diff --git a/README.md b/README.md index 5d5f90b..d9ff2f7 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,10 @@ ![Build and test](https://github.com/mikegoatly/html2md/workflows/Build%20and%20test/badge.svg) -Convert an HTML page to markdown, including re-linking and downloading of images. +Reverse engineer markdown from an HTML page, including: + +- Re-linking and downloading of images +- Front Matter metadata generation ## Usage as a dotnet tool @@ -27,13 +30,28 @@ If unspecified the entire body tag will be processed, otherwise only text contai Allows for specific tags to be ignored. --image-path-prefix|--ipp -The prefix to apply to all rendered image URLs - helpful when you're going to be serving images from a different location, relative or absolute. +The prefix to apply to all rendered image URLs - helpful when you're going to be serving images from a +different location, relative or absolute. --default-code-language The default language to use on code blocks converted from pre tags - defaults to csharp --code-language-class-map -Map between a pre tag's class names and languages. E.g. you might map the class name "sh_csharp" to "csharp" and "sh_powershell" to "powershell". +Map between a pre tag's class names and languages. E.g. you might map the class name "sh_csharp" to "csharp" +and "sh_powershell" to "powershell". + +--front-matter-data +Allows for configuration of information to be extracted to a Front Matter property. This can be an XPath to an element +or attribute in the HTML page, a string constant or a supported macro. +Supported macros: +RelativeUriPath: The relative path of the page being converted. e.g. for https://example.com/pages/page-1 the macro would +return /pages/page-1 + +--front-matter-data-list +Allows for configuration of list-based information to be extracted to a Front Matter property. + +--front-matter-delimiter +The delimiter to write out for the Front Matter section of the converted document. The default is --- ``` ## Usage as a nuget package @@ -61,7 +79,50 @@ ConversionResult converted = await converter.ConvertAsync( ``` -`ConvertedDocument` exposes: +You can also extract Front Matter metadata: + +``` csharp + +var options = new ConversionOptions +{ + FrontMatter = + { + Enabled = true, + SingleValueProperties = + { + { "Title", "//h1" }, + { "Author", "{{'Mike Goatly'}}" }, + { "RedirectFrom", @"{{RelativeUriPath}}" } + }, + ArrayValueProperties = + { + { "Tags", @"//p[@class='tags']/a" } + } + } +} + +var converter = new MarkdownConverter(options); + +ConversionResult converted = await converter.ConvertAsync("https://goatly.net/some-article"); + +``` + +Where the resulting markdown would be: + +``` +--- +Title: Article Title +Author: Mike Goatly +RedirectFrom: /some-article +Tags: + - Help + - Coding +--- +``` + +### `ConvertedDocument` + +`ConvertedDocument` is the result of a conversion process, containing: - `Documents`: The markdown representations of all the converted pages. - `Images`: A collection of images referenced in the documents. Each image includes the downloaded raw data as a byte array. @@ -78,14 +139,22 @@ The default is `csharp`. - `ExcludeTags`: The set of tags to exclude from the conversion process. You can use this if there are certain parts of a document you don't want translating to markdown, e.g. aside, nav, etc. - `CodeLanguageClassMap`: A dictionary mapping between class names that can appear on `pre` tags and the language they map to.E.g. you might map the class name "sh_csharp" to "csharp" and "sh_powershell" to "powershell". +- `FrontMatter`: Configuration for how Front Matter metadata should be emitted into a converted document. + - `Enabled`: Whether Front Matter metadata should be emitted. Defaults to `false`. + - `Delimiter`: The delimiter to write to the Front Matter section. Defaults to `---`. + - `SingleValueProperties`: Configuration of information to be extracted to a Front Matter property. This can be an XPath to an element +or attribute in the HTML page, a string constant or a supported macro. Supported macros: + - RelativeUriPath: The relative path of the page being converted. e.g. for https://example.com/pages/page-1 the macro would +return /pages/page-1 + - `ArrayValueProperties`: Configuration of list-based information to be extracted to a Front Matter property. ## Converted content -### `` +### `` and `` `italic` becomes `*italic*` -### `` +### `` and `` `bold` becomes `**bold**` diff --git a/src/Html2md.Core/ConversionOptions.cs b/src/Html2md.Core/ConversionOptions.cs index 4db8f90..8b02ab6 100644 --- a/src/Html2md.Core/ConversionOptions.cs +++ b/src/Html2md.Core/ConversionOptions.cs @@ -19,5 +19,8 @@ public class ConversionOptions : IConversionOptions /// public IDictionary CodeLanguageClassMap { get; set; } = new Dictionary(); + + /// + public FrontMatterOptions FrontMatter { get; set; } = new FrontMatterOptions(); } } diff --git a/src/Html2md.Core/FrontMatterExtractor.cs b/src/Html2md.Core/FrontMatterExtractor.cs new file mode 100644 index 0000000..2a7c282 --- /dev/null +++ b/src/Html2md.Core/FrontMatterExtractor.cs @@ -0,0 +1,75 @@ +using HtmlAgilityPack; +using System; +using System.Text; +using System.Text.RegularExpressions; + +namespace Html2md +{ + public class FrontMatterExtractor + { + public string? Extract(FrontMatterOptions options, HtmlDocument document, Uri pageUri) + { + if (!options.Enabled) + { + return null; + } + + var builder = new StringBuilder(); + builder.AppendLine(options.Delimiter); + + foreach (var singleValue in options.SingleValueProperties) + { + builder + .Append(singleValue.Key) + .Append(": ") + .AppendLine(ExtractValue(singleValue.Value, document, pageUri)); + } + + foreach (var singleValue in options.ArrayValueProperties) + { + builder + .Append(singleValue.Key) + .AppendLine(":"); + + foreach (var match in document.DocumentNode.SelectNodes(singleValue.Value)) + { + builder + .Append(" - ") + .AppendLine(match.GetDirectInnerText().Trim()); + } + } + + builder.AppendLine(options.Delimiter); + + return builder.ToString(); + } + + private static string ExtractValue(string xpathOrMacro, HtmlDocument document, Uri pageUri) + { + if (xpathOrMacro.StartsWith("{{")) + { + if (Regex.IsMatch(xpathOrMacro, @"^{{'[^']*'}}$")) + { + return xpathOrMacro.Substring(3, xpathOrMacro.Length - 6); + } + + return xpathOrMacro switch + { + "{{RelativeUriPath}}" => pageUri.LocalPath, + _ => throw new Exception("Unknown macro " + xpathOrMacro), + }; + } + else + { + var node = document.DocumentNode.SelectSingleNode(xpathOrMacro); + var attributeName = Regex.Match(xpathOrMacro, @"/@(\w+)$"); + if (attributeName.Success) + { + return node.GetAttributeValue(attributeName.Groups[1].Value, string.Empty).Trim(); + } + + return node.GetDirectInnerText().Trim(); + } + } + } +} diff --git a/src/Html2md.Core/FrontMatterOptions.cs b/src/Html2md.Core/FrontMatterOptions.cs new file mode 100644 index 0000000..6b36297 --- /dev/null +++ b/src/Html2md.Core/FrontMatterOptions.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; + +namespace Html2md +{ + /// + /// Configuration for writing Front Matter sections to converted documents. + /// + public class FrontMatterOptions + { + /// + /// Gets or sets a value indicating whether Front Matter should be written to converted documents. + /// + public bool Enabled { get; set; } + + /// + /// Gets or sets the delimiter that should be written to the Front Matter section. Default is ---. + /// + public string Delimiter { get; set; } = "---"; + + /// + /// Gets or sets the XPath or macro properties that should be written to the Front Matter section. + /// If an XPath is provided and more than one element matches, then the first is used. + /// + public Dictionary SingleValueProperties { get; set; } = new Dictionary(); + + /// + /// Gets or sets the XPath properties that should be written to the Front Matter section as a list. Each matching + /// value will be written as an entry in the list. + /// + public Dictionary ArrayValueProperties { get; set; } = new Dictionary(); + } +} diff --git a/src/Html2md.Core/Html2md.Core.csproj b/src/Html2md.Core/Html2md.Core.csproj index 24ba0ba..6de4c4b 100644 --- a/src/Html2md.Core/Html2md.Core.csproj +++ b/src/Html2md.Core/Html2md.Core.csproj @@ -1,7 +1,7 @@ - netstandard2.0 + netstandard2.1 8.0 enable Html2md @@ -13,8 +13,9 @@ LICENSE https://github.com/mikegoatly/html2md convert-html convert-markdown html markdown conversion - 1.0.1 + 1.1.0 https://github.com/mikegoatly/html2md + Added support for extracting Front Matter metadata diff --git a/src/Html2md.Core/IConversionOptions.cs b/src/Html2md.Core/IConversionOptions.cs index 2d9321e..3ff1c7a 100644 --- a/src/Html2md.Core/IConversionOptions.cs +++ b/src/Html2md.Core/IConversionOptions.cs @@ -1,5 +1,4 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; namespace Html2md { @@ -35,5 +34,10 @@ public interface IConversionOptions /// a document you don't want translating to markdown, e.g. aside, nav, etc. /// ISet ExcludeTags { get; } + + /// + /// Gets the FrontMatter configuration to apply to the conversion process. + /// + FrontMatterOptions FrontMatter { get; } } } diff --git a/src/Html2md.Core/MarkdownConverter.cs b/src/Html2md.Core/MarkdownConverter.cs index d49c5a9..70b4312 100644 --- a/src/Html2md.Core/MarkdownConverter.cs +++ b/src/Html2md.Core/MarkdownConverter.cs @@ -20,6 +20,7 @@ public class MarkdownConverter private readonly IConversionOptions options; private readonly ILogger logger; private readonly HttpClient httpClient; + private readonly FrontMatterExtractor frontMatterExtractor = new FrontMatterExtractor(); public MarkdownConverter(IConversionOptions options, ILogger? logger = null) : this(options, null, logger) @@ -70,6 +71,12 @@ private async Task ConvertAsync(Uri pageUri, StringBuilder bu var doc = new HtmlDocument(); doc.LoadHtml(content); + var frontMatter = this.frontMatterExtractor.Extract(this.options.FrontMatter, doc, pageUri); + if (frontMatter != null) + { + builder.Append(frontMatter); + } + this.logger.LogDebug("Processing page content"); this.ProcessNode(pageUri, doc.DocumentNode, builder, imageCollector, false); diff --git a/src/Html2md/CommandLineArgs.cs b/src/Html2md/CommandLineArgs.cs index 48e0585..5ae931f 100644 --- a/src/Html2md/CommandLineArgs.cs +++ b/src/Html2md/CommandLineArgs.cs @@ -52,6 +52,20 @@ public CommandLineArgs(string[] args) SaveArg(args, ref i, ref this.imageOutputLocation); break; + case "--front-matter-delimiter": + this.FrontMatter.Delimiter = GetArgParameter(args, ref i) ?? this.FrontMatter.Delimiter; + break; + + case "--front-matter-data": + AddArg(args, ref i, this.FrontMatter.SingleValueProperties); + this.FrontMatter.Enabled = true; + break; + + case "--front-matter-data-list": + AddArg(args, ref i, this.FrontMatter.ArrayValueProperties); + this.FrontMatter.Enabled = true; + break; + case "--image-path-prefix": case "--ipp": SaveArg(args, ref i, ref this.imagePathPrefix!); @@ -65,13 +79,13 @@ public CommandLineArgs(string[] args) case "--include-tags": case "--it": case "-t": - SaveArg(args, ref i, ref this.includeTags); + AddArg(args, ref i, ref this.includeTags); break; case "--exclude-tags": case "--et": case "-e": - SaveArg(args, ref i, ref this.excludeTags); + AddArg(args, ref i, ref this.excludeTags); break; case "--code-language-class-map": @@ -123,7 +137,7 @@ private void SaveArg(string[] args, ref int i, ref string? arg) arg = GetArgParameter(args, ref i); } - private void SaveArg(string[] args, ref int i, ref HashSet arg) + private void AddArg(string[] args, ref int i, ref HashSet arg) { var argValue = GetArgParameter(args, ref i); if (argValue != null) @@ -132,6 +146,25 @@ private void SaveArg(string[] args, ref int i, ref HashSet arg) } } + private void AddArg(string[] args, ref int i, Dictionary arg) + { + var argIndex = i; + var argValue = GetArgParameter(args, ref i); + if (argValue != null) + { + var pair = argValue.Split(":"); + + if (pair.Length != 2) + { + this.Error = "Malformed argument value for " + args[argIndex]; + } + else + { + arg[pair[0]] = pair[1]; + } + } + } + private void SaveArg(string[] args, ref int i, ref Dictionary arg) { var argIndex = i; @@ -185,5 +218,7 @@ public LogLevel LogLevel public ISet ExcludeTags => this.excludeTags; public IDictionary CodeLanguageClassMap => this.codeLanguageClassMap; + + public FrontMatterOptions FrontMatter { get; } = new FrontMatterOptions(); } } \ No newline at end of file diff --git a/src/Html2md/Html2md.csproj b/src/Html2md/Html2md.csproj index fbb63a1..95ed366 100644 --- a/src/Html2md/Html2md.csproj +++ b/src/Html2md/Html2md.csproj @@ -15,9 +15,10 @@ Copyright Mike Goatly LICENSE https://github.com/mikegoatly/html2md - 1.0.2 + 1.1.0 convert-html convert-markdown html markdown conversion https://github.com/mikegoatly/html2md + Added support for extracting Front Matter metadata diff --git a/src/Html2md/Program.cs b/src/Html2md/Program.cs index 0585842..e9cdeb1 100644 --- a/src/Html2md/Program.cs +++ b/src/Html2md/Program.cs @@ -88,6 +88,17 @@ private static void WriteHelp() Console.WriteLine("--code-language-class-map "); Console.WriteLine("Map between a pre tag's class names and languages. E.g. you might map the class name \"sh_csharp\" to \"csharp\" and \"sh_powershell\" to \"powershell\"."); Console.WriteLine(); + Console.WriteLine("--front-matter-data "); + Console.WriteLine("Allows for configuration of information to be extracted to a Front Matter property. This can be an XPath to an element or attribute in the HTML page, a string constant or a supported macro."); + Console.WriteLine("Supported macros:"); + Console.WriteLine("RelativeUriPath: The relative path of the page being converted. e.g. for https://example.com/pages/page-1 the macro would return /pages/page-1"); + Console.WriteLine(); + Console.WriteLine("--front-matter-data-list "); + Console.WriteLine("Allows for configuration of list-based information to be extracted to a Front Matter property."); + Console.WriteLine(); + Console.WriteLine("--front-matter-delimiter "); + Console.WriteLine("The delimiter to write out for the Front Matter section of the converted document. The default is ---"); + Console.WriteLine(); } } } diff --git a/test/Html2md.Tests.Unit/CommandLineArgsTests.cs b/test/Html2md.Tests.Unit/CommandLineArgsTests.cs index 1e271ac..1222266 100644 --- a/test/Html2md.Tests.Unit/CommandLineArgsTests.cs +++ b/test/Html2md.Tests.Unit/CommandLineArgsTests.cs @@ -41,6 +41,78 @@ public void WithFullArgumentNames_ShouldSetValuesCorrectly() }); } + [Fact] + public void WithFrontMatterData_ShouldSetParameterAndEnableFrontMatter() + { + var sut = new CommandLineArgs(new[] { + "-o", + "c:\\test\\output", + "-u", + "http://goatly.net", + "--front-matter-data", + "title://h1", + }); + + sut.FrontMatter.Enabled.Should().BeTrue(); + sut.FrontMatter.SingleValueProperties.Should().BeEquivalentTo( + new Dictionary + { + { "title", "//h1" } + }); + sut.FrontMatter.ArrayValueProperties.Should().BeEmpty(); + } + + [Fact] + public void WithFrontMatterListData_ShouldSetParameterAndEnableFrontMatter() + { + var sut = new CommandLineArgs(new[] { + "-o", + "c:\\test\\output", + "-u", + "http://goatly.net", + "--front-matter-data-list", + "title://h1", + }); + + sut.FrontMatter.Enabled.Should().BeTrue(); + sut.FrontMatter.SingleValueProperties.Should().BeEmpty(); + sut.FrontMatter.ArrayValueProperties.Should().BeEquivalentTo( + new Dictionary + { + { "title", "//h1" } + }); + } + + [Fact] + public void WithNoFrontMatterData_ShouldLeaveFrontMatterConfigDisabledWithDefaultDelmiter() + { + var sut = new CommandLineArgs(new[] { + "-o", + "c:\\test\\output", + "-u", + "http://goatly.net" + }); + + sut.FrontMatter.Enabled.Should().BeFalse(); + sut.FrontMatter.Delimiter.Should().Be("---"); + } + + [Fact] + public void WithCustomFrontMatterDelimiter_ShouldSetParameterAccordingly() + { + var sut = new CommandLineArgs(new[] { + "-o", + "c:\\test\\output", + "-u", + "http://goatly.net", + "--front-matter-delimiter", + "~~~" + }); + + sut.FrontMatter.Enabled.Should().BeFalse(); + sut.FrontMatter.Delimiter.Should().Be("~~~"); + } + [Fact] public void WithAbbreviatedArgumentNames_ShouldSetValuesCorrectly() { diff --git a/test/Html2md.Tests.Unit/FrontMatterExtractorTests.cs b/test/Html2md.Tests.Unit/FrontMatterExtractorTests.cs new file mode 100644 index 0000000..fd0f23f --- /dev/null +++ b/test/Html2md.Tests.Unit/FrontMatterExtractorTests.cs @@ -0,0 +1,191 @@ +using FluentAssertions; +using HtmlAgilityPack; +using System; +using Xunit; + +namespace Html2md.Tests.Unit +{ + public class FrontMatterExtractorTests + { + private static readonly string testPage = @" + + + + +
+
+ + +

Adding Application Insights to an existing Windows Store project using Visual Studio 2013 Update 3

+ +

+ Tags: + Application Insights, WinRT , Visual Studio

+ + +
+

As of Update 3, Visual Studio 2013 now has support for Application Insights built in, so I thought I’d have a play with it again.

Right now, my primary focus is adding instrumentation to aWindows 8 Store app I’m working on.I’d tried to do it with the previous Application Insights release (which was in preview), but found the need to explicitly build the app for all the various CPU targets a burden I could do without.The release that comes with Update 3 would seem to have fixed this, by allowing you to target Any CPU. < p > This is a summary of my experience of adding Application Insights to my existing real - world project. < p > First I removed any indication that I had ever had Application Insights added to the store project by removing the ApplicationInsights.config file that was already there from earlier attempts. < p > Then I right - clicked on the project, and selected Add Application Insights – on doing this, I received the following error: < blockquote > < p > Could not add Application Insights to project.& nbsp;

Failed to install package:

Microsoft.ApplicationInsights.WindowsStore

with error:

An error occurred while applying transformation to 'App.xaml' in project '<PROJECT>: No element in the source document matches '/_defaultNamespace:Application/_defaultNamespace:Application.Resources'

It turns out that the installer didn’t like the fact that my application had an unexpected App.xaml structure, due to the use of Prism as the application framework:

<prism:MvvmAppBase x:Class='Chordle.UI.App'
xmlns=http://schemas.microsoft.com/winfx/2006/xaml/presentation
xmlns:x=http://schemas.microsoft.com/winfx/2006/xaml
xmlns:prism='using:Microsoft.Practices.Prism.StoreApps'>
    <prism:MvvmAppBase.Resources>
        <ResourceDictionary> +…
        </ResourceDictionary>
    </prism:MvvmAppBase.Resources>
</prism:MvvmAppBase>
+

So to get around this, I had to comment out my existing XAML and add in a temporary Application.Resources area, like this:

<!--<prism:MvvmAppBase x:Class='Chordle.UI.App'
xmlns=http://schemas.microsoft.com/winfx/2006/xaml/presentation
xmlns:x=http://schemas.microsoft.com/winfx/2006/xaml
xmlns:prism='using:Microsoft.Practices.Prism.StoreApps'>
    <prism:MvvmAppBase.Resources>
        <ResourceDictionary> +…
        </ResourceDictionary>
    </prism:MvvmAppBase.Resources>
</prism:MvvmAppBase>—->
+<xaml:Application xmlns:xaml='http://schemas.microsoft.com/winfx/2006/xaml/presentation'>
<xaml:Application.Resources />
</xaml:Application>
+

And after closing App.xaml, I tried to add App Insights again, this time it succeeded, but I obviously had to fix-up the App.xaml file by uncommenting the original XAML, and moving the new ai:TelemetryContext resource into my own resource dictionary structure.

+

After all this, I finally discovered that currently you can’t yet view Windows Store/Phone telemetry in the preview Azure Portal, which is where the telemetry is going now, so there’s no way for me to test our whether this has actually worked… I’ll write another post when I’ve got more to add!

+ +

No Comments

+ +
+ + +".Replace('\'', '"'); + + private static readonly Uri testPageUri = new Uri("http://goatly.net/2012/03/some-post"); + + [Fact] + public void WhenDisabledShouldReturnNull() + { + this.RunFrontMatterTest( + new FrontMatterOptions() { Enabled = false }, + testPage, + testPageUri, + null); + } + + [Fact] + public void ShouldMapInnerTextForSingleProperties() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + SingleValueProperties = + { + { "title", "//article/header/h1" }, + { "firsttag", "//p[@class='tags']/a" } + } + }, + testPage, + testPageUri, + @"--- +title: Adding Application Insights to an existing Windows Store project using Visual Studio 2013 Update 3 +firsttag: Application Insights +--- +"); + } + + [Fact] + public void ShouldUseAlternativeDelimiterIfProvided() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + Delimiter = "~~~", + SingleValueProperties = + { + { "title", "//article/header/h1" } + } + }, + testPage, + testPageUri, + @"~~~ +title: Adding Application Insights to an existing Windows Store project using Visual Studio 2013 Update 3 +~~~ +"); + } + + [Fact] + public void ShouldMapAttributeValuesForSingleProperties() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + SingleValueProperties = + { + { "generator", "/html/head/meta[@name='generator']/@content" } + } + }, + testPage, + testPageUri, + @"--- +generator: Orchard +--- +"); + } + + [Fact] + public void ShouldReturnConstantMacroValue() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + SingleValueProperties = + { + { "author", @"{{'Mike Goatly'}}" } + } + }, + testPage, + testPageUri, + @"--- +author: Mike Goatly +--- +"); + } + + [Fact] + public void ShouldExpandMacroValues() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + SingleValueProperties = + { + { "RedirectFrom", @"{{RelativeUriPath}}" } + } + }, + testPage, + testPageUri, + @"--- +RedirectFrom: /2012/03/some-post +--- +"); + } + + [Fact] + public void ShouldMapArrayValues() + { + this.RunFrontMatterTest( + new FrontMatterOptions() + { + Enabled = true, + ArrayValueProperties = + { + { "Tags", @"//p[@class='tags']/a" } + } + }, + testPage, + testPageUri, + @"--- +Tags: + - Application Insights + - WinRT + - Visual Studio +--- +"); + } + + private void RunFrontMatterTest(FrontMatterOptions frontMatterOptions, string testPage, Uri pageUri, string expectedResult) + { + var document = new HtmlDocument(); + document.LoadHtml(testPage); + var result = new FrontMatterExtractor().Extract(frontMatterOptions, document, pageUri); + + result.Should().Be(expectedResult); + } + } +} diff --git a/test/Html2md.Tests.Unit/MarkdownConverterTests.cs b/test/Html2md.Tests.Unit/MarkdownConverterTests.cs index e690ef0..13d2ac9 100644 --- a/test/Html2md.Tests.Unit/MarkdownConverterTests.cs +++ b/test/Html2md.Tests.Unit/MarkdownConverterTests.cs @@ -56,6 +56,28 @@ await TestConverter( ""); } + [Fact] + public async Task ShouldApplyFrontMatterWhenConfigured() + { + await TestConverter( + "

Doc title

test

", + @"--- +Title: Doc title +--- +test + +", + new ConversionOptions + { + ExcludeTags = { "h1" }, + FrontMatter = + { + Enabled = true, + SingleValueProperties = { { "Title", "/body/h1" } } + } + }); + } + [Fact] public async Task ShouldConvertEm() {