Skip to content

Finish LLM text exporter #1417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
using Markdig.Syntax;
using Markdig.Syntax.Inlines;

namespace Elastic.Markdown.Slices;
namespace Elastic.Markdown;

public interface IDescriptionGenerator
{
Expand Down
11 changes: 9 additions & 2 deletions src/Elastic.Markdown/DocumentationGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
using Elastic.Markdown.Exporters;
using Elastic.Markdown.IO;
using Elastic.Markdown.Links.CrossLinks;
using Elastic.Markdown.Slices;
using Markdig.Syntax;
using Microsoft.Extensions.Logging;

Expand Down Expand Up @@ -174,6 +173,7 @@ await Parallel.ForEachAsync(DocumentationSet.Files, ctx, async (file, token) =>
_logger.LogInformation("-> Processed {ProcessedFiles}/{TotalFileCount} files", processedFiles, totalFileCount);
});
_logger.LogInformation("-> Processed {ProcessedFileCount}/{TotalFileCount} files", processedFileCount, totalFileCount);

}

private void HintUnusedSubstitutionKeys()
Expand Down Expand Up @@ -246,7 +246,14 @@ private async Task ProcessFile(HashSet<string> offendingFiles, DocumentationFile
foreach (var exporter in _markdownExporters)
{
var document = context.MarkdownDocument ??= await markdown.ParseFullAsync(ctx);
_ = await exporter.ExportAsync(new MarkdownExportContext { Document = document, File = markdown }, ctx);
_ = await exporter.ExportAsync(new MarkdownExportFileContext
{
BuildContext = Context,
Resolvers = DocumentationSet.MarkdownParser.Resolvers,
Document = document,
SourceFile = markdown,
DefaultOutputFile = outputFile
}, ctx);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
using System.IO.Abstractions;
using Elastic.Documentation.Configuration;
using Elastic.Markdown.IO;
using Elastic.Markdown.Slices;
using Markdig.Syntax;

namespace Elastic.Markdown.Exporters;
Expand Down
17 changes: 14 additions & 3 deletions src/Elastic.Markdown/Exporters/IMarkdownExporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,32 @@
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.IO.Abstractions;
using Elastic.Documentation.Configuration;
using Elastic.Markdown.IO;
using Elastic.Markdown.Myst;
using Markdig.Syntax;

namespace Elastic.Markdown.Exporters;

public class MarkdownExportContext

public record MarkdownExportContext
{
}
public record MarkdownExportFileContext
{
public required BuildContext BuildContext { get; init; }
public required IParserResolvers Resolvers { get; init; }
public required MarkdownDocument Document { get; init; }
public required MarkdownFile File { get; init; }
public required MarkdownFile SourceFile { get; init; }
public required IFileInfo DefaultOutputFile { get; init; }
public string? LLMText { get; set; }
}

public interface IMarkdownExporter
{
ValueTask StartAsync(Cancel ctx = default);
ValueTask StopAsync(Cancel ctx = default);
ValueTask<bool> ExportAsync(MarkdownExportContext context, Cancel ctx);
ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx);
ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx);
}
127 changes: 127 additions & 0 deletions src/Elastic.Markdown/Exporters/LLMTextExporter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Buffers;
using System.IO.Abstractions;
using System.IO.Compression;
using System.Text;
using Elastic.Documentation.Configuration;
using Elastic.Markdown.Helpers;
using Elastic.Markdown.Myst;
using Elastic.Markdown.Myst.FrontMatter;

namespace Elastic.Markdown.Exporters;

public class LLMTextExporter : IMarkdownExporter
{
public ValueTask StartAsync(Cancel ctx = default) => ValueTask.CompletedTask;

public ValueTask StopAsync(Cancel ctx = default) => ValueTask.CompletedTask;

public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx)
{
var source = fileContext.SourceFile.SourceFile;
var fs = source.FileSystem;
var llmText = fileContext.LLMText ??= ToLLMText(fileContext.BuildContext, fileContext.SourceFile.YamlFrontMatter, fileContext.Resolvers, source);

// write to the output version of the Markdown file directly
var outputFile = fileContext.DefaultOutputFile;
if (outputFile.Name == "index.md")
{
var root = fileContext.BuildContext.DocumentationOutputDirectory;
// Write to a file named after the parent folder
if (outputFile.Directory!.FullName == root.FullName)
{
// TODO in FinishExportAsync find a way to generate llms.txt
// e.g should it embedd all the links?
outputFile = fs.FileInfo.New(Path.Combine(root.FullName, "llms.md"));
}
else
outputFile = fs.FileInfo.New(outputFile.Directory!.FullName + ".md");
}

if (outputFile.Directory is { Exists: false })
outputFile.Directory.Create();

await fs.File.WriteAllTextAsync(outputFile.FullName, llmText, ctx);
return true;
}

/// <inheritdoc />
public ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx)
{
var outputDirectory = Path.Combine(outputFolder.FullName, "docs");
var zipPath = Path.Combine(outputDirectory, "llm.zip");
using (var zip = ZipFile.Open(zipPath, ZipArchiveMode.Create))
{
var markdownFiles = Directory.GetFiles(outputDirectory, "*.md", SearchOption.AllDirectories);

foreach (var file in markdownFiles)
{
var relativePath = Path.GetRelativePath(outputDirectory, file);
_ = zip.CreateEntryFromFile(file, relativePath);
}
}
return ValueTask.FromResult(true);
}

public static string ToLLMText(BuildContext buildContext, YamlFrontMatter? frontMatter, IParserResolvers resolvers, IFileInfo source)
{
var fs = source.FileSystem;
var sb = DocumentationObjectPoolProvider.StringBuilderPool.Get();

Read(source, fs, sb, buildContext.DocumentationSourceDirectory);
var full = sb.ToString();
var state = new ParserState(buildContext)
{
YamlFrontMatter = frontMatter,
MarkdownSourcePath = source,
CrossLinkResolver = resolvers.CrossLinkResolver,
DocumentationFileLookup = resolvers.DocumentationFileLookup
};
DocumentationObjectPoolProvider.StringBuilderPool.Return(sb);
var replaced = full.ReplaceSubstitutions(new ParserContext(state));
return replaced;
}

private static void Read(IFileInfo source, IFileSystem fs, StringBuilder sb, IDirectoryInfo setDirectory)
{
var text = fs.File.ReadAllText(source.FullName).AsSpan();
var spanStart = ":::{include}".AsSpan();
var include = SearchValues.Create([spanStart.ToString(), $":::{Environment.NewLine}"], StringComparison.OrdinalIgnoreCase);
int i;
var startIndex = 0;
while ((i = text[startIndex..].IndexOfAny(include)) >= 0)
{
var cursor = startIndex + i;
var marker = text[cursor..];
if (marker.StartsWith(spanStart))
{
_ = sb.Append(text.Slice(startIndex, i).TrimEnd('\n'));
var relativeFileStart = marker.IndexOf('}') + 1;
var relativeFileEnd = marker.IndexOf('\n');
var relativeFile = marker[relativeFileStart..relativeFileEnd].Trim();
var includePath = Path.GetFullPath(Path.Combine(source.Directory!.FullName, relativeFile.ToString()));
var includeSource = fs.FileInfo.New(includePath);
if (relativeFile.StartsWith('/'))
{
includePath = Path.Combine(setDirectory.FullName, relativeFile.TrimStart('/').ToString());
includeSource = fs.FileInfo.New(includePath);
}

if (includeSource.Extension == "md" && includePath.Contains("_snippets"))
Read(includeSource, fs, sb, setDirectory);
startIndex = cursor + relativeFileEnd;
startIndex = Math.Min(text.Length, startIndex);
}
else
{
startIndex += i + 3 + Environment.NewLine.Length;
startIndex = Math.Min(text.Length, startIndex);
}
}

_ = sb.Append(text[startIndex..]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
using Elastic.Documentation.Site.Navigation;
using Elastic.Markdown.Extensions.DetectionRules;
using Elastic.Markdown.IO;
using Elastic.Markdown.Page;
using Markdig.Syntax;
using RazorSlices;
using IFileInfo = System.IO.Abstractions.IFileInfo;

namespace Elastic.Markdown.Slices;
namespace Elastic.Markdown;

public class HtmlWriter(
DocumentationSet documentationSet,
Expand Down Expand Up @@ -99,7 +100,7 @@ private async Task<string> RenderLayout(MarkdownFile markdown, MarkdownDocument
if (PositionalNavigation.MarkdownNavigationLookup.TryGetValue("docs-content://versions.md", out var item))
allVersionsUrl = item.Url;

var slice = Index.Create(new IndexViewModel
var slice = Page.Index.Create(new IndexViewModel
{
SiteName = siteName,
DocSetName = DocumentationSet.Name,
Expand Down
1 change: 0 additions & 1 deletion src/Elastic.Markdown/IO/DocumentationFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
using Elastic.Documentation.Site;
using Elastic.Markdown.Myst;
using Elastic.Markdown.Myst.FrontMatter;
using Elastic.Markdown.Slices;

namespace Elastic.Markdown.IO;

Expand Down
13 changes: 1 addition & 12 deletions src/Elastic.Markdown/IO/MarkdownFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
using Elastic.Markdown.Links.CrossLinks;
using Elastic.Markdown.Myst;
using Elastic.Markdown.Myst.Directives;
using Elastic.Markdown.Myst.Directives.Include;
using Elastic.Markdown.Myst.FrontMatter;
using Elastic.Markdown.Myst.InlineParsers;
using Elastic.Markdown.Slices;
using Markdig;
using Markdig.Extensions.Yaml;
using Markdig.Renderers.Roundtrip;
Expand Down Expand Up @@ -185,17 +185,6 @@ public async Task<MarkdownDocument> ParseFullAsync(Cancel ctx)
return document;
}

public static string ToLLMText(MarkdownDocument document)
{
using var sw = new StringWriter();
var rr = new RoundtripRenderer(sw);
rr.Write(document);
var outputMarkdown = sw.ToString();

return outputMarkdown;

}

private IReadOnlyDictionary<string, string> GetSubstitutions()
{
var globalSubstitutions = _globalSubstitutions;
Expand Down
79 changes: 0 additions & 79 deletions src/Elastic.Markdown/IO/Navigation/DocumentationGroup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
// See the LICENSE file in the project root for more information

using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using Elastic.Documentation;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.TableOfContents;
Expand All @@ -12,84 +11,6 @@

namespace Elastic.Markdown.IO.Navigation;

[DebuggerDisplay("Current: {Model.RelativePath}")]
public record FileNavigationItem(MarkdownFile Model, DocumentationGroup Group, bool Hidden = false) : ILeafNavigationItem<MarkdownFile>
{
public INodeNavigationItem<INavigationModel, INavigationItem>? Parent { get; set; } = Group;
public IRootNavigationItem<INavigationModel, INavigationItem> NavigationRoot { get; } = Group.NavigationRoot;
public string Url => Model.Url;
public string NavigationTitle => Model.NavigationTitle;
public int NavigationIndex { get; set; }
}

public class TableOfContentsTreeCollector
{
private Dictionary<Uri, TableOfContentsTree> NestedTableOfContentsTrees { get; } = [];

public void Collect(Uri source, TableOfContentsTree tree) =>
NestedTableOfContentsTrees[source] = tree;

public void Collect(TocReference tocReference, TableOfContentsTree tree) =>
NestedTableOfContentsTrees[tocReference.Source] = tree;

public bool TryGetTableOfContentsTree(Uri source, [NotNullWhen(true)] out TableOfContentsTree? tree) =>
NestedTableOfContentsTrees.TryGetValue(source, out tree);
}


[DebuggerDisplay("Toc >{Depth} {FolderName} {Source} ({NavigationItems.Count} items)")]
public class TableOfContentsTree : DocumentationGroup, IRootNavigationItem<MarkdownFile, INavigationItem>
{
public Uri Source { get; }

public TableOfContentsTreeCollector TreeCollector { get; }

public TableOfContentsTree(
Uri source,
BuildContext context,
NavigationLookups lookups,
TableOfContentsTreeCollector treeCollector,
ref int fileIndex)
: base(".", treeCollector, context, lookups, source, ref fileIndex, 0, null, null)
{
TreeCollector = treeCollector;
NavigationRoot = this;

Source = source;
TreeCollector.Collect(source, this);

//edge case if a tree only holds a single group, ensure we collapse it down to the root (this)
if (NavigationItems.Count == 1 && NavigationItems.First() is DocumentationGroup { NavigationItems.Count: 0 })
NavigationItems = [];


}

internal TableOfContentsTree(
Uri source,
string folderName,
TableOfContentsTreeCollector treeCollector,
BuildContext context,
NavigationLookups lookups,
ref int fileIndex,
int depth,
IRootNavigationItem<MarkdownFile, INavigationItem> toplevelTree,
DocumentationGroup? parent
) : base(folderName, treeCollector, context, lookups, source, ref fileIndex, depth, toplevelTree, parent)
{
Source = source;
TreeCollector = treeCollector;
NavigationRoot = this;
TreeCollector.Collect(source, this);
}

protected override IRootNavigationItem<MarkdownFile, INavigationItem> DefaultNavigation => this;

// We rely on IsPrimaryNavEnabled to determine if we should show the dropdown
/// <inheritdoc />
public bool IsUsingNavigationDropdown => false;
}

[DebuggerDisplay("Group >{Depth} {FolderName} ({NavigationItems.Count} items)")]
public class DocumentationGroup : INodeNavigationItem<MarkdownFile, INavigationItem>
{
Expand Down
18 changes: 18 additions & 0 deletions src/Elastic.Markdown/IO/Navigation/FileNavigationItem.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Diagnostics;
using Elastic.Documentation.Site.Navigation;

namespace Elastic.Markdown.IO.Navigation;

[DebuggerDisplay("Current: {Model.RelativePath}")]
public record FileNavigationItem(MarkdownFile Model, DocumentationGroup Group, bool Hidden = false) : ILeafNavigationItem<MarkdownFile>
{
public INodeNavigationItem<INavigationModel, INavigationItem>? Parent { get; set; } = Group;
public IRootNavigationItem<INavigationModel, INavigationItem> NavigationRoot { get; } = Group.NavigationRoot;
public string Url => Model.Url;
public string NavigationTitle => Model.NavigationTitle;
public int NavigationIndex { get; set; }
}
Loading
Loading