Skip to content

Commit e80f601

Browse files
pokeysketch
and
sketch
committed
Add llms.txt generation following llms.txt specification
- Generate llms.txt as directory page with Core Documentation (root README) and Optional sections - Generate llms-full.txt with complete user documentation concatenated - Generate individual .md files for each documentation page in llms/ subdirectory - Focus on user documentation only (exclude contributing docs) - Follow llms.txt format specification from https://llmstxt.org/ - Integrate into build process via 'pnpm generate-llms' command Fixes #2873 Co-Authored-By: sketch <[email protected]> Change-ID: s30ae64e7cbc2b4fdk
1 parent 97bf1d6 commit e80f601

File tree

6 files changed

+386
-106
lines changed

6 files changed

+386
-106
lines changed

packages/cursorless-org/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,16 @@ To learn more about Next.js, take a look at the following resources:
2020
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
2121

2222
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome!
23+
24+
25+
## LLMs Training Data
26+
27+
During the build process, an `llms.txt` file is automatically generated in the output directory. This file contains a concatenation of all markdown files from the documentation directory (`packages/cursorless-org-docs/src/docs/`) and is used for LLM training and reference.
28+
29+
You can find this file at `out/llms.txt` after running the build process.
30+
31+
To generate this file manually without running a full build, you can run:
32+
33+
```sh
34+
pnpm generate-llms
35+
```

packages/cursorless-org/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55
"private": true,
66
"scripts": {
77
"dev": "next dev",
8-
"build": "next build",
8+
"build": "next build && pnpm generate-llms",
99
"start": "http-server out -a 127.0.0.1 -p 8080",
1010
"lint": "next lint",
1111
"compile": "tsc --build",
1212
"watch": "tsc --build --watch",
13-
"clean": "rm -rf ./out tsconfig.tsbuildinfo ./dist ./build"
13+
"clean": "rm -rf ./out tsconfig.tsbuildinfo ./dist ./build",
14+
"generate-llms": "my-ts-node ./src/scripts/generateLlmsTxt.ts"
1415
},
1516
"dependencies": {
1617
"@cursorless/cheatsheet": "workspace:*",
18+
"@cursorless/common": "workspace:*",
1719
"@mdx-js/loader": "3.0.1",
1820
"@mdx-js/react": "3.0.1",
1921
"@next/mdx": "15.3.2",

packages/cursorless-org/src/content/enablement-group.mdx.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
export { default } from "*.mdx";
1+
export { default } from "./enablement-group.mdx";
22

33
export const meta: {
44
title: string;
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Script to generate multiple llms.txt files following the llms.txt specification.
5+
* See https://llmstxt.org/ for format details.
6+
*/
7+
8+
import * as fs from "fs/promises";
9+
import * as path from "path";
10+
11+
/**
12+
* Recursively find all markdown files in a directory
13+
* @param dir The directory to search
14+
* @returns Array of file paths
15+
*/
16+
async function findMarkdownFiles(dir: string): Promise<string[]> {
17+
const entries = await fs.readdir(dir, { withFileTypes: true });
18+
const filesPromises = entries.map(async (entry) => {
19+
const entryPath = path.join(dir, entry.name);
20+
if (entry.isDirectory()) {
21+
return findMarkdownFiles(entryPath);
22+
} else if (
23+
entry.isFile() &&
24+
(entry.name.endsWith(".md") || entry.name.endsWith(".mdx"))
25+
) {
26+
return [entryPath];
27+
}
28+
return [];
29+
});
30+
31+
const files = await Promise.all(filesPromises);
32+
return files.flat();
33+
}
34+
35+
/**
36+
* Get the title from a markdown file
37+
* @param filePath Path to the markdown file
38+
* @returns The title or a fallback based on filename
39+
*/
40+
async function getMarkdownTitle(filePath: string): Promise<string> {
41+
try {
42+
const content = await fs.readFile(filePath, "utf8");
43+
44+
// Look for frontmatter title first
45+
const frontmatterMatch = content.match(/^---\s*\n[\s\S]*?title:\s*["']?([^"'\n]+)["']?[\s\S]*?\n---/);
46+
if (frontmatterMatch) {
47+
return frontmatterMatch[1].trim();
48+
}
49+
50+
// Look for first H1 heading
51+
const h1Match = content.match(/^#\s+(.+)$/m);
52+
if (h1Match) {
53+
return h1Match[1].trim();
54+
}
55+
56+
// Fallback to filename
57+
const basename = path.basename(filePath, path.extname(filePath));
58+
return basename === "README" ? "Overview" : basename.replace(/-/g, " ");
59+
} catch {
60+
// Fallback to filename
61+
const basename = path.basename(filePath, path.extname(filePath));
62+
return basename === "README" ? "Overview" : basename.replace(/-/g, " ");
63+
}
64+
}
65+
66+
/**
67+
* Generate a single markdown file for a docs page
68+
* @param filePath Path to the original markdown file
69+
* @param outputDir Directory to write the output file
70+
* @param repoRoot Repository root path
71+
* @returns The output file name
72+
*/
73+
async function generateIndividualFile(
74+
filePath: string,
75+
outputDir: string,
76+
repoRoot: string,
77+
userDocsDir: string
78+
): Promise<string> {
79+
const content = await fs.readFile(filePath, "utf8");
80+
const relativePath = path.relative(repoRoot, filePath);
81+
82+
// Generate output filename from path relative to user docs dir
83+
let outputName = path.relative(userDocsDir, filePath)
84+
.replace(/\//g, "-")
85+
.replace(/\.mdx?$/, ".md");
86+
87+
// Special handling for README files
88+
if (outputName.endsWith("-README.md")) {
89+
outputName = outputName.replace("-README.md", "-overview.md");
90+
}
91+
92+
// Create llms subdirectory
93+
const llmsDir = path.join(outputDir, "llms");
94+
await fs.mkdir(llmsDir, { recursive: true });
95+
96+
const outputPath = path.join(llmsDir, outputName);
97+
98+
// Add source comment at the top
99+
const fileContent = `<!-- Source: ${relativePath} -->\n\n${content}`;
100+
101+
await fs.writeFile(outputPath, fileContent);
102+
103+
return `llms/${outputName}`;
104+
}
105+
106+
/**
107+
* Create a directory page following llms.txt format
108+
* @param title Page title
109+
* @param description Page description
110+
* @param sections Sections with files
111+
* @param outputPath Output file path
112+
*/
113+
async function createDirectoryPage(
114+
title: string,
115+
description: string,
116+
sections: Array<{ name: string; files: Array<{ name: string; filename: string; description?: string }> }>,
117+
outputPath: string
118+
): Promise<void> {
119+
let content = `# ${title}\n\n> ${description}\n\n`;
120+
121+
for (const section of sections) {
122+
content += `## ${section.name}\n\n`;
123+
for (const file of section.files) {
124+
content += `- [${file.name}](${file.filename})`;
125+
if (file.description) {
126+
content += `: ${file.description}`;
127+
}
128+
content += "\n";
129+
}
130+
content += "\n";
131+
}
132+
133+
await fs.writeFile(outputPath, content.trim());
134+
}
135+
136+
/**
137+
* Generate a full concatenated file
138+
* @param files Array of file paths
139+
* @param outputPath Output file path
140+
* @param repoRoot Repository root path
141+
* @param title Title for the file
142+
*/
143+
async function generateFullFile(
144+
files: string[],
145+
outputPath: string,
146+
repoRoot: string,
147+
title: string
148+
): Promise<void> {
149+
let content = `# ${title}\n\nThis file is auto-generated from all relevant Markdown files in the Cursorless documentation.\n`;
150+
151+
for (const filePath of files) {
152+
const relativePath = path.relative(repoRoot, filePath);
153+
content += `\n\n<!-- File: ${relativePath} -->\n\n`;
154+
155+
const fileContent = await fs.readFile(filePath, "utf8");
156+
content += fileContent;
157+
}
158+
159+
await fs.writeFile(outputPath, content.trim());
160+
}
161+
162+
/**
163+
* Main function to generate all llms.txt files
164+
*/
165+
export async function generateLlmsTxt(): Promise<void> {
166+
// Get repo root from environment variable
167+
const repoRoot = process.env.CURSORLESS_REPO_ROOT;
168+
if (!repoRoot) {
169+
throw new Error("CURSORLESS_REPO_ROOT environment variable must be set");
170+
}
171+
172+
const docsDir = path.resolve(repoRoot, "packages/cursorless-org-docs/src/docs");
173+
const outputDir = path.resolve(repoRoot, "packages/cursorless-org/out");
174+
175+
console.log("Generating llms.txt files...");
176+
177+
try {
178+
// Create the output directory if it doesn't exist
179+
await fs.mkdir(outputDir, { recursive: true });
180+
181+
// Find user docs only
182+
const userDocsDir = path.join(docsDir, "user");
183+
const userFiles = await findMarkdownFiles(userDocsDir);
184+
185+
console.log(`Found ${userFiles.length} user docs`);
186+
187+
// Generate individual files for user docs
188+
const userIndividualFiles: Array<{ name: string; filename: string; description?: string; isRootReadme?: boolean }> = [];
189+
190+
// Process user files
191+
for (const filePath of userFiles) {
192+
const outputName = await generateIndividualFile(filePath, outputDir, repoRoot, userDocsDir);
193+
const title = await getMarkdownTitle(filePath);
194+
const relativePath = path.relative(repoRoot, filePath);
195+
196+
// Check if this is the root README
197+
const isRootReadme = relativePath.endsWith("user/README.md");
198+
199+
userIndividualFiles.push({
200+
name: title,
201+
filename: outputName,
202+
isRootReadme
203+
});
204+
}
205+
206+
// Sort files by name
207+
userIndividualFiles.sort((a, b) => a.name.localeCompare(b.name));
208+
209+
// Separate root README from other files
210+
const rootReadme = userIndividualFiles.find(f => f.isRootReadme);
211+
const otherFiles = userIndividualFiles.filter(f => !f.isRootReadme);
212+
213+
// 1. Generate main llms.txt (directory page)
214+
const sections = [];
215+
216+
if (rootReadme) {
217+
sections.push({
218+
name: "Core Documentation",
219+
files: [{ name: rootReadme.name, filename: rootReadme.filename }]
220+
});
221+
}
222+
223+
sections.push({
224+
name: "Optional",
225+
files: [
226+
...otherFiles,
227+
{ name: "Complete Documentation", filename: "llms-full.txt", description: "Full concatenated documentation" }
228+
]
229+
});
230+
231+
await createDirectoryPage(
232+
"Cursorless",
233+
"Cursorless is a spoken language for structural navigation and editing. Use voice commands to edit code faster than with a keyboard.",
234+
sections,
235+
path.join(outputDir, "llms.txt")
236+
);
237+
238+
// 2. Generate llms-full.txt (complete user documentation)
239+
await generateFullFile(
240+
userFiles,
241+
path.join(outputDir, "llms-full.txt"),
242+
repoRoot,
243+
"Cursorless Documentation"
244+
);
245+
246+
console.log("Successfully generated llms.txt files:");
247+
console.log(" - llms.txt (main directory page)");
248+
console.log(" - llms-full.txt (complete documentation)");
249+
console.log(` - ${userIndividualFiles.length} individual documentation files`);
250+
} catch (error) {
251+
console.error("Error generating llms.txt files:", error);
252+
process.exit(1);
253+
}
254+
}
255+
256+
// Run the main function directly
257+
void generateLlmsTxt();

packages/cursorless-org/tsconfig.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
"references": [
2424
{
2525
"path": "../cheatsheet"
26+
},
27+
{
28+
"path": "../common"
2629
}
2730
],
2831
"exclude": ["node_modules"]

0 commit comments

Comments
 (0)