Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
],
"node": [
"dist/node.d.ts"
],
"extractor": [
"dist/extractor.d.ts"
]
}
},
Expand All @@ -35,6 +38,11 @@
"./node": {
"types": "./dist/node.d.ts",
"import": "./dist/node.js"
},
"./extractor": {
"types": "./dist/extractor.d.ts",
"import": "./dist/extractor.js",
"require": "./dist/extractor.js"
}
},
"scripts": {
Expand Down
41 changes: 41 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ import { Command } from 'commander';
import { Defuddle } from './node';
import { writeFile, readFile } from 'fs/promises';
import { resolve } from 'path';
import { pathToFileURL } from 'url';
import { parseLinkedomHTML } from './utils/linkedom-compat';
import { countWords } from './utils';
import { buildFrontmatter } from './frontmatter';
import { getInitialUA, fetchPage, extractRawMarkdown, cleanMarkdownContent, BOT_UA } from './fetch';
import { ExtractorRegistry } from './extractor-registry';

export interface ParseOptions {
output?: string;
Expand All @@ -19,6 +21,38 @@ export interface ParseOptions {
lang?: string;
userAgent?: string;
frontmatter?: boolean;
extractor?: string[];
}

function collectExtractor(value: string, previous: string[]): string[] {
return previous.concat([value]);
}

// Production CJS builds: `await import(s)` is lowered by tsc (module=CommonJS)
// to a `require(s)` equivalent that cannot resolve file:// URLs or .mjs files.
// Hide the dynamic import inside a Function() so tsc doesn't see it, and the
// expression is parsed at runtime by Node, which honours native ESM dynamic
// import.
//
// Vitest (test runs): the source TS goes through esbuild, which preserves
// `import()`. But the Function()-eval shortcut runs outside vitest's module
// graph and trips its "A dynamic import callback was not specified" check.
// Detect that environment and use the in-source `import()` form so vitest
// can hook it normally.
const __INSIDE_VITEST__ = (globalThis as Record<string, unknown>).__vitest_worker__ !== undefined;
const dynamicImport: (specifier: string) => Promise<{ default?: unknown; [k: string]: unknown }> =
__INSIDE_VITEST__
? ((specifier: string) => import(specifier)) as never
: (new Function('specifier', 'return import(specifier)') as never);

async function loadExtractor(extractorPath: string): Promise<void> {
const absPath = resolve(process.cwd(), extractorPath);
const mod = await dynamicImport(pathToFileURL(absPath).href);
const mapping = (mod.default ?? mod) as { patterns?: unknown; extractor?: unknown };
if (!mapping || !Array.isArray(mapping.patterns) || typeof mapping.extractor !== 'function') {
throw new Error(`--extractor ${extractorPath}: module must default-export { patterns: (string | RegExp)[], extractor: class }`);
}
ExtractorRegistry.register(mapping as { patterns: (string | RegExp)[]; extractor: new (...args: unknown[]) => unknown } as never);
}

interface ParseResult {
Expand Down Expand Up @@ -60,6 +94,12 @@ export async function parseSource(source: string | undefined, options: ParseOpti
language: options.lang,
};

if (options.extractor && options.extractor.length > 0) {
for (const extractorPath of options.extractor) {
await loadExtractor(extractorPath);
}
}

let html: string;
let url: string | undefined;

Expand Down Expand Up @@ -174,6 +214,7 @@ export function createProgram(): Command {
.option('--debug', 'Enable debug mode')
.option('-l, --lang <code>', 'Preferred language (BCP 47, e.g. en, fr, ja)')
.option('-u, --user-agent <string>', 'Custom User-Agent header for HTTP requests (helps with 403/FORBIDDEN responses)')
.option('--extractor <path>', 'Load a custom extractor module (repeatable). The file must default-export { patterns, extractor }.', collectExtractor, [])
.action(async (source: string | undefined, options: ParseOptions) => {
try {
const { output } = await parseSource(source, options);
Expand Down
4 changes: 2 additions & 2 deletions src/extractor-registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ import { LeetCodeExtractor } from './extractors/leetcode';
import { LwnExtractor } from './extractors/lwn';
import { MastodonExtractor } from './extractors/mastodon';

type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any, options?: ExtractorOptions) => BaseExtractor;
export type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any, options?: ExtractorOptions) => BaseExtractor;

interface ExtractorMapping {
export interface ExtractorMapping {
patterns: (string | RegExp)[];
extractor: ExtractorConstructor;
}
Expand Down
27 changes: 27 additions & 0 deletions src/extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Public entry for extractor authors.
//
// Import `BaseExtractor` from here to write a custom site-specific extractor,
// and either load it via the CLI's `--extractor <path>` flag or register it at
// runtime with `ExtractorRegistry.register(...)`.
//
// Example user file (loaded via --extractor):
//
// import { BaseExtractor } from 'defuddle/extractor';
//
// class MyExtractor extends BaseExtractor {
// canExtract() { return false; }
// canExtractAsync() { return true; }
// prefersAsync() { return true; }
// async extractAsync() { /* fetch additional pages, return ExtractorResult */ }
// }
//
// export default {
// patterns: [/^https?:\/\/example\.com\/article\//],
// extractor: MyExtractor,
// };

export { BaseExtractor } from './extractors/_base';
export type { ExtractorOptions } from './extractors/_base';
export type { ExtractorResult, ExtractedContent, ExtractorVariables } from './types/extractors';
export { ExtractorRegistry } from './extractor-registry';
export type { ExtractorMapping, ExtractorConstructor } from './extractor-registry';
33 changes: 33 additions & 0 deletions tests/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,37 @@ describe('CLI parseSource', () => {
// commander camelCases --user-agent → options.userAgent, which parseSource reads.
expect(option?.attributeName()).toBe('userAgent');
});

test('registers the --extractor flag as a repeatable string array', () => {
const parseCommand = createProgram().commands.find((c) => c.name() === 'parse');
const option = parseCommand?.options.find((o) => o.long === '--extractor');

expect(option).toBeDefined();
expect(option?.attributeName()).toBe('extractor');
// commander invokes the collector with the default starting empty array
expect(option?.defaultValue).toEqual([]);
});

test('--extractor loads the supplied module and registers it with ExtractorRegistry', async () => {
const { ExtractorRegistry } = await import('../src/extractor-registry');
const fixturePath = join(__dirname, 'fixtures', 'cli-extractor-custom.mjs');

const before = (ExtractorRegistry as unknown as { mappings: unknown[] }).mappings.length;

// parseSource runs loadExtractor early. The fixture's patterns don't match
// the stdin (no URL) so the registered extractor isn't used for parsing —
// we only assert registration happened.
await parseSource(undefined, { extractor: [fixturePath] }, createMockStdin(fixtureHtml));

const after = (ExtractorRegistry as unknown as { mappings: unknown[] }).mappings.length;
expect(after).toBe(before + 1);
});

test('--extractor rejects modules that do not default-export the expected shape', async () => {
const fixturePath = join(__dirname, 'fixtures', 'cli-extractor-malformed.mjs');

await expect(
parseSource(undefined, { extractor: [fixturePath] }, createMockStdin(fixtureHtml))
).rejects.toThrow(/must default-export/);
});
});
21 changes: 21 additions & 0 deletions tests/fixtures/cli-extractor-custom.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Test fixture loaded by tests/cli.test.ts via --extractor.
// Duck-types the BaseExtractor interface so the test doesn't depend on `dist/`.

class CliTestExtractor {
constructor(document, url) {
this.document = document;
this.url = url;
}
canExtract() { return false; }
canExtractAsync() { return false; }
prefersAsync() { return false; }
extract() {
return { content: '', contentHtml: '', extractedContent: {} };
}
async extractAsync() { return this.extract(); }
}

export default {
patterns: ['cli-test-only.invalid'],
extractor: CliTestExtractor,
};
4 changes: 4 additions & 0 deletions tests/fixtures/cli-extractor-malformed.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Malformed: missing required `patterns` and `extractor` keys.
// tests/cli.test.ts expects loadExtractor to reject this.

export default { wrong: 'shape' };
2 changes: 1 addition & 1 deletion tsconfig.node.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"types": ["node"]
},
"include": ["src/node.ts", "src/cli.ts"],
"include": ["src/node.ts", "src/cli.ts", "src/extractor.ts"],
"exclude": ["node_modules", "dist"]
}