diff --git a/package.json b/package.json index 8355e1c5a..bcde6af2f 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,9 @@ ], "node": [ "dist/node.d.ts" + ], + "extractor": [ + "dist/extractor.d.ts" ] } }, @@ -35,6 +38,11 @@ "./node": { "types": "./dist/node.d.ts", "import": "./dist/node.js" + }, + "./extractor": { + "types": "./dist/extractor.d.ts", + "import": "./dist/extractor.js", + "require": "./dist/extractor.js" } }, "scripts": { diff --git a/src/cli.ts b/src/cli.ts index a40c7dd08..73299e9c3 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -4,10 +4,12 @@ import { Command } from 'commander'; import { Defuddle } from './node'; import { writeFile, readFile } from 'fs/promises'; import { resolve } from 'path'; +import { pathToFileURL } from 'url'; import { parseLinkedomHTML } from './utils/linkedom-compat'; import { countWords } from './utils'; import { buildFrontmatter } from './frontmatter'; import { getInitialUA, fetchPage, extractRawMarkdown, cleanMarkdownContent, BOT_UA } from './fetch'; +import { ExtractorRegistry } from './extractor-registry'; export interface ParseOptions { output?: string; @@ -19,6 +21,38 @@ export interface ParseOptions { lang?: string; userAgent?: string; frontmatter?: boolean; + extractor?: string[]; +} + +function collectExtractor(value: string, previous: string[]): string[] { + return previous.concat([value]); +} + +// Production CJS builds: `await import(s)` is lowered by tsc (module=CommonJS) +// to a `require(s)` equivalent that cannot resolve file:// URLs or .mjs files. +// Hide the dynamic import inside a Function() so tsc doesn't see it, and the +// expression is parsed at runtime by Node, which honours native ESM dynamic +// import. +// +// Vitest (test runs): the source TS goes through esbuild, which preserves +// `import()`. But the Function()-eval shortcut runs outside vitest's module +// graph and trips its "A dynamic import callback was not specified" check. +// Detect that environment and use the in-source `import()` form so vitest +// can hook it normally. +const __INSIDE_VITEST__ = (globalThis as Record).__vitest_worker__ !== undefined; +const dynamicImport: (specifier: string) => Promise<{ default?: unknown; [k: string]: unknown }> = + __INSIDE_VITEST__ + ? ((specifier: string) => import(specifier)) as never + : (new Function('specifier', 'return import(specifier)') as never); + +async function loadExtractor(extractorPath: string): Promise { + const absPath = resolve(process.cwd(), extractorPath); + const mod = await dynamicImport(pathToFileURL(absPath).href); + const mapping = (mod.default ?? mod) as { patterns?: unknown; extractor?: unknown }; + if (!mapping || !Array.isArray(mapping.patterns) || typeof mapping.extractor !== 'function') { + throw new Error(`--extractor ${extractorPath}: module must default-export { patterns: (string | RegExp)[], extractor: class }`); + } + ExtractorRegistry.register(mapping as { patterns: (string | RegExp)[]; extractor: new (...args: unknown[]) => unknown } as never); } interface ParseResult { @@ -60,6 +94,12 @@ export async function parseSource(source: string | undefined, options: ParseOpti language: options.lang, }; + if (options.extractor && options.extractor.length > 0) { + for (const extractorPath of options.extractor) { + await loadExtractor(extractorPath); + } + } + let html: string; let url: string | undefined; @@ -174,6 +214,7 @@ export function createProgram(): Command { .option('--debug', 'Enable debug mode') .option('-l, --lang ', 'Preferred language (BCP 47, e.g. en, fr, ja)') .option('-u, --user-agent ', 'Custom User-Agent header for HTTP requests (helps with 403/FORBIDDEN responses)') + .option('--extractor ', 'Load a custom extractor module (repeatable). The file must default-export { patterns, extractor }.', collectExtractor, []) .action(async (source: string | undefined, options: ParseOptions) => { try { const { output } = await parseSource(source, options); diff --git a/src/extractor-registry.ts b/src/extractor-registry.ts index e3df8eaaa..20088248b 100644 --- a/src/extractor-registry.ts +++ b/src/extractor-registry.ts @@ -27,9 +27,9 @@ import { LeetCodeExtractor } from './extractors/leetcode'; import { LwnExtractor } from './extractors/lwn'; import { MastodonExtractor } from './extractors/mastodon'; -type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any, options?: ExtractorOptions) => BaseExtractor; +export type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any, options?: ExtractorOptions) => BaseExtractor; -interface ExtractorMapping { +export interface ExtractorMapping { patterns: (string | RegExp)[]; extractor: ExtractorConstructor; } diff --git a/src/extractor.ts b/src/extractor.ts new file mode 100644 index 000000000..67d11a8a3 --- /dev/null +++ b/src/extractor.ts @@ -0,0 +1,27 @@ +// Public entry for extractor authors. +// +// Import `BaseExtractor` from here to write a custom site-specific extractor, +// and either load it via the CLI's `--extractor ` flag or register it at +// runtime with `ExtractorRegistry.register(...)`. +// +// Example user file (loaded via --extractor): +// +// import { BaseExtractor } from 'defuddle/extractor'; +// +// class MyExtractor extends BaseExtractor { +// canExtract() { return false; } +// canExtractAsync() { return true; } +// prefersAsync() { return true; } +// async extractAsync() { /* fetch additional pages, return ExtractorResult */ } +// } +// +// export default { +// patterns: [/^https?:\/\/example\.com\/article\//], +// extractor: MyExtractor, +// }; + +export { BaseExtractor } from './extractors/_base'; +export type { ExtractorOptions } from './extractors/_base'; +export type { ExtractorResult, ExtractedContent, ExtractorVariables } from './types/extractors'; +export { ExtractorRegistry } from './extractor-registry'; +export type { ExtractorMapping, ExtractorConstructor } from './extractor-registry'; diff --git a/tests/cli.test.ts b/tests/cli.test.ts index 4522602c7..44a32d9dc 100644 --- a/tests/cli.test.ts +++ b/tests/cli.test.ts @@ -105,4 +105,37 @@ describe('CLI parseSource', () => { // commander camelCases --user-agent → options.userAgent, which parseSource reads. expect(option?.attributeName()).toBe('userAgent'); }); + + test('registers the --extractor flag as a repeatable string array', () => { + const parseCommand = createProgram().commands.find((c) => c.name() === 'parse'); + const option = parseCommand?.options.find((o) => o.long === '--extractor'); + + expect(option).toBeDefined(); + expect(option?.attributeName()).toBe('extractor'); + // commander invokes the collector with the default starting empty array + expect(option?.defaultValue).toEqual([]); + }); + + test('--extractor loads the supplied module and registers it with ExtractorRegistry', async () => { + const { ExtractorRegistry } = await import('../src/extractor-registry'); + const fixturePath = join(__dirname, 'fixtures', 'cli-extractor-custom.mjs'); + + const before = (ExtractorRegistry as unknown as { mappings: unknown[] }).mappings.length; + + // parseSource runs loadExtractor early. The fixture's patterns don't match + // the stdin (no URL) so the registered extractor isn't used for parsing — + // we only assert registration happened. + await parseSource(undefined, { extractor: [fixturePath] }, createMockStdin(fixtureHtml)); + + const after = (ExtractorRegistry as unknown as { mappings: unknown[] }).mappings.length; + expect(after).toBe(before + 1); + }); + + test('--extractor rejects modules that do not default-export the expected shape', async () => { + const fixturePath = join(__dirname, 'fixtures', 'cli-extractor-malformed.mjs'); + + await expect( + parseSource(undefined, { extractor: [fixturePath] }, createMockStdin(fixtureHtml)) + ).rejects.toThrow(/must default-export/); + }); }); diff --git a/tests/fixtures/cli-extractor-custom.mjs b/tests/fixtures/cli-extractor-custom.mjs new file mode 100644 index 000000000..cf19264f5 --- /dev/null +++ b/tests/fixtures/cli-extractor-custom.mjs @@ -0,0 +1,21 @@ +// Test fixture loaded by tests/cli.test.ts via --extractor. +// Duck-types the BaseExtractor interface so the test doesn't depend on `dist/`. + +class CliTestExtractor { + constructor(document, url) { + this.document = document; + this.url = url; + } + canExtract() { return false; } + canExtractAsync() { return false; } + prefersAsync() { return false; } + extract() { + return { content: '', contentHtml: '', extractedContent: {} }; + } + async extractAsync() { return this.extract(); } +} + +export default { + patterns: ['cli-test-only.invalid'], + extractor: CliTestExtractor, +}; diff --git a/tests/fixtures/cli-extractor-malformed.mjs b/tests/fixtures/cli-extractor-malformed.mjs new file mode 100644 index 000000000..47c47f34f --- /dev/null +++ b/tests/fixtures/cli-extractor-malformed.mjs @@ -0,0 +1,4 @@ +// Malformed: missing required `patterns` and `extractor` keys. +// tests/cli.test.ts expects loadExtractor to reject this. + +export default { wrong: 'shape' }; diff --git a/tsconfig.node.json b/tsconfig.node.json index 64f18ced6..a3696f464 100644 --- a/tsconfig.node.json +++ b/tsconfig.node.json @@ -13,6 +13,6 @@ "lib": ["ES2020", "DOM", "DOM.Iterable"], "types": ["node"] }, - "include": ["src/node.ts", "src/cli.ts"], + "include": ["src/node.ts", "src/cli.ts", "src/extractor.ts"], "exclude": ["node_modules", "dist"] }