Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@ export interface ParseOptions {
lang?: string;
userAgent?: string;
frontmatter?: boolean;
// Removal toggles — default to enabled (the existing pipeline). The
// `--no-*` flag form lets users disable each step individually so they
// can inspect what would otherwise be stripped. Useful for developing
// site-specific extractors against a known-clean baseline.
contentPatterns?: boolean;
lowScoring?: boolean;
exactSelectors?: boolean;
partialSelectors?: boolean;
hiddenElements?: boolean;
smallImages?: boolean;
// `removeImages` defaults to `false` in the API; expose as positive flag.
removeImages?: boolean;
}

interface ParseResult {
Expand All @@ -35,6 +47,13 @@ const ansi = {
// Read version from package.json
const version = require('../package.json').version;

// Helper: emit a single-key object only when the value is defined. Used to
// avoid passing `undefined` into the defuddle options spread, which would
// override the library's default with `undefined`.
function maybe<K extends string, V>(key: K, value: V | undefined): Partial<Record<K, V>> {
return value === undefined ? {} : ({ [key]: value } as Record<K, V>);
}

export async function readStdin(input: NodeJS.ReadStream = process.stdin): Promise<string> {
return new Promise((resolve, reject) => {
const chunks: string[] = [];
Expand All @@ -53,11 +72,22 @@ export async function parseSource(source: string | undefined, options: ParseOpti
options.markdown = true;
}

// Build defuddle options. Removal toggles are only included when the
// caller actually specified a value — Defuddle's internal merge does
// `{ defaults, ...this.options }`, so an explicit `undefined` would
// shadow the default `true` and silently disable the removal pass.
const defuddleOpts = {
debug: options.debug,
markdown: options.markdown,
separateMarkdown: options.markdown || options.json,
language: options.lang,
...maybe('removeContentPatterns', options.contentPatterns),
...maybe('removeLowScoring', options.lowScoring),
...maybe('removeExactSelectors', options.exactSelectors),
...maybe('removePartialSelectors', options.partialSelectors),
...maybe('removeHiddenElements', options.hiddenElements),
...maybe('removeSmallImages', options.smallImages),
...maybe('removeImages', options.removeImages),
};

let html: string;
Expand Down Expand Up @@ -174,6 +204,16 @@ export function createProgram(): Command {
.option('--debug', 'Enable debug mode')
.option('-l, --lang <code>', 'Preferred language (BCP 47, e.g. en, fr, ja)')
.option('-u, --user-agent <string>', 'Custom User-Agent header for HTTP requests (helps with 403/FORBIDDEN responses)')
// Removal toggles. Each `--no-*` flag disables a single removal pass
// so users (especially extractor authors) can isolate which step is
// stripping a given element.
.option('--no-content-patterns', 'Keep boilerplate patterns (read time, breadcrumb, metadata lists, newsletter signups, etc.)')
.option('--no-low-scoring', 'Keep low-scoring elements (skip the content scoring pass)')
.option('--no-exact-selectors', 'Keep elements matched by the exact-selector denylist (ads, scripts, JW Player, etc.)')
.option('--no-partial-selectors', 'Keep elements matched by the partial-selector denylist (class/id containing "ad-", "sidebar", "comment", etc.)')
.option('--no-hidden-elements', 'Keep CSS-hidden elements (display:none, visibility:hidden, opacity:0)')
.option('--no-small-images', 'Keep small images (skip the <100px image filter)')
.option('--remove-images', 'Remove all images, picture, and figure elements')
.action(async (source: string | undefined, options: ParseOptions) => {
try {
const { output } = await parseSource(source, options);
Expand Down
61 changes: 61 additions & 0 deletions tests/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,65 @@ describe('CLI parseSource', () => {
// commander camelCases --user-agent → options.userAgent, which parseSource reads.
expect(option?.attributeName()).toBe('userAgent');
});

test('registers each removal toggle flag', () => {
const parseCommand = createProgram().commands.find((c) => c.name() === 'parse');
const flags = [
'--no-content-patterns',
'--no-low-scoring',
'--no-exact-selectors',
'--no-partial-selectors',
'--no-hidden-elements',
'--no-small-images',
'--remove-images',
];
for (const long of flags) {
const option = parseCommand?.options.find((o) => o.long === long);
expect(option, `option ${long} is registered`).toBeDefined();
}
});

test('--no-content-patterns keeps content otherwise stripped by the metadata-list heuristic', async () => {
// The blog-metadata-list heuristic removes short trailing <ul> link
// lists introduced by a sentence not ending in ':'. Use a minimal
// fixture that triggers it: an article ending with such a list.
const html = `<!DOCTYPE html><html><body><main><article>
<h1>Test article</h1>
<p>The body has enough words to make the article extract well and clear the very-short content threshold so the rest of the pipeline runs as it would on a real article. We add a second sentence to make sure the scoring pass keeps this element.</p>
<p>The companion artifacts are available from the following sources.</p>
<ul>
<li><a href="https://example.com/a">Mirror A</a></li>
<li><a href="https://example.com/b">Mirror B</a></li>
</ul>
</article></main></body></html>`;

const defaultResult = await parseSource('-', {}, createMockStdin(html));
const keptResult = await parseSource(
'-',
{ contentPatterns: false },
createMockStdin(html),
);

// With the default pipeline, the trailing link list is the kind of
// element the metadata-list heuristic strips. Disabling content
// patterns must keep it.
expect(defaultResult.output).not.toContain('Mirror A');
expect(keptResult.output).toContain('Mirror A');
expect(keptResult.output).toContain('Mirror B');
});

test('--remove-images strips images from the output', async () => {
const html = `<!DOCTYPE html><html><body><main><article>
<h1>Test article</h1>
<p>Article body with enough text content to clear the scoring pass and the very-short threshold so the rest of the pipeline runs normally during extraction.</p>
<p><img src="https://example.com/cover.png" alt="cover" width="800" height="600"></p>
<p>More body content follows the image to keep it inline within the article scope.</p>
</article></main></body></html>`;

const withImages = await parseSource('-', {}, createMockStdin(html));
const withoutImages = await parseSource('-', { removeImages: true }, createMockStdin(html));

expect(withImages.output).toContain('cover.png');
expect(withoutImages.output).not.toContain('cover.png');
});
});