Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,42 @@ export const EXACT_SELECTORS = [

// ads
'.ad:not([class*="gradient"])',
'.adsbygoogle',
'ins.adsbygoogle',
'[class^="ad-" i]',
'[class$="-ad" i]',
'[class*="gpt-ad" i]',
'[class*="-ad-" i]:not([class*="gradient"])',
'[class^="ads-" i]',
'[class$="-ads" i]',
'[data-ad-wrapper]',
'[data-ad-client]',
'[data-ad-host]',
'[data-ad-slot]',
'[data-ad-format]',
'[data-full-width-responsive]',
'[id^="ad-" i]',
'[id$="-ad" i]',
'[id*="div-gpt-ad" i]',
'[id*="gpt-ad" i]',
'[id*="-ad-" i]',
'[id^="ads-" i]',
'[id$="-ads" i]',
'[role="banner" i]',
'[alt*="advert" i]',
'[aria-label*="advertisement" i]',
'[aria-label*="advertising" i]',
'[data-sponsored]',
'.promo',
'.Promo',
'[class*="promoted" i]',
'[id*="promoted" i]',
'[class*="outbrain" i]',
'[id*="outbrain" i]',
'[class*="revcontent" i]',
'[id*="revcontent" i]',
'[class*="taboola" i]',
'[id*="taboola" i]',
'#barrier-page', // ft.com
'.alert',
'[rel="sponsored" i]',
Expand Down Expand Up @@ -252,8 +279,19 @@ export const EXACT_SELECTORS = [
// iframes
'instaread-player',
'iframe:not([src])',
'iframe[src*="adnxs.com"]',
'iframe[src*="adservice.google."]',
'iframe[src*="adsystem.com"]',
'iframe[src*="amazon-adsystem.com"]',
'iframe[src*="blink.net"]',
'iframe[src*="doubleclick.net"]',
'iframe[src*="giscus.app"]',
'iframe[src*="googleadservices.com"]',
'iframe[src*="googlesyndication.com"]',
'iframe[src*="googletagservices.com"]',
'iframe[src*="outbrain.com"]',
'iframe[src*="securepubads.g.doubleclick.net"]',
'iframe[src*="taboola.com"]',
'iframe[src*="tinypass.com"]',
'iframe[src*="trinitymedia.ai"]',

Expand All @@ -267,6 +305,12 @@ export const EXACT_SELECTORS = [
'#Newsletter',
'.subscribe',

// ad network widgets
'amp-ad[type="adsense" i]',
'amp-ad[type="doubleclick" i]',
'amp-embed[type="adsense" i]',
'amp-embed[type="doubleclick" i]',

// Substack inline clutter
'[data-component-name="ButtonCreateButton"]',
'[data-component-name="DigestPostEmbed"]',
Expand Down
67 changes: 67 additions & 0 deletions tests/ad-removal.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { describe, expect, test } from 'vitest';
import { Defuddle } from '../src/node';
import { parseDocument } from './helpers';

describe('Ad removal', () => {
test('removes Google ad slots and ad network iframes from extracted content', async () => {
const html = `<!DOCTYPE html>
<html>
<head>
<title>Article with ads</title>
<meta name="description" content="A test article with ad slots.">
</head>
<body>
<main>
<article>
<h1>Article with ads</h1>
<p>This article contains a real opening paragraph with enough text for the extractor to identify it as the main content.</p>
<ins class="adsbygoogle" data-ad-client="ca-pub-123" data-ad-slot="456"></ins>
<div id="div-gpt-ad-123">
<iframe src="https://securepubads.g.doubleclick.net/tag/js/gpt.js"></iframe>
</div>
<p>The second paragraph should remain after the advertising slots have been removed from the extracted result.</p>
<iframe src="https://pagead2.googlesyndication.com/pagead/html/r20240501/r20190131/zrt_lookup.html"></iframe>
</article>
</main>
</body>
</html>`;

const result = await Defuddle(
parseDocument(html, 'https://example.com/article-with-ads'),
'https://example.com/article-with-ads'
);

expect(result.content).toContain('The second paragraph should remain');
expect(result.content).not.toContain('adsbygoogle');
expect(result.content).not.toContain('div-gpt-ad');
expect(result.content).not.toContain('doubleclick.net');
expect(result.content).not.toContain('googlesyndication.com');
});

test('removes sponsored recommendation widgets without removing unrelated words', async () => {
const html = `<!DOCTYPE html>
<html>
<head><title>Roadmap article</title></head>
<body>
<article>
<h1>Roadmap article</h1>
<p class="roadmap">Roadmap should stay because this class is part of the article content rather than an ad marker.</p>
<p id="shadow-adventure">Adventure should stay because the id only contains the letters ad inside a larger word.</p>
<div class="taboola-widget">Sponsored links from around the web</div>
<div class="outbrain-module">Promoted stories</div>
<p>The article conclusion should remain in the extracted content after sponsored widgets are removed.</p>
</article>
</body>
</html>`;

const result = await Defuddle(
parseDocument(html, 'https://example.com/roadmap-article'),
'https://example.com/roadmap-article'
);

expect(result.content).toContain('Roadmap should stay');
expect(result.content).toContain('Adventure should stay');
expect(result.content).not.toContain('Sponsored links from around the web');
expect(result.content).not.toContain('Promoted stories');
});
});