kepano · ch040602 · Jun 8, 2026
diff --git a/src/constants.ts b/src/constants.ts
@@ -109,15 +109,42 @@ export const EXACT_SELECTORS = [
 
 	// ads
 	'.ad:not([class*="gradient"])',
+	'.adsbygoogle',
+	'ins.adsbygoogle',
 	'[class^="ad-" i]',
 	'[class$="-ad" i]',
+	'[class*="gpt-ad" i]',
+	'[class*="-ad-" i]:not([class*="gradient"])',
+	'[class^="ads-" i]',
+	'[class$="-ads" i]',
 	'[data-ad-wrapper]',
+	'[data-ad-client]',
+	'[data-ad-host]',
+	'[data-ad-slot]',
+	'[data-ad-format]',
+	'[data-full-width-responsive]',
 	'[id^="ad-" i]',
 	'[id$="-ad" i]',
+	'[id*="div-gpt-ad" i]',
+	'[id*="gpt-ad" i]',
+	'[id*="-ad-" i]',
+	'[id^="ads-" i]',
+	'[id$="-ads" i]',
 	'[role="banner" i]',
 	'[alt*="advert" i]',
+	'[aria-label*="advertisement" i]',
+	'[aria-label*="advertising" i]',
+	'[data-sponsored]',
 	'.promo',
 	'.Promo',
+	'[class*="promoted" i]',
+	'[id*="promoted" i]',
+	'[class*="outbrain" i]',
+	'[id*="outbrain" i]',
+	'[class*="revcontent" i]',
+	'[id*="revcontent" i]',
+	'[class*="taboola" i]',
+	'[id*="taboola" i]',
 	'#barrier-page', // ft.com
 	'.alert',
 	'[rel="sponsored" i]',
@@ -252,8 +279,19 @@ export const EXACT_SELECTORS = [
 	// iframes
 	'instaread-player',
 	'iframe:not([src])',
+	'iframe[src*="adnxs.com"]',
+	'iframe[src*="adservice.google."]',
+	'iframe[src*="adsystem.com"]',
+	'iframe[src*="amazon-adsystem.com"]',
 	'iframe[src*="blink.net"]',
+	'iframe[src*="doubleclick.net"]',
 	'iframe[src*="giscus.app"]',
+	'iframe[src*="googleadservices.com"]',
+	'iframe[src*="googlesyndication.com"]',
+	'iframe[src*="googletagservices.com"]',
+	'iframe[src*="outbrain.com"]',
+	'iframe[src*="securepubads.g.doubleclick.net"]',
+	'iframe[src*="taboola.com"]',
 	'iframe[src*="tinypass.com"]',
 	'iframe[src*="trinitymedia.ai"]',
 
@@ -267,6 +305,12 @@ export const EXACT_SELECTORS = [
 	'#Newsletter',
 	'.subscribe',
 
+	// ad network widgets
+	'amp-ad[type="adsense" i]',
+	'amp-ad[type="doubleclick" i]',
+	'amp-embed[type="adsense" i]',
+	'amp-embed[type="doubleclick" i]',
+
 	// Substack inline clutter
 	'[data-component-name="ButtonCreateButton"]',
 	'[data-component-name="DigestPostEmbed"]',

diff --git a/tests/ad-removal.test.ts b/tests/ad-removal.test.ts
@@ -0,0 +1,67 @@
+import { describe, expect, test } from 'vitest';
+import { Defuddle } from '../src/node';
+import { parseDocument } from './helpers';
+
+describe('Ad removal', () => {
+	test('removes Google ad slots and ad network iframes from extracted content', async () => {
+		const html = `<!DOCTYPE html>
+<html>
+<head>
+	<title>Article with ads</title>
+	<meta name="description" content="A test article with ad slots.">
+</head>
+<body>
+	<main>
+		<article>
+			<h1>Article with ads</h1>
+			<p>This article contains a real opening paragraph with enough text for the extractor to identify it as the main content.</p>
+			<ins class="adsbygoogle" data-ad-client="ca-pub-123" data-ad-slot="456"></ins>
+			<div id="div-gpt-ad-123">
+				<iframe src="https://securepubads.g.doubleclick.net/tag/js/gpt.js"></iframe>
+			</div>
+			<p>The second paragraph should remain after the advertising slots have been removed from the extracted result.</p>
+			<iframe src="https://pagead2.googlesyndication.com/pagead/html/r20240501/r20190131/zrt_lookup.html"></iframe>
+		</article>
+	</main>
+</body>
+</html>`;
+
+		const result = await Defuddle(
+			parseDocument(html, 'https://example.com/article-with-ads'),
+			'https://example.com/article-with-ads'
+		);
+
+		expect(result.content).toContain('The second paragraph should remain');
+		expect(result.content).not.toContain('adsbygoogle');
+		expect(result.content).not.toContain('div-gpt-ad');
+		expect(result.content).not.toContain('doubleclick.net');
+		expect(result.content).not.toContain('googlesyndication.com');
+	});
+
+	test('removes sponsored recommendation widgets without removing unrelated words', async () => {
+		const html = `<!DOCTYPE html>
+<html>
+<head><title>Roadmap article</title></head>
+<body>
+	<article>
+		<h1>Roadmap article</h1>
+		<p class="roadmap">Roadmap should stay because this class is part of the article content rather than an ad marker.</p>
+		<p id="shadow-adventure">Adventure should stay because the id only contains the letters ad inside a larger word.</p>
+		<div class="taboola-widget">Sponsored links from around the web</div>
+		<div class="outbrain-module">Promoted stories</div>
+		<p>The article conclusion should remain in the extracted content after sponsored widgets are removed.</p>
+	</article>
+</body>
+</html>`;
+
+		const result = await Defuddle(
+			parseDocument(html, 'https://example.com/roadmap-article'),
+			'https://example.com/roadmap-article'
+		);
+
+		expect(result.content).toContain('Roadmap should stay');
+		expect(result.content).toContain('Adventure should stay');
+		expect(result.content).not.toContain('Sponsored links from around the web');
+		expect(result.content).not.toContain('Promoted stories');
+	});
+});