From 27284aa642f342b4470aeaadbc338802f4b20d73 Mon Sep 17 00:00:00 2001 From: ch040602 Date: Tue, 9 Jun 2026 01:49:28 +0900 Subject: [PATCH] Remove ad network clutter --- src/constants.ts | 44 ++++++++++++++++++++++++++ tests/ad-removal.test.ts | 67 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tests/ad-removal.test.ts diff --git a/src/constants.ts b/src/constants.ts index 81d2c92b4..4a5a4cbf8 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -109,15 +109,42 @@ export const EXACT_SELECTORS = [ // ads '.ad:not([class*="gradient"])', + '.adsbygoogle', + 'ins.adsbygoogle', '[class^="ad-" i]', '[class$="-ad" i]', + '[class*="gpt-ad" i]', + '[class*="-ad-" i]:not([class*="gradient"])', + '[class^="ads-" i]', + '[class$="-ads" i]', '[data-ad-wrapper]', + '[data-ad-client]', + '[data-ad-host]', + '[data-ad-slot]', + '[data-ad-format]', + '[data-full-width-responsive]', '[id^="ad-" i]', '[id$="-ad" i]', + '[id*="div-gpt-ad" i]', + '[id*="gpt-ad" i]', + '[id*="-ad-" i]', + '[id^="ads-" i]', + '[id$="-ads" i]', '[role="banner" i]', '[alt*="advert" i]', + '[aria-label*="advertisement" i]', + '[aria-label*="advertising" i]', + '[data-sponsored]', '.promo', '.Promo', + '[class*="promoted" i]', + '[id*="promoted" i]', + '[class*="outbrain" i]', + '[id*="outbrain" i]', + '[class*="revcontent" i]', + '[id*="revcontent" i]', + '[class*="taboola" i]', + '[id*="taboola" i]', '#barrier-page', // ft.com '.alert', '[rel="sponsored" i]', @@ -252,8 +279,19 @@ export const EXACT_SELECTORS = [ // iframes 'instaread-player', 'iframe:not([src])', + 'iframe[src*="adnxs.com"]', + 'iframe[src*="adservice.google."]', + 'iframe[src*="adsystem.com"]', + 'iframe[src*="amazon-adsystem.com"]', 'iframe[src*="blink.net"]', + 'iframe[src*="doubleclick.net"]', 'iframe[src*="giscus.app"]', + 'iframe[src*="googleadservices.com"]', + 'iframe[src*="googlesyndication.com"]', + 'iframe[src*="googletagservices.com"]', + 'iframe[src*="outbrain.com"]', + 'iframe[src*="securepubads.g.doubleclick.net"]', + 'iframe[src*="taboola.com"]', 'iframe[src*="tinypass.com"]', 'iframe[src*="trinitymedia.ai"]', @@ -267,6 +305,12 @@ export const EXACT_SELECTORS = [ '#Newsletter', '.subscribe', + // ad network widgets + 'amp-ad[type="adsense" i]', + 'amp-ad[type="doubleclick" i]', + 'amp-embed[type="adsense" i]', + 'amp-embed[type="doubleclick" i]', + // Substack inline clutter '[data-component-name="ButtonCreateButton"]', '[data-component-name="DigestPostEmbed"]', diff --git a/tests/ad-removal.test.ts b/tests/ad-removal.test.ts new file mode 100644 index 000000000..d6e74acdf --- /dev/null +++ b/tests/ad-removal.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, test } from 'vitest'; +import { Defuddle } from '../src/node'; +import { parseDocument } from './helpers'; + +describe('Ad removal', () => { + test('removes Google ad slots and ad network iframes from extracted content', async () => { + const html = ` + + + Article with ads + + + +
+
+

Article with ads

+

This article contains a real opening paragraph with enough text for the extractor to identify it as the main content.

+ +
+ +
+

The second paragraph should remain after the advertising slots have been removed from the extracted result.

+ +
+
+ +`; + + const result = await Defuddle( + parseDocument(html, 'https://example.com/article-with-ads'), + 'https://example.com/article-with-ads' + ); + + expect(result.content).toContain('The second paragraph should remain'); + expect(result.content).not.toContain('adsbygoogle'); + expect(result.content).not.toContain('div-gpt-ad'); + expect(result.content).not.toContain('doubleclick.net'); + expect(result.content).not.toContain('googlesyndication.com'); + }); + + test('removes sponsored recommendation widgets without removing unrelated words', async () => { + const html = ` + +Roadmap article + +
+

Roadmap article

+

Roadmap should stay because this class is part of the article content rather than an ad marker.

+

Adventure should stay because the id only contains the letters ad inside a larger word.

+
Sponsored links from around the web
+
Promoted stories
+

The article conclusion should remain in the extracted content after sponsored widgets are removed.

+
+ +`; + + const result = await Defuddle( + parseDocument(html, 'https://example.com/roadmap-article'), + 'https://example.com/roadmap-article' + ); + + expect(result.content).toContain('Roadmap should stay'); + expect(result.content).toContain('Adventure should stay'); + expect(result.content).not.toContain('Sponsored links from around the web'); + expect(result.content).not.toContain('Promoted stories'); + }); +});