diff --git a/src/constants.ts b/src/constants.ts index 8810d39cf..9c84350c2 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -187,9 +187,11 @@ export const EXACT_SELECTORS = [ // '[href*="/tag/"]', // '[href*="/tags/"]', // '[href*="/topics"]', // see issue #131 - '[href*="/author/"]', - '[href*="/author?"]', - '[href$="/author"]', + // Author links can be legitimate article content (see issue #252). + // Author metadata/widgets are handled by class selectors and content-pattern removals. + // '[href*="/author/"]', + // '[href*="/author?"]', + // '[href$="/author"]', 'a[href*="copyright.com"]', 'a[href*="google.com/preferences"]', '[href="#top"]', diff --git a/src/removals/content-patterns.ts b/src/removals/content-patterns.ts index 75c134a5d..941facd57 100644 --- a/src/removals/content-patterns.ts +++ b/src/removals/content-patterns.ts @@ -452,6 +452,30 @@ export function removeByContentPattern(mainContent: Element, debug: boolean, url break; } + // Remove compact author byline lists near the top of content. The broad + // href-based selector removal is intentionally disabled so body links to + // author pages are preserved; pre-content author lists are metadata. + for (const list of mainContent.querySelectorAll('ul, ol')) { + if (!list.parentNode) continue; + if (!isPreContent(list)) continue; + if (countWords(list.textContent || '') > 10) continue; + if (list.querySelector(CONTENT_ELEMENT_SELECTOR)) continue; + + const links = Array.from(list.querySelectorAll('a[href]')); + if (links.length === 0) continue; + const allAuthorLinks = links.every(link => { + const href = link.getAttribute('href') || ''; + return href.includes('/author/') || href.includes('/author?') || /\/author\/?$/.test(href); + }); + if (!allAuthorLinks) continue; + + const target = walkUpToWrapper(list, list.textContent?.trim() || '', mainContent); + if (debug && debugRemovals) { + debugRemovals.push({ step: 'removeByContentPattern', reason: 'author byline list', text: textPreview(target) }); + } + target.remove(); + } + const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time')); // Single pass over candidates for all metadata-removal checks. diff --git a/tests/expected/issues--252-author-links-preserved.md b/tests/expected/issues--252-author-links-preserved.md new file mode 100644 index 000000000..596f61785 --- /dev/null +++ b/tests/expected/issues--252-author-links-preserved.md @@ -0,0 +1,21 @@ +```json +{ + "title": "Simple Made Clear", + "author": "Jane Smith", + "site": "Example Talks", + "published": "" +} +``` + +Software systems become easier to understand when each part has one reason to change. This talk explains how teams can separate concerns without splitting code into arbitrary fragments. + +The main example follows a reporting service as it grows from a single script into a small set of modules. Each step keeps the public behavior the same while making dependencies visible and easier to test. + +## People mentioned + +- [Alan Perlis](https://en.wikipedia.org/wiki/Alan_Perlis) +- [Grady Booch](https://example.com/author/grady-booch) +- [Edsger Dijkstra](https://en.wikipedia.org/wiki/Edsger_W._Dijkstra) +- [Erik Meijer](https://example.com/author/erik-meijer) + +The point is not that every program needs more layers. The point is that names, data flow, and boundaries should make the important choices visible to the next person reading the code. diff --git a/tests/fixtures/issues--252-author-links-preserved.html b/tests/fixtures/issues--252-author-links-preserved.html new file mode 100644 index 000000000..e127164b4 --- /dev/null +++ b/tests/fixtures/issues--252-author-links-preserved.html @@ -0,0 +1,28 @@ + + + + + + Simple Made Clear + + + + + +
+

Simple Made Clear

+

Software systems become easier to understand when each part has one reason to change. This talk explains how teams can separate concerns without splitting code into arbitrary fragments.

+

The main example follows a reporting service as it grows from a single script into a small set of modules. Each step keeps the public behavior the same while making dependencies visible and easier to test.

+ +

People mentioned

+ + +

The point is not that every program needs more layers. The point is that names, data flow, and boundaries should make the important choices visible to the next person reading the code.

+
+ +