From 73ebdc434867f02b66e2568947a69fc2a22f75e5 Mon Sep 17 00:00:00 2001 From: Benjamin Jasper Date: Sat, 13 Jun 2026 12:33:47 +0200 Subject: [PATCH 1/2] fix: fix srcset normalization when the widht/density descriptor is missing --- src/elements/images.ts | 53 +++++++++---------- .../elements--srcset-normalization.md | 10 +++- .../elements--srcset-normalization.html | 16 ++++++ 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/src/elements/images.ts b/src/elements/images.ts index 57499009e..a8eda13dc 100644 --- a/src/elements/images.ts +++ b/src/elements/images.ts @@ -903,46 +903,41 @@ function processSourceElement(element: Element, doc: Document): Element { * Extract the first URL from a srcset attribute. * Handles URLs that contain commas (e.g., Substack CDN URLs like * https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...) - * by parsing based on width/density descriptors rather than splitting on commas. + * by parsing width/density descriptors and only splitting candidate separators. */ function extractFirstUrlFromSrcset(srcset: string): string | null { if (!srcset || !srcset.trim()) return null; const trimmed = srcset.trim(); - - // Match srcset entries by finding URL + descriptor pairs. - // Each entry ends with a width descriptor (e.g., "424w") or density descriptor (e.g., "2x"). - // The URL is everything before the whitespace that precedes the descriptor. - // This handles URLs containing commas (which would break a simple comma-split). - const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g; - let match; - let lastIndex = 0; - - while ((match = entryPattern.exec(trimmed)) !== null) { - // Extract URL from this entry, trimming any leading comma+whitespace from previous entry - let url = match[1].trim(); - if (lastIndex > 0) { - // Remove leading comma separator from previous entry - url = url.replace(/^,\s*/, ''); + const descriptorPattern = /\s+\d+(?:\.\d+)?[wx](?=\s*(?:,|$))/g; + const candidateSeparatorPattern = /(?:,\s*(?=(?:https?:)?\/\/|\/(?!\/)|\.{1,2}\/)|,\s+(?=[^,\s]+\.(?:jpg|jpeg|png|webp|gif|avif|svg)(?:[?#\s,]|$)))/i; + const extractUrl = (candidate: string): string | null => { + const normalized = candidate.replace(/^,\s*/, '').trim(); + if (!normalized) return null; + + // Split only on commas that look like candidate separators. Some CDN URLs + // contain commas in the URL itself, e.g. "/image/fetch/...,w_424,...". + for (const part of normalized.split(candidateSeparatorPattern)) { + const urlMatch = part.trim().match(urlPattern); + if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) { + return urlMatch[1]; + } } - lastIndex = entryPattern.lastIndex; - - if (!url) continue; - - // Skip SVG data URLs - if (isSvgDataUrl(url)) continue; + return null; + }; - return url; - } + let match; + let start = 0; - // Fallback: try extracting URL before first whitespace (for srcset with single entry and no descriptor) - const urlMatch = trimmed.match(urlPattern); - if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) { - return urlMatch[1]; + while ((match = descriptorPattern.exec(trimmed)) !== null) { + const url = extractUrl(trimmed.slice(start, match.index)); + start = descriptorPattern.lastIndex; + if (url) return url; } - return null; + // Fallback: handle srcset values with no descriptors. + return extractUrl(trimmed.slice(start)); } /** diff --git a/tests/expected/elements--srcset-normalization.md b/tests/expected/elements--srcset-normalization.md index 6b1446c46..a71a56469 100644 --- a/tests/expected/elements--srcset-normalization.md +++ b/tests/expected/elements--srcset-normalization.md @@ -15,6 +15,14 @@ This article tests normalization of React SSR camelCase srcSet attributes to sta Hero image with React SSR attributes. Photo credit. +![Image whose first srcset candidate has no descriptor.](https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile) + +Image whose first srcset candidate has no descriptor. + +![Image with CDN commas in srcset URL.](https://substackcdn.com/image/fetch/$s_!test!,w_848,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp) + +Image with CDN commas in srcset URL. + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident. \ No newline at end of file +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident. diff --git a/tests/fixtures/elements--srcset-normalization.html b/tests/fixtures/elements--srcset-normalization.html index 65ea808f0..9032586ff 100644 --- a/tests/fixtures/elements--srcset-normalization.html +++ b/tests/fixtures/elements--srcset-normalization.html @@ -18,6 +18,22 @@

Article with React SSR Images

Hero image with React SSR attributes. Photo credit.
+
+ + + Image whose first srcset candidate has no descriptor. + +
Image whose first srcset candidate has no descriptor.
+
+ +
+ + + Image with CDN commas in srcset URL. + +
Image with CDN commas in srcset URL.
+
+

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.

From dc6cbe668a9c781c59b7d7d09a9ee9730bcfe039 Mon Sep 17 00:00:00 2001 From: Benjamin Jasper Date: Sat, 13 Jun 2026 16:17:40 +0200 Subject: [PATCH 2/2] feat: consolidate srcset parsing/selection --- src/defuddle.ts | 56 +++------------------------ src/elements/images.ts | 43 +++------------------ src/extractors/substack.ts | 17 +------- src/markdown.ts | 37 +++--------------- src/utils/srcset.ts | 79 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 97 insertions(+), 135 deletions(-) create mode 100644 src/utils/srcset.ts diff --git a/src/defuddle.ts b/src/defuddle.ts index a513b30d0..8b81e8342 100644 --- a/src/defuddle.ts +++ b/src/defuddle.ts @@ -20,6 +20,7 @@ import { removeByContentPattern, removeEyebrowLabel } from './removals/content-p import { removeMetadataBlock } from './removals/metadata-block'; import { getComputedStyle, textPreview, countWords } from './utils'; import { parseHTML, serializeHTML, decodeHTMLEntities, isDangerousUrl, getClassName } from './utils/dom'; +import { formatSrcset, getLargestWidthSrcsetUrl, parseSrcset } from './utils/srcset'; interface StyleChange { selector: string; @@ -581,31 +582,7 @@ export class Defuddle { */ private _getLargestImageSrc(img: Element): string { const srcset = img.getAttribute('srcset') || ''; - if (!srcset) return img.getAttribute('src') || ''; - - // Parse srcset entries: each ends with a width descriptor (e.g. "424w") - // URLs may contain commas (e.g. Substack CDN), so split on width descriptors - const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g; - let bestUrl = ''; - let bestWidth = 0; - let match; - let lastIndex = 0; - - while ((match = entryPattern.exec(srcset)) !== null) { - let url = match[1].trim(); - if (lastIndex > 0) { - url = url.replace(/^,\s*/, ''); - } - lastIndex = entryPattern.lastIndex; - - const width = parseFloat(match[2]); - if (url && width > bestWidth) { - bestWidth = width; - bestUrl = url; - } - } - - let url = bestUrl || img.getAttribute('src') || ''; + let url = (srcset && getLargestWidthSrcsetUrl(srcset)) || img.getAttribute('src') || ''; // Strip CDN width/crop constraints to get the full resolution image // (e.g. Cloudinary-style params: ,w_852,c_limit → removed) @@ -1362,32 +1339,9 @@ export class Defuddle { element.querySelectorAll('[srcset]').forEach(el => { const srcset = el.getAttribute('srcset'); if (srcset) { - // Parse srcset using width/density descriptors as delimiters, - // not commas — URLs may contain commas (e.g. CDN transform params) - const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g; - const entries: string[] = []; - let match; - let lastIdx = 0; - - while ((match = entryPattern.exec(srcset)) !== null) { - let url = match[1].trim(); - if (lastIdx > 0) { - url = url.replace(/^,\s*/, ''); - } - lastIdx = entryPattern.lastIndex; - entries.push(`${resolve(url)} ${match[2]}`); - } - - if (entries.length > 0) { - el.setAttribute('srcset', entries.join(', ')); - } else { - // Fallback: simple comma split for srcsets without descriptors - const resolved = srcset.split(',').map(entry => { - const parts = entry.trim().split(/\s+/); - if (parts[0]) parts[0] = resolve(parts[0]); - return parts.join(' '); - }).join(', '); - el.setAttribute('srcset', resolved); + const candidates = parseSrcset(srcset); + if (candidates.length > 0) { + el.setAttribute('srcset', formatSrcset(candidates, resolve)); } } }); diff --git a/src/elements/images.ts b/src/elements/images.ts index a8eda13dc..ba905e211 100644 --- a/src/elements/images.ts +++ b/src/elements/images.ts @@ -4,6 +4,7 @@ import { isElement, isTextNode } from '../utils'; import { transferContent, parseHTML, serializeHTML } from '../utils/dom'; +import { getFirstSrcsetUrl, parseSrcset } from '../utils/srcset'; import { BLOCK_LEVEL_ELEMENTS } from '../constants'; // Pre-compile regular expressions @@ -11,9 +12,7 @@ const b64DataUrlRegex = /^data:image\/([^;]+);base64,/; const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/; const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/; const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i; -const widthPattern = /\s(\d+)w/; const dprPattern = /dpr=(\d+(?:\.\d+)?)/; -const urlPattern = /^([^\s]+)/; const absoluteUrlPattern = /^https?:\/\//; const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i; const datePattern = /^\d{4}-\d{2}-\d{2}$/; @@ -903,41 +902,10 @@ function processSourceElement(element: Element, doc: Document): Element { * Extract the first URL from a srcset attribute. * Handles URLs that contain commas (e.g., Substack CDN URLs like * https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...) - * by parsing width/density descriptors and only splitting candidate separators. + * by parsing based on width/density descriptors rather than splitting on commas. */ function extractFirstUrlFromSrcset(srcset: string): string | null { - if (!srcset || !srcset.trim()) return null; - - const trimmed = srcset.trim(); - const descriptorPattern = /\s+\d+(?:\.\d+)?[wx](?=\s*(?:,|$))/g; - const candidateSeparatorPattern = /(?:,\s*(?=(?:https?:)?\/\/|\/(?!\/)|\.{1,2}\/)|,\s+(?=[^,\s]+\.(?:jpg|jpeg|png|webp|gif|avif|svg)(?:[?#\s,]|$)))/i; - const extractUrl = (candidate: string): string | null => { - const normalized = candidate.replace(/^,\s*/, '').trim(); - if (!normalized) return null; - - // Split only on commas that look like candidate separators. Some CDN URLs - // contain commas in the URL itself, e.g. "/image/fetch/...,w_424,...". - for (const part of normalized.split(candidateSeparatorPattern)) { - const urlMatch = part.trim().match(urlPattern); - if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) { - return urlMatch[1]; - } - } - - return null; - }; - - let match; - let start = 0; - - while ((match = descriptorPattern.exec(trimmed)) !== null) { - const url = extractUrl(trimmed.slice(start, match.index)); - start = descriptorPattern.lastIndex; - if (url) return url; - } - - // Fallback: handle srcset values with no descriptors. - return extractUrl(trimmed.slice(start)); + return getFirstSrcsetUrl(srcset, { skipSvgDataUrls: true }); } /** @@ -972,11 +940,10 @@ function selectBestSource(sources: NodeListOf): Element | null { if (!srcset) continue; // Extract width and DPR from srcset - const widthMatch = srcset.match(widthPattern); + const width = parseSrcset(srcset).reduce((max, candidate) => Math.max(max, candidate.width || 0), 0); const dprMatch = srcset.match(dprPattern); - if (widthMatch && widthMatch[1]) { - const width = parseInt(widthMatch[1], 10); + if (width > 0) { const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1; // Calculate effective resolution (width * DPR) diff --git a/src/extractors/substack.ts b/src/extractors/substack.ts index 8d4f82be0..918999092 100644 --- a/src/extractors/substack.ts +++ b/src/extractors/substack.ts @@ -1,6 +1,7 @@ import { BaseExtractor } from './_base'; import { ExtractorResult } from '../types/extractors'; import { parseHTML } from '../utils/dom'; +import { getLargestWidthSrcsetUrl } from '../utils/srcset'; const INJECTED_ATTR = 'data-defuddle-substack-post'; @@ -189,21 +190,7 @@ export class SubstackExtractor extends BaseExtractor { private getLargestSrc(img: Element): string { const srcset = img.getAttribute('srcset') || ''; if (srcset) { - const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g; - let bestUrl = ''; - let bestWidth = 0; - let match; - let lastIndex = 0; - while ((match = entryPattern.exec(srcset)) !== null) { - let url = match[1].trim(); - if (lastIndex > 0) url = url.replace(/^,\s*/, ''); - lastIndex = entryPattern.lastIndex; - const width = parseFloat(match[2]); - if (url && width > bestWidth) { - bestWidth = width; - bestUrl = url; - } - } + const bestUrl = getLargestWidthSrcsetUrl(srcset); if (bestUrl) return bestUrl.replace(/,w_\d+/g, '').replace(/,c_\w+/g, ''); } return img.getAttribute('src') || ''; diff --git a/src/markdown.ts b/src/markdown.ts index 0fed552e5..e2ee940a0 100644 --- a/src/markdown.ts +++ b/src/markdown.ts @@ -1,6 +1,7 @@ import TurndownService from 'turndown'; import { isElement, isTextNode } from './utils'; import { parseHTML, serializeHTML, isDirectTableChild } from './utils/dom'; +import { getFirstSrcsetUrl, getLargestWidthSrcsetUrl } from './utils/srcset'; import type { DefuddleResponse, DefuddleOptions } from './types'; // Define a type that works for both JSDOM and browser environments @@ -40,10 +41,6 @@ export function asGenericElement(node: any): GenericElement { return node as unknown as GenericElement; } - -const WIDTH_DESCRIPTOR_RE = /^(\d+)w,?$/; -const DENSITY_DESCRIPTOR_RE = /^\d+(?:\.\d+)?x,?$/; - // MathML element names, used to detect whether a has real MathML to fall // back on (vs. only a rendered-text annotation). Hoisted so the sets aren't // rebuilt on every math element during conversion. @@ -75,35 +72,13 @@ function formatMarkdownLinkTitle(title: string | null): string { function getBestImageSrc(node: GenericElement): string { const srcset = node.getAttribute('srcset'); if (srcset) { - let bestUrl = ''; - let bestWidth = 0; - // Tokenize by whitespace instead of splitting on commas, because CDN - // image URLs (e.g. Substack) can contain commas in the URL path - // (e.g. `w_424,c_limit,f_webp`). We scan tokens and treat any token - // matching `Nw` as a width descriptor; the preceding tokens form the URL. - const tokens = srcset.trim().split(/\s+/); - let urlParts: string[] = []; - for (const token of tokens) { - const widthMatch = token.match(WIDTH_DESCRIPTOR_RE); - if (widthMatch) { - const width = parseInt(widthMatch[1], 10); - if (urlParts.length > 0 && width > bestWidth) { - const url = urlParts.join(' ').replace(/^,\s*/, ''); - if (url) { - bestWidth = width; - bestUrl = url; - } - } - urlParts = []; - } else if (DENSITY_DESCRIPTOR_RE.test(token)) { - // Density descriptor (e.g. 2x) — skip, not used for selection - urlParts = []; - } else { - urlParts.push(token); - } - } + const bestUrl = getLargestWidthSrcsetUrl(srcset); if (bestUrl) return bestUrl; + + const firstUrl = getFirstSrcsetUrl(srcset); + if (firstUrl) return firstUrl; } + return node.getAttribute('src') || ''; } diff --git a/src/utils/srcset.ts b/src/utils/srcset.ts new file mode 100644 index 000000000..cb63f3649 --- /dev/null +++ b/src/utils/srcset.ts @@ -0,0 +1,79 @@ +export interface SrcsetCandidate { + url: string; + descriptor?: string; + width?: number; + density?: number; +} + +const IMAGE_EXTENSION_RE = '(?:jpg|jpeg|png|webp|gif|avif|svg)'; +const CANDIDATE_SEPARATOR_RE = new RegExp( + `(?:,\\s*(?=(?:https?:)?//|/(?!/)|\\.{1,2}/)|,\\s+(?=[^,\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$))|,(?=[^,/\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$)))`, + 'i' +); +const WIDTH_DESCRIPTOR_RE = /^(\d+)w$/i; +const DENSITY_DESCRIPTOR_RE = /^(\d+(?:\.\d+)?)x$/i; + +function parseDescriptor(value: string): Pick { + const descriptor = value.replace(/,$/, ''); + const widthMatch = descriptor.match(WIDTH_DESCRIPTOR_RE); + if (widthMatch) { + return { descriptor, width: parseInt(widthMatch[1], 10) }; + } + + const densityMatch = descriptor.match(DENSITY_DESCRIPTOR_RE); + if (densityMatch) { + return { descriptor, density: parseFloat(densityMatch[1]) }; + } + + return {}; +} + +export function parseSrcset(srcset: string): SrcsetCandidate[] { + if (!srcset || !srcset.trim()) return []; + + const candidates: SrcsetCandidate[] = []; + for (const rawCandidate of srcset.trim().split(CANDIDATE_SEPARATOR_RE)) { + const candidate = rawCandidate.replace(/^,\s*/, '').trim(); + if (!candidate) continue; + + const parts = candidate.split(/\s+/); + const descriptor = parts.length > 1 ? parseDescriptor(parts[parts.length - 1]) : {}; + const urlParts = descriptor.descriptor ? parts.slice(0, -1) : parts; + const url = urlParts.join(' ').trim(); + + if (url) { + candidates.push({ url, ...descriptor }); + } + } + + return candidates; +} + +export function getFirstSrcsetUrl(srcset: string, options: { skipSvgDataUrls?: boolean } = {}): string | null { + for (const candidate of parseSrcset(srcset)) { + if (options.skipSvgDataUrls && candidate.url.startsWith('data:image/svg+xml')) continue; + return candidate.url; + } + + return null; +} + +export function getLargestWidthSrcsetUrl(srcset: string): string | null { + let bestUrl: string | null = null; + let bestWidth = 0; + + for (const candidate of parseSrcset(srcset)) { + if (candidate.width && candidate.width > bestWidth) { + bestWidth = candidate.width; + bestUrl = candidate.url; + } + } + + return bestUrl; +} + +export function formatSrcset(candidates: SrcsetCandidate[], resolveUrl: (url: string) => string): string { + return candidates + .map(candidate => `${resolveUrl(candidate.url)}${candidate.descriptor ? ` ${candidate.descriptor}` : ''}`) + .join(', '); +}