From 73ebdc434867f02b66e2568947a69fc2a22f75e5 Mon Sep 17 00:00:00 2001
From: Benjamin Jasper <jasper.benjamin@icloud.com>
Date: Sat, 13 Jun 2026 12:33:47 +0200
Subject: [PATCH 1/2] fix: fix srcset normalization when the widht/density
 descriptor is missing

---
 src/elements/images.ts                        | 53 +++++++++----------
 .../elements--srcset-normalization.md         | 10 +++-
 .../elements--srcset-normalization.html       | 16 ++++++
 3 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/src/elements/images.ts b/src/elements/images.ts
index 57499009e..a8eda13dc 100644
--- a/src/elements/images.ts
+++ b/src/elements/images.ts
@@ -903,46 +903,41 @@ function processSourceElement(element: Element, doc: Document): Element {
  * Extract the first URL from a srcset attribute.
  * Handles URLs that contain commas (e.g., Substack CDN URLs like
  * https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...)
- * by parsing based on width/density descriptors rather than splitting on commas.
+ * by parsing width/density descriptors and only splitting candidate separators.
  */
 function extractFirstUrlFromSrcset(srcset: string): string | null {
 	if (!srcset || !srcset.trim()) return null;
 
 	const trimmed = srcset.trim();
-
-	// Match srcset entries by finding URL + descriptor pairs.
-	// Each entry ends with a width descriptor (e.g., "424w") or density descriptor (e.g., "2x").
-	// The URL is everything before the whitespace that precedes the descriptor.
-	// This handles URLs containing commas (which would break a simple comma-split).
-	const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
-	let match;
-	let lastIndex = 0;
-
-	while ((match = entryPattern.exec(trimmed)) !== null) {
-		// Extract URL from this entry, trimming any leading comma+whitespace from previous entry
-		let url = match[1].trim();
-		if (lastIndex > 0) {
-			// Remove leading comma separator from previous entry
-			url = url.replace(/^,\s*/, '');
+	const descriptorPattern = /\s+\d+(?:\.\d+)?[wx](?=\s*(?:,|$))/g;
+	const candidateSeparatorPattern = /(?:,\s*(?=(?:https?:)?\/\/|\/(?!\/)|\.{1,2}\/)|,\s+(?=[^,\s]+\.(?:jpg|jpeg|png|webp|gif|avif|svg)(?:[?#\s,]|$)))/i;
+	const extractUrl = (candidate: string): string | null => {
+		const normalized = candidate.replace(/^,\s*/, '').trim();
+		if (!normalized) return null;
+
+		// Split only on commas that look like candidate separators. Some CDN URLs
+		// contain commas in the URL itself, e.g. "/image/fetch/...,w_424,...".
+		for (const part of normalized.split(candidateSeparatorPattern)) {
+			const urlMatch = part.trim().match(urlPattern);
+			if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
+				return urlMatch[1];
+			}
 		}
 
-		lastIndex = entryPattern.lastIndex;
-
-		if (!url) continue;
-
-		// Skip SVG data URLs
-		if (isSvgDataUrl(url)) continue;
+		return null;
+	};
 
-		return url;
-	}
+	let match;
+	let start = 0;
 
-	// Fallback: try extracting URL before first whitespace (for srcset with single entry and no descriptor)
-	const urlMatch = trimmed.match(urlPattern);
-	if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
-		return urlMatch[1];
+	while ((match = descriptorPattern.exec(trimmed)) !== null) {
+		const url = extractUrl(trimmed.slice(start, match.index));
+		start = descriptorPattern.lastIndex;
+		if (url) return url;
 	}
 
-	return null;
+	// Fallback: handle srcset values with no descriptors.
+	return extractUrl(trimmed.slice(start));
 }
 
 /**
diff --git a/tests/expected/elements--srcset-normalization.md b/tests/expected/elements--srcset-normalization.md
index 6b1446c46..a71a56469 100644
--- a/tests/expected/elements--srcset-normalization.md
+++ b/tests/expected/elements--srcset-normalization.md
@@ -15,6 +15,14 @@ This article tests normalization of React SSR camelCase srcSet attributes to sta
 
 Hero image with React SSR attributes. Photo credit.
 
+![Image whose first srcset candidate has no descriptor.](https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile)
+
+Image whose first srcset candidate has no descriptor.
+
+![Image with CDN commas in srcset URL.](https://substackcdn.com/image/fetch/$s_!test!,w_848,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp)
+
+Image with CDN commas in srcset URL.
+
 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.
 
-Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.
\ No newline at end of file
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.
diff --git a/tests/fixtures/elements--srcset-normalization.html b/tests/fixtures/elements--srcset-normalization.html
index 65ea808f0..9032586ff 100644
--- a/tests/fixtures/elements--srcset-normalization.html
+++ b/tests/fixtures/elements--srcset-normalization.html
@@ -18,6 +18,22 @@ <h1>Article with React SSR Images</h1>
 			<figcaption>Hero image with React SSR attributes. Photo credit.</figcaption>
 		</figure>
 
+		<figure>
+			<picture>
+				<source type="image/webp" srcset="https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile, https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile__scale_2 2x">
+				<img src="https://www.example.com/images/fallback.jpg" alt="Image whose first srcset candidate has no descriptor.">
+			</picture>
+			<figcaption>Image whose first srcset candidate has no descriptor.</figcaption>
+		</figure>
+
+		<figure>
+			<picture>
+				<source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!test!,w_424,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp 424w, https://substackcdn.com/image/fetch/$s_!test!,w_848,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp 848w">
+				<img src="https://www.example.com/images/fallback.jpg" alt="Image with CDN commas in srcset URL.">
+			</picture>
+			<figcaption>Image with CDN commas in srcset URL.</figcaption>
+		</figure>
+
 		<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.</p>
 		<p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.</p>
 	</article>

From dc6cbe668a9c781c59b7d7d09a9ee9730bcfe039 Mon Sep 17 00:00:00 2001
From: Benjamin Jasper <jasper.benjamin@icloud.com>
Date: Sat, 13 Jun 2026 16:17:40 +0200
Subject: [PATCH 2/2] feat: consolidate srcset parsing/selection

---
 src/defuddle.ts            | 56 +++------------------------
 src/elements/images.ts     | 43 +++------------------
 src/extractors/substack.ts | 17 +-------
 src/markdown.ts            | 37 +++---------------
 src/utils/srcset.ts        | 79 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 97 insertions(+), 135 deletions(-)
 create mode 100644 src/utils/srcset.ts

diff --git a/src/defuddle.ts b/src/defuddle.ts
index a513b30d0..8b81e8342 100644
--- a/src/defuddle.ts
+++ b/src/defuddle.ts
@@ -20,6 +20,7 @@ import { removeByContentPattern, removeEyebrowLabel } from './removals/content-p
 import { removeMetadataBlock } from './removals/metadata-block';
 import { getComputedStyle, textPreview, countWords } from './utils';
 import { parseHTML, serializeHTML, decodeHTMLEntities, isDangerousUrl, getClassName } from './utils/dom';
+import { formatSrcset, getLargestWidthSrcsetUrl, parseSrcset } from './utils/srcset';
 
 interface StyleChange {
 	selector: string;
@@ -581,31 +582,7 @@ export class Defuddle {
 	 */
 	private _getLargestImageSrc(img: Element): string {
 		const srcset = img.getAttribute('srcset') || '';
-		if (!srcset) return img.getAttribute('src') || '';
-
-		// Parse srcset entries: each ends with a width descriptor (e.g. "424w")
-		// URLs may contain commas (e.g. Substack CDN), so split on width descriptors
-		const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g;
-		let bestUrl = '';
-		let bestWidth = 0;
-		let match;
-		let lastIndex = 0;
-
-		while ((match = entryPattern.exec(srcset)) !== null) {
-			let url = match[1].trim();
-			if (lastIndex > 0) {
-				url = url.replace(/^,\s*/, '');
-			}
-			lastIndex = entryPattern.lastIndex;
-
-			const width = parseFloat(match[2]);
-			if (url && width > bestWidth) {
-				bestWidth = width;
-				bestUrl = url;
-			}
-		}
-
-		let url = bestUrl || img.getAttribute('src') || '';
+		let url = (srcset && getLargestWidthSrcsetUrl(srcset)) || img.getAttribute('src') || '';
 
 		// Strip CDN width/crop constraints to get the full resolution image
 		// (e.g. Cloudinary-style params: ,w_852,c_limit → removed)
@@ -1362,32 +1339,9 @@ export class Defuddle {
 		element.querySelectorAll('[srcset]').forEach(el => {
 			const srcset = el.getAttribute('srcset');
 			if (srcset) {
-				// Parse srcset using width/density descriptors as delimiters,
-				// not commas — URLs may contain commas (e.g. CDN transform params)
-				const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
-				const entries: string[] = [];
-				let match;
-				let lastIdx = 0;
-
-				while ((match = entryPattern.exec(srcset)) !== null) {
-					let url = match[1].trim();
-					if (lastIdx > 0) {
-						url = url.replace(/^,\s*/, '');
-					}
-					lastIdx = entryPattern.lastIndex;
-					entries.push(`${resolve(url)} ${match[2]}`);
-				}
-
-				if (entries.length > 0) {
-					el.setAttribute('srcset', entries.join(', '));
-				} else {
-					// Fallback: simple comma split for srcsets without descriptors
-					const resolved = srcset.split(',').map(entry => {
-						const parts = entry.trim().split(/\s+/);
-						if (parts[0]) parts[0] = resolve(parts[0]);
-						return parts.join(' ');
-					}).join(', ');
-					el.setAttribute('srcset', resolved);
+				const candidates = parseSrcset(srcset);
+				if (candidates.length > 0) {
+					el.setAttribute('srcset', formatSrcset(candidates, resolve));
 				}
 			}
 		});
diff --git a/src/elements/images.ts b/src/elements/images.ts
index a8eda13dc..ba905e211 100644
--- a/src/elements/images.ts
+++ b/src/elements/images.ts
@@ -4,6 +4,7 @@
 
 import { isElement, isTextNode } from '../utils';
 import { transferContent, parseHTML, serializeHTML } from '../utils/dom';
+import { getFirstSrcsetUrl, parseSrcset } from '../utils/srcset';
 import { BLOCK_LEVEL_ELEMENTS } from '../constants';
 
 // Pre-compile regular expressions
@@ -11,9 +12,7 @@ const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
 const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
 const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
 const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
-const widthPattern = /\s(\d+)w/;
 const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
-const urlPattern = /^([^\s]+)/;
 const absoluteUrlPattern = /^https?:\/\//;
 const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
 const datePattern = /^\d{4}-\d{2}-\d{2}$/;
@@ -903,41 +902,10 @@ function processSourceElement(element: Element, doc: Document): Element {
  * Extract the first URL from a srcset attribute.
  * Handles URLs that contain commas (e.g., Substack CDN URLs like
  * https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...)
- * by parsing width/density descriptors and only splitting candidate separators.
+ * by parsing based on width/density descriptors rather than splitting on commas.
  */
 function extractFirstUrlFromSrcset(srcset: string): string | null {
-	if (!srcset || !srcset.trim()) return null;
-
-	const trimmed = srcset.trim();
-	const descriptorPattern = /\s+\d+(?:\.\d+)?[wx](?=\s*(?:,|$))/g;
-	const candidateSeparatorPattern = /(?:,\s*(?=(?:https?:)?\/\/|\/(?!\/)|\.{1,2}\/)|,\s+(?=[^,\s]+\.(?:jpg|jpeg|png|webp|gif|avif|svg)(?:[?#\s,]|$)))/i;
-	const extractUrl = (candidate: string): string | null => {
-		const normalized = candidate.replace(/^,\s*/, '').trim();
-		if (!normalized) return null;
-
-		// Split only on commas that look like candidate separators. Some CDN URLs
-		// contain commas in the URL itself, e.g. "/image/fetch/...,w_424,...".
-		for (const part of normalized.split(candidateSeparatorPattern)) {
-			const urlMatch = part.trim().match(urlPattern);
-			if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
-				return urlMatch[1];
-			}
-		}
-
-		return null;
-	};
-
-	let match;
-	let start = 0;
-
-	while ((match = descriptorPattern.exec(trimmed)) !== null) {
-		const url = extractUrl(trimmed.slice(start, match.index));
-		start = descriptorPattern.lastIndex;
-		if (url) return url;
-	}
-
-	// Fallback: handle srcset values with no descriptors.
-	return extractUrl(trimmed.slice(start));
+	return getFirstSrcsetUrl(srcset, { skipSvgDataUrls: true });
 }
 
 /**
@@ -972,11 +940,10 @@ function selectBestSource(sources: NodeListOf<Element>): Element | null {
 		if (!srcset) continue;
 		
 		// Extract width and DPR from srcset
-		const widthMatch = srcset.match(widthPattern);
+		const width = parseSrcset(srcset).reduce((max, candidate) => Math.max(max, candidate.width || 0), 0);
 		const dprMatch = srcset.match(dprPattern);
 		
-		if (widthMatch && widthMatch[1]) {
-			const width = parseInt(widthMatch[1], 10);
+		if (width > 0) {
 			const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;
 			
 			// Calculate effective resolution (width * DPR)
diff --git a/src/extractors/substack.ts b/src/extractors/substack.ts
index 8d4f82be0..918999092 100644
--- a/src/extractors/substack.ts
+++ b/src/extractors/substack.ts
@@ -1,6 +1,7 @@
 import { BaseExtractor } from './_base';
 import { ExtractorResult } from '../types/extractors';
 import { parseHTML } from '../utils/dom';
+import { getLargestWidthSrcsetUrl } from '../utils/srcset';
 
 const INJECTED_ATTR = 'data-defuddle-substack-post';
 
@@ -189,21 +190,7 @@ export class SubstackExtractor extends BaseExtractor {
 	private getLargestSrc(img: Element): string {
 		const srcset = img.getAttribute('srcset') || '';
 		if (srcset) {
-			const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g;
-			let bestUrl = '';
-			let bestWidth = 0;
-			let match;
-			let lastIndex = 0;
-			while ((match = entryPattern.exec(srcset)) !== null) {
-				let url = match[1].trim();
-				if (lastIndex > 0) url = url.replace(/^,\s*/, '');
-				lastIndex = entryPattern.lastIndex;
-				const width = parseFloat(match[2]);
-				if (url && width > bestWidth) {
-					bestWidth = width;
-					bestUrl = url;
-				}
-			}
+			const bestUrl = getLargestWidthSrcsetUrl(srcset);
 			if (bestUrl) return bestUrl.replace(/,w_\d+/g, '').replace(/,c_\w+/g, '');
 		}
 		return img.getAttribute('src') || '';
diff --git a/src/markdown.ts b/src/markdown.ts
index 0fed552e5..e2ee940a0 100644
--- a/src/markdown.ts
+++ b/src/markdown.ts
@@ -1,6 +1,7 @@
 import TurndownService from 'turndown';
 import { isElement, isTextNode } from './utils';
 import { parseHTML, serializeHTML, isDirectTableChild } from './utils/dom';
+import { getFirstSrcsetUrl, getLargestWidthSrcsetUrl } from './utils/srcset';
 import type { DefuddleResponse, DefuddleOptions } from './types';
 
 // Define a type that works for both JSDOM and browser environments
@@ -40,10 +41,6 @@ export function asGenericElement(node: any): GenericElement {
 	return node as unknown as GenericElement;
 }
 
-
-const WIDTH_DESCRIPTOR_RE = /^(\d+)w,?$/;
-const DENSITY_DESCRIPTOR_RE = /^\d+(?:\.\d+)?x,?$/;
-
 // MathML element names, used to detect whether a <math> has real MathML to fall
 // back on (vs. only a rendered-text annotation). Hoisted so the sets aren't
 // rebuilt on every math element during conversion.
@@ -75,35 +72,13 @@ function formatMarkdownLinkTitle(title: string | null): string {
 function getBestImageSrc(node: GenericElement): string {
 	const srcset = node.getAttribute('srcset');
 	if (srcset) {
-		let bestUrl = '';
-		let bestWidth = 0;
-		// Tokenize by whitespace instead of splitting on commas, because CDN
-		// image URLs (e.g. Substack) can contain commas in the URL path
-		// (e.g. `w_424,c_limit,f_webp`). We scan tokens and treat any token
-		// matching `Nw` as a width descriptor; the preceding tokens form the URL.
-		const tokens = srcset.trim().split(/\s+/);
-		let urlParts: string[] = [];
-		for (const token of tokens) {
-			const widthMatch = token.match(WIDTH_DESCRIPTOR_RE);
-			if (widthMatch) {
-				const width = parseInt(widthMatch[1], 10);
-				if (urlParts.length > 0 && width > bestWidth) {
-					const url = urlParts.join(' ').replace(/^,\s*/, '');
-					if (url) {
-						bestWidth = width;
-						bestUrl = url;
-					}
-				}
-				urlParts = [];
-			} else if (DENSITY_DESCRIPTOR_RE.test(token)) {
-				// Density descriptor (e.g. 2x) — skip, not used for selection
-				urlParts = [];
-			} else {
-				urlParts.push(token);
-			}
-		}
+		const bestUrl = getLargestWidthSrcsetUrl(srcset);
 		if (bestUrl) return bestUrl;
+
+		const firstUrl = getFirstSrcsetUrl(srcset);
+		if (firstUrl) return firstUrl;
 	}
+
 	return node.getAttribute('src') || '';
 }
 
diff --git a/src/utils/srcset.ts b/src/utils/srcset.ts
new file mode 100644
index 000000000..cb63f3649
--- /dev/null
+++ b/src/utils/srcset.ts
@@ -0,0 +1,79 @@
+export interface SrcsetCandidate {
+	url: string;
+	descriptor?: string;
+	width?: number;
+	density?: number;
+}
+
+const IMAGE_EXTENSION_RE = '(?:jpg|jpeg|png|webp|gif|avif|svg)';
+const CANDIDATE_SEPARATOR_RE = new RegExp(
+	`(?:,\\s*(?=(?:https?:)?//|/(?!/)|\\.{1,2}/)|,\\s+(?=[^,\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$))|,(?=[^,/\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$)))`,
+	'i'
+);
+const WIDTH_DESCRIPTOR_RE = /^(\d+)w$/i;
+const DENSITY_DESCRIPTOR_RE = /^(\d+(?:\.\d+)?)x$/i;
+
+function parseDescriptor(value: string): Pick<SrcsetCandidate, 'descriptor' | 'width' | 'density'> {
+	const descriptor = value.replace(/,$/, '');
+	const widthMatch = descriptor.match(WIDTH_DESCRIPTOR_RE);
+	if (widthMatch) {
+		return { descriptor, width: parseInt(widthMatch[1], 10) };
+	}
+
+	const densityMatch = descriptor.match(DENSITY_DESCRIPTOR_RE);
+	if (densityMatch) {
+		return { descriptor, density: parseFloat(densityMatch[1]) };
+	}
+
+	return {};
+}
+
+export function parseSrcset(srcset: string): SrcsetCandidate[] {
+	if (!srcset || !srcset.trim()) return [];
+
+	const candidates: SrcsetCandidate[] = [];
+	for (const rawCandidate of srcset.trim().split(CANDIDATE_SEPARATOR_RE)) {
+		const candidate = rawCandidate.replace(/^,\s*/, '').trim();
+		if (!candidate) continue;
+
+		const parts = candidate.split(/\s+/);
+		const descriptor = parts.length > 1 ? parseDescriptor(parts[parts.length - 1]) : {};
+		const urlParts = descriptor.descriptor ? parts.slice(0, -1) : parts;
+		const url = urlParts.join(' ').trim();
+
+		if (url) {
+			candidates.push({ url, ...descriptor });
+		}
+	}
+
+	return candidates;
+}
+
+export function getFirstSrcsetUrl(srcset: string, options: { skipSvgDataUrls?: boolean } = {}): string | null {
+	for (const candidate of parseSrcset(srcset)) {
+		if (options.skipSvgDataUrls && candidate.url.startsWith('data:image/svg+xml')) continue;
+		return candidate.url;
+	}
+
+	return null;
+}
+
+export function getLargestWidthSrcsetUrl(srcset: string): string | null {
+	let bestUrl: string | null = null;
+	let bestWidth = 0;
+
+	for (const candidate of parseSrcset(srcset)) {
+		if (candidate.width && candidate.width > bestWidth) {
+			bestWidth = candidate.width;
+			bestUrl = candidate.url;
+		}
+	}
+
+	return bestUrl;
+}
+
+export function formatSrcset(candidates: SrcsetCandidate[], resolveUrl: (url: string) => string): string {
+	return candidates
+		.map(candidate => `${resolveUrl(candidate.url)}${candidate.descriptor ? ` ${candidate.descriptor}` : ''}`)
+		.join(', ');
+}