Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 5 additions & 51 deletions src/defuddle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import { removeByContentPattern, removeEyebrowLabel } from './removals/content-p
import { removeMetadataBlock } from './removals/metadata-block';
import { getComputedStyle, textPreview, countWords } from './utils';
import { parseHTML, serializeHTML, decodeHTMLEntities, isDangerousUrl, getClassName } from './utils/dom';
import { formatSrcset, getLargestWidthSrcsetUrl, parseSrcset } from './utils/srcset';

interface StyleChange {
selector: string;
Expand Down Expand Up @@ -581,31 +582,7 @@ export class Defuddle {
*/
private _getLargestImageSrc(img: Element): string {
const srcset = img.getAttribute('srcset') || '';
if (!srcset) return img.getAttribute('src') || '';

// Parse srcset entries: each ends with a width descriptor (e.g. "424w")
// URLs may contain commas (e.g. Substack CDN), so split on width descriptors
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g;
let bestUrl = '';
let bestWidth = 0;
let match;
let lastIndex = 0;

while ((match = entryPattern.exec(srcset)) !== null) {
let url = match[1].trim();
if (lastIndex > 0) {
url = url.replace(/^,\s*/, '');
}
lastIndex = entryPattern.lastIndex;

const width = parseFloat(match[2]);
if (url && width > bestWidth) {
bestWidth = width;
bestUrl = url;
}
}

let url = bestUrl || img.getAttribute('src') || '';
let url = (srcset && getLargestWidthSrcsetUrl(srcset)) || img.getAttribute('src') || '';

// Strip CDN width/crop constraints to get the full resolution image
// (e.g. Cloudinary-style params: ,w_852,c_limit → removed)
Expand Down Expand Up @@ -1362,32 +1339,9 @@ export class Defuddle {
element.querySelectorAll('[srcset]').forEach(el => {
const srcset = el.getAttribute('srcset');
if (srcset) {
// Parse srcset using width/density descriptors as delimiters,
// not commas — URLs may contain commas (e.g. CDN transform params)
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
const entries: string[] = [];
let match;
let lastIdx = 0;

while ((match = entryPattern.exec(srcset)) !== null) {
let url = match[1].trim();
if (lastIdx > 0) {
url = url.replace(/^,\s*/, '');
}
lastIdx = entryPattern.lastIndex;
entries.push(`${resolve(url)} ${match[2]}`);
}

if (entries.length > 0) {
el.setAttribute('srcset', entries.join(', '));
} else {
// Fallback: simple comma split for srcsets without descriptors
const resolved = srcset.split(',').map(entry => {
const parts = entry.trim().split(/\s+/);
if (parts[0]) parts[0] = resolve(parts[0]);
return parts.join(' ');
}).join(', ');
el.setAttribute('srcset', resolved);
const candidates = parseSrcset(srcset);
if (candidates.length > 0) {
el.setAttribute('srcset', formatSrcset(candidates, resolve));
}
}
});
Expand Down
46 changes: 4 additions & 42 deletions src/elements/images.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@

import { isElement, isTextNode } from '../utils';
import { transferContent, parseHTML, serializeHTML } from '../utils/dom';
import { getFirstSrcsetUrl, parseSrcset } from '../utils/srcset';
import { BLOCK_LEVEL_ELEMENTS } from '../constants';

// Pre-compile regular expressions
const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
const widthPattern = /\s(\d+)w/;
const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
const urlPattern = /^([^\s]+)/;
const absoluteUrlPattern = /^https?:\/\//;
const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
const datePattern = /^\d{4}-\d{2}-\d{2}$/;
Expand Down Expand Up @@ -906,43 +905,7 @@ function processSourceElement(element: Element, doc: Document): Element {
* by parsing based on width/density descriptors rather than splitting on commas.
*/
function extractFirstUrlFromSrcset(srcset: string): string | null {
if (!srcset || !srcset.trim()) return null;

const trimmed = srcset.trim();

// Match srcset entries by finding URL + descriptor pairs.
// Each entry ends with a width descriptor (e.g., "424w") or density descriptor (e.g., "2x").
// The URL is everything before the whitespace that precedes the descriptor.
// This handles URLs containing commas (which would break a simple comma-split).
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
let match;
let lastIndex = 0;

while ((match = entryPattern.exec(trimmed)) !== null) {
// Extract URL from this entry, trimming any leading comma+whitespace from previous entry
let url = match[1].trim();
if (lastIndex > 0) {
// Remove leading comma separator from previous entry
url = url.replace(/^,\s*/, '');
}

lastIndex = entryPattern.lastIndex;

if (!url) continue;

// Skip SVG data URLs
if (isSvgDataUrl(url)) continue;

return url;
}

// Fallback: try extracting URL before first whitespace (for srcset with single entry and no descriptor)
const urlMatch = trimmed.match(urlPattern);
if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
return urlMatch[1];
}

return null;
return getFirstSrcsetUrl(srcset, { skipSvgDataUrls: true });
}

/**
Expand Down Expand Up @@ -977,11 +940,10 @@ function selectBestSource(sources: NodeListOf<Element>): Element | null {
if (!srcset) continue;

// Extract width and DPR from srcset
const widthMatch = srcset.match(widthPattern);
const width = parseSrcset(srcset).reduce((max, candidate) => Math.max(max, candidate.width || 0), 0);
const dprMatch = srcset.match(dprPattern);

if (widthMatch && widthMatch[1]) {
const width = parseInt(widthMatch[1], 10);
if (width > 0) {
const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;

// Calculate effective resolution (width * DPR)
Expand Down
17 changes: 2 additions & 15 deletions src/extractors/substack.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { BaseExtractor } from './_base';
import { ExtractorResult } from '../types/extractors';
import { parseHTML } from '../utils/dom';
import { getLargestWidthSrcsetUrl } from '../utils/srcset';

const INJECTED_ATTR = 'data-defuddle-substack-post';

Expand Down Expand Up @@ -189,21 +190,7 @@ export class SubstackExtractor extends BaseExtractor {
private getLargestSrc(img: Element): string {
const srcset = img.getAttribute('srcset') || '';
if (srcset) {
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g;
let bestUrl = '';
let bestWidth = 0;
let match;
let lastIndex = 0;
while ((match = entryPattern.exec(srcset)) !== null) {
let url = match[1].trim();
if (lastIndex > 0) url = url.replace(/^,\s*/, '');
lastIndex = entryPattern.lastIndex;
const width = parseFloat(match[2]);
if (url && width > bestWidth) {
bestWidth = width;
bestUrl = url;
}
}
const bestUrl = getLargestWidthSrcsetUrl(srcset);
if (bestUrl) return bestUrl.replace(/,w_\d+/g, '').replace(/,c_\w+/g, '');
}
return img.getAttribute('src') || '';
Expand Down
37 changes: 6 additions & 31 deletions src/markdown.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import TurndownService from 'turndown';
import { isElement, isTextNode } from './utils';
import { parseHTML, serializeHTML, isDirectTableChild } from './utils/dom';
import { getFirstSrcsetUrl, getLargestWidthSrcsetUrl } from './utils/srcset';
import type { DefuddleResponse, DefuddleOptions } from './types';

// Define a type that works for both JSDOM and browser environments
Expand Down Expand Up @@ -40,10 +41,6 @@ export function asGenericElement(node: any): GenericElement {
return node as unknown as GenericElement;
}


const WIDTH_DESCRIPTOR_RE = /^(\d+)w,?$/;
const DENSITY_DESCRIPTOR_RE = /^\d+(?:\.\d+)?x,?$/;

// MathML element names, used to detect whether a <math> has real MathML to fall
// back on (vs. only a rendered-text annotation). Hoisted so the sets aren't
// rebuilt on every math element during conversion.
Expand Down Expand Up @@ -75,35 +72,13 @@ function formatMarkdownLinkTitle(title: string | null): string {
function getBestImageSrc(node: GenericElement): string {
const srcset = node.getAttribute('srcset');
if (srcset) {
let bestUrl = '';
let bestWidth = 0;
// Tokenize by whitespace instead of splitting on commas, because CDN
// image URLs (e.g. Substack) can contain commas in the URL path
// (e.g. `w_424,c_limit,f_webp`). We scan tokens and treat any token
// matching `Nw` as a width descriptor; the preceding tokens form the URL.
const tokens = srcset.trim().split(/\s+/);
let urlParts: string[] = [];
for (const token of tokens) {
const widthMatch = token.match(WIDTH_DESCRIPTOR_RE);
if (widthMatch) {
const width = parseInt(widthMatch[1], 10);
if (urlParts.length > 0 && width > bestWidth) {
const url = urlParts.join(' ').replace(/^,\s*/, '');
if (url) {
bestWidth = width;
bestUrl = url;
}
}
urlParts = [];
} else if (DENSITY_DESCRIPTOR_RE.test(token)) {
// Density descriptor (e.g. 2x) — skip, not used for selection
urlParts = [];
} else {
urlParts.push(token);
}
}
const bestUrl = getLargestWidthSrcsetUrl(srcset);
if (bestUrl) return bestUrl;

const firstUrl = getFirstSrcsetUrl(srcset);
if (firstUrl) return firstUrl;
}

return node.getAttribute('src') || '';
}

Expand Down
79 changes: 79 additions & 0 deletions src/utils/srcset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
export interface SrcsetCandidate {
url: string;
descriptor?: string;
width?: number;
density?: number;
}

const IMAGE_EXTENSION_RE = '(?:jpg|jpeg|png|webp|gif|avif|svg)';
const CANDIDATE_SEPARATOR_RE = new RegExp(
`(?:,\\s*(?=(?:https?:)?//|/(?!/)|\\.{1,2}/)|,\\s+(?=[^,\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$))|,(?=[^,/\\s]+\\.${IMAGE_EXTENSION_RE}(?:[?#\\s,]|$)))`,
'i'
);
const WIDTH_DESCRIPTOR_RE = /^(\d+)w$/i;
const DENSITY_DESCRIPTOR_RE = /^(\d+(?:\.\d+)?)x$/i;

function parseDescriptor(value: string): Pick<SrcsetCandidate, 'descriptor' | 'width' | 'density'> {
const descriptor = value.replace(/,$/, '');
const widthMatch = descriptor.match(WIDTH_DESCRIPTOR_RE);
if (widthMatch) {
return { descriptor, width: parseInt(widthMatch[1], 10) };
}

const densityMatch = descriptor.match(DENSITY_DESCRIPTOR_RE);
if (densityMatch) {
return { descriptor, density: parseFloat(densityMatch[1]) };
}

return {};
}

export function parseSrcset(srcset: string): SrcsetCandidate[] {
if (!srcset || !srcset.trim()) return [];

const candidates: SrcsetCandidate[] = [];
for (const rawCandidate of srcset.trim().split(CANDIDATE_SEPARATOR_RE)) {
const candidate = rawCandidate.replace(/^,\s*/, '').trim();
if (!candidate) continue;

const parts = candidate.split(/\s+/);
const descriptor = parts.length > 1 ? parseDescriptor(parts[parts.length - 1]) : {};
const urlParts = descriptor.descriptor ? parts.slice(0, -1) : parts;
const url = urlParts.join(' ').trim();

if (url) {
candidates.push({ url, ...descriptor });
}
}

return candidates;
}

export function getFirstSrcsetUrl(srcset: string, options: { skipSvgDataUrls?: boolean } = {}): string | null {
for (const candidate of parseSrcset(srcset)) {
if (options.skipSvgDataUrls && candidate.url.startsWith('data:image/svg+xml')) continue;
return candidate.url;
}

return null;
}

export function getLargestWidthSrcsetUrl(srcset: string): string | null {
let bestUrl: string | null = null;
let bestWidth = 0;

for (const candidate of parseSrcset(srcset)) {
if (candidate.width && candidate.width > bestWidth) {
bestWidth = candidate.width;
bestUrl = candidate.url;
}
}

return bestUrl;
}

export function formatSrcset(candidates: SrcsetCandidate[], resolveUrl: (url: string) => string): string {
return candidates
.map(candidate => `${resolveUrl(candidate.url)}${candidate.descriptor ? ` ${candidate.descriptor}` : ''}`)
.join(', ');
}
10 changes: 9 additions & 1 deletion tests/expected/elements--srcset-normalization.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ This article tests normalization of React SSR camelCase srcSet attributes to sta

Hero image with React SSR attributes. Photo credit.

![Image whose first srcset candidate has no descriptor.](https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile)

Image whose first srcset candidate has no descriptor.

![Image with CDN commas in srcset URL.](https://substackcdn.com/image/fetch/$s_!test!,w_848,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp)

Image with CDN commas in srcset URL.

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.
16 changes: 16 additions & 0 deletions tests/fixtures/elements--srcset-normalization.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ <h1>Article with React SSR Images</h1>
<figcaption>Hero image with React SSR attributes. Photo credit.</figcaption>
</figure>

<figure>
<picture>
<source type="image/webp" srcset="https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile, https://img.zeit.de/zeit-magazin/2026/26/test-image/square__360x360__mobile__scale_2 2x">
<img src="https://www.example.com/images/fallback.jpg" alt="Image whose first srcset candidate has no descriptor.">
</picture>
<figcaption>Image whose first srcset candidate has no descriptor.</figcaption>
</figure>

<figure>
<picture>
<source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!test!,w_424,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp 424w, https://substackcdn.com/image/fetch/$s_!test!,w_848,c_limit,f_webp/https%3A%2F%2Fexample.com%2Fphoto.webp 848w">
<img src="https://www.example.com/images/fallback.jpg" alt="Image with CDN commas in srcset URL.">
</picture>
<figcaption>Image with CDN commas in srcset URL.</figcaption>
</figure>

<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.</p>
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.</p>
</article>
Expand Down