@@ -254,37 +254,55 @@ export async function getRemoteDocument(
254254 contentType === "application/xhtml+xml" ||
255255 contentType ?. startsWith ( "application/xhtml+xml;" ) )
256256 ) {
257- const p =
258- / < ( a | l i n k ) ( ( \s + [ a - z ] [ a - z : _ - ] * = ( " [ ^ " ] * " | ' [ ^ ' ] * ' | [ ^ \s > ] + ) ) + ) \s * \/ ? > / ig;
259- const p2 = / \s + ( [ a - z ] [ a - z : _ - ] * ) = ( " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / ig;
257+ // Security: Limit HTML response size to mitigate ReDoS attacks
258+ const MAX_HTML_SIZE = 1024 * 1024 ; // 1MB
260259 const html = await response . text ( ) ;
261- let m : RegExpExecArray | null ;
262- const rawAttribs : string [ ] = [ ] ;
263- while ( ( m = p . exec ( html ) ) !== null ) rawAttribs . push ( m [ 2 ] ) ;
264- for ( const rawAttrs of rawAttribs ) {
265- let m2 : RegExpExecArray | null ;
266- const attribs : Record < string , string > = { } ;
267- while ( ( m2 = p2 . exec ( rawAttrs ) ) !== null ) {
268- const key = m2 [ 1 ] . toLowerCase ( ) ;
269- const value = m2 [ 3 ] ?? m2 [ 4 ] ?? m2 [ 5 ] ?? "" ;
270- attribs [ key ] = value ;
271- }
272- if (
273- attribs . rel === "alternate" && "type" in attribs && (
274- attribs . type === "application/activity+json" ||
275- attribs . type === "application/ld+json" ||
276- attribs . type . startsWith ( "application/ld+json;" )
277- ) && "href" in attribs &&
278- new URL ( attribs . href , docUrl ) . href !== docUrl . href
279- ) {
280- logger . debug (
281- "Found alternate document: {alternateUrl} from {url}" ,
282- { alternateUrl : attribs . href , url : documentUrl } ,
283- ) ;
284- return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
260+ if ( html . length > MAX_HTML_SIZE ) {
261+ logger . warn (
262+ "HTML response too large, skipping alternate link discovery: {url}" ,
263+ { url : documentUrl , size : html . length } ,
264+ ) ;
265+ document = JSON . parse ( html ) ;
266+ } else {
267+ // Safe regex patterns without nested quantifiers to prevent ReDoS
268+ // (CVE-2025-68475)
269+ // Step 1: Extract <a ...> or <link ...> tags
270+ const tagPattern = / < ( a | l i n k ) \s + ( [ ^ > ] * ?) \s * \/ ? > / gi;
271+ // Step 2: Parse attributes
272+ const attrPattern =
273+ / ( [ a - z ] [ a - z : _ - ] * ) = (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / gi;
274+
275+ let tagMatch : RegExpExecArray | null ;
276+ while ( ( tagMatch = tagPattern . exec ( html ) ) !== null ) {
277+ const tagContent = tagMatch [ 2 ] ;
278+ let attrMatch : RegExpExecArray | null ;
279+ const attribs : Record < string , string > = { } ;
280+
281+ // Reset regex state for attribute parsing
282+ attrPattern . lastIndex = 0 ;
283+ while ( ( attrMatch = attrPattern . exec ( tagContent ) ) !== null ) {
284+ const key = attrMatch [ 1 ] . toLowerCase ( ) ;
285+ const value = attrMatch [ 2 ] ?? attrMatch [ 3 ] ?? attrMatch [ 4 ] ?? "" ;
286+ attribs [ key ] = value ;
287+ }
288+
289+ if (
290+ attribs . rel === "alternate" && "type" in attribs && (
291+ attribs . type === "application/activity+json" ||
292+ attribs . type === "application/ld+json" ||
293+ attribs . type . startsWith ( "application/ld+json;" )
294+ ) && "href" in attribs &&
295+ new URL ( attribs . href , docUrl ) . href !== docUrl . href
296+ ) {
297+ logger . debug (
298+ "Found alternate document: {alternateUrl} from {url}" ,
299+ { alternateUrl : attribs . href , url : documentUrl } ,
300+ ) ;
301+ return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
302+ }
285303 }
304+ document = JSON . parse ( html ) ;
286305 }
287- document = JSON . parse ( html ) ;
288306 } else {
289307 document = await response . json ( ) ;
290308 }
0 commit comments