@@ -19,6 +19,7 @@ interface ScanArgs {
19
19
filePaths : string [ ]
20
20
enhancedScanning ?: boolean
21
21
omitValuesFromEnhancedScan ?: unknown [ ]
22
+ useMinimalChunks : boolean
22
23
}
23
24
24
25
interface MatchResult {
@@ -146,54 +147,49 @@ const likelySecretRegex = new RegExp(
146
147
)
147
148
148
149
/**
149
- * Checks a line of text for likely secrets based on known prefixes and patterns.
150
+ * Checks a chunk of text for likely secrets based on known prefixes and patterns.
150
151
* The function works by:
151
- * 1. Splitting the line into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152
+ * 1. Splitting the chunk into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152
153
* 2. For each token, checking if it matches our secret pattern:
153
154
* - Must start (^) with one of our known prefixes (e.g. aws_, github_pat_, etc)
154
155
* - Must be followed by at least MIN_CHARS_AFTER_PREFIX non-whitespace characters
155
156
* - Must extend to the end ($) of the token
156
157
*
157
- * For example, given the line : secretKey='aws_123456789012345678'
158
+ * For example, given the chunk : secretKey='aws_123456789012345678'
158
159
* 1. It's split into tokens: ['secretKey', 'aws_123456789012345678']
159
160
* 2. Each token is checked against the regex pattern:
160
161
* - 'secretKey' doesn't match (doesn't start with a known prefix)
161
162
* - 'aws_123456789012345678' matches (starts with 'aws_' and has sufficient length)
162
163
*
163
- * @param line The line of text to check
164
- * @param file The file path where this line was found
165
- * @param lineNumber The line number in the file
166
- * @param omitValuesFromEnhancedScan Optional array of values to exclude from matching
167
- * @returns Array of matches found in the line
168
164
*/
169
165
export function findLikelySecrets ( {
170
- line,
171
- file,
172
- lineNumber,
166
+ chunk,
173
167
omitValuesFromEnhancedScan = [ ] ,
174
168
} : {
175
- line : string
176
- file : string
177
- lineNumber : number
169
+ /**
170
+ * The chunk of text to check
171
+ */
172
+ chunk : string
173
+ /**
174
+ * Optional array of values to exclude from matching
175
+ */
178
176
omitValuesFromEnhancedScan ?: unknown [ ]
179
- } ) : MatchResult [ ] {
180
- if ( ! line ) return [ ]
177
+ } ) : { index : number ; prefix : string } [ ] {
178
+ if ( ! chunk ) return [ ]
181
179
182
- const matches : MatchResult [ ] = [ ]
180
+ const matches : ReturnType < typeof findLikelySecrets > = [ ]
183
181
let match : RegExpExecArray | null
184
182
const allOmittedValues = [ ...omitValuesFromEnhancedScan , ...SAFE_LISTED_VALUES ]
185
183
186
- while ( ( match = likelySecretRegex . exec ( line ) ) !== null ) {
184
+ while ( ( match = likelySecretRegex . exec ( chunk ) ) !== null ) {
187
185
const token = match . groups ?. token
188
186
const prefix = match . groups ?. prefix
189
187
if ( ! token || ! prefix || allOmittedValues . includes ( token ) ) {
190
188
continue
191
189
}
192
190
matches . push ( {
193
- file,
194
- lineNumber,
195
- key : prefix ,
196
- enhancedMatch : true ,
191
+ prefix,
192
+ index : match . index ,
197
193
} )
198
194
}
199
195
@@ -279,6 +275,7 @@ export async function scanFilesForKeyValues({
279
275
base,
280
276
enhancedScanning,
281
277
omitValuesFromEnhancedScan = [ ] ,
278
+ useMinimalChunks = false ,
282
279
} : ScanArgs ) : Promise < ScanResults > {
283
280
const scanResults : ScanResults = {
284
281
matches : [ ] ,
@@ -309,6 +306,8 @@ export async function scanFilesForKeyValues({
309
306
310
307
let settledPromises : PromiseSettledResult < MatchResult [ ] > [ ] = [ ]
311
308
309
+ const searchStream = useMinimalChunks ? searchStreamMinimalChunks : searchStreamReadline
310
+
312
311
// process the scanning in batches to not run into memory issues by
313
312
// processing all files at the same time.
314
313
while ( filePaths . length > 0 ) {
@@ -333,19 +332,24 @@ export async function scanFilesForKeyValues({
333
332
return scanResults
334
333
}
335
334
336
- const searchStream = ( {
337
- basePath,
338
- file,
339
- keyValues,
340
- enhancedScanning,
341
- omitValuesFromEnhancedScan = [ ] ,
342
- } : {
335
+ type SearchStreamOptions = {
343
336
basePath : string
344
337
file : string
345
338
keyValues : Record < string , string [ ] >
346
339
enhancedScanning ?: boolean
347
340
omitValuesFromEnhancedScan ?: unknown [ ]
348
- } ) : Promise < MatchResult [ ] > => {
341
+ }
342
+
343
+ /**
344
+ * Search stream implementation using node:readline
345
+ */
346
+ const searchStreamReadline = ( {
347
+ basePath,
348
+ file,
349
+ keyValues,
350
+ enhancedScanning,
351
+ omitValuesFromEnhancedScan = [ ] ,
352
+ } : SearchStreamOptions ) : Promise < MatchResult [ ] > => {
349
353
return new Promise ( ( resolve , reject ) => {
350
354
const filePath = path . resolve ( basePath , file )
351
355
@@ -382,7 +386,14 @@ const searchStream = ({
382
386
lineNumber ++
383
387
if ( typeof line === 'string' ) {
384
388
if ( enhancedScanning ) {
385
- matches . push ( ...findLikelySecrets ( { line, file, lineNumber, omitValuesFromEnhancedScan } ) )
389
+ matches . push (
390
+ ...findLikelySecrets ( { chunk : line , omitValuesFromEnhancedScan } ) . map ( ( { prefix } ) => ( {
391
+ key : prefix ,
392
+ file,
393
+ lineNumber,
394
+ enhancedMatch : true ,
395
+ } ) ) ,
396
+ )
386
397
}
387
398
if ( maxMultiLineCount > 1 ) {
388
399
lines . push ( line )
@@ -472,6 +483,160 @@ const searchStream = ({
472
483
} )
473
484
}
474
485
486
+ /**
487
+ * Search stream implementation using just read stream that allows to buffer less content
488
+ */
489
+ const searchStreamMinimalChunks = ( {
490
+ basePath,
491
+ file,
492
+ keyValues,
493
+ enhancedScanning,
494
+ omitValuesFromEnhancedScan = [ ] ,
495
+ } : SearchStreamOptions ) : Promise < MatchResult [ ] > => {
496
+ return new Promise ( ( resolve , reject ) => {
497
+ const filePath = path . resolve ( basePath , file )
498
+
499
+ const inStream = createReadStream ( filePath )
500
+ const matches : MatchResult [ ] = [ ]
501
+
502
+ const keyVals : string [ ] = ( [ ] as string [ ] ) . concat ( ...Object . values ( keyValues ) )
503
+
504
+ // determine longest value that we will search for - needed to determine minimal size of our buffer
505
+ const maxValLength = Math . max (
506
+ 0 ,
507
+ // explicit secrets
508
+ ...keyVals . map ( ( v ) => v . length ) ,
509
+ ...( enhancedScanning
510
+ ? [
511
+ // omitted likely secrets (after finding likely secret we check if it should be omitted, so we need to capture at least size of omitted values)
512
+ ...omitValuesFromEnhancedScan . map ( ( v ) => ( typeof v === 'string' ? v . length : 0 ) ) ,
513
+ // minimum length needed to find likely secret
514
+ ...LIKELY_SECRET_PREFIXES . map ( ( v ) => v . length + MIN_CHARS_AFTER_PREFIX ) ,
515
+ ]
516
+ : [ ] ) ,
517
+ )
518
+
519
+ if ( maxValLength === 0 ) {
520
+ // no non-empty values to scan for
521
+ return matches
522
+ }
523
+
524
+ const minValLength = Math . min ( ...keyVals . map ( ( v ) => v . length ) )
525
+
526
+ function getKeyForValue ( val ) {
527
+ let key = ''
528
+ for ( const [ secretKeyName , valuePermutations ] of Object . entries ( keyValues ) ) {
529
+ if ( valuePermutations . includes ( val ) ) {
530
+ key = secretKeyName
531
+ }
532
+ }
533
+ return key
534
+ }
535
+
536
+ let buffer = ''
537
+
538
+ function getCurrentBufferNewLineIndexes ( ) {
539
+ const newLinesIndexesInCurrentBuffer = [ ] as number [ ]
540
+ let newLineIndex = - 1
541
+ while ( ( newLineIndex = buffer . indexOf ( '\n' , newLineIndex + 1 ) ) !== - 1 ) {
542
+ newLinesIndexesInCurrentBuffer . push ( newLineIndex )
543
+ }
544
+
545
+ return newLinesIndexesInCurrentBuffer
546
+ }
547
+ let fileIndex = 0
548
+ let processedLines = 0
549
+ const foundIndexes = new Map < string , Set < number > > ( )
550
+ const foundLines = new Map < string , Set < number > > ( )
551
+ inStream . on ( 'data' , function ( chunk ) {
552
+ const newChunk = chunk . toString ( )
553
+
554
+ buffer += newChunk
555
+
556
+ let newLinesIndexesInCurrentBuffer = null as null | number [ ]
557
+
558
+ if ( buffer . length > minValLength ) {
559
+ for ( const valVariant of keyVals ) {
560
+ let valVariantIndex = - 1
561
+ while ( ( valVariantIndex = buffer . indexOf ( valVariant , valVariantIndex + 1 ) ) !== - 1 ) {
562
+ const pos = fileIndex + valVariantIndex
563
+ let foundIndexesForValVariant = foundIndexes . get ( valVariant )
564
+ if ( ! foundIndexesForValVariant ?. has ( pos ) ) {
565
+ if ( newLinesIndexesInCurrentBuffer === null ) {
566
+ newLinesIndexesInCurrentBuffer = getCurrentBufferNewLineIndexes ( )
567
+ }
568
+
569
+ let lineNumber = processedLines + 1
570
+ for ( const newLineIndex of newLinesIndexesInCurrentBuffer ) {
571
+ if ( valVariantIndex > newLineIndex ) {
572
+ lineNumber ++
573
+ } else {
574
+ break
575
+ }
576
+ }
577
+
578
+ let foundLinesForValVariant = foundLines . get ( valVariant )
579
+ if ( ! foundLinesForValVariant ?. has ( lineNumber ) ) {
580
+ matches . push ( {
581
+ file,
582
+ lineNumber,
583
+ key : getKeyForValue ( valVariant ) ,
584
+ enhancedMatch : false ,
585
+ } )
586
+
587
+ if ( ! foundLinesForValVariant ) {
588
+ foundLinesForValVariant = new Set < number > ( )
589
+ foundLines . set ( valVariant , foundLinesForValVariant )
590
+ }
591
+ foundLinesForValVariant . add ( lineNumber )
592
+ }
593
+
594
+ if ( ! foundIndexesForValVariant ) {
595
+ foundIndexesForValVariant = new Set < number > ( )
596
+ foundIndexes . set ( valVariant , foundIndexesForValVariant )
597
+ }
598
+ foundIndexesForValVariant . add ( pos )
599
+ }
600
+ }
601
+ }
602
+ }
603
+
604
+ if ( buffer . length > maxValLength ) {
605
+ const lengthDiff = buffer . length - maxValLength
606
+ fileIndex += lengthDiff
607
+ if ( newLinesIndexesInCurrentBuffer === null ) {
608
+ newLinesIndexesInCurrentBuffer = getCurrentBufferNewLineIndexes ( )
609
+ }
610
+
611
+ // advanced processed lines
612
+ for ( const newLineIndex of newLinesIndexesInCurrentBuffer ) {
613
+ if ( newLineIndex < lengthDiff ) {
614
+ processedLines ++
615
+ } else {
616
+ break
617
+ }
618
+ }
619
+
620
+ // Keep the last part of the buffer to handle split values across chunks
621
+ buffer = buffer . slice ( - maxValLength )
622
+ }
623
+ } )
624
+
625
+ inStream . on ( 'error' , function ( error : any ) {
626
+ if ( error ?. code === 'EISDIR' ) {
627
+ // file path is a directory - do nothing
628
+ resolve ( matches )
629
+ } else {
630
+ reject ( error )
631
+ }
632
+ } )
633
+
634
+ inStream . on ( 'close' , function ( ) {
635
+ resolve ( matches )
636
+ } )
637
+ } )
638
+ }
639
+
475
640
/**
476
641
* ScanResults are all of the finds for all keys and their disparate locations. Scanning is
477
642
* async in streams so order can change a lot. Some matches are the result of an env var explictly being marked as secret,
0 commit comments