Skip to content

Commit 2207554

Browse files
committed
feat: allow to scan secrets without buffering whole lines
1 parent e89d4ae commit 2207554

File tree

3 files changed

+679
-390
lines changed

3 files changed

+679
-390
lines changed

packages/build/src/plugins_core/secrets_scanning/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const coreStep: CoreStepFunction = async function ({
3030
netlifyConfig,
3131
explicitSecretKeys,
3232
enhancedSecretScan,
33+
featureFlags,
3334
systemLog,
3435
deployId,
3536
api,
@@ -38,6 +39,7 @@ const coreStep: CoreStepFunction = async function ({
3839

3940
const passedSecretKeys = (explicitSecretKeys || '').split(',')
4041
const envVars = netlifyConfig.build.environment as Record<string, unknown>
42+
const useMinimalChunks = featureFlags?.secret_scanning_minimal_chunks
4143

4244
systemLog?.({ passedSecretKeys, buildDir })
4345

@@ -109,6 +111,7 @@ const coreStep: CoreStepFunction = async function ({
109111
filePaths,
110112
enhancedScanning: enhancedSecretScan && enhancedScanningEnabledInEnv,
111113
omitValuesFromEnhancedScan: getOmitValuesFromEnhancedScanForEnhancedScanFromEnv(envVars),
114+
useMinimalChunks,
112115
})
113116

114117
secretMatches = scanResults.matches.filter((match) => !match.enhancedMatch)

packages/build/src/plugins_core/secrets_scanning/utils.ts

Lines changed: 196 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ interface ScanArgs {
1919
filePaths: string[]
2020
enhancedScanning?: boolean
2121
omitValuesFromEnhancedScan?: unknown[]
22+
useMinimalChunks: boolean
2223
}
2324

2425
interface MatchResult {
@@ -146,54 +147,49 @@ const likelySecretRegex = new RegExp(
146147
)
147148

148149
/**
149-
* Checks a line of text for likely secrets based on known prefixes and patterns.
150+
* Checks a chunk of text for likely secrets based on known prefixes and patterns.
150151
* The function works by:
151-
* 1. Splitting the line into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152+
* 1. Splitting the chunk into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152153
* 2. For each token, checking if it matches our secret pattern:
153154
* - Must start (^) with one of our known prefixes (e.g. aws_, github_pat_, etc)
154155
* - Must be followed by at least MIN_CHARS_AFTER_PREFIX non-whitespace characters
155156
* - Must extend to the end ($) of the token
156157
*
157-
* For example, given the line: secretKey='aws_123456789012345678'
158+
* For example, given the chunk: secretKey='aws_123456789012345678'
158159
* 1. It's split into tokens: ['secretKey', 'aws_123456789012345678']
159160
* 2. Each token is checked against the regex pattern:
160161
* - 'secretKey' doesn't match (doesn't start with a known prefix)
161162
* - 'aws_123456789012345678' matches (starts with 'aws_' and has sufficient length)
162163
*
163-
* @param line The line of text to check
164-
* @param file The file path where this line was found
165-
* @param lineNumber The line number in the file
166-
* @param omitValuesFromEnhancedScan Optional array of values to exclude from matching
167-
* @returns Array of matches found in the line
168164
*/
169165
export function findLikelySecrets({
170-
line,
171-
file,
172-
lineNumber,
166+
chunk,
173167
omitValuesFromEnhancedScan = [],
174168
}: {
175-
line: string
176-
file: string
177-
lineNumber: number
169+
/**
170+
* The chunk of text to check
171+
*/
172+
chunk: string
173+
/**
174+
* Optional array of values to exclude from matching
175+
*/
178176
omitValuesFromEnhancedScan?: unknown[]
179-
}): MatchResult[] {
180-
if (!line) return []
177+
}): { index: number; prefix: string }[] {
178+
if (!chunk) return []
181179

182-
const matches: MatchResult[] = []
180+
const matches: ReturnType<typeof findLikelySecrets> = []
183181
let match: RegExpExecArray | null
184182
const allOmittedValues = [...omitValuesFromEnhancedScan, ...SAFE_LISTED_VALUES]
185183

186-
while ((match = likelySecretRegex.exec(line)) !== null) {
184+
while ((match = likelySecretRegex.exec(chunk)) !== null) {
187185
const token = match.groups?.token
188186
const prefix = match.groups?.prefix
189187
if (!token || !prefix || allOmittedValues.includes(token)) {
190188
continue
191189
}
192190
matches.push({
193-
file,
194-
lineNumber,
195-
key: prefix,
196-
enhancedMatch: true,
191+
prefix,
192+
index: match.index,
197193
})
198194
}
199195

@@ -279,6 +275,7 @@ export async function scanFilesForKeyValues({
279275
base,
280276
enhancedScanning,
281277
omitValuesFromEnhancedScan = [],
278+
useMinimalChunks = false,
282279
}: ScanArgs): Promise<ScanResults> {
283280
const scanResults: ScanResults = {
284281
matches: [],
@@ -309,6 +306,8 @@ export async function scanFilesForKeyValues({
309306

310307
let settledPromises: PromiseSettledResult<MatchResult[]>[] = []
311308

309+
const searchStream = useMinimalChunks ? searchStreamMinimalChunks : searchStreamReadline
310+
312311
// process the scanning in batches to not run into memory issues by
313312
// processing all files at the same time.
314313
while (filePaths.length > 0) {
@@ -333,19 +332,24 @@ export async function scanFilesForKeyValues({
333332
return scanResults
334333
}
335334

336-
const searchStream = ({
337-
basePath,
338-
file,
339-
keyValues,
340-
enhancedScanning,
341-
omitValuesFromEnhancedScan = [],
342-
}: {
335+
type SearchStreamOptions = {
343336
basePath: string
344337
file: string
345338
keyValues: Record<string, string[]>
346339
enhancedScanning?: boolean
347340
omitValuesFromEnhancedScan?: unknown[]
348-
}): Promise<MatchResult[]> => {
341+
}
342+
343+
/**
344+
* Search stream implementation using node:readline
345+
*/
346+
const searchStreamReadline = ({
347+
basePath,
348+
file,
349+
keyValues,
350+
enhancedScanning,
351+
omitValuesFromEnhancedScan = [],
352+
}: SearchStreamOptions): Promise<MatchResult[]> => {
349353
return new Promise((resolve, reject) => {
350354
const filePath = path.resolve(basePath, file)
351355

@@ -382,7 +386,14 @@ const searchStream = ({
382386
lineNumber++
383387
if (typeof line === 'string') {
384388
if (enhancedScanning) {
385-
matches.push(...findLikelySecrets({ line, file, lineNumber, omitValuesFromEnhancedScan }))
389+
matches.push(
390+
...findLikelySecrets({ chunk: line, omitValuesFromEnhancedScan }).map(({ prefix }) => ({
391+
key: prefix,
392+
file,
393+
lineNumber,
394+
enhancedMatch: true,
395+
})),
396+
)
386397
}
387398
if (maxMultiLineCount > 1) {
388399
lines.push(line)
@@ -472,6 +483,160 @@ const searchStream = ({
472483
})
473484
}
474485

486+
/**
487+
* Search stream implementation using just read stream that allows to buffer less content
488+
*/
489+
const searchStreamMinimalChunks = ({
490+
basePath,
491+
file,
492+
keyValues,
493+
enhancedScanning,
494+
omitValuesFromEnhancedScan = [],
495+
}: SearchStreamOptions): Promise<MatchResult[]> => {
496+
return new Promise((resolve, reject) => {
497+
const filePath = path.resolve(basePath, file)
498+
499+
const inStream = createReadStream(filePath)
500+
const matches: MatchResult[] = []
501+
502+
const keyVals: string[] = ([] as string[]).concat(...Object.values(keyValues))
503+
504+
// determine longest value that we will search for - needed to determine minimal size of our buffer
505+
const maxValLength = Math.max(
506+
0,
507+
// explicit secrets
508+
...keyVals.map((v) => v.length),
509+
...(enhancedScanning
510+
? [
511+
// omitted likely secrets (after finding likely secret we check if it should be omitted, so we need to capture at least size of omitted values)
512+
...omitValuesFromEnhancedScan.map((v) => (typeof v === 'string' ? v.length : 0)),
513+
// minimum length needed to find likely secret
514+
...LIKELY_SECRET_PREFIXES.map((v) => v.length + MIN_CHARS_AFTER_PREFIX),
515+
]
516+
: []),
517+
)
518+
519+
if (maxValLength === 0) {
520+
// no non-empty values to scan for
521+
return matches
522+
}
523+
524+
const minValLength = Math.min(...keyVals.map((v) => v.length))
525+
526+
function getKeyForValue(val) {
527+
let key = ''
528+
for (const [secretKeyName, valuePermutations] of Object.entries(keyValues)) {
529+
if (valuePermutations.includes(val)) {
530+
key = secretKeyName
531+
}
532+
}
533+
return key
534+
}
535+
536+
let buffer = ''
537+
538+
function getCurrentBufferNewLineIndexes() {
539+
const newLinesIndexesInCurrentBuffer = [] as number[]
540+
let newLineIndex = -1
541+
while ((newLineIndex = buffer.indexOf('\n', newLineIndex + 1)) !== -1) {
542+
newLinesIndexesInCurrentBuffer.push(newLineIndex)
543+
}
544+
545+
return newLinesIndexesInCurrentBuffer
546+
}
547+
let fileIndex = 0
548+
let processedLines = 0
549+
const foundIndexes = new Map<string, Set<number>>()
550+
const foundLines = new Map<string, Set<number>>()
551+
inStream.on('data', function (chunk) {
552+
const newChunk = chunk.toString()
553+
554+
buffer += newChunk
555+
556+
let newLinesIndexesInCurrentBuffer = null as null | number[]
557+
558+
if (buffer.length > minValLength) {
559+
for (const valVariant of keyVals) {
560+
let valVariantIndex = -1
561+
while ((valVariantIndex = buffer.indexOf(valVariant, valVariantIndex + 1)) !== -1) {
562+
const pos = fileIndex + valVariantIndex
563+
let foundIndexesForValVariant = foundIndexes.get(valVariant)
564+
if (!foundIndexesForValVariant?.has(pos)) {
565+
if (newLinesIndexesInCurrentBuffer === null) {
566+
newLinesIndexesInCurrentBuffer = getCurrentBufferNewLineIndexes()
567+
}
568+
569+
let lineNumber = processedLines + 1
570+
for (const newLineIndex of newLinesIndexesInCurrentBuffer) {
571+
if (valVariantIndex > newLineIndex) {
572+
lineNumber++
573+
} else {
574+
break
575+
}
576+
}
577+
578+
let foundLinesForValVariant = foundLines.get(valVariant)
579+
if (!foundLinesForValVariant?.has(lineNumber)) {
580+
matches.push({
581+
file,
582+
lineNumber,
583+
key: getKeyForValue(valVariant),
584+
enhancedMatch: false,
585+
})
586+
587+
if (!foundLinesForValVariant) {
588+
foundLinesForValVariant = new Set<number>()
589+
foundLines.set(valVariant, foundLinesForValVariant)
590+
}
591+
foundLinesForValVariant.add(lineNumber)
592+
}
593+
594+
if (!foundIndexesForValVariant) {
595+
foundIndexesForValVariant = new Set<number>()
596+
foundIndexes.set(valVariant, foundIndexesForValVariant)
597+
}
598+
foundIndexesForValVariant.add(pos)
599+
}
600+
}
601+
}
602+
}
603+
604+
if (buffer.length > maxValLength) {
605+
const lengthDiff = buffer.length - maxValLength
606+
fileIndex += lengthDiff
607+
if (newLinesIndexesInCurrentBuffer === null) {
608+
newLinesIndexesInCurrentBuffer = getCurrentBufferNewLineIndexes()
609+
}
610+
611+
// advanced processed lines
612+
for (const newLineIndex of newLinesIndexesInCurrentBuffer) {
613+
if (newLineIndex < lengthDiff) {
614+
processedLines++
615+
} else {
616+
break
617+
}
618+
}
619+
620+
// Keep the last part of the buffer to handle split values across chunks
621+
buffer = buffer.slice(-maxValLength)
622+
}
623+
})
624+
625+
inStream.on('error', function (error: any) {
626+
if (error?.code === 'EISDIR') {
627+
// file path is a directory - do nothing
628+
resolve(matches)
629+
} else {
630+
reject(error)
631+
}
632+
})
633+
634+
inStream.on('close', function () {
635+
resolve(matches)
636+
})
637+
})
638+
}
639+
475640
/**
476641
* ScanResults are all of the finds for all keys and their disparate locations. Scanning is
477642
* async in streams so order can change a lot. Some matches are the result of an env var explictly being marked as secret,

0 commit comments

Comments
 (0)