Skip to content

feat: allow to scan secrets without buffering whole lines #6318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions packages/build/src/plugins_core/secrets_scanning/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const coreStep: CoreStepFunction = async function ({
netlifyConfig,
explicitSecretKeys,
enhancedSecretScan,
featureFlags,
systemLog,
deployId,
api,
Expand All @@ -38,6 +39,7 @@ const coreStep: CoreStepFunction = async function ({

const passedSecretKeys = (explicitSecretKeys || '').split(',')
const envVars = netlifyConfig.build.environment as Record<string, unknown>
const useMinimalChunks = featureFlags?.secret_scanning_minimal_chunks

systemLog?.({ passedSecretKeys, buildDir })

Expand Down Expand Up @@ -109,6 +111,7 @@ const coreStep: CoreStepFunction = async function ({
filePaths,
enhancedScanning: enhancedSecretScan && enhancedScanningEnabledInEnv,
omitValuesFromEnhancedScan: getOmitValuesFromEnhancedScanForEnhancedScanFromEnv(envVars),
useMinimalChunks,
})

secretMatches = scanResults.matches.filter((match) => !match.enhancedMatch)
Expand Down
285 changes: 254 additions & 31 deletions packages/build/src/plugins_core/secrets_scanning/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ interface ScanArgs {
filePaths: string[]
enhancedScanning?: boolean
omitValuesFromEnhancedScan?: unknown[]
useMinimalChunks: boolean
}

interface MatchResult {
Expand Down Expand Up @@ -146,54 +147,49 @@ const likelySecretRegex = new RegExp(
)

/**
* Checks a line of text for likely secrets based on known prefixes and patterns.
* Checks a chunk of text for likely secrets based on known prefixes and patterns.
* The function works by:
* 1. Splitting the line into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
* 1. Splitting the chunk into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
* 2. For each token, checking if it matches our secret pattern:
* - Must start (^) with one of our known prefixes (e.g. aws_, github_pat_, etc)
* - Must be followed by at least MIN_CHARS_AFTER_PREFIX non-whitespace characters
* - Must extend to the end ($) of the token
*
* For example, given the line: secretKey='aws_123456789012345678'
* For example, given the chunk: secretKey='aws_123456789012345678'
* 1. It's split into tokens: ['secretKey', 'aws_123456789012345678']
* 2. Each token is checked against the regex pattern:
* - 'secretKey' doesn't match (doesn't start with a known prefix)
* - 'aws_123456789012345678' matches (starts with 'aws_' and has sufficient length)
*
* @param line The line of text to check
* @param file The file path where this line was found
* @param lineNumber The line number in the file
* @param omitValuesFromEnhancedScan Optional array of values to exclude from matching
* @returns Array of matches found in the line
*/
export function findLikelySecrets({
line,
file,
lineNumber,
text,
omitValuesFromEnhancedScan = [],
}: {
line: string
file: string
lineNumber: number
/**
* Text to check
*/
text: string
/**
* Optional array of values to exclude from matching
*/
omitValuesFromEnhancedScan?: unknown[]
}): MatchResult[] {
if (!line) return []
}): { index: number; prefix: string }[] {
if (!text) return []

const matches: MatchResult[] = []
const matches: ReturnType<typeof findLikelySecrets> = []
let match: RegExpExecArray | null
const allOmittedValues = [...omitValuesFromEnhancedScan, ...SAFE_LISTED_VALUES]

while ((match = likelySecretRegex.exec(line)) !== null) {
while ((match = likelySecretRegex.exec(text)) !== null) {
const token = match.groups?.token
const prefix = match.groups?.prefix
if (!token || !prefix || allOmittedValues.includes(token)) {
continue
}
matches.push({
file,
lineNumber,
key: prefix,
enhancedMatch: true,
prefix,
index: match.index,
})
}

Expand Down Expand Up @@ -279,6 +275,7 @@ export async function scanFilesForKeyValues({
base,
enhancedScanning,
omitValuesFromEnhancedScan = [],
useMinimalChunks = false,
}: ScanArgs): Promise<ScanResults> {
const scanResults: ScanResults = {
matches: [],
Expand Down Expand Up @@ -309,6 +306,8 @@ export async function scanFilesForKeyValues({

let settledPromises: PromiseSettledResult<MatchResult[]>[] = []

const searchStream = useMinimalChunks ? searchStreamMinimalChunks : searchStreamReadline

// process the scanning in batches to not run into memory issues by
// processing all files at the same time.
while (filePaths.length > 0) {
Expand All @@ -333,19 +332,24 @@ export async function scanFilesForKeyValues({
return scanResults
}

const searchStream = ({
basePath,
file,
keyValues,
enhancedScanning,
omitValuesFromEnhancedScan = [],
}: {
type SearchStreamOptions = {
basePath: string
file: string
keyValues: Record<string, string[]>
enhancedScanning?: boolean
omitValuesFromEnhancedScan?: unknown[]
}): Promise<MatchResult[]> => {
}

/**
* Search stream implementation using node:readline
*/
const searchStreamReadline = ({
basePath,
file,
keyValues,
enhancedScanning,
omitValuesFromEnhancedScan = [],
}: SearchStreamOptions): Promise<MatchResult[]> => {
return new Promise((resolve, reject) => {
const filePath = path.resolve(basePath, file)

Expand Down Expand Up @@ -382,7 +386,14 @@ const searchStream = ({
lineNumber++
if (typeof line === 'string') {
if (enhancedScanning) {
matches.push(...findLikelySecrets({ line, file, lineNumber, omitValuesFromEnhancedScan }))
matches.push(
...findLikelySecrets({ text: line, omitValuesFromEnhancedScan }).map(({ prefix }) => ({
key: prefix,
file,
lineNumber,
enhancedMatch: true,
})),
)
}
if (maxMultiLineCount > 1) {
lines.push(line)
Expand Down Expand Up @@ -472,6 +483,218 @@ const searchStream = ({
})
}

/**
* Search stream implementation using just read stream that allows to buffer less content
*/
const searchStreamMinimalChunks = ({
basePath,
file,
keyValues,
enhancedScanning,
omitValuesFromEnhancedScan = [],
}: SearchStreamOptions): Promise<MatchResult[]> => {
return new Promise((resolve, reject) => {
const matches: MatchResult[] = []

const keyVals: string[] = ([] as string[]).concat(...Object.values(keyValues))

// determine longest value that we will search for - needed to determine minimal size of rolling buffer
const maxValLength = Math.max(
0,
// explicit secrets
...keyVals.map((v) => v.length),
...(enhancedScanning
? [
// omitted likely secrets (after finding likely secret we check if it should be omitted, so we need to capture at least size of omitted values)
...omitValuesFromEnhancedScan.map((v) => (typeof v === 'string' ? v.length : 0)),
// minimum length needed to find likely secret
...LIKELY_SECRET_PREFIXES.map((v) => v.length + MIN_CHARS_AFTER_PREFIX),
]
: []),
)

if (maxValLength === 0) {
// no non-empty values to scan for
resolve(matches)
return
}

const filePath = path.resolve(basePath, file)

const inStream = createReadStream(filePath)

function getKeyForValue(val) {
let key = ''
for (const [secretKeyName, valuePermutations] of Object.entries(keyValues)) {
if (valuePermutations.includes(val)) {
key = secretKeyName
}
}
return key
}

let buffer = ''

let newLinesIndexesInCurrentBuffer: number[] | null = null
function getCurrentBufferNewLineIndexes() {
if (newLinesIndexesInCurrentBuffer === null) {
newLinesIndexesInCurrentBuffer = [] as number[]
let newLineIndex = -1
while ((newLineIndex = buffer.indexOf('\n', newLineIndex + 1)) !== -1) {
newLinesIndexesInCurrentBuffer.push(newLineIndex)
}
}

return newLinesIndexesInCurrentBuffer
}

/**
* Amount of characters that were fully processed. Used to determine absolute position of current rolling buffer
* in the file.
*/
let processedCharacters = 0
/**
* Amount of lines that were fully processed. Used to determine absolute line number of matches in current rolling buffer.
*/
let processedLines = 0
/**
* Map keeping track of found secrets in current file. Used to prevent reporting same secret+position multiple times.
* Needed because rolling buffer might retain same secret in multiple passes.
*/
const foundIndexes = new Map<string, Set<number>>()
/**
* We report given secret at most once per line, so we keep track lines we already reported for given secret.
*/
const foundLines = new Map<string, Set<number>>()

/**
* Calculate absolute line number in a file for given match in the current rolling buffer.
*/
function getLineNumberForMatchInTheBuffer({ indexInBuffer, key }: { indexInBuffer: number; key: string }) {
const absolutePositionInFile = processedCharacters + indexInBuffer

// check if we already handled match for given key in this position
let foundIndexesForKey = foundIndexes.get(key)
if (!foundIndexesForKey?.has(absolutePositionInFile)) {
// ensure we track match for this key and position to not report it again in future passes
if (!foundIndexesForKey) {
foundIndexesForKey = new Set<number>()
foundIndexes.set(key, foundIndexesForKey)
}
foundIndexesForKey.add(absolutePositionInFile)

// calculate line number based on amount of fully processed lines and position of line breaks in current buffer
let lineNumber = processedLines + 1
for (const newLineIndex of getCurrentBufferNewLineIndexes()) {
if (indexInBuffer > newLineIndex) {
lineNumber++
} else {
break
}
}

// check if we already handled match for given key in this line
let foundLinesForKey = foundLines.get(key)
if (!foundLinesForKey?.has(lineNumber)) {
if (!foundLinesForKey) {
foundLinesForKey = new Set<number>()
foundLines.set(key, foundLinesForKey)
}
foundLinesForKey.add(lineNumber)

// only report line number if we didn't report it yet for this key
return lineNumber
}
}
}

function processBuffer() {
for (const valVariant of keyVals) {
let indexInBuffer = -1
while ((indexInBuffer = buffer.indexOf(valVariant, indexInBuffer + 1)) !== -1) {
const key = getKeyForValue(valVariant)
const lineNumber = getLineNumberForMatchInTheBuffer({
indexInBuffer,
key,
})

if (typeof lineNumber === 'number') {
matches.push({
file,
lineNumber,
key,
enhancedMatch: false,
})
}
}
}

if (enhancedScanning) {
const likelySecrets = findLikelySecrets({ text: buffer, omitValuesFromEnhancedScan })
for (const { index, prefix } of likelySecrets) {
const lineNumber = getLineNumberForMatchInTheBuffer({
indexInBuffer: index,
key: prefix,
})

if (typeof lineNumber === 'number') {
matches.push({
file,
lineNumber,
key: prefix,
enhancedMatch: true,
})
}
}
}
}

inStream.on('data', function (chunk) {
buffer += chunk.toString()

// reset new line positions in current buffer
newLinesIndexesInCurrentBuffer = null

if (buffer.length > maxValLength) {
// only process if buffer is large enough to contain longest secret, if final chunk isn't large enough
// it will be processed in `close` event handler
processBuffer()

// we will keep maxValLength characters in the buffer, surplus of characters at this point is fully processed
const charactersInBufferThatWereFullyProcessed = buffer.length - maxValLength
processedCharacters += charactersInBufferThatWereFullyProcessed

// advance processed lines
for (const newLineIndex of getCurrentBufferNewLineIndexes()) {
if (newLineIndex < charactersInBufferThatWereFullyProcessed) {
processedLines++
} else {
break
}
}

// Keep the last part of the buffer to handle split values across chunks
buffer = buffer.slice(charactersInBufferThatWereFullyProcessed)
}
})

inStream.on('error', function (error: any) {
if (error?.code === 'EISDIR') {
// file path is a directory - do nothing
resolve(matches)
} else {
reject(error)
}
})

inStream.on('close', function () {
// process any remaining buffer content
processBuffer()
resolve(matches)
})
})
}

/**
* ScanResults are all of the finds for all keys and their disparate locations. Scanning is
* async in streams so order can change a lot. Some matches are the result of an env var explictly being marked as secret,
Expand Down
Loading
Loading