diff --git a/.env.example b/.env.example index 759964fc6..b02159071 100644 --- a/.env.example +++ b/.env.example @@ -65,17 +65,17 @@ GITTENSORY_REVIEW_ENRICHMENT=false # Current analyzer names: # dependency,lockfileDrift,secret,license,installScript,heavyDependency,actionPin,eol,redos # provenance,codeowners,secretLog,assetWeight,typosquat,commitSignature,iacMisconfig,nativeBuild -# history,docCommentDrift +# history,docCommentDrift,duplication # # Profile defaults: # fast: dependency,lockfileDrift,secret,license,installScript,heavyDependency,actionPin,eol # redos,provenance,secretLog,typosquat,iacMisconfig,nativeBuild # balanced (default): dependency,lockfileDrift,secret,license,installScript,heavyDependency # actionPin,eol,redos,provenance,codeowners,secretLog,assetWeight,typosquat,commitSignature -# iacMisconfig,nativeBuild,history,docCommentDrift +# iacMisconfig,nativeBuild,history,docCommentDrift,duplication # deep: dependency,lockfileDrift,secret,license,installScript,heavyDependency,actionPin,eol # redos,provenance,codeowners,secretLog,assetWeight,typosquat,commitSignature,iacMisconfig -# nativeBuild,history,docCommentDrift +# nativeBuild,history,docCommentDrift,duplication # END GENERATED REES ANALYZERS # Submitter-reputation spend control (internal-only): downgrades new/burst/low-rep diff --git a/apps/gittensory-ui/src/lib/rees-analyzers.ts b/apps/gittensory-ui/src/lib/rees-analyzers.ts index a73ff5eec..e84023a25 100644 --- a/apps/gittensory-ui/src/lib/rees-analyzers.ts +++ b/apps/gittensory-ui/src/lib/rees-analyzers.ts @@ -505,6 +505,34 @@ export const REES_ANALYZERS = [ "Conservative: only named function declarations with confidently-enumerable params; non-parameter signature edits are not reported.", }, }, + { + name: "duplication", + title: "Near-verbatim duplicated code", + category: "quality", + cost: "github-light", + defaultEnabled: true, + profiles: ["balanced", "deep"], + requires: ["files", "github-token", "head-sha"], + limits: { + minRun: 8, + maxCandidates: 40, + maxFetches: 30, + maxFindings: 25, + maxFileBytes: 500000, + }, + docs: { + summary: + "Flags added code that is a near-verbatim duplicate of a block already present elsewhere in the repo.", + looksAt: + "Added diff hunks in changed source files compared against same-extension repo files fetched from the git tree at headSha.", + reports: + "The head file:line, the existing source file:line it duplicates, and the matched line count.", + network: + "Calls the GitHub API for the git tree and candidate blobs. Requires headSha and token forwarding for private repos.", + notes: + "Conservative: trivial/boilerplate lines are dropped and a long contiguous run is required, so incidental overlap is not flagged. Never returns code content.", + }, + }, ] as const satisfies readonly ReesAnalyzerDoc[]; export const REES_ANALYZER_NAMES = REES_ANALYZERS.map((analyzer) => analyzer.name); diff --git a/review-enrichment/analyzer-metadata.json b/review-enrichment/analyzer-metadata.json index 9ad7e58ad..98d05a2e4 100644 --- a/review-enrichment/analyzer-metadata.json +++ b/review-enrichment/analyzer-metadata.json @@ -579,6 +579,36 @@ "network": "Calls the GitHub API for changed file contents. Requires headSha and token forwarding for private repos.", "notes": "Conservative: only named function declarations with confidently-enumerable params; non-parameter signature edits are not reported." } + }, + { + "name": "duplication", + "title": "Near-verbatim duplicated code", + "category": "quality", + "cost": "github-light", + "defaultEnabled": true, + "profiles": [ + "balanced", + "deep" + ], + "requires": [ + "files", + "github-token", + "head-sha" + ], + "limits": { + "minRun": 8, + "maxCandidates": 40, + "maxFetches": 30, + "maxFindings": 25, + "maxFileBytes": 500000 + }, + "docs": { + "summary": "Flags added code that is a near-verbatim duplicate of a block already present elsewhere in the repo.", + "looksAt": "Added diff hunks in changed source files compared against same-extension repo files fetched from the git tree at headSha.", + "reports": "The head file:line, the existing source file:line it duplicates, and the matched line count.", + "network": "Calls the GitHub API for the git tree and candidate blobs. Requires headSha and token forwarding for private repos.", + "notes": "Conservative: trivial/boilerplate lines are dropped and a long contiguous run is required, so incidental overlap is not flagged. Never returns code content." + } } ] } diff --git a/review-enrichment/src/analyzers/duplication-scan.ts b/review-enrichment/src/analyzers/duplication-scan.ts new file mode 100644 index 000000000..79c15b0bb --- /dev/null +++ b/review-enrichment/src/analyzers/duplication-scan.ts @@ -0,0 +1,533 @@ +// Full-file / near-verbatim duplication scan (#1520). Flags code a PR ADDS that is a near-verbatim duplicate of a +// block that already exists elsewhere in the repo — i.e. copy-paste instead of importing the existing helper. The +// no-checkout reviewer sees only the diff, so it cannot know the added lines already live in another file; this +// analyzer fetches the repo's git tree at headSha (ONE recursive call) plus a bounded set of candidate blobs and +// looks for a contiguous run of significant added lines that reappears verbatim (after whitespace normalization) in +// a candidate file. It is deliberately CONSERVATIVE — trivial/boilerplate lines are dropped and a long contiguous +// run is required — so it does not flag incidental overlap. It reports ONLY locations (`head:path:line` vs +// `source:path:line`) + the matched line count — never the code content. Fail-safe: returns [] without a token / +// headSha, on a bad repoFullName, or when the tree fetch fails; a single malformed candidate is skipped, never +// aborting the scan. +import type { + AnalyzerDiagnostics, + EnrichRequest, + DuplicationFinding, +} from "../types.js"; +import type { AnalysisContext } from "../analysis-context.js"; +import { boundedFetchJson } from "../external-fetch.js"; + +const GITHUB_API = "https://api.github.com"; +const GITHUB_API_VERSION = "2022-11-28"; + +const MIN_RUN = 8; // a contiguous run of >= this many significant normalized lines is required to flag a duplicate +const MAX_CANDIDATES = 40; // cap candidate files (closest-by-path first) we consider per scan +const MAX_FETCHES = 30; // global cap on candidate blob fetches per scan +const MAX_FINDINGS = 25; // keep the brief bounded +const MIN_SIGNIFICANT_LEN = 12; // lines shorter than this (after trim) are treated as trivial and dropped +const MAX_FILE_BYTES = 500_000; // skip an oversized candidate blob so one huge (likely generated) file can't eat the budget +const MAX_TREE_JSON_BYTES = 4 * 1024 * 1024; // recursive git tree can be large; bound it like asset-weight does +const MAX_BLOB_JSON_BYTES = 1024 * 1024; // a base64 blob payload is ~4/3 of the file; bound the JSON we read + +// Source-code extensions whose copy-paste is meaningful. Text data / config / lockfiles are intentionally excluded. +const SOURCE_EXTS = new Set([ + "ts", "tsx", "js", "jsx", "mjs", "cjs", + "py", "go", "rb", "java", "kt", "rs", + "c", "cc", "cpp", "h", "hpp", "cs", "php", "swift", "scala", +]); + +// A single repo path segment (owner or name): word chars, dot, dash only. Whole-segment `.`/`..` are rejected +// separately so a hostile repoFullName can't traverse or redirect the token-bearing request to another repository. +const REPO_SEGMENT = /^[A-Za-z0-9._-]+$/; +const SHA_RE = /^[0-9a-fA-F]{7,64}$/; + +interface ScanOptions { + signal?: AbortSignal; + analysis?: Pick; + diagnostics?: AnalyzerDiagnostics; +} + +function githubHeaders(token: string): Record { + return { + Authorization: `Bearer ${token}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": GITHUB_API_VERSION, + "User-Agent": "gittensory-review-enrichment", + }; +} + +/** Parse `owner/repo`, rejecting anything that isn't exactly two safe segments (no traversal, no extra slashes) so a + * hostile `repoFullName` cannot redirect the token-bearing request elsewhere. Returns null when unsafe. */ +function parseRepo( + repoFullName: string, +): { owner: string; repo: string } | null { + const parts = repoFullName.split("/"); + if (parts.length !== 2) return null; + const [owner, repo] = parts; + for (const seg of [owner, repo]) { + if (!seg || seg === "." || seg === ".." || !REPO_SEGMENT.test(seg)) { + return null; + } + } + return { owner: owner!, repo: repo! }; +} + +/** Fetch JSON from a GitHub git endpoint through the shared bounded helper. Mirrors asset-weight: when an analysis + * context is supplied its caching/metered `fetchJson` is used, otherwise the bare `boundedFetchJson`. Returns the + * parsed body on a 2xx with valid JSON, or null on any non-OK / malformed / over-budget / network outcome so the + * caller fails safe. */ +async function fetchGithubJson( + url: string, + token: string, + fetchImpl: typeof fetch, + options: ScanOptions, + endpointCategory: "github-trees" | "github-blobs", +): Promise { + const fetchOptions = { + endpointCategory, + headers: githubHeaders(token), + signal: options.signal, + fetchImpl, + diagnostics: options.diagnostics, + phase: "duplication", + subcall: endpointCategory, + maxBytes: + endpointCategory === "github-trees" + ? MAX_TREE_JSON_BYTES + : MAX_BLOB_JSON_BYTES, + maxCallsPerCategory: + endpointCategory === "github-blobs" ? MAX_FETCHES : 1, + }; + const response = options.analysis + ? await options.analysis.fetchJson(url, fetchOptions) + : await boundedFetchJson(url, fetchOptions); + return response.ok ? response.data : null; +} + +function extOf(path: string): string | null { + const slash = path.lastIndexOf("/"); + const dot = path.lastIndexOf("."); + if (dot <= slash) return null; // no extension, or the dot is in a directory segment + return path.slice(dot + 1).toLowerCase(); +} + +function isSourceExt(path: string): boolean { + const ext = extOf(path); + return ext !== null && SOURCE_EXTS.has(ext); +} + +/** Paths that are generated / vendored / minified / type-declaration: copy-paste there is noise, not a defect. */ +function isExcludedPath(path: string): boolean { + if (path.endsWith(".d.ts")) return true; + if (path.includes(".min.")) return true; + const lower = path.toLowerCase(); + for (const seg of lower.split("/")) { + if ( + seg === "node_modules" || + seg === "dist" || + seg === "build" || + seg === "vendor" + ) { + return true; + } + } + return false; +} + +/** Normalize a source line for comparison: trim + collapse internal whitespace to single spaces. Returns null when + * the line is blank or "trivial" (too short, pure punctuation/braces, or bare boilerplate like a lone `import`) so + * incidental matches on closing braces / import keywords cannot accumulate into a false-positive run. */ +export function normalizeLine(raw: string): string | null { + const collapsed = raw.trim().replace(/\s+/g, " "); + if (!collapsed) return null; + if (collapsed.length < MIN_SIGNIFICANT_LEN) return null; + // Pure punctuation / brackets (e.g. `});`, `} else {`-free closers) carry no copy-paste signal. + if (!/[A-Za-z0-9]/.test(collapsed)) return null; + // Bare import/include/use boilerplate that legitimately recurs across files. `export` is intentionally NOT dropped + // (an `export function`/`export const` declaration is meaningful and part of a copy-pasted block); only re-exports + // of the form `export ... from ...` are import-like. + if (/^(import|from|using|use|#include|package|require)\b/.test(collapsed)) { + return null; + } + if (/^export\b.*\bfrom\b/.test(collapsed)) return null; // re-export, e.g. `export { x } from "./y"` + return collapsed; +} + +interface NormBlock { + /** Normalized significant lines, in order. */ + norm: string[]; + /** Parallel array: the ORIGINAL new-file line number for each entry of `norm`. */ + lineNos: number[]; +} + +/** Parse a unified-diff `patch` into blocks of ADDED significant lines with their new-file line numbers. Lines are + * grouped into a fresh block whenever the added run is broken (a context/removed line, or a trivial line dropped by + * normalization), so a contiguous run requirement is meaningful. */ +export function extractAddedBlocks(patch: string | undefined): NormBlock[] { + if (!patch) return []; + const blocks: NormBlock[] = []; + let current: NormBlock | null = null; + let newLine = 0; + let inHunk = false; + + const flush = () => { + if (current && current.norm.length) blocks.push(current); + current = null; + }; + + for (const line of patch.split("\n")) { + const header = /^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/.exec(line); + if (header) { + flush(); + newLine = Number(header[1]); + inHunk = true; + continue; + } + if (!inHunk) continue; + if (line.startsWith("+++")) continue; // file header inside the patch, not an added line + if (line.startsWith("+")) { + const norm = normalizeLine(line.slice(1)); + if (norm === null) { + // A trivial added line breaks the contiguous significant run. + flush(); + } else { + if (!current) current = { norm: [], lineNos: [] }; + current.norm.push(norm); + current.lineNos.push(newLine); + } + newLine++; + continue; + } + if (line.startsWith("-")) { + // Removed line: does not advance the new-file counter, breaks the added run. + flush(); + continue; + } + if (line.startsWith("\\")) { + // "\ No newline at end of file" — not a real line. + continue; + } + // Context line (starts with a space, or an empty line in the patch): advances new-file counter, breaks the run. + flush(); + newLine++; + } + flush(); + return blocks; +} + +/** Split a full file's text into blocks of significant lines (with 1-based source line numbers), breaking a block + * at every blank/trivial line — the same gap-aware grouping `extractAddedBlocks` uses. Indexing each block + * separately means a matched run can never bridge across a blank or trivial line in the source file. */ +function normalizeFileBlocks(text: string): NormBlock[] { + const blocks: NormBlock[] = []; + let current: NormBlock | null = null; + const lines = text.split("\n"); + for (let i = 0; i < lines.length; i += 1) { + const n = normalizeLine(lines[i]!); + if (n === null) { + if (current && current.norm.length) blocks.push(current); + current = null; + } else { + if (!current) current = { norm: [], lineNos: [] }; + current.norm.push(n); + current.lineNos.push(i + 1); + } + } + if (current && current.norm.length) blocks.push(current); + return blocks; +} + +interface GramIndex { + block: NormBlock; + /** Map of joined MIN_RUN-window key → list of window start indices in `block.norm`. */ + windows: Map; +} + +/** Build a rolling MIN_RUN-line window index over a candidate file's significant lines, for fast run lookup. */ +function buildGramIndex(block: NormBlock): GramIndex { + const windows = new Map(); + for (let i = 0; i + MIN_RUN <= block.norm.length; i++) { + const key = block.norm.slice(i, i + MIN_RUN).join("\n"); + const list = windows.get(key); + if (list) list.push(i); + else windows.set(key, [i]); + } + return { block, windows }; +} + +/** Find the LONGEST contiguous run shared between an added block and an indexed candidate, anchored on a matching + * MIN_RUN window then extended forward. Returns the head + source line numbers of the run start and its length, + * or null when no run of >= MIN_RUN significant lines is shared. */ +function longestSharedRun( + added: NormBlock, + index: GramIndex, +): { headLine: number; sourceLine: number; length: number } | null { + const cand = index.block; + let best: { headLine: number; sourceLine: number; length: number } | null = + null; + for (let a = 0; a + MIN_RUN <= added.norm.length; a++) { + const key = added.norm.slice(a, a + MIN_RUN).join("\n"); + const starts = index.windows.get(key); + if (!starts) continue; + for (const c of starts) { + // Verify + extend the run forward from the matched window. + let len = MIN_RUN; + while ( + a + len < added.norm.length && + c + len < cand.norm.length && + added.norm[a + len] === cand.norm[c + len] + ) { + len++; + } + if (!best || len > best.length) { + best = { + headLine: added.lineNos[a]!, + sourceLine: cand.lineNos[c]!, + length: len, + }; + } + } + } + return best; +} + +/** Fetch the recursive git tree at `sha`. Returns the blob entries (path + blob sha) or null on a non-OK / malformed + * reply / network error so the caller fails safe. */ +async function fetchTree( + owner: string, + repo: string, + sha: string, + token: string, + fetchImpl: typeof fetch, + options: ScanOptions, +): Promise | null> { + if (!SHA_RE.test(sha)) return null; + const url = `${GITHUB_API}/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/git/trees/${encodeURIComponent(sha)}?recursive=1`; + const json = await fetchGithubJson<{ + tree?: Array<{ path?: string; type?: string; sha?: string }>; + truncated?: boolean; + }>(url, token, fetchImpl, options, "github-trees"); + if (!json || !Array.isArray(json.tree)) return null; + // `json.truncated` (very large repos) is intentionally not treated as an error: this analyzer is best-effort and + // additive, so a partial tree only means fewer candidate files to compare against. That can at worst MISS a + // duplicate (a false negative), never produce a wrong finding, so scanning the partial tree is acceptable. + const out: Array<{ path: string; sha: string }> = []; + for (const entry of json.tree) { + if ( + entry.type === "blob" && + typeof entry.path === "string" && + typeof entry.sha === "string" + ) { + out.push({ path: entry.path, sha: entry.sha }); + } + } + return out; +} + +/** Fetch a single git blob's decoded UTF-8 content, or null on a non-OK / malformed reply / network error. Only + * base64-encoded string content is decoded — anything else fails safe to null. */ +async function fetchBlob( + owner: string, + repo: string, + blobSha: string, + token: string, + fetchImpl: typeof fetch, + options: ScanOptions, +): Promise { + if (!SHA_RE.test(blobSha)) return null; + const url = `${GITHUB_API}/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/git/blobs/${encodeURIComponent(blobSha)}`; + const json = await fetchGithubJson<{ + content?: string; + encoding?: string; + }>(url, token, fetchImpl, options, "github-blobs"); + if (!json) return null; + if (json.encoding !== "base64" || typeof json.content !== "string") { + return null; + } + // Cap on the encoded size first so a huge blob never burns CPU on decode, then on the true decoded byte length. + if (json.content.length > MAX_FILE_BYTES * 2) return null; + const decoded = decodeBase64Utf8(json.content); + if (!decoded || decoded.byteLength > MAX_FILE_BYTES) return null; + return decoded.text; +} + +/** Worker-safe base64 → UTF-8 decode (no Node `Buffer` dependency, so the Cloudflare Worker deployment path works, + * not only Node). GitHub wraps blob `content` with newlines, which `atob` rejects, so whitespace is stripped first. + * Returns the decoded text plus its true UTF-8 byte length, or null on malformed base64 (fail safe, never throws). */ +export function decodeBase64Utf8(content: string): { text: string; byteLength: number } | null { + try { + const binary = atob(content.replace(/\s+/g, "")); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i += 1) bytes[i] = binary.charCodeAt(i); + // `fatal: true` throws on invalid UTF-8, so a binary/non-text blob fails safe to null (skipped) instead of being + // turned into replacement-character garbage that could be compared as if it were source. + return { text: new TextDecoder("utf-8", { fatal: true }).decode(bytes), byteLength: bytes.length }; + } catch { + return null; + } +} + +/** Score a candidate path by how close it sits to any changed path: more shared leading directory segments ⇒ higher + * score (a copy-pasted helper most often lives near the file that should have imported it). */ +function proximityScore(candidate: string, changedDirs: string[][]): number { + const candSegs = candidate.split("/").slice(0, -1); + let best = 0; + for (const dir of changedDirs) { + let shared = 0; + const max = Math.min(candSegs.length, dir.length); + while (shared < max && candSegs[shared] === dir[shared]) shared++; + if (shared > best) best = shared; + } + return best; +} + +/** Analyzer entrypoint: flag added code that is a near-verbatim duplicate of an existing block elsewhere in the repo. + * Fail-safe (returns [] without a token/headSha, on a bad repoFullName, or on a failed tree fetch). */ +export async function scanDuplication( + req: EnrichRequest, + fetchImpl: typeof fetch = fetch, + options: ScanOptions = {}, +): Promise { + const token = req.githubToken; + if (!token || !req.headSha) return []; + if (options.signal?.aborted) return []; + const repo = parseRepo(req.repoFullName); + if (!repo) return []; + + // Changed source files (kept) and their added significant blocks. Test/spec files are KEPT — copy-pasted tests are + // a real maintenance smell too — only generated/vendored/minified/declaration files are excluded. + const changed = (req.files ?? []).filter( + (f) => + f.status !== "removed" && + isSourceExt(f.path) && + !isExcludedPath(f.path), + ); + if (!changed.length) return []; + + const changedPaths = new Set(changed.map((f) => f.path)); + const changedExts = new Set(); + for (const f of changed) { + const ext = extOf(f.path); + if (ext) changedExts.add(ext); + } + + const addedByFile: Array<{ path: string; ext: string | null; blocks: NormBlock[] }> = []; + for (const f of changed) { + const blocks = extractAddedBlocks(f.patch).filter( + (b) => b.norm.length >= MIN_RUN, + ); + if (blocks.length) addedByFile.push({ path: f.path, ext: extOf(f.path), blocks }); + } + if (!addedByFile.length) return []; + + const tree = await fetchTree( + repo.owner, + repo.repo, + req.headSha, + token, + fetchImpl, + options, + ); + if (tree === null) return []; + + // Group candidate blobs BY extension, then proximity-sort + cap WITHIN each extension bucket (MAX_CANDIDATES each). + // Scanning is then round-robin across buckets under one global fetch budget (below), so no single extension's + // candidates can starve another's, while the TOTAL blob fetches stay bounded by MAX_FETCHES. + const changedDirs = [...changedPaths].map((p) => p.split("/").slice(0, -1)); + const candidatesByExt = new Map>(); + for (const entry of tree) { + if (changedPaths.has(entry.path) || isExcludedPath(entry.path)) continue; + const ext = extOf(entry.path); + if (ext === null || !changedExts.has(ext)) continue; + const bucket = candidatesByExt.get(ext); + if (bucket) bucket.push(entry); + else candidatesByExt.set(ext, [entry]); + } + for (const [ext, bucket] of candidatesByExt) { + bucket.sort((a, b) => { + const sa = proximityScore(a.path, changedDirs); + const sb = proximityScore(b.path, changedDirs); + return sb !== sa ? sb - sa : a.path.localeCompare(b.path); + }); + candidatesByExt.set(ext, bucket.slice(0, MAX_CANDIDATES)); + } + + // Added blocks grouped by extension, so each extension is scanned only against its own candidate bucket. + const addedByExt = new Map>(); + for (const f of addedByFile) { + if (f.ext === null) continue; + const arr = addedByExt.get(f.ext); + if (arr) arr.push({ path: f.path, blocks: f.blocks }); + else addedByExt.set(f.ext, [{ path: f.path, blocks: f.blocks }]); + } + + const findings: DuplicationFinding[] = []; + const seen = new Set(); + + // Scan round-robin across extensions under ONE global MAX_FETCHES budget: fair (each changed extension gets its + // turn, so none is starved) AND bounded (total blob fetches never exceed MAX_FETCHES, regardless of how many + // extensions changed). One pointer per extension bucket; we cycle until the budget is spent or every bucket drains. + const buckets = [...addedByExt.entries()] + .map(([ext, addedFiles]) => ({ + addedFiles, + cands: candidatesByExt.get(ext) ?? [], + cursor: 0, + })) + .filter((bk) => bk.cands.length > 0); + + let fetches = 0; + let aborted = false; + let progressed = true; + while (fetches < MAX_FETCHES && progressed && !aborted) { + progressed = false; + for (const bk of buckets) { + if (fetches >= MAX_FETCHES) break; + if (bk.cursor >= bk.cands.length) continue; + if (options.signal?.aborted) { + aborted = true; + break; + } + const cand = bk.cands[bk.cursor]; + bk.cursor += 1; + if (!cand) continue; + progressed = true; + fetches += 1; + const text = await fetchBlob( + repo.owner, + repo.repo, + cand.sha, + token, + fetchImpl, + options, + ); + if (text === null) continue; // bad/empty/oversized candidate — skip, never abort (size capped in fetchBlob) + // Index each gap-delimited block of the candidate separately so a run cannot bridge a blank/trivial line. + const indices = normalizeFileBlocks(text) + .filter((b) => b.norm.length >= MIN_RUN) + .map(buildGramIndex); + if (!indices.length) continue; + + for (const file of bk.addedFiles) { + for (const block of file.blocks) { + let best: { headLine: number; sourceLine: number; length: number } | null = null; + for (const index of indices) { + const run = longestSharedRun(block, index); + if (run && (!best || run.length > best.length)) best = run; + } + if (!best) continue; + const key = `${file.path}:${best.headLine}|${cand.path}:${best.sourceLine}`; + if (seen.has(key)) continue; + seen.add(key); + findings.push({ + file: file.path, + line: best.headLine, + sourceFile: cand.path, + sourceLine: best.sourceLine, + lines: best.length, + }); + } + } + } + } + + return findings.sort((a, b) => b.lines - a.lines).slice(0, MAX_FINDINGS); +} diff --git a/review-enrichment/src/analyzers/registry.ts b/review-enrichment/src/analyzers/registry.ts index 9d8967055..33252a53b 100644 --- a/review-enrichment/src/analyzers/registry.ts +++ b/review-enrichment/src/analyzers/registry.ts @@ -4,6 +4,7 @@ import { scanCodeowners } from "./codeowners.js"; import { scanCommitSignature } from "./commit-signature.js"; import { dependencyAnalyzer } from "./dependency/descriptor.js"; import { scanDocCommentDrift } from "./doc-comment-drift.js"; +import { scanDuplication } from "./duplication-scan.js"; import { scanEol } from "./eol-check.js"; import { scanHeavyDependencies } from "./heavy-dependency.js"; import { scanHistory } from "./history.js"; @@ -388,6 +389,35 @@ export const ANALYZER_DESCRIPTORS = [ }, run: (req, { signal }) => scanDocCommentDrift(req, fetch, { signal }), }), + descriptor({ + name: "duplication", + title: "Near-verbatim duplicated code", + category: "quality", + cost: "github-light", + defaultEnabled: true, + requires: ["files", "github-token", "head-sha"], + limits: { + minRun: 8, + maxCandidates: 40, + maxFetches: 30, + maxFindings: 25, + maxFileBytes: 500_000, + }, + docs: { + summary: + "Flags added code that is a near-verbatim duplicate of a block already present elsewhere in the repo.", + looksAt: + "Added diff hunks in changed source files compared against same-extension repo files fetched from the git tree at headSha.", + reports: + "The head file:line, the existing source file:line it duplicates, and the matched line count.", + network: + "Calls the GitHub API for the git tree and candidate blobs. Requires headSha and token forwarding for private repos.", + notes: + "Conservative: trivial/boilerplate lines are dropped and a long contiguous run is required, so incidental overlap is not flagged. Never returns code content.", + }, + run: (req, { signal, analysis, diagnostics }) => + scanDuplication(req, fetch, { signal, analysis, diagnostics }), + }), ] as const satisfies readonly AnyAnalyzerDescriptor[]; export const ANALYZER_NAMES = ANALYZER_DESCRIPTORS.map( diff --git a/review-enrichment/src/render.ts b/review-enrichment/src/render.ts index c90461c43..47ed5ae95 100644 --- a/review-enrichment/src/render.ts +++ b/review-enrichment/src/render.ts @@ -353,6 +353,18 @@ export function renderBrief( } } + const duplication = findings.duplication ?? []; + if (duplication.length) { + lines.push( + "### Near-verbatim duplicated code (prefer importing the existing implementation)", + ); + for (const item of duplication) { + lines.push( + `- ${safeCodeSpan(`${item.file}:${item.line}`)} duplicates ${safeCodeSpan(`${item.sourceFile}:${item.sourceLine}`)} (~${item.lines} lines)`, + ); + } + } + if (!lines.length) return { promptSection: "", systemSuffix: "" }; const header = diff --git a/review-enrichment/src/types.ts b/review-enrichment/src/types.ts index d3e4bd61e..29bd606dc 100644 --- a/review-enrichment/src/types.ts +++ b/review-enrichment/src/types.ts @@ -291,6 +291,7 @@ export interface BriefFindings { nativeBuild?: NativeBuildFinding[]; history?: HistoryFinding[]; docCommentDrift?: DocCommentDriftFinding[]; + duplication?: DuplicationFinding[]; } /** A JSDoc/TSDoc block whose `@param` tags name parameters the adjacent function no longer declares — a @@ -305,6 +306,22 @@ export interface DocCommentDriftFinding { staleParams: string[]; } +/** Added code that is a near-verbatim duplicate of a contiguous block already present elsewhere in the repo — a + * copy-paste the no-checkout reviewer cannot see, where importing the existing implementation is usually better. + * Reports the head location, the matching source location, and the matched line count only — never the code. (#1520) */ +export interface DuplicationFinding { + /** Path of the changed file that ADDED the duplicated block. */ + file: string; + /** New-file line where the duplicated run begins in the changed file. */ + line: number; + /** Path of the existing repo file that already contains the same block. */ + sourceFile: string; + /** 1-based line where the run begins in the existing source file. */ + sourceLine: number; + /** Number of contiguous significant lines that matched verbatim (after whitespace normalization). */ + lines: number; +} + export type AnalyzerStatus = "ok" | "degraded" | "skipped" | "capped" | "timeout"; /** Internal, public-safe analyzer diagnostics for Sentry. Never attach request bodies, diffs, tokens, or raw prompts. */ diff --git a/review-enrichment/test/analyzer-registry.test.ts b/review-enrichment/test/analyzer-registry.test.ts index 6ec736433..f4f44de07 100644 --- a/review-enrichment/test/analyzer-registry.test.ts +++ b/review-enrichment/test/analyzer-registry.test.ts @@ -29,6 +29,7 @@ const EXPECTED_ANALYZERS = [ "nativeBuild", "history", "docCommentDrift", + "duplication", ]; test("analyzer descriptors cover the runtime registry in stable order", () => { diff --git a/review-enrichment/test/duplication-scan.test.ts b/review-enrichment/test/duplication-scan.test.ts new file mode 100644 index 000000000..2d5be4264 --- /dev/null +++ b/review-enrichment/test/duplication-scan.test.ts @@ -0,0 +1,528 @@ +// Units for the near-verbatim duplication-scan analyzer (#1520). Own file (not enrichment.test.ts) so concurrent +// analyzer PRs don't collide. Runs against the compiled dist/. +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { + scanDuplication, + normalizeLine, + extractAddedBlocks, + decodeBase64Utf8, +} from "../dist/analyzers/duplication-scan.js"; +import { renderBrief } from "../dist/render.js"; + +// ── helpers ──────────────────────────────────────────────────────────────────── + +// A run of >= MIN_RUN (8) significant, non-trivial, non-import lines shared between the patch and a source file. +const SHARED = [ + "const totalRewardScaled = baseReward * decayFactor(epochAge)", + "const clampedScore = Math.min(maxScore, Math.max(minScore, rawScore))", + "const weightedAverage = sumOfWeights === 0 ? 0 : weightedTotal / sumOfWeights", + "const normalizedVector = values.map((value) => value / vectorMagnitude)", + "const adjustedPenalty = penaltyBase * Math.log2(violationCount + 1)", + "const finalEmission = totalRewardScaled - adjustedPenalty + bonusAmount", + "const roundedEmission = Math.round(finalEmission * 1000000) / 1000000", + "const settledEmission = Number.isFinite(roundedEmission) ? roundedEmission : 0", + "const persistedEmission = await store.write(minerHotkey, settledEmission)", +]; + +// Build a +-prefixed unified-diff patch that adds the given lines starting at new-file line `start`. +const addedPatch = (addedLines, start = 10) => { + const body = addedLines.map((l) => `+${l}`).join("\n"); + return `@@ -1,0 +${start},${addedLines.length} @@\n${body}`; +}; + +// A source file's text where the SHARED block begins at line `at` (1-based), padded above with filler lines. +const sourceWithBlock = (block, at = 5) => { + const filler = Array.from( + { length: at - 1 }, + (_, i) => `const fillerVariableNumber${i} = computeFillerValueFromIndex(${i})`, + ); + return [...filler, ...block].join("\n"); +}; + +const b64 = (text) => Buffer.from(text, "utf8").toString("base64"); +const b64Bytes = (arr) => Buffer.from(Uint8Array.from(arr)).toString("base64"); +// Build a real Response so the shared boundedFetchJson body reader (used when no analysis context is injected) +// parses it exactly as it would a live GitHub reply. +const jsonResponse = (body, init) => new Response(JSON.stringify(body), init); + +// Mock fetch that switches on the URL: git/trees → the tree; git/blobs/{sha} → that blob's content. +const makeFetch = ({ + tree, + blobs, + truncated = false, + treeOk = true, + treeBadJson = false, + blobStatus = {}, + blobThrow = new Set(), + counter, +}) => { + return async (url) => { + if (counter) counter.calls.push(url); + if (url.includes("/git/trees/")) { + if (counter) counter.tree++; + if (!treeOk) return jsonResponse({}, { status: 500 }); + if (treeBadJson) return jsonResponse({ nope: 1 }); + return jsonResponse({ + tree: tree.map((t) => ({ path: t.path, type: "blob", sha: t.sha })), + truncated, + }); + } + const m = /\/git\/blobs\/([0-9a-fA-F]+)/.exec(url); + if (m) { + const sha = m[1]; + if (counter) counter.blob++; + if (blobThrow.has(sha)) throw new Error("network down"); + const code = blobStatus[sha]; + if (code && !(code >= 200 && code < 300)) { + return jsonResponse({}, { status: code }); + } + const content = blobs[sha]; + if (content === undefined) { + return jsonResponse({ encoding: "utf8", content: "x" }); + } + return jsonResponse({ encoding: "base64", content: b64(content) }); + } + throw new Error(`unexpected url ${url}`); + }; +}; + +const baseReq = (overrides = {}) => ({ + repoFullName: "o/r", + prNumber: 1, + headSha: "a".repeat(40), + githubToken: "tok", + files: [{ path: "src/new-scorer.ts", status: "added", patch: addedPatch(SHARED) }], + ...overrides, +}); + +// ── normalizeLine / extractAddedBlocks units ──────────────────────────────────── + +test("normalizeLine: drops blanks, trivial, punctuation-only and bare imports; keeps significant code", () => { + assert.equal(normalizeLine(" "), null); + assert.equal(normalizeLine("})"), null); // punctuation only + assert.equal(normalizeLine("} else {"), null); // < MIN_SIGNIFICANT_LEN after trim + assert.equal(normalizeLine("short"), null); // too short + assert.equal(normalizeLine("import { foo } from './bar'"), null); // bare import + assert.equal(normalizeLine(" const x = computeWeightedScore(a, b) "), "const x = computeWeightedScore(a, b)"); + assert.equal( + normalizeLine("const y = 1234567890 + somethingLong"), + "const y = 1234567890 + somethingLong", + ); // internal whitespace collapsed +}); + +test("extractAddedBlocks: groups consecutive added lines, breaks on context/removed/trivial, tracks new-file line numbers", () => { + const patch = [ + "@@ -1,2 +5,6 @@", + "+const firstSignificantLineHere = makeValue(1)", + "+const secondSignificantLineHere = makeValue(2)", + " contextLineThatBreaksTheRun", + "+const thirdSignificantLineHere = makeValue(3)", + "-removedLineDoesNotCount", + "+const fourthSignificantLineHere = makeValue(4)", + ].join("\n"); + const blocks = extractAddedBlocks(patch); + // block1: lines 5,6 ; block2: line 8 (after one context line at 7) ; block3: line 9 (removed doesn't advance) + assert.equal(blocks.length, 3); + assert.deepEqual(blocks[0].lineNos, [5, 6]); + assert.deepEqual(blocks[1].lineNos, [8]); + assert.deepEqual(blocks[2].lineNos, [9]); +}); + +test("decodeBase64Utf8: decodes UTF-8 blob content without globalThis.Buffer (Cloudflare Worker deployment path)", () => { + // The production decode must not depend on Node's Buffer. Encode while Buffer exists, then remove it and decode. + const text = "const computedScore = weight * factor + base\nconst total = computedScore + bonusAmount + café"; + const encoded = b64(text); + const savedBuffer = globalThis.Buffer; + globalThis.Buffer = undefined; // simulate a runtime with no Node Buffer (Worker) + try { + const out = decodeBase64Utf8(encoded); + assert.equal(out?.text, text); // atob/TextDecoder path, multi-byte UTF-8 (é) preserved + assert.ok(out.byteLength >= text.length); // byte length, not UTF-16 code-unit length + } finally { + globalThis.Buffer = savedBuffer; + } +}); + +test("decodeBase64Utf8: malformed base64 fails safe to null (never throws)", () => { + assert.equal(decodeBase64Utf8("!!!not base64!!!"), null); +}); + +test("decodeBase64Utf8: invalid UTF-8 (binary) content fails safe to null (fatal decode)", () => { + // base64 of the bytes 0xFF 0xFE 0xFD — not valid UTF-8; a binary blob must decode to null, not garbage. + const invalidUtf8B64 = b64Bytes([0xff, 0xfe, 0xfd]); + assert.equal(decodeBase64Utf8(invalidUtf8B64), null); +}); + +test("scanDuplication: a match cannot bridge a blank/trivial line in the source (per-block indexing)", async () => { + // Added: 8 contiguous significant lines. Candidate: the same 8 lines, but split by a blank line into two 4-line + // blocks. With per-block indexing neither candidate block reaches MIN_RUN (8) contiguously, so there is NO match. + const eight = SHARED.slice(0, 8); + const splitCandidate = [...eight.slice(0, 4), "", ...eight.slice(4)].join("\n"); + const fetchImpl = makeFetch({ + tree: [{ path: "src/other.ts", sha: "c".repeat(40) }], + blobs: { ["c".repeat(40)]: splitCandidate }, + }); + const req = baseReq({ files: [{ path: "src/new.ts", status: "added", patch: addedPatch(eight) }] }); + assert.deepEqual(await scanDuplication(req, fetchImpl), []); +}); + +test("scanDuplication: mixed-extension PR does not cross-match (a .ts block is not compared to a .py candidate)", async () => { + // A changed .ts file whose added block lives VERBATIM in a .py candidate file. Cross-extension matching is a bug; + // the per-extension guard must prevent any finding here. + const req = { + repoFullName: "o/r", + prNumber: 1, + headSha: "a".repeat(40), + githubToken: "tok", + files: [ + { path: "src/scorer.ts", status: "added", patch: addedPatch(SHARED) }, + { path: "src/util.py", status: "added", patch: "@@ -1,0 +1,1 @@\n+def helper(value): return value + 1" }, + ], + }; + const fetchImpl = makeFetch({ + tree: [ + { path: "lib/copy.py", sha: "b".repeat(40) }, // .py file containing the .ts block — must NOT match + { path: "lib/other.ts", sha: "c".repeat(40) }, // unrelated .ts + ], + blobs: { + ["b".repeat(40)]: sourceWithBlock(SHARED), + ["c".repeat(40)]: "const unrelatedConstantValue = computeSomethingDifferentEntirely(42)", + }, + }); + assert.deepEqual(await scanDuplication(req, fetchImpl), []); +}); + +test("scanDuplication: per-extension budget — a .py duplicate is still found despite >40 unrelated .ts candidates", async () => { + // 8 significant Python lines for the .py side. + const PY_SHARED = [ + "total_reward_scaled = base_reward * decay_factor(epoch_age)", + "clamped_score = min(max_score, max(min_score, raw_score))", + "weighted_average = 0 if sum_of_weights == 0 else weighted_total / sum_of_weights", + "normalized_vector = [value / vector_magnitude for value in values]", + "adjusted_penalty = penalty_base * math.log2(violation_count + 1)", + "final_emission = total_reward_scaled - adjusted_penalty + bonus_amount", + "rounded_emission = round(final_emission * 1000000) / 1000000", + "settled_emission = rounded_emission if math.isfinite(rounded_emission) else 0", + ]; + // 45 unrelated .ts candidates that would overflow a single global slice(40) and starve the .py side. + const tsFillers = Array.from({ length: 45 }, (_, i) => ({ + path: `lib/ts/filler${i}.ts`, + sha: "a".repeat(38) + i.toString().padStart(2, "0"), + })); + const pyDup = { path: "lib/py/copy.py", sha: "f".repeat(40) }; + const blobs = { [pyDup.sha]: sourceWithBlock(PY_SHARED) }; + for (const f of tsFillers) { + blobs[f.sha] = `const noMatchFiller${f.sha.slice(-2)} = computeUnrelatedFillerValue(123456)`; + } + const req = { + repoFullName: "o/r", + prNumber: 1, + headSha: "a".repeat(40), + githubToken: "tok", + files: [ + { path: "src/scorer.ts", status: "added", patch: addedPatch(SHARED) }, + { path: "src/scorer.py", status: "added", patch: addedPatch(PY_SHARED) }, + ], + }; + const findings = await scanDuplication(req, makeFetch({ tree: [...tsFillers, pyDup], blobs })); + assert.ok( + findings.some((x) => x.sourceFile === "lib/py/copy.py" && x.file === "src/scorer.py"), + JSON.stringify(findings), + ); +}); + +test("scanDuplication: total blob fetches across all changed extensions never exceed the global MAX_FETCHES budget", async () => { + // Two extensions, each with 25 candidates (50 total). Round-robin under ONE global budget must cap total fetches. + const PY = [ + "py_total_reward = base_reward * decay_factor_for_epoch(epoch_age)", + "py_clamped_score = min(max_score, max(min_score, raw_score_value))", + "py_weighted_avg = 0 if total_weight == 0 else weighted_sum / total_weight", + "py_normalized = [value / vector_magnitude for value in raw_values]", + "py_adjusted_penalty = penalty_base * log2_of(violation_count + 1)", + "py_final_emission = py_total_reward - py_adjusted_penalty + bonus_amount", + "py_rounded = round(py_final_emission * 1000000) / 1000000.0", + "py_settled = py_rounded if is_finite_number(py_rounded) else 0.0", + ]; + const tsFillers = Array.from({ length: 25 }, (_, i) => ({ path: `lib/ts/f${i}.ts`, sha: "a".repeat(38) + i.toString().padStart(2, "0") })); + const pyFillers = Array.from({ length: 25 }, (_, i) => ({ path: `lib/py/f${i}.py`, sha: "b".repeat(38) + i.toString().padStart(2, "0") })); + const blobs = {}; + for (const f of [...tsFillers, ...pyFillers]) { + blobs[f.sha] = `const unrelatedFiller${f.sha.slice(-2)} = computeUnrelatedFillerValue(987654)`; + } + const counter = { calls: [], tree: 0, blob: 0 }; + const req = { + repoFullName: "o/r", + prNumber: 1, + headSha: "a".repeat(40), + githubToken: "tok", + files: [ + { path: "src/a.ts", status: "added", patch: addedPatch(SHARED) }, + { path: "src/b.py", status: "added", patch: addedPatch(PY) }, + ], + }; + await scanDuplication(req, makeFetch({ tree: [...tsFillers, ...pyFillers], blobs, counter })); + assert.ok(counter.blob <= 30, `blob fetches=${counter.blob} exceeded MAX_FETCHES (30)`); + assert.ok(counter.blob > 0, "expected some candidate fetches"); +}); + +test("extractAddedBlocks: empty/undefined patch → no blocks", () => { + assert.deepEqual(extractAddedBlocks(undefined), []); + assert.deepEqual(extractAddedBlocks(""), []); +}); + +test("scanDuplication: an oversized candidate blob is skipped so it cannot eat the budget", async () => { + // The SHARED block IS present in this candidate, but the file exceeds MAX_FILE_BYTES → never scanned → no finding. + const oversized = sourceWithBlock(SHARED) + "\n" + "x".repeat(500_001); + const fetch = makeFetch({ + tree: [{ path: "src/huge-generated.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: oversized }, + }); + assert.deepEqual(await scanDuplication(baseReq(), fetch), []); +}); + +// ── scanDuplication detection ──────────────────────────────────────────────────── + +test("scanDuplication: detects a near-verbatim duplicate with correct head vs source location and line count", async () => { + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }, + }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); + assert.equal(findings[0].file, "src/new-scorer.ts"); + assert.equal(findings[0].line, 10); // patch starts the added block at new-file line 10 + assert.equal(findings[0].sourceFile, "src/existing-scorer.ts"); + assert.equal(findings[0].sourceLine, 5); // SHARED block begins at line 5 in the source + assert.equal(findings[0].lines, SHARED.length); // entire 9-line run matched +}); + +test("scanDuplication: a run shorter than MIN_RUN is not flagged (no false positive)", async () => { + const short = SHARED.slice(0, 7); // 7 < MIN_RUN (8) + const req = baseReq({ + files: [{ path: "src/new-scorer.ts", status: "added", patch: addedPatch(short) }], + }); + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: sourceWithBlock(short, 5) }, + }); + assert.deepEqual(await scanDuplication(req, fetchImpl), []); +}); + +test("scanDuplication: boilerplate-only / import-only added lines are never flagged", async () => { + const boilerplate = Array.from({ length: 12 }, (_, i) => `import { thing${i} } from './m${i}'`); + const req = baseReq({ + files: [{ path: "src/new-scorer.ts", status: "added", patch: addedPatch(boilerplate) }], + }); + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: boilerplate.join("\n") }, + }); + assert.deepEqual(await scanDuplication(req, fetchImpl), []); +}); + +test("scanDuplication: the changed file itself is excluded as a candidate", async () => { + // Only candidate in the tree IS the changed file → no other source to match against → no finding. + const counter = { calls: [], tree: 0, blob: 0 }; + const fetchImpl = makeFetch({ + tree: [{ path: "src/new-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }, + counter, + }); + assert.deepEqual(await scanDuplication(baseReq(), fetchImpl), []); + assert.equal(counter.blob, 0); // no blob fetched — the only tree entry was the changed file +}); + +test("scanDuplication: only same-extension candidates are considered", async () => { + const counter = { calls: [], tree: 0, blob: 0 }; + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.py", sha: "b".repeat(40) }], // .py vs changed .ts + blobs: { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }, + counter, + }); + assert.deepEqual(await scanDuplication(baseReq(), fetchImpl), []); + assert.equal(counter.blob, 0); // wrong extension → never fetched +}); + +// ── fail-safe guards ───────────────────────────────────────────────────────────── + +test("scanDuplication: fails safe with no githubToken", async () => { + assert.deepEqual(await scanDuplication(baseReq({ githubToken: undefined }), async () => { + throw new Error("should not fetch"); + }), []); +}); + +test("scanDuplication: fails safe with no headSha", async () => { + assert.deepEqual(await scanDuplication(baseReq({ headSha: undefined }), async () => { + throw new Error("should not fetch"); + }), []); +}); + +test("scanDuplication: fails safe on a bad repoFullName", async () => { + assert.deepEqual(await scanDuplication(baseReq({ repoFullName: "../evil" }), async () => { + throw new Error("should not fetch"); + }), []); +}); + +test("scanDuplication: fails safe when the tree fetch is non-OK", async () => { + const fetchImpl = makeFetch({ tree: [], blobs: {}, treeOk: false }); + assert.deepEqual(await scanDuplication(baseReq(), fetchImpl), []); +}); + +test("scanDuplication: fails safe on malformed tree json (no tree array)", async () => { + const fetchImpl = makeFetch({ tree: [], blobs: {}, treeBadJson: true }); + assert.deepEqual(await scanDuplication(baseReq(), fetchImpl), []); +}); + +test("scanDuplication: a candidate whose blob fetch is non-OK is skipped, scan continues", async () => { + const fetchImpl = makeFetch({ + tree: [ + { path: "src/broken.ts", sha: "b".repeat(40) }, // non-OK blob + { path: "src/existing-scorer.ts", sha: "c".repeat(40) }, // good duplicate + ], + blobs: { ["c".repeat(40)]: sourceWithBlock(SHARED, 5) }, + blobStatus: { ["b".repeat(40)]: 404 }, + }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); + assert.equal(findings[0].sourceFile, "src/existing-scorer.ts"); +}); + +test("scanDuplication: a candidate whose blob fetch throws is skipped, scan continues", async () => { + const fetchImpl = makeFetch({ + tree: [ + { path: "src/broken.ts", sha: "b".repeat(40) }, + { path: "src/existing-scorer.ts", sha: "c".repeat(40) }, + ], + blobs: { ["c".repeat(40)]: sourceWithBlock(SHARED, 5) }, + blobThrow: new Set(["b".repeat(40)]), + }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); + assert.equal(findings[0].sourceFile, "src/existing-scorer.ts"); +}); + +test("scanDuplication: a candidate blob with non-base64 encoding is skipped", async () => { + // "src/plain.ts" returns encoding utf8 (not base64) → decoded to null → skipped; the real dup still matches. + const fetchImpl = makeFetch({ + tree: [ + { path: "src/plain.ts", sha: "b".repeat(40) }, + { path: "src/existing-scorer.ts", sha: "c".repeat(40) }, + ], + blobs: { ["c".repeat(40)]: sourceWithBlock(SHARED, 5) }, // b-sha has no entry → utf8 fallback in mock + }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); + assert.equal(findings[0].sourceFile, "src/existing-scorer.ts"); +}); + +test("scanDuplication: a truncated tree still works on the returned entries", async () => { + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }, + truncated: true, + }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); +}); + +// ── bounding ────────────────────────────────────────────────────────────────────── + +test("scanDuplication: respects MAX_FETCHES (caps candidate blob fetches at 30)", async () => { + // 50 same-extension non-matching candidates → only MAX_FETCHES (30) blobs fetched. + const tree = Array.from({ length: 50 }, (_, i) => ({ + path: `src/zzz/cand${String(i).padStart(3, "0")}.ts`, + sha: (i + 16).toString(16).padStart(40, "0"), + })); + const blobs = {}; + for (const t of tree) blobs[t.sha] = "const irrelevantNonMatchingContentLine = makeUniqueValue()"; + const counter = { calls: [], tree: 0, blob: 0 }; + const fetchImpl = makeFetch({ tree, blobs, counter }); + await scanDuplication(baseReq(), fetchImpl); + assert.equal(counter.blob, 30); +}); + +test("scanDuplication: respects MAX_CANDIDATES (only the closest 40 candidates are considered)", async () => { + // 100 candidates but MAX_CANDIDATES=40 caps the set; combined with MAX_FETCHES=30 only 30 blobs fetched, and the + // proximity sort means the in-directory candidate (sharing src/) is preferred and the match is still found. + const tree = [ + { path: "src/existing-scorer.ts", sha: "b".repeat(40) }, // shares src/ → high proximity → fetched first + ]; + for (let i = 0; i < 100; i++) { + tree.push({ + path: `far/away/dir/cand${String(i).padStart(3, "0")}.ts`, + sha: (i + 32).toString(16).padStart(40, "0"), + }); + } + const blobs = { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }; + for (const t of tree) if (!(t.sha in blobs)) blobs[t.sha] = "const unrelatedFillerContentLine = uniqueValue()"; + const counter = { calls: [], tree: 0, blob: 0 }; + const fetchImpl = makeFetch({ tree, blobs, counter }); + const findings = await scanDuplication(baseReq(), fetchImpl); + assert.equal(findings.length, 1); + assert.equal(findings[0].sourceFile, "src/existing-scorer.ts"); + assert.ok(counter.blob <= 30); // MAX_FETCHES still bounds the work +}); + +test("scanDuplication: an already-aborted signal yields [] without fetching", async () => { + const counter = { calls: [], tree: 0, blob: 0 }; + const fetchImpl = makeFetch({ + tree: [{ path: "src/existing-scorer.ts", sha: "b".repeat(40) }], + blobs: { ["b".repeat(40)]: sourceWithBlock(SHARED, 5) }, + counter, + }); + const findings = await scanDuplication(baseReq(), fetchImpl, { signal: AbortSignal.abort() }); + assert.deepEqual(findings, []); + assert.equal(counter.tree, 0); // never even fetched the tree +}); + +test("scanDuplication: no changed source files → [] without fetching", async () => { + const req = baseReq({ + files: [{ path: "README.md", status: "modified", patch: addedPatch(SHARED) }], + }); + let called = false; + await scanDuplication(req, async () => { + called = true; + throw new Error("nope"); + }); + assert.equal(called, false); +}); + +test("scanDuplication: removed files and no added-block files are ignored", async () => { + const req = baseReq({ + files: [ + { path: "src/gone.ts", status: "removed", patch: addedPatch(SHARED) }, // removed → skipped + ], + }); + let called = false; + await scanDuplication(req, async () => { + called = true; + throw new Error("nope"); + }); + assert.equal(called, false); +}); + +// ── render ────────────────────────────────────────────────────────────────────── + +test("renderBrief emits a public-safe duplication block with file:line, escaping paths, never the code", () => { + const { promptSection } = renderBrief({ + duplication: [ + { file: "src/new-scorer.ts", line: 10, sourceFile: "src/existing-scorer.ts", sourceLine: 5, lines: 9 }, + ], + }); + assert.match(promptSection, /Near-verbatim duplicated code/); + assert.match(promptSection, /`src\/new-scorer\.ts:10`/); + assert.match(promptSection, /`src\/existing-scorer\.ts:5`/); + assert.match(promptSection, /~9 lines/); + // No code content from SHARED ever leaks into the rendered brief. + assert.ok(!promptSection.includes("totalRewardScaled")); +}); + +test("renderBrief escapes a backtick in a duplication path (no code-span breakout)", () => { + const { promptSection } = renderBrief({ + duplication: [ + { file: "src/we`ird.ts", line: 1, sourceFile: "src/o`ther.ts", sourceLine: 2, lines: 8 }, + ], + }); + assert.ok(!promptSection.includes("we`ird")); // raw backtick neutralized by safeCodeSpan +});