diff --git a/.github/workflows/indexer-backfill.yml b/.github/workflows/indexer-backfill.yml index f67852023..3f8ddd01d 100644 --- a/.github/workflows/indexer-backfill.yml +++ b/.github/workflows/indexer-backfill.yml @@ -62,10 +62,15 @@ on: default: '50' type: string path_prefix: - description: 'Scope crawl to one path prefix (e.g. .agents/skills); empty = all prefixes' + description: 'Scope crawl to one path prefix (e.g. .agents/skills); empty = broad query (all depths)' required: false default: '' type: string + max_ranges: + description: 'SMI-5286 1c: per-dispatch size-(sub)range budget before a checkpoint+exit (default: 150)' + required: false + default: '150' + type: string supabase_env: description: 'Target Supabase environment' required: false @@ -181,6 +186,14 @@ jobs: RESUME_FROM: ${{ github.event.inputs.resume_from || 'latest' }} BACKFILL_MAX_SKILLS_PER_REPO: ${{ github.event.inputs.max_skills_per_repo || '50' }} BACKFILL_PATH_PREFIX: ${{ github.event.inputs.path_prefix || '' }} + # SMI-5286 1c: run ONLY Phase 3 (the size-faceted subdirectory crawl) + + # finalize each dispatch — topic/high-trust are the cron's job, so this + # keeps every backfill dispatch focused and resumable on the facet cursor. + DISCOVERY_PHASE: '3' + # SMI-5286 1c: per-dispatch (sub)range budget — the facet driver writes a + # checkpoint after this many ranges so the run fits the GHA cap; re-dispatch + # with resume_from=latest until facets_remaining=0. + BACKFILL_MAX_RANGES: ${{ github.event.inputs.max_ranges || '150' }} # Raised caps for backfill mode (per SPARC section #3). # These override the conservative cron defaults. CODE_SEARCH_MAX_PAGES: '10' @@ -224,6 +237,11 @@ jobs: CAP_SATURATED=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.cap_saturated // false') TRUNCATED_REPO_COUNT=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.truncated_repo_count // 0') TOKEN_SOURCE=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.token_source // "unknown"') + # SMI-5286 1c (M-2): true crawl position. current_facet == 'done' is the + # AUTHORITATIVE terminal signal (facets_remaining alone reads 0 while the + # last facet's bisected sub-ranges still drain). + CURRENT_FACET=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.current_facet // "unknown"') + PENDING_SUBRANGES=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.pending_subrange_count // 0') # CRITICAL: token_source must be exactly "pat" on every backfill dispatch. # An "app" value means the App env entries leaked in (consuming the cron's @@ -247,15 +265,19 @@ jobs: echo "| Facets Total | $FACETS_TOTAL |" echo "| Facets Completed | $FACETS_COMPLETED |" echo "| Facets Remaining | $FACETS_REMAINING |" + echo "| Current Facet | $CURRENT_FACET |" + echo "| Pending Sub-ranges | $PENDING_SUBRANGES |" echo "| Checkpoint ID | $CHECKPOINT_ID |" echo "| Cap Saturated | $CAP_SATURATED |" echo "| Truncated Repo Count | $TRUNCATED_REPO_COUNT |" echo "| Token Source | $TOKEN_SOURCE |" echo "" - if [ "$FACETS_REMAINING" = "0" ]; then - echo "**TERMINAL CONDITION MET**: facets_remaining == 0. Backfill loop is complete." + # current_facet == 'done' is authoritative: it is set only when the + # ladder AND the bisection frontier are both exhausted (SMI-5286 1c C-1/M-2). + if [ "$CURRENT_FACET" = "done" ]; then + echo "**TERMINAL CONDITION MET**: current_facet == 'done'. Backfill loop is complete." else - echo "**Backfill continues.** Re-dispatch with resume_from=latest to pick up from checkpoint $CHECKPOINT_ID." + echo "**Backfill continues** (current_facet=$CURRENT_FACET, pending_subranges=$PENDING_SUBRANGES). Re-dispatch with resume_from=latest to pick up from checkpoint $CHECKPOINT_ID." fi } >> "$GITHUB_STEP_SUMMARY" diff --git a/CHANGELOG.md b/CHANGELOG.md index 69a49303c..4143b2bbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Indexer Backfill Facet Driver** (2026-06-18, SMI-5286 sub-wave 1c): the + out-of-band backfill (`indexer-backfill.yml`) now crawls the full + `filename:SKILL.md` universe past GitHub code-search's 1000-result-per-query + cap by partitioning the broad query into a fixed `size:` byte-range ladder + (`code-search.facets.ts`) with adaptive bisect-on-saturation: any facet whose + `total_count` exceeds the cap is split and its halves crawled before the next + facet, so every file is reachable. The depth-first frontier (facet index + + bisection stack + page) is fully captured by the checkpoint cursor — extended + with `pending_subranges` — so a dispatch boundary mid-bisection resumes + losslessly across the 6h GHA cap. `per_page` raised 30→100; `BACKFILL_PATH_PREFIX` + scopes a one-ecosystem DRY_RUN; `DISCOVERY_PHASE=3` focuses each dispatch on the + Phase-3b crawl + finalize. Also fixes a latent root-`SKILL.md` drop in + `fetchSkillPathsFromTree` (`trees-search.ts`) — repos whose only skill is a root + `SKILL.md` are now emitted as `path:''` instead of silently lost. Gated: the + live (`DRY_RUN=false`) crawl requires explicit operator sign-off. - **Vendor-Org Trust Tier** (2026-05-02, SMI-4651): GitHub-verified vendor organizations (Stripe, Notion, Atlassian, Figma, Canva, Zapier, Cloudflare, and any future verified org) are now auto-promoted to the `curated` trust diff --git a/scripts/indexer/backfill-checkpoint.ts b/scripts/indexer/backfill-checkpoint.ts index 1c66f2a63..c157de2b7 100644 --- a/scripts/indexer/backfill-checkpoint.ts +++ b/scripts/indexer/backfill-checkpoint.ts @@ -22,21 +22,174 @@ */ import type { SupabaseClient } from '@supabase/supabase-js' +import { type SizeFacet, buildSizeFacets, facetId, bisectFacet } from './code-search.facets.ts' /** `event_type` discriminator for backfill checkpoint rows in `audit_logs`. */ export const BACKFILL_CHECKPOINT_EVENT_TYPE = 'indexer_backfill_checkpoint' +/** + * A persisted size sub-range `[lo, hi]`. `hi` is `null` when the range is + * open-ended (`Infinity`) — `Infinity` does NOT survive `JSON.stringify` + * (it serializes to `null`), so the cursor uses an explicit `null` sentinel and + * {@link deserializeRange} maps it back to `Number.POSITIVE_INFINITY`. + */ +export type PersistedSubrange = [number, number | null] + /** * Resume cursor. `(path, facet, last_page)` lets a re-dispatch resume mid-facet, * not just at facet boundaries (SPARC §#5 facet-AND-page granularity). */ export interface BackfillCursor { - /** The path-prefix facet being crawled (e.g. '.agents/skills'). */ + /** The path-prefix being crawled ('' = the broad, no-`path:` query). */ path: string - /** The active facet window within that path (e.g. a date/size bucket; Wave 1c). */ + /** Stable id of the active size facet/sub-range ({@link facetId}); 'done' when complete. */ facet: string - /** Last code-search page consumed within the current facet (1-based). */ + /** Last code-search page consumed within the current (sub)range (1-based; 0 = none yet). */ last_page: number + /** + * SMI-5286 1c: 0-based index of the next top-level facet to process in the + * static {@link buildSizeFacets} ladder. Incremented when a top-level facet is + * RETIRED — either fully drained OR bisected (its sub-ranges, tracked in + * `pending_subranges`, then cover it). So it counts top-level facets whose + * coverage is committed, NOT necessarily finished crawling; use `current_facet` + * / `pending_subrange_count` (in the run summary) to tell 'bisecting' from 'done'. + */ + facet_index?: number + /** + * SMI-5286 1c: the in-progress bisection frontier — sub-ranges of the current + * facet not yet fully crawled (DFS stack; the LAST element is crawled next). + * Persisted so a dispatch boundary mid-bisection resumes without losing + * not-yet-crawled sub-ranges (the bare `(path,facet,last_page)` cursor cannot + * represent a partial bisection tree, C-2). + */ + pending_subranges?: PersistedSubrange[] +} + +/** Map a runtime {@link SizeFacet} to its JSON-safe persisted form (`Infinity` → `null`). */ +function serializeRange(facet: SizeFacet): PersistedSubrange { + return [facet.lo, Number.isFinite(facet.hi) ? facet.hi : null] +} + +/** Map a persisted sub-range back to a runtime {@link SizeFacet} (`null` → `Infinity`). */ +function deserializeRange([lo, hi]: PersistedSubrange): SizeFacet { + return { lo, hi: hi == null ? Number.POSITIVE_INFINITY : hi } +} + +/** + * Runtime crawl frontier reconstructed from a {@link BackfillCursor}. The facet + * driver is a depth-first walk of the static size ladder: each top-level facet + * that saturates the 1000-result cap is bisected into `pendingSubranges`, which + * are drained (themselves bisecting further) before `facetIndex` advances. + */ +export interface FacetCrawlState { + /** Index into {@link buildSizeFacets} of the current top-level facet. */ + facetIndex: number + /** DFS stack of sub-ranges still to crawl for the current facet; head crawled next. */ + pendingSubranges: SizeFacet[] + /** Last page consumed within the current (sub)range (0 = none). */ + lastPage: number +} + +/** Reconstruct the crawl frontier from a persisted cursor (or a cold start). */ +export function cursorToFacetState(cursor: BackfillCursor | null | undefined): FacetCrawlState { + if (!cursor) return { facetIndex: 0, pendingSubranges: [], lastPage: 0 } + return { + facetIndex: cursor.facet_index ?? 0, + pendingSubranges: (cursor.pending_subranges ?? []).map(deserializeRange), + lastPage: cursor.last_page ?? 0, + } +} + +/** + * The range currently being crawled: the head of the bisection stack, else the + * top-level facet at `facetIndex`. `null` once the ladder is exhausted. + */ +export function currentFacetRange( + state: FacetCrawlState, + facets: SizeFacet[] = buildSizeFacets() +): SizeFacet | null { + if (state.pendingSubranges.length > 0) { + return state.pendingSubranges[state.pendingSubranges.length - 1] + } + if (state.facetIndex < facets.length) return facets[state.facetIndex] + return null +} + +/** + * Replace the current saturated range with its two halves (the first half is + * crawled next). Resets the page cursor. Returns false when the range cannot + * subdivide (the caller then records truncation and advances). + * + * Retirement: a saturated range is REPLACED by its halves, so it must never be + * revisited. If it was a sub-range (stack non-empty) we pop it; if it was the + * TOP-LEVEL facet (stack empty) we advance `facetIndex` past it before pushing — + * otherwise, once the halves drain, `currentFacetRange` would return the same + * top-level facet again, it would re-saturate, and the crawl would loop forever + * without advancing `facets_completed` (governance C-1). + */ +export function bisectCurrentFacet(state: FacetCrawlState, range: SizeFacet): boolean { + const halves = bisectFacet(range) + if (!halves) return false + if (state.pendingSubranges.length > 0) { + state.pendingSubranges.pop() // retire the sub-range being bisected + } else { + state.facetIndex++ // retire the top-level facet — its halves now cover it + } + // Push so halves[0] ends up on top (LIFO) → the lower sub-range is crawled next. + state.pendingSubranges.push(halves[1], halves[0]) + state.lastPage = 0 + return true +} + +/** + * Advance past the current exhausted (or unbisectable-saturated) range: pop the + * bisection stack if non-empty, else advance the top-level facet index. Resets + * the page cursor. + */ +export function advanceFacet(state: FacetCrawlState): void { + if (state.pendingSubranges.length > 0) state.pendingSubranges.pop() + else state.facetIndex++ + state.lastPage = 0 +} + +/** True when every top-level facet AND its bisection frontier are exhausted. */ +export function isFacetCrawlDone( + state: FacetCrawlState, + facets: SizeFacet[] = buildSizeFacets() +): boolean { + return state.facetIndex >= facets.length && state.pendingSubranges.length === 0 +} + +/** Serialize the crawl frontier back into a persisted {@link BackfillCursor}. */ +export function facetStateToCursor( + state: FacetCrawlState, + pathPrefix: string, + facets: SizeFacet[] = buildSizeFacets() +): BackfillCursor { + const range = currentFacetRange(state, facets) + return { + path: pathPrefix, + facet: range ? facetId(range) : 'done', + last_page: state.lastPage, + facet_index: state.facetIndex, + pending_subranges: state.pendingSubranges.map(serializeRange), + } +} + +/** + * The outcome of one dispatch's facet crawl: the advanced cursor to persist, a + * terminal flag, and the operator-observable counters. Lives here (not in + * `subdirectory-search.ts`) so `indexer-types.ts` can reference it without + * importing the search module. + */ +export interface BackfillCrawlOutcome { + cursor: BackfillCursor + done: boolean + cap_saturated: boolean + truncated_repo_count: number + facets_completed: number + facets_total: number + ranges_crawled: number } /** @@ -113,6 +266,15 @@ export interface BackfillSummary { facets_remaining: number cap_saturated: boolean truncated_repo_count: number + /** + * SMI-5286 1c (M-2): true crawl position. `facets_remaining` is coarse — it + * reads 0 once the last top-level facet is retired even while its bisected + * sub-ranges are still draining. `current_facet` (the active (sub)range id, or + * 'done') + `pending_subrange_count` (bisection-frontier depth) let the operator + * distinguish "finished" from "still bisecting". + */ + current_facet?: string + pending_subrange_count?: number } /** diff --git a/scripts/indexer/code-search.facets.ts b/scripts/indexer/code-search.facets.ts new file mode 100644 index 000000000..854715c35 --- /dev/null +++ b/scripts/indexer/code-search.facets.ts @@ -0,0 +1,139 @@ +/** + * Size-facet partitioner for the broad `filename:SKILL.md` code-search backfill + * @module scripts/indexer/code-search.facets + * + * SMI-5286 Wave 1c: the broad community code-search query + * (`filename:SKILL.md`) saturates GitHub's hard 1000-result ceiling, so a single + * paginated pass can never reach the long tail. This module partitions that one + * query by file SIZE into a fixed ladder of disjoint, exhaustive byte-size + * buckets so each sub-query returns < 1000 results, with adaptive + * bisect-on-saturation for the dense low buckets. + * + * Why size and not date: GitHub /search/code's `size:` qualifier IS a real, + * probe-verified filter, whereas `created:`/`pushed:` are tokenized as + * free-text content (SMI-5176) and crush results to files that literally + * contain the date string. Size is therefore the only viable partitioner. + * + * This module is pure (no I/O, no GitHub dependency): it produces facet ranges, + * stable labels, the `size:` qualifier string, and a bisection helper. The + * caller (the facet driver) owns dispatch, pagination, and the checkpoint + * cursor; it passes the already-formatted qualifier string into + * `code-search.ts` so that file stays free of any facet dependency. + */ + +/** A disjoint, inclusive byte-size bucket over the SKILL.md blob size. */ +export interface SizeFacet { + /** Inclusive lower byte bound (>= 0). */ + lo: number + /** Inclusive upper byte bound; Number.POSITIVE_INFINITY for the open-ended final bucket. */ + hi: number +} + +/** + * The fixed, pre-enumerated size-bucket ladder. Returns the SAME array every + * call so `facets_total = buildSizeFacets().length` is STATIC across dispatches + * (the checkpoint cursor's `facets_completed` count is meaningless if the ladder + * can change). Buckets are disjoint and EXHAUSTIVELY cover [0, ∞): + * facets[0].lo === 0; facets[i+1].lo === facets[i].hi + 1; last.hi === Infinity. + * + * The ladder doubles each bucket's width as size grows. SKILL.md files are + * small, so the low buckets are dense and WILL bisect at runtime — that is the + * expected, designed behaviour of the adaptive split. + * + * @returns The frozen 9-bucket size ladder (stable identity across calls) + */ +export function buildSizeFacets(): SizeFacet[] { + return SIZE_FACETS +} + +/** + * The canonical size-bucket ladder, enumerated once at module load so + * `buildSizeFacets()` returns a stable array identity. Buckets are + * inclusive-inclusive, disjoint, contiguous, and exhaustively cover [0, ∞): + * the first bucket starts at 0, each subsequent `lo` is the prior `hi + 1`, the + * width doubles each step, and the final bucket is open-ended. + */ +const SIZE_FACETS: SizeFacet[] = Object.freeze([ + { lo: 0, hi: 127 }, + { lo: 128, hi: 255 }, + { lo: 256, hi: 511 }, + { lo: 512, hi: 1023 }, + { lo: 1024, hi: 2047 }, + { lo: 2048, hi: 4095 }, + { lo: 4096, hi: 8191 }, + { lo: 8192, hi: 16383 }, + { lo: 16384, hi: Number.POSITIVE_INFINITY }, +]) as SizeFacet[] + +/** + * Stable label for a facet, used as the checkpoint cursor `facet` string. + * Finite: `${lo}-${hi}`. Open-ended: `${lo}+`. + * + * @param facet - The size bucket to label + * @returns A stable, human-readable facet identifier + */ +export function facetId(facet: SizeFacet): string { + return facet.hi === Number.POSITIVE_INFINITY ? `${facet.lo}+` : `${facet.lo}-${facet.hi}` +} + +/** + * The GitHub /search/code size qualifier for this facet. INCLUSIVE-INCLUSIVE: + * finite → `size:${lo}..${hi}`; open-ended (hi === Infinity) → `size:>=${lo}`. + * (Off-by-one boundaries double-count — buckets are already inclusive-inclusive.) + * + * @param facet - The size bucket to render + * @returns The `size:` qualifier string to append to the code-search query + */ +export function facetToQualifier(facet: SizeFacet): string { + return facet.hi === Number.POSITIVE_INFINITY + ? `size:>=${facet.lo}` + : `size:${facet.lo}..${facet.hi}` +} + +/** + * Upper ceiling for open-ended bisection (bytes). A SKILL.md larger than 4 MiB is + * not a real skill, so once an open-ended bucket's lower bound passes this the + * tail is treated as unsplittable (the caller records it truncated rather than + * bisecting forever — doubling an open-ended range never reaches `lo === hi`, so + * a persistently-saturating open-ended facet would otherwise loop infinitely). + */ +const OPEN_ENDED_BISECT_CEILING = 4 * 1024 * 1024 + +/** + * Split a facet into two disjoint, contiguous, inclusive halves that together + * cover the SAME range (used when a facet saturates the 1000-result cap). + * Finite: mid = lo + floor((hi - lo) / 2) → [{lo, hi: mid}, {lo: mid+1, hi}]. + * Open-ended (hi === Infinity): pivot by doubling → + * [{lo, hi: lo*2 - 1}, {lo: lo*2, hi: Infinity}] (requires 0 < lo < ceiling). + * Returns null when the facet CANNOT subdivide (finite with lo >= hi; open-ended + * with lo === 0; or open-ended past {@link OPEN_ENDED_BISECT_CEILING}). + * + * @param facet - The saturated size bucket to bisect + * @returns A two-element tuple of contiguous halves, or null when unsplittable + */ +export function bisectFacet(facet: SizeFacet): [SizeFacet, SizeFacet] | null { + if (facet.hi === Number.POSITIVE_INFINITY) { + // Open-ended bucket: pivot by doubling the lower bound. A bucket starting at + // 0 cannot double (0 * 2 === 0), and past the ceiling there are no real + // skills left to partition — both are unsplittable, guard them so a + // persistently-saturating open-ended tail terminates instead of doubling + // forever. + if (facet.lo <= 0 || facet.lo >= OPEN_ENDED_BISECT_CEILING) { + return null + } + const pivot = facet.lo * 2 + return [ + { lo: facet.lo, hi: pivot - 1 }, + { lo: pivot, hi: Number.POSITIVE_INFINITY }, + ] + } + // Finite bucket: a single-byte (or inverted) range cannot subdivide. + if (facet.lo >= facet.hi) { + return null + } + const mid = facet.lo + Math.floor((facet.hi - facet.lo) / 2) + return [ + { lo: facet.lo, hi: mid }, + { lo: mid + 1, hi: facet.hi }, + ] +} diff --git a/scripts/indexer/code-search.ts b/scripts/indexer/code-search.ts index fdd644ff9..c25e0376a 100644 --- a/scripts/indexer/code-search.ts +++ b/scripts/indexer/code-search.ts @@ -75,7 +75,10 @@ const RETRY_DELAYS = [1000, 2000, 4000] */ export async function searchCodeForSkillMd( page: number, - perPage = 30, + // SMI-5286 1c: default per_page raised 30 → 100 (GitHub max) so each page + // drains the 1000-result ceiling in fewer requests. The root phase stays + // disabled in 1c, so no size facet is threaded here. + perPage = 100, telemetry: RateLimitTelemetry ): Promise<{ repos: GitHubRepository[]; total: number; retries: number; error?: string }> { // Build query: find root-level SKILL.md files. @@ -215,8 +218,14 @@ export function extractSkillPath(itemPath: string): string { export async function searchCodeForSkillMdInSubdirectory( pathPrefix: string | undefined, page: number, - perPage = 30, - telemetry: RateLimitTelemetry + // SMI-5286 1c: default per_page raised 30 → 100 (GitHub max). + perPage = 100, + telemetry: RateLimitTelemetry, + // SMI-5286 1c: optional pre-formatted GitHub `size:` qualifier (e.g. + // `size:0..127`) appended to the query so the broad backfill can partition the + // 1000-result-capped query by file size. The caller (the facet driver) formats + // it via code-search.facets.ts; this file stays free of the facet dependency. + sizeQualifier?: string ): Promise<{ repos: GitHubRepository[] total: number @@ -238,7 +247,11 @@ export async function searchCodeForSkillMdInSubdirectory( // Build query: broad (no path constraint) or scoped to pathPrefix. // SMI-5176: date qualifiers (created:>/pushed:>) are NOT functional on GitHub // code search — they are tokenized as free-text content. No freshness qualifier. - const queryStr = pathPrefix ? `filename:SKILL.md path:${pathPrefix}` : 'filename:SKILL.md' + const baseQuery = pathPrefix ? `filename:SKILL.md path:${pathPrefix}` : 'filename:SKILL.md' + // SMI-5286 1c: append the size facet qualifier (already INCLUSIVE-INCLUSIVE, + // e.g. `size:0..127`) BEFORE encoding so the partitioned backfill stays under + // the 1000-result ceiling. The qualifier is part of queryStr pre-encode. + const queryStr = sizeQualifier ? `${baseQuery} ${sizeQualifier}` : baseQuery const query = encodeURIComponent(queryStr) const url = `https://api.github.com/search/code?q=${query}&per_page=${perPage}&page=${page}` diff --git a/scripts/indexer/discovery-orchestrator.ts b/scripts/indexer/discovery-orchestrator.ts index e14d4a4da..5d510f3ba 100644 --- a/scripts/indexer/discovery-orchestrator.ts +++ b/scripts/indexer/discovery-orchestrator.ts @@ -25,7 +25,7 @@ import { type RateLimitTelemetry, } from './_shared/rate-limit.ts' import { type SkillMdValidation } from './skill-processor.ts' -import { runSubdirectorySearch } from './subdirectory-search.ts' +import { runSubdirectorySearchPhase, type BackfillFacetPlan } from './subdirectory-search.ts' import { runCategorization, runCodeSearch, runUpsertPhase } from './indexer-runners.ts' import { applyTreeHashTouches, type TreeHashTouchEntry } from './tree-hash-touch.ts' import type { RotationSource } from './topic-rotation.ts' @@ -124,6 +124,13 @@ export interface RunDiscoveryParams { * real skills). Default false → byte-identical cron path. */ backfillMode?: boolean + /** + * SMI-5286 1c: when set (backfill dispatches), Phase 3b runs the resumable + * size-faceted crawl from this plan's cursor instead of the legacy broad+ + * fallback loop. `run.ts` builds it from the checkpoint; the advanced cursor + * returns on `result.backfill_crawl`. + */ + backfillFacetPlan?: BackfillFacetPlan } export async function runDiscovery(params: RunDiscoveryParams): Promise { @@ -148,6 +155,7 @@ export async function runDiscovery(params: RunDiscoveryParams): Promise { const treeHashCache = (result as { tree_hash_cache?: { hits?: number; misses?: number } } | null) ?.tree_hash_cache - // SMI-5286 Wave 1b (§#2): on a backfill dispatch, attach a `backfill` sub-object - // onto `data` so `indexer-backfill.yml` can read `data.backfill.token_source` - // (its guardian fails the run if it reads 'app', proving PAT-bucket isolation). - // Facet counters are 0 in 1b (facet partitioning lands in 1c); token_source is - // the load-bearing field. Spread keeps the existing IndexerResult fields under - // `data` intact. Only emitted when BACKFILL_MODE is true. + // SMI-5286 (§#2): on a backfill dispatch, attach a `backfill` sub-object onto + // `data` so `indexer-backfill.yml` can read `data.backfill.token_source` (its + // guardian fails the run if it reads 'app', proving PAT-bucket isolation). + // 1c: the facet counters are sourced from `result.backfill_crawl` (the advanced + // cursor outcome from Phase 3b); `facets_remaining == 0` is the terminal + // condition the operator loop watches. Spread keeps the existing IndexerResult + // fields under `data` intact. Only emitted when BACKFILL_MODE is true. let data: unknown = result if (env.BACKFILL_MODE && result && typeof result === 'object') { + const crawl = (result as { backfill_crawl?: IndexerResult['backfill_crawl'] }).backfill_crawl const backfill: BackfillSummary = { token_source: resolveTokenSource(), checkpoint_id: checkpointId, - facets_total: 0, - facets_completed: 0, - facets_remaining: 0, - cap_saturated: false, - truncated_repo_count: 0, + facets_total: crawl?.facets_total ?? 0, + facets_completed: crawl?.facets_completed ?? 0, + facets_remaining: crawl ? crawl.facets_total - crawl.facets_completed : 0, + cap_saturated: crawl?.cap_saturated ?? false, + truncated_repo_count: crawl?.truncated_repo_count ?? 0, + // M-2: honest crawl position — 'done' only when the bisection frontier is + // also empty (facets_remaining alone reads 0 while sub-ranges still drain). + current_facet: crawl?.cursor.facet, + pending_subrange_count: crawl?.cursor.pending_subranges?.length ?? 0, } data = { ...(result as Record), backfill } } diff --git a/scripts/indexer/subdirectory-search.helpers.ts b/scripts/indexer/subdirectory-search.helpers.ts new file mode 100644 index 000000000..e205f640c --- /dev/null +++ b/scripts/indexer/subdirectory-search.helpers.ts @@ -0,0 +1,308 @@ +/** + * Subdirectory-search helpers: shared per-skill result processor + the SMI-5286 + * 1c size-faceted backfill crawl. + * @module scripts/indexer/subdirectory-search.helpers + * + * Extracted from `subdirectory-search.ts` to keep that entrypoint under the + * 500-line CI gate (SMI-5286 1c). `processSearchResults` is shared by the legacy + * broad/fallback loop AND the backfill crawl; `runBackfillFacetCrawl` is the + * size-faceted depth-first driver. The dependency is one-way + * (`subdirectory-search.ts` → this file) — this file never imports the entrypoint. + * + * NOT parity-guarded (`parity.test.ts` exempts the subdirectory surface, C-2), + * so divergence from the Deno copy is safe and intended (the backfill engine is + * the Node GHA runner only). + */ + +import { delay, type RateLimitTelemetry } from './_shared/rate-limit.ts' +import { searchCodeForSkillMdInSubdirectory } from './code-search.ts' +import { checkSkillMdExists } from './skill-processor.ts' +import { fetchRepoLicense, isPermissiveLicense } from './license-filter.ts' +import { enumerateRepoSkillPaths, type EnumerateTelemetry } from './trees-enumerate.ts' +import { buildSkillTreeUrl } from './skill-url.ts' +import { buildSizeFacets, facetId, facetToQualifier } from './code-search.facets.ts' +import { + type BackfillCursor, + type BackfillCrawlOutcome, + advanceFacet, + bisectCurrentFacet, + cursorToFacetState, + currentFacetRange, + facetStateToCursor, + isFacetCrawlDone, +} from './backfill-checkpoint.ts' +import type { GitHubRepository } from './topic-search.ts' +import type { SkillMdValidation } from './skill-processor.ts' + +/** + * Process code search results: deduplicate, license-gate, validate, and collect repos. + * Shared by both broad and fallback search paths. + * + * SMI-4852: Threads `telemetry` to downstream `fetchRepoLicense` and + * `checkSkillMdExists` calls so every GitHub API hit lands in the shared + * collector. + * + * SMI-5286 Wave 1a (§#1, C-1): per-skill (collection) extraction. Each candidate + * repo is enumerated ONCE via the Trees API (`enumerateRepoSkillPaths`) and EVERY + * valid SKILL.md parent dir becomes its own `GitHubRepository`, with a DISTINCT + * per-skill tree URL (`buildSkillTreeUrl`) so N skills in one repo yield N distinct + * `repo_url` rows that never collide on `onConflict: 'repo_url'`. Each per-path row + * is validated independently (§#4 strict gate) before it is collected; validated + * rows are `installable:true` (`skill-processor.ts:440` then persists the non-null + * tree URL). Edit E: only the enumeration loop changed — the dedup-key / + * freshness-qualifier lines (`:89`) are byte-stable for the SMI-5176 rebase. + */ +export async function processSearchResults( + resultRepos: GitHubRepository[], + seenUrls: Set, + validationCache: Map, + validationOptions: { strictValidation?: boolean; minContentLength?: number }, + repos: GitHubRepository[], + stats: { licenseFiltered: number; licenseFetchFailed: number }, + telemetry: RateLimitTelemetry, + enumerateTelemetry: EnumerateTelemetry, + enumeratedRepos: Set +): Promise { + for (const repo of resultRepos) { + // Deduplication key includes skillPath: one repo can have multiple skills + const dedupKey = repo.skillPath ? `${repo.url}/${repo.skillPath}` : repo.url + if (seenUrls.has(dedupKey)) continue + + // License gate: fetch SPDX from GitHub API (not included in code search response) + const { license: spdxId, fetchFailed } = await fetchRepoLicense( + repo.owner, + repo.repoName, + telemetry + ) + + if (fetchFailed) { + // API failure — skip this run but don't count as license-filtered. + // NOT added to seenUrls so the repo is retried on the next indexer run. + console.log(`[BroadDiscovery] License fetch failed (will retry next run): ${repo.fullName}`) + stats.licenseFetchFailed++ + await delay(200) + continue + } + + if (!isPermissiveLicense(spdxId)) { + // Confirmed non-permissive license — permanently excluded. + console.log(`[BroadDiscovery] License excluded: ${repo.fullName} spdx=${spdxId ?? 'null'}`) + stats.licenseFiltered++ + await delay(200) + continue + } + + // Mark this code-search result consumed (per-repo+skillPath identity) so the + // same surfaced file is not re-processed across pages/prefixes. + seenUrls.add(dedupKey) + + // SMI-5286 Wave 1a (§#1): enumerate the repo's full tree ONCE. The broad query + // can surface the same repo via multiple SKILL.md hits; guard re-enumeration. + const repoKey = `${repo.owner}/${repo.repoName}` + if (enumeratedRepos.has(repoKey)) { + await delay(50) + continue + } + enumeratedRepos.add(repoKey) + + const { entries, truncatedByApi, truncatedByCap } = await enumerateRepoSkillPaths( + repo.owner, + repo.repoName, + repo.defaultBranch, + telemetry, + enumerateTelemetry + ) + + if (truncatedByApi) { + // Trees API truncated — do NOT emit a partial set (deterministic skip). + console.log( + `[BroadDiscovery] Trees truncated, skipping for manual handling: ${repo.fullName}` + ) + await delay(50) + continue + } + if (truncatedByCap) { + console.log(`[BroadDiscovery] Per-repo cap reached, taking first N: ${repo.fullName}`) + } + + // Validate each enumerated SKILL.md independently (§#4 strict gate) and emit + // one per-skill GitHubRepository with a distinct tree URL (C-1) per valid path. + for (const entry of entries) { + const skillPath = entry.path + const installable = await checkSkillMdExists( + repo.owner, + repo.repoName, + repo.defaultBranch, + validationCache, + telemetry, + skillPath, + validationOptions + ) + + // C-1: build the per-skill tree URL from the BARE repo html_url + // (reconstructed from owner/repoName), NOT from `repo.url` — by this point + // `repo.url` is already the code-search mapper's tree URL, so reusing it + // would double the `/tree/` segment. `skillUrl` already encodes + // `skillPath`, so it alone is the dedup key. + const skillUrl = buildSkillTreeUrl( + `https://github.com/${repo.owner}/${repo.repoName}`, + repo.defaultBranch, + skillPath + ) + if (seenUrls.has(skillUrl)) continue + seenUrls.add(skillUrl) + + repos.push({ + ...repo, + url: skillUrl, + installable, + skillPath, + treeHash: entry.blobSha, + license: spdxId, + }) + await delay(50) + } + } +} + +/** + * SMI-5286 1c: a single dispatch's facet-crawl plan. The driver in `run.ts` + * builds this from the resumed checkpoint cursor + raised caps and hands it to + * `runSubdirectorySearch`; the returned {@link BackfillCrawlOutcome} carries the + * advanced cursor back for the next checkpoint write. + */ +export interface BackfillFacetPlan { + /** Cursor to resume from (null = cold start at facet 0, page 0). */ + startCursor: BackfillCursor | null + /** + * Restrict the crawl to this single `path:` prefix (the `BACKFILL_PATH_PREFIX` + * one-ecosystem DRY_RUN / targeted-recovery mode). `undefined` = the broad + * `filename:SKILL.md` query (no `path:` constraint), which subsumes root + + * every subdirectory. + */ + pathPrefix: string | undefined + /** Results per code-search page (GitHub max 100). */ + perPage: number + /** Pages to crawl per (sub)range before treating it as exhausted (≈ ceil(1000 / perPage)). */ + maxPagesPerRange: number + /** Dispatch budget: stop after this many (sub)ranges so the run fits the GHA cap. */ + maxRangesPerDispatch: number +} + +/** GitHub code-search retrievable-results ceiling per query (any query caps here). */ +const CODE_SEARCH_RESULT_CAP = 1000 + +/** + * SMI-5286 1c: depth-first size-faceted crawl of the broad `filename:SKILL.md` + * query (or a single `path:` prefix). Pages each size (sub)range to the 1000-cap; + * a range whose `total_count` exceeds the cap is BISECTED (its halves crawled + * before the next top-level facet) so every file is reachable. A range that + * saturates but cannot subdivide further (≥1000 identical-byte-size files — + * almost always denylist-caught boilerplate) is recorded as truncated, logged, + * and skipped (never silently dropped). The frontier (facet index + bisection + * stack + page) is fully captured by the returned cursor so a dispatch boundary + * mid-bisection resumes losslessly. Reuses {@link processSearchResults} (license + * gate + Trees per-skill enumeration + per-path validation) unchanged. + */ +export async function runBackfillFacetCrawl( + plan: BackfillFacetPlan, + seenUrls: Set, + validationCache: Map, + validationOptions: { strictValidation?: boolean; minContentLength?: number }, + repos: GitHubRepository[], + stats: { licenseFiltered: number; licenseFetchFailed: number }, + telemetry: RateLimitTelemetry, + enumerateTelemetry: EnumerateTelemetry, + enumeratedRepos: Set, + errors: string[] +): Promise { + const facets = buildSizeFacets() + const state = cursorToFacetState(plan.startCursor) + const pathLabel = plan.pathPrefix ?? 'broad' + let capSaturated = false + let truncatedRanges = 0 + let rangesCrawled = 0 + + while (rangesCrawled < plan.maxRangesPerDispatch) { + const range = currentFacetRange(state, facets) + if (!range) break // ladder exhausted + const qualifier = facetToQualifier(range) + + let saturated = false + let errored = false + for (let page = state.lastPage + 1; page <= plan.maxPagesPerRange; page++) { + const result = await searchCodeForSkillMdInSubdirectory( + plan.pathPrefix, + page, + plan.perPage, + telemetry, + qualifier + ) + if (result.error) { + errors.push(`[backfill ${pathLabel} ${facetId(range)} p${page}] ${result.error}`) + errored = true + break + } + // The 1000-cap is detected from total_count on the first page: rather than + // waste pages on the unreachable tail, bisect immediately — the sub-ranges + // (each < cap, or bisected further) cover the same files. + if (page === 1 && result.total > CODE_SEARCH_RESULT_CAP) { + saturated = true + break + } + await processSearchResults( + result.repos, + seenUrls, + validationCache, + validationOptions, + repos, + stats, + telemetry, + enumerateTelemetry, + enumeratedRepos + ) + state.lastPage = page + if (result.repos.length < plan.perPage) break // short page → range exhausted + await delay(6000) // 10 code-search req/min → 6s between pages + } + + rangesCrawled++ + + if (errored) { + // M-1: a page error (rate-limiter already retried transient 403/429, so a + // returned error is exceptional) — count it as truncated so it surfaces in + // the dispatch summary + errors[], then advance past the range rather than + // re-crawl it forever this dispatch. The operator can re-run the facet under + // a narrower BACKFILL_PATH_PREFIX once the cause is cleared (SPARC §#3). + truncatedRanges++ + console.warn( + `[Backfill] facet ${facetId(range)} (${pathLabel}) errored — recorded as truncated, advancing` + ) + advanceFacet(state) + } else if (saturated) { + capSaturated = true + if (!bisectCurrentFacet(state, range)) { + // Saturated AND unbisectable: record + skip (never silent). The operator + // can re-run this facet under a narrower BACKFILL_PATH_PREFIX (SPARC §#3). + truncatedRanges++ + console.warn( + `[Backfill] facet ${facetId(range)} (${pathLabel}) saturated at the 1000-cap and cannot subdivide — recorded as truncated, skipping` + ) + advanceFacet(state) + } + } else { + // Range exhausted (short page, or page cap reached with total <= cap). + advanceFacet(state) + } + } + + return { + cursor: facetStateToCursor(state, plan.pathPrefix ?? '', facets), + done: isFacetCrawlDone(state, facets), + cap_saturated: capSaturated, + truncated_repo_count: truncatedRanges, + facets_completed: state.facetIndex, + facets_total: facets.length, + ranges_crawled: rangesCrawled, + } +} diff --git a/scripts/indexer/subdirectory-search.ts b/scripts/indexer/subdirectory-search.ts index 5102a762a..fa025714b 100644 --- a/scripts/indexer/subdirectory-search.ts +++ b/scripts/indexer/subdirectory-search.ts @@ -5,23 +5,28 @@ * SMI-4852: Node-flavored sibling of * `supabase/functions/indexer/subdirectory-search.ts`. This module performs no * direct GitHub fetches; it dispatches to `searchCodeForSkillMdInSubdirectory` - * (already wrapped per Hard Rule 1) and the sibling-module helpers - * `checkSkillMdExists` / `fetchRepoLicense` (each wrapped in their own - * cluster). Telemetry is threaded through to every downstream call so the - * single run-scoped collector aggregates header data from every consumer. - * Parity is guarded by `scripts/indexer/tests/parity.test.ts`. + * (already wrapped per Hard Rule 1) and the sibling-module helper + * `processSearchResults` (license-gate + Trees per-skill enumeration). Telemetry + * is threaded through to every downstream call so the single run-scoped collector + * aggregates header data from every consumer. NOTE (SMI-5286 1c): this surface is + * NOT parity-guarded (`parity.test.ts` exempts subdirectory-search), so the Node + * copy may diverge from the Deno parent. * * Original module docs: * * SMI-2660: Phase 3b of the indexer — finds SKILL.md files via GitHub Code Search. * SMI-3229: Replaced hardcoded path-prefix loop with broad `filename:SKILL.md` query. * - * Extracted from index.ts to satisfy the 500-line CI gate. + * Extracted from index.ts to satisfy the 500-line CI gate. SMI-5286 1c moved the + * shared `processSearchResults` + the size-faceted backfill crawl to + * `subdirectory-search.helpers.ts` to stay under that gate. * * Strategy: * 1. Primary: broad query (no path: constraint) — discovers SKILL.md at any depth * 2. Fallback: if any page returns incomplete_results, re-runs with 7 path-scoped * queries to ensure known ecosystems are fully covered + * 3. SMI-5286 1c backfill: when a `BackfillFacetPlan` is supplied, the legacy loop + * is replaced by a resumable size-faceted depth-first crawl. * * Rate limit: 10 code search requests/minute (separate from main API). * Gated by SKILLSMITH_ENABLE_SUBDIRECTORY_SEARCH=true env var to prevent @@ -35,12 +40,25 @@ import { GITHUB_API_DELAY, delay, type RateLimitTelemetry } from './_shared/rate-limit.ts' import { searchCodeForSkillMdInSubdirectory } from './code-search.ts' -import { checkSkillMdExists } from './skill-processor.ts' -import { fetchRepoLicense, isPermissiveLicense } from './license-filter.ts' -import { enumerateRepoSkillPaths, type EnumerateTelemetry } from './trees-enumerate.ts' -import { buildSkillTreeUrl } from './skill-url.ts' +import { type EnumerateTelemetry } from './trees-enumerate.ts' +import { + processSearchResults, + runBackfillFacetCrawl, + type BackfillFacetPlan, +} from './subdirectory-search.helpers.ts' +import type { BackfillCrawlOutcome } from './backfill-checkpoint.ts' import type { GitHubRepository } from './topic-search.ts' import type { SkillMdValidation } from './skill-processor.ts' +import type { IndexerResult } from './indexer-types.ts' + +export type { BackfillFacetPlan } from './subdirectory-search.helpers.ts' + +/** + * Results per code-search page. SMI-5286 1c (C-5): GitHub allows 100 (was a + * hardcoded 30 → 3.3x fewer requests for the same coverage). The cron leaves + * Phase 3b disabled, so this only affects manual-enable + backfill runs. + */ +const BROAD_QUERY_PER_PAGE = 100 /** * Fallback path prefixes used when broad query returns incomplete results. @@ -69,137 +87,6 @@ export const FALLBACK_PATH_PREFIXES = [ '.windsurf/skills', // Windsurf (native, since 2026-03) ] -/** - * Process code search results: deduplicate, license-gate, validate, and collect repos. - * Shared by both broad and fallback search paths. - * - * SMI-4852: Threads `telemetry` to downstream `fetchRepoLicense` and - * `checkSkillMdExists` calls so every GitHub API hit lands in the shared - * collector. - * - * SMI-5286 Wave 1a (§#1, C-1): per-skill (collection) extraction. Each candidate - * repo is enumerated ONCE via the Trees API (`enumerateRepoSkillPaths`) and EVERY - * valid SKILL.md parent dir becomes its own `GitHubRepository`, with a DISTINCT - * per-skill tree URL (`buildSkillTreeUrl`) so N skills in one repo yield N distinct - * `repo_url` rows that never collide on `onConflict: 'repo_url'`. Each per-path row - * is validated independently (§#4 strict gate) before it is collected; validated - * rows are `installable:true` (`skill-processor.ts:440` then persists the non-null - * tree URL). Edit E: only the enumeration loop changed — the dedup-key / - * freshness-qualifier lines (`:89`) are byte-stable for the SMI-5176 rebase. - */ -async function processSearchResults( - resultRepos: GitHubRepository[], - seenUrls: Set, - validationCache: Map, - validationOptions: { strictValidation?: boolean; minContentLength?: number }, - repos: GitHubRepository[], - stats: { licenseFiltered: number; licenseFetchFailed: number }, - telemetry: RateLimitTelemetry, - enumerateTelemetry: EnumerateTelemetry, - enumeratedRepos: Set -): Promise { - for (const repo of resultRepos) { - // Deduplication key includes skillPath: one repo can have multiple skills - const dedupKey = repo.skillPath ? `${repo.url}/${repo.skillPath}` : repo.url - if (seenUrls.has(dedupKey)) continue - - // License gate: fetch SPDX from GitHub API (not included in code search response) - const { license: spdxId, fetchFailed } = await fetchRepoLicense( - repo.owner, - repo.repoName, - telemetry - ) - - if (fetchFailed) { - // API failure — skip this run but don't count as license-filtered. - // NOT added to seenUrls so the repo is retried on the next indexer run. - console.log(`[BroadDiscovery] License fetch failed (will retry next run): ${repo.fullName}`) - stats.licenseFetchFailed++ - await delay(200) - continue - } - - if (!isPermissiveLicense(spdxId)) { - // Confirmed non-permissive license — permanently excluded. - console.log(`[BroadDiscovery] License excluded: ${repo.fullName} spdx=${spdxId ?? 'null'}`) - stats.licenseFiltered++ - await delay(200) - continue - } - - // Mark this code-search result consumed (per-repo+skillPath identity) so the - // same surfaced file is not re-processed across pages/prefixes. - seenUrls.add(dedupKey) - - // SMI-5286 Wave 1a (§#1): enumerate the repo's full tree ONCE. The broad query - // can surface the same repo via multiple SKILL.md hits; guard re-enumeration. - const repoKey = `${repo.owner}/${repo.repoName}` - if (enumeratedRepos.has(repoKey)) { - await delay(50) - continue - } - enumeratedRepos.add(repoKey) - - const { entries, truncatedByApi, truncatedByCap } = await enumerateRepoSkillPaths( - repo.owner, - repo.repoName, - repo.defaultBranch, - telemetry, - enumerateTelemetry - ) - - if (truncatedByApi) { - // Trees API truncated — do NOT emit a partial set (deterministic skip). - console.log( - `[BroadDiscovery] Trees truncated, skipping for manual handling: ${repo.fullName}` - ) - await delay(50) - continue - } - if (truncatedByCap) { - console.log(`[BroadDiscovery] Per-repo cap reached, taking first N: ${repo.fullName}`) - } - - // Validate each enumerated SKILL.md independently (§#4 strict gate) and emit - // one per-skill GitHubRepository with a distinct tree URL (C-1) per valid path. - for (const entry of entries) { - const skillPath = entry.path - const installable = await checkSkillMdExists( - repo.owner, - repo.repoName, - repo.defaultBranch, - validationCache, - telemetry, - skillPath, - validationOptions - ) - - // C-1: build the per-skill tree URL from the BARE repo html_url - // (reconstructed from owner/repoName), NOT from `repo.url` — by this point - // `repo.url` is already the code-search mapper's tree URL, so reusing it - // would double the `/tree/` segment. `skillUrl` already encodes - // `skillPath`, so it alone is the dedup key. - const skillUrl = buildSkillTreeUrl( - `https://github.com/${repo.owner}/${repo.repoName}`, - repo.defaultBranch, - skillPath - ) - if (seenUrls.has(skillUrl)) continue - seenUrls.add(skillUrl) - - repos.push({ - ...repo, - url: skillUrl, - installable, - skillPath, - treeHash: entry.blobSha, - license: spdxId, - }) - await delay(50) - } - } -} - /** * SMI-3229: Run Phase 3b broad SKILL.md discovery with incomplete_results fallback. * @@ -222,13 +109,17 @@ async function processSearchResults( * @param validationOptions - Strict validation and minimum content length options * @param maxPages - Maximum pages per query (capped by caller) * @param telemetry - Shared rate-limit telemetry collector. + * @param backfillPlan - SMI-5286 1c: when present, run the size-faceted backfill + * crawl instead of the legacy broad+fallback loop. Optional → every existing + * 5-arg caller (the cron Phase-3b path + tests) is byte-stable. */ export async function runSubdirectorySearch( seenUrls: Set, validationCache: Map, validationOptions: { strictValidation?: boolean; minContentLength?: number }, maxPages: number, - telemetry: RateLimitTelemetry + telemetry: RateLimitTelemetry, + backfillPlan?: BackfillFacetPlan ): Promise<{ repos: GitHubRepository[] totalFound: number @@ -238,6 +129,8 @@ export async function runSubdirectorySearch( incompleteResults: number searchMode: 'broad' | 'prefix-fallback' errors: string[] + /** SMI-5286 1c: present only when `backfillPlan` was supplied. */ + backfill?: BackfillCrawlOutcome }> { const repos: GitHubRepository[] = [] const errors: string[] = [] @@ -245,21 +138,56 @@ export async function runSubdirectorySearch( let totalRetries = 0 const stats = { licenseFiltered: 0, licenseFetchFailed: 0 } let incompleteResults = 0 - let searchMode: 'broad' | 'prefix-fallback' = 'broad' + const searchMode: 'broad' | 'prefix-fallback' = 'broad' // SMI-5286 Wave 1a: run-scoped per-skill extraction state. `enumerateTelemetry` // accumulates denylist/cap/truncation counters across the whole run; // `enumeratedRepos` guards one Trees call per repo across pages and prefixes. const enumerateTelemetry: EnumerateTelemetry = {} const enumeratedRepos = new Set() + // ── SMI-5286 1c: size-faceted backfill crawl (replaces the legacy loop) ── + if (backfillPlan) { + const backfill = await runBackfillFacetCrawl( + backfillPlan, + seenUrls, + validationCache, + validationOptions, + repos, + stats, + telemetry, + enumerateTelemetry, + enumeratedRepos, + errors + ) + console.log( + `[Backfill] Facet crawl: ${repos.length} skills added, ${backfill.facets_completed}/${backfill.facets_total} facets, ${backfill.ranges_crawled} ranges this dispatch, ` + + `${stats.licenseFiltered} license-filtered, cap_saturated=${backfill.cap_saturated}, truncated=${backfill.truncated_repo_count}, done=${backfill.done}` + ) + console.log( + `[Backfill] Per-skill extraction: ${enumeratedRepos.size} repos enumerated, ${enumerateTelemetry.denylistSkipped ?? 0} denylist-skipped, ${enumerateTelemetry.cappedRepoCount ?? 0} capped, ${enumerateTelemetry.truncatedRepoCount ?? 0} api-truncated` + ) + return { + repos, + totalFound: repos.length, + retries: totalRetries, + licenseFiltered: stats.licenseFiltered, + licenseFetchFailed: stats.licenseFetchFailed, + incompleteResults, + searchMode, + errors, + backfill, + } + } + // ── Primary: broad query (no path constraint) ──────────────────────── console.log('[BroadDiscovery] Running broad filename:SKILL.md query...') + let primaryMode: 'broad' | 'prefix-fallback' = 'broad' for (let page = 1; page <= maxPages; page++) { const result = await searchCodeForSkillMdInSubdirectory( undefined, // no pathPrefix → broad query page, - 30, + BROAD_QUERY_PER_PAGE, telemetry ) @@ -291,14 +219,14 @@ export async function runSubdirectorySearch( enumeratedRepos ) - if (result.repos.length < 30) break + if (result.repos.length < BROAD_QUERY_PER_PAGE) break // Code search rate limit: 10 req/min → 6s between pages await delay(6000) } // ── Fallback: path-scoped queries if broad had incomplete results ──── if (incompleteResults > 0) { - searchMode = 'prefix-fallback' + primaryMode = 'prefix-fallback' console.log( `[BroadDiscovery] ${incompleteResults} page(s) had incomplete results — falling back to path-scoped queries` ) @@ -307,7 +235,12 @@ export async function runSubdirectorySearch( console.log(`[BroadDiscovery] Fallback searching path:${pathPrefix}...`) for (let page = 1; page <= maxPages; page++) { - const result = await searchCodeForSkillMdInSubdirectory(pathPrefix, page, 30, telemetry) + const result = await searchCodeForSkillMdInSubdirectory( + pathPrefix, + page, + BROAD_QUERY_PER_PAGE, + telemetry + ) totalRetries += result.retries @@ -339,7 +272,7 @@ export async function runSubdirectorySearch( enumeratedRepos ) - if (result.repos.length < 30) break + if (result.repos.length < BROAD_QUERY_PER_PAGE) break // Code search rate limit: 10 req/min → 6s between pages await delay(6000) } @@ -350,7 +283,7 @@ export async function runSubdirectorySearch( } console.log( - `[BroadDiscovery] Complete (${searchMode}): ${repos.length} added, ${stats.licenseFiltered} license-filtered, ${stats.licenseFetchFailed} fetch-failed, ${incompleteResults} incomplete, ${totalRetries} retries` + `[BroadDiscovery] Complete (${primaryMode}): ${repos.length} added, ${stats.licenseFiltered} license-filtered, ${stats.licenseFetchFailed} fetch-failed, ${incompleteResults} incomplete, ${totalRetries} retries` ) // SMI-5286 Wave 1a: per-skill extraction observability (§#1, Edit D). console.log( @@ -372,7 +305,63 @@ export async function runSubdirectorySearch( licenseFiltered: stats.licenseFiltered, licenseFetchFailed: stats.licenseFetchFailed, incompleteResults, - searchMode, + searchMode: primaryMode, errors, } } + +/** + * Phase 3b wrapper: runs {@link runSubdirectorySearch}, folds its repos/errors/ + * stats into the orchestrator's accumulators, and (SMI-5286 1c) surfaces the + * backfill cursor on `result.backfill_crawl`. Extracted here so + * `discovery-orchestrator.ts` stays under the 500-line gate. Never throws — a + * Phase-3b failure records a zeroed `subdirectory_search` and is swallowed + * (one phase must not abort the cycle), matching the prior inline behavior. + */ +export async function runSubdirectorySearchPhase(args: { + seenUrls: Set + validationCache: Map + validationOptions: { strictValidation?: boolean; minContentLength?: number } + codeSearchMaxPages: number + telemetry: RateLimitTelemetry + repositories: GitHubRepository[] + result: IndexerResult + backfillFacetPlan?: BackfillFacetPlan +}): Promise { + try { + const subdirResult = await runSubdirectorySearch( + args.seenUrls, + args.validationCache, + args.validationOptions, + args.codeSearchMaxPages, + args.telemetry, + args.backfillFacetPlan + ) + for (const repo of subdirResult.repos) { + args.repositories.push(repo) + } + args.result.errors.push(...subdirResult.errors) + args.result.subdirectory_search = { + repos_found: subdirResult.repos.length, + total_found: subdirResult.totalFound, + retries: subdirResult.retries, + license_filtered: subdirResult.licenseFiltered, + license_fetch_failed: subdirResult.licenseFetchFailed, + incomplete_results: subdirResult.incompleteResults, + search_mode: subdirResult.searchMode, + } + if (subdirResult.backfill) { + args.result.backfill_crawl = subdirResult.backfill + } + } catch (err) { + console.warn(`[CodeSearch] Phase 3b failed: ${err instanceof Error ? err.message : 'Unknown'}`) + args.result.subdirectory_search = { + repos_found: 0, + total_found: 0, + retries: 0, + license_filtered: 0, + license_fetch_failed: 0, + error: 'phase_failed', + } + } +} diff --git a/scripts/indexer/trees-search.ts b/scripts/indexer/trees-search.ts index 4fd1eb57c..d372391ce 100644 --- a/scripts/indexer/trees-search.ts +++ b/scripts/indexer/trees-search.ts @@ -110,8 +110,12 @@ export async function fetchSkillPathsFromTree( // Match SKILL.md case-insensitively at any depth if (!entry.path.endsWith('/SKILL.md') && entry.path.toUpperCase() !== 'SKILL.MD') continue const slashIdx = entry.path.lastIndexOf('/') - if (slashIdx < 0) continue // root SKILL.md — no parent dir to extract - skillEntries.push({ path: entry.path.slice(0, slashIdx), blobSha: entry.sha }) + // SMI-5286 1c (C-4): a root-level SKILL.md has no parent dir → emit path:'' + // (buildSkillTreeUrl maps '' → …/tree/). Previously dropped, which + // silently lost repos whose only skill is a root SKILL.md once Phase 3a (the + // only other root-skill emitter) was disabled. + const skillPath = slashIdx < 0 ? '' : entry.path.slice(0, slashIdx) + skillEntries.push({ path: skillPath, blobSha: entry.sha }) } if (data.truncated) { diff --git a/scripts/tests/indexer/backfill-checkpoint.statemachine.test.ts b/scripts/tests/indexer/backfill-checkpoint.statemachine.test.ts new file mode 100644 index 000000000..a28846f4d --- /dev/null +++ b/scripts/tests/indexer/backfill-checkpoint.statemachine.test.ts @@ -0,0 +1,200 @@ +/** + * Facet driver state-machine tests (SMI-5286 1c) + * @module scripts/tests/indexer/backfill-checkpoint.statemachine + * + * The cursor <-> crawl-frontier state machine that drives the size-faceted + * backfill: a depth-first walk of the static `buildSizeFacets()` ladder where a + * saturated facet is bisected (its halves drained before the next facet) and the + * frontier (facetIndex + bisection stack + page) round-trips losslessly through + * the JSON checkpoint. Split out of backfill-checkpoint.test.ts to keep each file + * focused + under the 500-line convention. + */ + +import { describe, it, expect } from 'vitest' +import { + cursorToFacetState, + currentFacetRange, + bisectCurrentFacet, + advanceFacet, + isFacetCrawlDone, + facetStateToCursor, + type FacetCrawlState, +} from '../../indexer/backfill-checkpoint.ts' +import { buildSizeFacets } from '../../indexer/code-search.facets.ts' + +const FACETS = buildSizeFacets() + +describe('facet driver state machine (SMI-5286 1c)', () => { + it('cursorToFacetState cold-starts on null/undefined', () => { + expect(cursorToFacetState(null)).toEqual({ facetIndex: 0, pendingSubranges: [], lastPage: 0 }) + expect(cursorToFacetState(undefined)).toEqual({ + facetIndex: 0, + pendingSubranges: [], + lastPage: 0, + }) + }) + + it('cursorToFacetState reconstructs facet_index, last_page, and pending_subranges', () => { + const state = cursorToFacetState({ + path: '', + facet: '0-63', + last_page: 2, + facet_index: 3, + pending_subranges: [ + [0, 63], + [64, 127], + ], + }) + expect(state.facetIndex).toBe(3) + expect(state.lastPage).toBe(2) + expect(state.pendingSubranges).toEqual([ + { lo: 0, hi: 63 }, + { lo: 64, hi: 127 }, + ]) + }) + + it('cursorToFacetState maps a null upper bound back to Infinity', () => { + const state = cursorToFacetState({ + path: '', + facet: '16384+', + last_page: 0, + facet_index: 8, + pending_subranges: [[16384, null]], + }) + expect(state.pendingSubranges[0]).toEqual({ lo: 16384, hi: Number.POSITIVE_INFINITY }) + }) + + it('currentFacetRange returns the top-level facet when the stack is empty', () => { + const state: FacetCrawlState = { facetIndex: 0, pendingSubranges: [], lastPage: 0 } + expect(currentFacetRange(state, FACETS)).toEqual(FACETS[0]) + }) + + it('currentFacetRange returns the stack head (LIFO) when a bisection is in progress', () => { + const state: FacetCrawlState = { + facetIndex: 0, + pendingSubranges: [ + { lo: 64, hi: 127 }, + { lo: 0, hi: 63 }, + ], + lastPage: 0, + } + expect(currentFacetRange(state, FACETS)).toEqual({ lo: 0, hi: 63 }) + }) + + it('currentFacetRange returns null once the ladder is exhausted', () => { + const state: FacetCrawlState = { + facetIndex: FACETS.length, + pendingSubranges: [], + lastPage: 0, + } + expect(currentFacetRange(state, FACETS)).toBeNull() + }) + + it('bisectCurrentFacet RETIRES the top-level facet (facetIndex++) before pushing halves', () => { + // C-1 regression: a top-level bisection must advance facetIndex so the facet + // is never re-queried after its halves drain (else it re-saturates forever). + const state: FacetCrawlState = { facetIndex: 0, pendingSubranges: [], lastPage: 4 } + const ok = bisectCurrentFacet(state, { lo: 0, hi: 127 }) + expect(ok).toBe(true) + expect(state.pendingSubranges).toEqual([ + { lo: 64, hi: 127 }, + { lo: 0, hi: 63 }, + ]) + expect(currentFacetRange(state, FACETS)).toEqual({ lo: 0, hi: 63 }) // crawled next + expect(state.lastPage).toBe(0) + expect(state.facetIndex).toBe(1) // top-level facet retired + }) + + it('C-1: after a top-level facet bisects and both halves drain, the NEXT facet is reached (no re-crawl)', () => { + const state: FacetCrawlState = { facetIndex: 0, pendingSubranges: [], lastPage: 0 } + bisectCurrentFacet(state, FACETS[0]) // facet 0 saturated → halves on stack, facetIndex=1 + expect(currentFacetRange(state, FACETS)).toEqual({ lo: 0, hi: 63 }) + advanceFacet(state) // lower half done → pop + expect(currentFacetRange(state, FACETS)).toEqual({ lo: 64, hi: 127 }) + advanceFacet(state) // upper half done → pop; stack now empty + // The frontier must move to facet 1, NOT back to the (saturating) facet 0. + expect(state.pendingSubranges).toEqual([]) + expect(currentFacetRange(state, FACETS)).toEqual(FACETS[1]) + expect(state.facetIndex).toBe(1) + }) + + it('bisectCurrentFacet replaces the stack head with its halves (sub-range bisection, facetIndex unchanged)', () => { + const state: FacetCrawlState = { + facetIndex: 2, + pendingSubranges: [{ lo: 0, hi: 63 }], + lastPage: 1, + } + bisectCurrentFacet(state, { lo: 0, hi: 63 }) + expect(state.pendingSubranges).toEqual([ + { lo: 32, hi: 63 }, + { lo: 0, hi: 31 }, + ]) + expect(state.facetIndex).toBe(2) // a sub-range bisected — top-level index does not move + }) + + it('bisectCurrentFacet returns false for an unsplittable range (lo === hi)', () => { + const state: FacetCrawlState = { facetIndex: 0, pendingSubranges: [], lastPage: 0 } + expect(bisectCurrentFacet(state, { lo: 5, hi: 5 })).toBe(false) + expect(state.pendingSubranges).toEqual([]) + expect(state.facetIndex).toBe(0) // no retirement on a failed bisect + }) + + it('advanceFacet pops the stack when bisecting, else increments facetIndex', () => { + const withStack: FacetCrawlState = { + facetIndex: 1, + pendingSubranges: [{ lo: 0, hi: 63 }], + lastPage: 3, + } + advanceFacet(withStack) + expect(withStack.pendingSubranges).toEqual([]) + expect(withStack.facetIndex).toBe(1) // unchanged — a sub-range finished, not the facet + expect(withStack.lastPage).toBe(0) + + const noStack: FacetCrawlState = { facetIndex: 1, pendingSubranges: [], lastPage: 3 } + advanceFacet(noStack) + expect(noStack.facetIndex).toBe(2) + expect(noStack.lastPage).toBe(0) + }) + + it('isFacetCrawlDone is true only when the ladder AND the bisection frontier are empty', () => { + expect( + isFacetCrawlDone({ facetIndex: FACETS.length, pendingSubranges: [], lastPage: 0 }, FACETS) + ).toBe(true) + expect( + isFacetCrawlDone( + { facetIndex: FACETS.length, pendingSubranges: [{ lo: 0, hi: 63 }], lastPage: 0 }, + FACETS + ) + ).toBe(false) + expect(isFacetCrawlDone({ facetIndex: 3, pendingSubranges: [], lastPage: 0 }, FACETS)).toBe( + false + ) + }) + + it('facetStateToCursor → cursorToFacetState round-trips through JSON (Infinity survives as null)', () => { + const state: FacetCrawlState = { + facetIndex: 8, + pendingSubranges: [{ lo: 16384, hi: Number.POSITIVE_INFINITY }], + lastPage: 2, + } + const cursor = facetStateToCursor(state, '.agents/skills', FACETS) + // The open-ended upper bound is persisted as null (JSON-safe). + expect(cursor.pending_subranges).toEqual([[16384, null]]) + expect(cursor.path).toBe('.agents/skills') + + // Survive a real JSON round-trip (the audit_logs metadata path). + const roundTripped = JSON.parse(JSON.stringify(cursor)) + const restored = cursorToFacetState(roundTripped) + expect(restored).toEqual(state) + }) + + it("facetStateToCursor reports facet 'done' when the ladder is exhausted", () => { + const cursor = facetStateToCursor( + { facetIndex: FACETS.length, pendingSubranges: [], lastPage: 0 }, + '', + FACETS + ) + expect(cursor.facet).toBe('done') + expect(cursor.facet_index).toBe(FACETS.length) + }) +}) diff --git a/scripts/tests/indexer/backfill-facet-crawl.test.ts b/scripts/tests/indexer/backfill-facet-crawl.test.ts new file mode 100644 index 000000000..8a45aa543 --- /dev/null +++ b/scripts/tests/indexer/backfill-facet-crawl.test.ts @@ -0,0 +1,434 @@ +/** + * Size-faceted backfill crawl tests (SMI-5286 1c) + * + * Drives the public entry `runSubdirectorySearch(..., backfillPlan)` to prove the + * size-faceted depth-first crawl in `subdirectory-search.helpers.ts` + * (`runBackfillFacetCrawl`) behaves per the SPARC §#3/§#5 contract: + * 1. Exhausts the static 9-facet ladder when no range saturates → done, all 9 + * facets completed, cursor.facet === 'done'. + * 2. A facet whose page-1 total exceeds the 1000-cap is BISECTED (its page-1 + * repos are NOT collected); the bisected sub-ranges still crawl to + * completion → cap_saturated true, done true. + * 3. Budget + resume round-trip: maxRangesPerDispatch bounds one dispatch; the + * returned cursor resumes losslessly across dispatches until done, with + * facets_completed monotonically advancing. + * 4. The crawl threads a `size:` qualifier as the 5th arg of every code-search + * call. + * + * Strategy: mirrors `subdirectory-search.perskill.test.ts` exactly — mock every + * I/O boundary (rate-limit delay, github-auth, code-search, license-filter, + * skill-processor, trees-enumerate) at the module level, import the SUT AFTER the + * mocks, and let `buildSkillTreeUrl` (pure) and the facet ladder run real. + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest' +import type { RateLimitTelemetry } from '../../indexer/_shared/rate-limit.ts' + +// --------------------------------------------------------------------------- +// Module-level mocks — declared before any import of the SUT +// (identical shape to subdirectory-search.perskill.test.ts) +// --------------------------------------------------------------------------- + +// Mock delay so the 6s inter-page sleeps don't actually wait. +vi.mock('../../indexer/_shared/rate-limit.ts', () => ({ + GITHUB_API_DELAY: 0, + delay: vi.fn(async () => undefined), + withRateLimitTracking: vi.fn(), + withBackoff: vi.fn(async (fn: () => Promise) => fn()), + newRateLimitTelemetry: vi.fn(() => ({})), +})) + +vi.mock('../../indexer/_shared/github-auth.ts', () => ({ + buildGitHubHeaders: vi.fn(async () => ({})), +})) + +const mockSearchCode = vi.fn() +vi.mock('../../indexer/code-search.ts', async (importOriginal) => { + const actual = await importOriginal() + return { + ...actual, + searchCodeForSkillMdInSubdirectory: (...args: unknown[]) => mockSearchCode(...args), + } +}) + +const mockFetchRepoLicense = vi.fn() +vi.mock('../../indexer/license-filter.ts', async (importOriginal) => { + const actual = await importOriginal() + return { + ...actual, + fetchRepoLicense: (...args: unknown[]) => mockFetchRepoLicense(...args), + } +}) + +const mockCheckSkillMdExists = vi.fn() +vi.mock('../../indexer/skill-processor.ts', async (importOriginal) => { + const actual = await importOriginal() + return { + ...actual, + checkSkillMdExists: (...args: unknown[]) => mockCheckSkillMdExists(...args), + } +}) + +const mockEnumerateRepoSkillPaths = vi.fn() +vi.mock('../../indexer/trees-enumerate.ts', async (importOriginal) => { + const actual = await importOriginal() + return { + ...actual, + enumerateRepoSkillPaths: (...args: unknown[]) => mockEnumerateRepoSkillPaths(...args), + } +}) + +// Imported AFTER mocks so the SUT binds the stubs. +import { runSubdirectorySearch, type BackfillFacetPlan } from '../../indexer/subdirectory-search.ts' + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +const noTelemetry: RateLimitTelemetry = {} as RateLimitTelemetry + +/** Static ladder length — kept literal to assert against the SUT, not derive from it. */ +const LADDER_SIZE = 9 + +/** + * Build a minimal GitHubRepository-shaped code-search hit. Owner is varied via a + * counter so per-repo dedup (`enumeratedRepos`) never swallows a later facet's + * single repo — though every assertion in this file keys on facet COUNTERS, not + * collected-repo counts, so this is belt-and-suspenders. + */ +let repoCounter = 0 +function makeCodeSearchRepo(overrides: Record = {}) { + repoCounter += 1 + const owner = `owner${repoCounter}` + return { + owner, + name: 'skills-repo', + fullName: `${owner}/skills-repo`, + description: 'test', + url: `https://github.com/${owner}/skills-repo/tree/main/skills/x`, + stars: 5, + forks: 0, + topics: ['claude-code-skill'], + updatedAt: new Date().toISOString(), + defaultBranch: 'main', + installable: false, + repoName: 'skills-repo', + skillPath: 'skills/x', + discoveryPath: 'subdirectory_search:broad', + ...overrides, + } +} + +/** + * A non-saturating facet: page 1 returns ONE repo (total well under the cap, but + * repos.length === 1 < perPage → short page → range exhausted in a single page). + * page>=2 returns an empty short page as a defensive backstop. + */ +function nonSaturatingPage(page: number) { + if (page === 1) { + return { repos: [makeCodeSearchRepo()], total: 5, retries: 0, incomplete_results: false } + } + return { repos: [], total: 5, retries: 0, incomplete_results: false } +} + +/** A saturated facet: page-1 total exceeds the 1000-result code-search cap. */ +function saturatedPage1() { + // repos here MUST NOT be collected (the crawl bisects before processing them). + return { + repos: [makeCodeSearchRepo({ skillPath: 'skills/should-not-collect' })], + total: 5000, + retries: 0, + incomplete_results: false, + } +} + +/** A BackfillFacetPlan with broad (no path:) query, overridable per test. */ +function makePlan(overrides: Partial = {}): BackfillFacetPlan { + return { + startCursor: null, + pathPrefix: undefined, + perPage: 100, + maxPagesPerRange: 20, + maxRangesPerDispatch: 100, + ...overrides, + } +} + +beforeEach(() => { + repoCounter = 0 + mockSearchCode.mockReset() + mockFetchRepoLicense.mockReset() + mockCheckSkillMdExists.mockReset() + mockEnumerateRepoSkillPaths.mockReset() + + // Default I/O behaviour: permissive license, validation passes, one skill per repo. + mockFetchRepoLicense.mockResolvedValue({ license: 'MIT', fetchFailed: false }) + mockCheckSkillMdExists.mockResolvedValue(true) + mockEnumerateRepoSkillPaths.mockResolvedValue({ + entries: [{ path: 'skills/x', blobSha: 'sha1' }], + truncatedByCap: false, + truncatedByApi: false, + }) +}) + +// --------------------------------------------------------------------------- + +describe('runSubdirectorySearch — size-faceted backfill crawl (SMI-5286 1c)', () => { + it('Case 1: exhausts the full 9-facet ladder when no range saturates', async () => { + // Every facet/sub-range is non-saturating and exhausts in a single short page. + mockSearchCode.mockImplementation(async (_pathPrefix: unknown, page: number) => + nonSaturatingPage(page) + ) + + const result = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + // maxPages is ignored on the backfill path (plan.maxPagesPerRange governs). + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 100 }) + ) + + expect(result.backfill).toBeDefined() + const backfill = result.backfill! + expect(backfill.done).toBe(true) + expect(backfill.facets_completed).toBe(LADDER_SIZE) + expect(backfill.facets_total).toBe(LADDER_SIZE) + expect(backfill.cap_saturated).toBe(false) + // With no saturation, every top-level facet is one range → 9 ranges crawled. + expect(backfill.ranges_crawled).toBe(LADDER_SIZE) + // Terminal cursor: ladder exhausted → facet sentinel 'done', index at the end. + expect(backfill.cursor.facet).toBe('done') + expect(backfill.cursor.facet_index).toBe(LADDER_SIZE) + expect(backfill.cursor.pending_subranges).toEqual([]) + }) + + it('Case 2: a saturated facet bisects (page-1 repos not collected) and still completes', async () => { + // FIRST top-level facet (size:0..127) saturates on page 1; every later range + // (including the saturated facet's bisected sub-ranges) is non-saturating. + let firstCall = true + mockSearchCode.mockImplementation(async (_pathPrefix: unknown, page: number) => { + if (firstCall) { + firstCall = false + // page 1 of the very first facet → saturated. + return saturatedPage1() + } + return nonSaturatingPage(page) + }) + + const result = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 100 }) + ) + + const backfill = result.backfill! + expect(backfill.cap_saturated).toBe(true) + // The crawl still drains the whole ladder because the saturated facet's + // sub-ranges (which return total:5) get crawled before facet 0 advances. + expect(backfill.done).toBe(true) + expect(backfill.facets_completed).toBe(LADDER_SIZE) + expect(backfill.cursor.facet).toBe('done') + + // No repo was collected from the saturated page-1: that page returned a repo + // whose skillPath was 'should-not-collect', but the crawl bisected BEFORE + // calling processSearchResults on it. Every collected row therefore comes from + // the (bisected) sub-ranges / later facets, never the capped page. + const collectedPaths = result.repos.map((r) => r.skillPath) + expect(collectedPaths).not.toContain('skills/should-not-collect') + // enumerateRepoSkillPaths is only reached via processSearchResults, so the + // saturated repo was never enumerated. + const enumeratedOwners = mockEnumerateRepoSkillPaths.mock.calls.map((c) => c[0]) + // The saturated repo is owner1 (first makeCodeSearchRepo call). It must NOT + // appear among enumerated owners. + expect(enumeratedOwners).not.toContain('owner1') + }) + + it('Case 3: budget + resume round-trip resumes losslessly across dispatches', async () => { + // Non-saturating for the whole run: each facet exhausts in one range. + mockSearchCode.mockImplementation(async (_pathPrefix: unknown, page: number) => + nonSaturatingPage(page) + ) + + // --- Dispatch 1: budget of 2 ranges. --- + const seen = new Set() + const cache = new Map() + const first = await runSubdirectorySearch( + seen, + cache, + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 2 }) + ) + const firstBackfill = first.backfill! + + expect(firstBackfill.done).toBe(false) + expect(firstBackfill.ranges_crawled).toBe(2) + // Two non-saturating facets completed; cursor is partway through the ladder. + expect(firstBackfill.facets_completed).toBe(2) + expect(firstBackfill.cursor.facet_index).toBe(2) + expect(firstBackfill.cursor.facet_index).toBeLessThan(LADDER_SIZE) + expect(firstBackfill.cursor.facet).not.toBe('done') + + // --- Dispatch 2: resume from the returned cursor (fresh seenUrls; the cursor, + // not the dedup set, carries crawl position — documenting the choice). --- + const second = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ startCursor: firstBackfill.cursor, maxRangesPerDispatch: 2 }) + ) + const secondBackfill = second.backfill! + + // Resumed: strictly MORE facets done than dispatch 1 (no facets lost / redone). + expect(secondBackfill.facets_completed).toBeGreaterThan(firstBackfill.facets_completed) + expect(secondBackfill.facets_completed).toBe(4) + expect(secondBackfill.cursor.facet_index).toBe(4) + + // --- Drain the rest in a loop until done; assert it reaches the full ladder. --- + let cursor = secondBackfill.cursor + let done = secondBackfill.done + let lastCompleted = secondBackfill.facets_completed + let guard = 0 + while (!done) { + if (guard++ > 20) throw new Error('resume loop did not converge') + const next = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ startCursor: cursor, maxRangesPerDispatch: 2 }) + ) + const nb = next.backfill! + // Monotonic non-regression of completed facets across every dispatch. + expect(nb.facets_completed).toBeGreaterThanOrEqual(lastCompleted) + lastCompleted = nb.facets_completed + cursor = nb.cursor + done = nb.done + } + + expect(lastCompleted).toBe(LADDER_SIZE) + expect(cursor.facet).toBe('done') + }) + + it('Case 4: passes a size: qualifier as the 5th arg to the code-search call', async () => { + mockSearchCode.mockImplementation(async (_pathPrefix: unknown, page: number) => + nonSaturatingPage(page) + ) + + await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 3 }) + ) + + expect(mockSearchCode).toHaveBeenCalled() + // Every call carries a 5th arg that is a `size:` qualifier string. + const sizeArgs = mockSearchCode.mock.calls.map((c) => c[4]) + expect(sizeArgs.length).toBeGreaterThan(0) + for (const arg of sizeArgs) { + expect(typeof arg).toBe('string') + expect(arg as string).toMatch(/^size:/) + } + // The first facet (size:0..127) renders as `size:0..127`. + expect(sizeArgs).toContain('size:0..127') + }) + + // A mock where facet 0 (size:0..127) AND every bisected descendant (a finite + // range with hi <= 127) ALWAYS saturates; every other facet (lo >= 128, plus + // the open-ended tail) is non-saturating. + function facet0AlwaysSaturates() { + return async ( + _pathPrefix: unknown, + page: number, + _perPage: unknown, + _telemetry: unknown, + sizeQualifier: string + ) => { + const m = /^size:(\d+)\.\.(\d+)$/.exec(sizeQualifier) + const withinFacet0 = m !== null && Number(m[2]) <= 127 + if (withinFacet0 && page === 1) { + return { repos: [makeCodeSearchRepo()], total: 5000, retries: 0, incomplete_results: false } + } + return nonSaturatingPage(page) + } + } + + it('Case 5 (C-1 regression): a PERSISTENTLY-saturating top-level facet is retired and the crawl still terminates', async () => { + // Pre-C-1-fix this looped forever re-crawling facet 0 (facets_completed stuck + // at 0). Post-fix facet 0 is retired on its first bisect, bisects down to + // single-byte truncation, then facets 1-8 complete to a clean terminal state. + mockSearchCode.mockImplementation(facet0AlwaysSaturates()) + + const result = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 1000 }) + ) + const backfill = result.backfill! + expect(backfill.done).toBe(true) // terminates — no infinite loop + expect(backfill.facets_completed).toBe(LADDER_SIZE) + expect(backfill.cap_saturated).toBe(true) + // facet 0 bisects to single-byte buckets that can't subdivide → recorded truncated. + expect(backfill.truncated_repo_count).toBeGreaterThan(0) + expect(backfill.cursor.facet).toBe('done') + }) + + it('Case 6 (C-1 regression, budget-bounded): a saturating top-level facet advances facets_completed past 0', async () => { + // The decisive C-1 check: with a tiny budget and facet 0 always saturating, + // the top-level facet must RETIRE (facets_completed >= 1) — pre-fix it stayed + // at 0 across every dispatch, an infinite re-crawl that never made progress. + mockSearchCode.mockImplementation(facet0AlwaysSaturates()) + + const result = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 3 }) + ) + expect(result.backfill!.facets_completed).toBeGreaterThanOrEqual(1) + }) + + it('M-1: a page error on a range is recorded as truncated and the crawl advances', async () => { + // First range errors on page 1; everything else is non-saturating. + let firstCall = true + mockSearchCode.mockImplementation(async (_pathPrefix: unknown, page: number) => { + if (firstCall) { + firstCall = false + return { repos: [], total: 0, retries: 0, incomplete_results: false, error: 'rate limited' } + } + return nonSaturatingPage(page) + }) + + const result = await runSubdirectorySearch( + new Set(), + new Map(), + {}, + 1, + noTelemetry, + makePlan({ maxRangesPerDispatch: 100 }) + ) + const backfill = result.backfill! + // The errored range is surfaced (counted + in errors[]), not silently dropped, + // and the crawl advanced past it to complete the rest of the ladder. + expect(backfill.truncated_repo_count).toBeGreaterThanOrEqual(1) + expect(backfill.done).toBe(true) + expect(result.errors.some((e) => e.includes('rate limited'))).toBe(true) + }) +}) diff --git a/scripts/tests/indexer/code-search.facets.test.ts b/scripts/tests/indexer/code-search.facets.test.ts new file mode 100644 index 000000000..d9e8ff971 --- /dev/null +++ b/scripts/tests/indexer/code-search.facets.test.ts @@ -0,0 +1,131 @@ +/** + * Tests for the size-facet partitioner (SMI-5286 Wave 1c) + * + * The facet ladder partitions the broad `filename:SKILL.md` code-search query by + * file SIZE so each sub-query stays under GitHub's 1000-result ceiling. These + * tests pin the load-bearing invariants: exhaustive+disjoint+contiguous coverage, + * a STABLE ladder length across calls (so `facets_total` is static for the + * checkpoint cursor), inclusive-inclusive `size:` qualifiers (no off-by-one), and + * the adaptive bisection contract (including its unsplittable guards). + */ + +import { describe, it, expect } from 'vitest' +import { + buildSizeFacets, + facetId, + facetToQualifier, + bisectFacet, + type SizeFacet, +} from '../../indexer/code-search.facets.ts' + +describe('SMI-5286 Wave 1c: buildSizeFacets', () => { + it('is exhaustive, disjoint, and contiguous over [0, ∞)', () => { + const facets = buildSizeFacets() + + // Starts at 0. + expect(facets[0].lo).toBe(0) + + // Each subsequent lo is exactly the prior hi + 1 (disjoint + contiguous, no gaps). + for (let i = 0; i < facets.length - 1; i++) { + expect(facets[i + 1].lo).toBe(facets[i].hi + 1) + // Every interior bucket is finite and well-ordered. + expect(facets[i].hi).toBeGreaterThanOrEqual(facets[i].lo) + expect(Number.isFinite(facets[i].hi)).toBe(true) + } + + // Final bucket is open-ended. + expect(facets[facets.length - 1].hi).toBe(Number.POSITIVE_INFINITY) + }) + + it('returns a STABLE ladder length across two calls (facets_total must be static)', () => { + const first = buildSizeFacets() + const second = buildSizeFacets() + expect(first.length).toBe(second.length) + // Same stable identity so facets_total never drifts mid-backfill. + expect(first).toBe(second) + }) + + it('uses the exact 9-bucket doubling ladder', () => { + const facets = buildSizeFacets() + expect(facets).toEqual([ + { lo: 0, hi: 127 }, + { lo: 128, hi: 255 }, + { lo: 256, hi: 511 }, + { lo: 512, hi: 1023 }, + { lo: 1024, hi: 2047 }, + { lo: 2048, hi: 4095 }, + { lo: 4096, hi: 8191 }, + { lo: 8192, hi: 16383 }, + { lo: 16384, hi: Number.POSITIVE_INFINITY }, + ]) + }) +}) + +describe('SMI-5286 Wave 1c: facetToQualifier', () => { + it('renders the first (finite) bucket as inclusive-inclusive size:0..127', () => { + const facets = buildSizeFacets() + expect(facetToQualifier(facets[0])).toBe('size:0..127') + }) + + it('renders the open-ended bucket as size:>=16384 (no off-by-one)', () => { + const facets = buildSizeFacets() + expect(facetToQualifier(facets[facets.length - 1])).toBe('size:>=16384') + }) +}) + +describe('SMI-5286 Wave 1c: facetId', () => { + it('labels a finite bucket as `${lo}-${hi}`', () => { + expect(facetId({ lo: 0, hi: 127 })).toBe('0-127') + }) + + it('labels the open-ended bucket as `${lo}+`', () => { + expect(facetId({ lo: 16384, hi: Number.POSITIVE_INFINITY })).toBe('16384+') + }) +}) + +describe('SMI-5286 Wave 1c: bisectFacet', () => { + it('splits a finite bucket into disjoint, contiguous halves covering the same union', () => { + const halves = bisectFacet({ lo: 0, hi: 127 }) + expect(halves).not.toBeNull() + const [left, right] = halves as [SizeFacet, SizeFacet] + expect(left).toEqual({ lo: 0, hi: 63 }) + expect(right).toEqual({ lo: 64, hi: 127 }) + // Disjoint + contiguous: right.lo === left.hi + 1. + expect(right.lo).toBe(left.hi + 1) + // Same union: spans the original [0, 127]. + expect(left.lo).toBe(0) + expect(right.hi).toBe(127) + }) + + it('splits the open-ended bucket by doubling the pivot', () => { + const halves = bisectFacet({ lo: 16384, hi: Number.POSITIVE_INFINITY }) + expect(halves).not.toBeNull() + const [left, right] = halves as [SizeFacet, SizeFacet] + expect(left).toEqual({ lo: 16384, hi: 32767 }) + expect(right).toEqual({ lo: 32768, hi: Number.POSITIVE_INFINITY }) + // Disjoint + contiguous at the pivot. + expect(right.lo).toBe(left.hi + 1) + }) + + it('returns null for an unsplittable finite facet (lo >= hi)', () => { + expect(bisectFacet({ lo: 5, hi: 5 })).toBeNull() + // Inverted/degenerate range is also unsplittable. + expect(bisectFacet({ lo: 10, hi: 9 })).toBeNull() + }) + + it('returns null for an open-ended facet anchored at 0 (cannot double)', () => { + expect(bisectFacet({ lo: 0, hi: Number.POSITIVE_INFINITY })).toBeNull() + }) + + it('returns null for an open-ended facet past the 4 MiB ceiling (terminates persistent saturation)', () => { + // A SKILL.md larger than 4 MiB is not a real skill; an open-ended bucket + // doubling forever would never reach lo === hi, so the ceiling makes a + // persistently-saturating open-ended tail terminate (recorded truncated) + // instead of bisecting infinitely (SMI-5286 1c C-1 follow-up). + const ceiling = 4 * 1024 * 1024 + expect(bisectFacet({ lo: ceiling, hi: Number.POSITIVE_INFINITY })).toBeNull() + expect(bisectFacet({ lo: ceiling + 1, hi: Number.POSITIVE_INFINITY })).toBeNull() + // Just below the ceiling still splits. + expect(bisectFacet({ lo: ceiling / 2, hi: Number.POSITIVE_INFINITY })).not.toBeNull() + }) +}) diff --git a/scripts/tests/indexer/community-url-fork.test.ts b/scripts/tests/indexer/community-url-fork.test.ts index b6d859dd0..bd7976c4c 100644 --- a/scripts/tests/indexer/community-url-fork.test.ts +++ b/scripts/tests/indexer/community-url-fork.test.ts @@ -451,3 +451,71 @@ describe('SMI-5176 freshness qualifier contracts', () => { expect(decodedQ).not.toContain('pushed:') }) }) + +// --------------------------------------------------------------------------- +// SMI-5286 1c: size-facet qualifier + per_page=100 contracts +// Asserts the size: qualifier reaches the emitted fetch URL (encoded) and that +// the new default per_page=100 is present. Mirrors the freshness-qualifier +// harness above (capture URL via fetch mock, then inspect q= and per_page=). +// --------------------------------------------------------------------------- + +describe('SMI-5286 1c size-facet qualifier + per_page contracts', () => { + beforeEach(() => vi.restoreAllMocks()) + + it('appends a sizeQualifier (URL-encoded) to the code-search query', async () => { + let capturedUrl = '' + vi.spyOn(globalThis, 'fetch').mockImplementationOnce(async (url) => { + capturedUrl = String(url) + return makeFetchOk({ total_count: 0, incomplete_results: false, items: [] }) + }) + + await searchCodeForSkillMdInSubdirectory(undefined, 1, 100, noTelemetry, 'size:0..127') + + // Raw URL carries the percent-encoded colon (size%3A0..127). + expect(capturedUrl).toContain('size%3A0..127') + // Decoded query contains the literal qualifier alongside the base filter. + const decodedQ = decodeURIComponent(capturedUrl.split('q=')[1]?.split('&')[0] ?? '') + expect(decodedQ).toContain('size:0..127') + expect(decodedQ).toContain('filename:SKILL.md') + }) + + it('omits any size: qualifier when sizeQualifier is not supplied', async () => { + let capturedUrl = '' + vi.spyOn(globalThis, 'fetch').mockImplementationOnce(async (url) => { + capturedUrl = String(url) + return makeFetchOk({ total_count: 0, incomplete_results: false, items: [] }) + }) + + await searchCodeForSkillMdInSubdirectory(undefined, 1, 100, noTelemetry) + + const decodedQ = decodeURIComponent(capturedUrl.split('q=')[1]?.split('&')[0] ?? '') + expect(decodedQ).not.toContain('size:') + }) + + it('emits per_page=100 (GitHub max) for searchCodeForSkillMdInSubdirectory', async () => { + let capturedUrl = '' + vi.spyOn(globalThis, 'fetch').mockImplementationOnce(async (url) => { + capturedUrl = String(url) + return makeFetchOk({ total_count: 0, incomplete_results: false, items: [] }) + }) + + // telemetry is positional after perPage, so the default cannot be exercised + // by omitting perPage; pass the new default (100) explicitly to assert it + // reaches the URL. + await searchCodeForSkillMdInSubdirectory(undefined, 1, 100, noTelemetry) + + expect(capturedUrl).toContain('per_page=100') + }) + + it('emits per_page=100 (GitHub max) for searchCodeForSkillMd (root phase)', async () => { + let capturedUrl = '' + vi.spyOn(globalThis, 'fetch').mockImplementationOnce(async (url) => { + capturedUrl = String(url) + return makeFetchOk({ total_count: 0, incomplete_results: false, items: [] }) + }) + + await searchCodeForSkillMd(1, 100, noTelemetry) + + expect(capturedUrl).toContain('per_page=100') + }) +}) diff --git a/scripts/tests/indexer/parse-env.backfill.test.ts b/scripts/tests/indexer/parse-env.backfill.test.ts index 2de218614..68fa26b60 100644 --- a/scripts/tests/indexer/parse-env.backfill.test.ts +++ b/scripts/tests/indexer/parse-env.backfill.test.ts @@ -84,3 +84,62 @@ describe('parseEnv — BACKFILL_MODE (SMI-5286 Wave 1b)', () => { expect('BACKFILL_MODE' in env).toBe(true) }) }) + +describe('parseEnv — SMI-5286 1c backfill levers', () => { + let originalEnv: NodeJS.ProcessEnv + + beforeEach(() => { + originalEnv = { ...process.env } + for (const k of Object.keys(process.env)) { + delete process.env[k] + } + Object.assign(process.env, BASE_ENV) + }) + + afterEach(() => { + process.env = originalEnv + }) + + it('BACKFILL_PATH_PREFIX is undefined when absent or empty', () => { + delete process.env.BACKFILL_PATH_PREFIX + expect(parseEnv().BACKFILL_PATH_PREFIX).toBeUndefined() + process.env.BACKFILL_PATH_PREFIX = '' + expect(parseEnv().BACKFILL_PATH_PREFIX).toBeUndefined() + }) + + it('BACKFILL_PATH_PREFIX passes a non-empty prefix through verbatim', () => { + process.env.BACKFILL_PATH_PREFIX = '.agents/skills' + expect(parseEnv().BACKFILL_PATH_PREFIX).toBe('.agents/skills') + }) + + it('BACKFILL_MAX_RANGES defaults to 150 and honors an override', () => { + delete process.env.BACKFILL_MAX_RANGES + expect(parseEnv().BACKFILL_MAX_RANGES).toBe(150) + process.env.BACKFILL_MAX_RANGES = '40' + expect(parseEnv().BACKFILL_MAX_RANGES).toBe(40) + }) + + it('raises the cap DEFAULTS only when BACKFILL_MODE is set (C-5)', () => { + // Cron defaults (backfill off) + const cron = parseEnv() + expect(cron.MAX_PAGES).toBe(5) + expect(cron.MAX_REPOS).toBe(100) + expect(cron.CODE_SEARCH_MAX_PAGES).toBe(1) + + // Backfill defaults (no explicit caps set) + process.env.BACKFILL_MODE = 'true' + const backfill = parseEnv() + expect(backfill.MAX_PAGES).toBe(10) + expect(backfill.MAX_REPOS).toBe(500) + expect(backfill.CODE_SEARCH_MAX_PAGES).toBe(10) + }) + + it('explicit cap env vars still override the backfill defaults', () => { + process.env.BACKFILL_MODE = 'true' + process.env.CODE_SEARCH_MAX_PAGES = '3' + process.env.MAX_PAGES = '7' + const env = parseEnv() + expect(env.CODE_SEARCH_MAX_PAGES).toBe(3) + expect(env.MAX_PAGES).toBe(7) + }) +}) diff --git a/scripts/tests/indexer/trees-search.test.ts b/scripts/tests/indexer/trees-search.test.ts new file mode 100644 index 000000000..a959da6e1 --- /dev/null +++ b/scripts/tests/indexer/trees-search.test.ts @@ -0,0 +1,113 @@ +/** + * Unit tests for trees-search.ts — fetchSkillPathsFromTree (SMI-5286 1c §C-4) + * + * Asserts the root-level SKILL.md handling: a blob at repo-root `SKILL.md` + * must enumerate to a TreeSkillEntry with path:'' (buildSkillTreeUrl maps '' → + * …/tree/), NOT be silently dropped. A nested `tools/foo/SKILL.md` + * yields its parent dir, and a `use-skill.md` blob must NOT match (suffix gate). + * + * Mocks the network layer at globalThis.fetch so the production HTTP plumbing + * is exercised but no real requests are made. Matches the mock pattern used in + * community-url-fork.test.ts (vi.mock rate-limit passthrough + vi.spyOn fetch). + */ + +import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest' +import type { RateLimitTelemetry } from '../../indexer/_shared/rate-limit.ts' + +// --------------------------------------------------------------------------- +// Keep rate-limit helpers fast: mock delay + withRateLimitTracking to forward +// the fetch call directly so we can spy on globalThis.fetch. +// --------------------------------------------------------------------------- + +vi.mock('../../indexer/_shared/rate-limit.ts', () => ({ + GITHUB_API_DELAY: 0, + delay: vi.fn(async () => undefined), + withBackoff: vi.fn(async (fn: () => Promise) => fn()), + // Let withRateLimitTracking call globalThis.fetch directly (transparent). + withRateLimitTracking: vi.fn(async (_telemetry: unknown, url: string, opts?: RequestInit) => { + const init = opts ? { headers: opts.headers } : {} + return globalThis.fetch(url, init) + }), +})) + +vi.mock('../../indexer/_shared/github-auth.ts', () => ({ + buildGitHubHeaders: vi.fn(async () => ({ Authorization: 'Bearer test-token' })), +})) + +// Imported AFTER mocks so the SUT binds the stub. +import { fetchSkillPathsFromTree } from '../../indexer/trees-search.ts' + +afterEach(() => vi.restoreAllMocks()) + +const noTelemetry: RateLimitTelemetry = {} as RateLimitTelemetry + +// --------------------------------------------------------------------------- +// Helpers to build minimal GitHub Trees API response payloads +// --------------------------------------------------------------------------- + +function makeBlob(path: string, sha: string) { + return { + path, + mode: '100644', + type: 'blob', + sha, + size: 123, + url: `https://api.github.com/repos/acme/my-skills/git/blobs/${sha}`, + } +} + +function makeFetchOk(body: unknown): Response { + return { + ok: true, + status: 200, + headers: { get: () => null }, + json: async () => body, + } as unknown as Response +} + +// --------------------------------------------------------------------------- +// Root-level SKILL.md handling (SMI-5286 1c §C-4) +// --------------------------------------------------------------------------- + +describe('fetchSkillPathsFromTree — root-level SKILL.md (SMI-5286 1c §C-4)', () => { + beforeEach(() => vi.restoreAllMocks()) + + it('emits path:"" for a root SKILL.md and the parent dir for a nested one; ignores use-skill.md', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce( + makeFetchOk({ + sha: 'treesha', + url: 'https://api.github.com/repos/acme/my-skills/git/trees/main', + tree: [ + makeBlob('SKILL.md', 'rootsha'), // root SKILL.md → path '' + makeBlob('tools/foo/SKILL.md', 'nestedsha'), // nested → parent dir + makeBlob('docs/use-skill.md', 'usesha'), // suffix gate: must NOT match + ], + truncated: false, + }) + ) + + const result = await fetchSkillPathsFromTree('acme', 'my-skills', 'main', noTelemetry) + + expect(result.entries).toEqual([ + { path: '', blobSha: 'rootsha' }, + { path: 'tools/foo', blobSha: 'nestedsha' }, + ]) + expect(result.truncated).toBe(false) + expect(result.errors).toHaveLength(0) + }) + + it('does NOT match use-skill.md even when it is the only blob (suffix gate)', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce( + makeFetchOk({ + sha: 'treesha', + url: 'https://api.github.com/repos/acme/my-skills/git/trees/main', + tree: [makeBlob('use-skill.md', 'usesha')], + truncated: false, + }) + ) + + const result = await fetchSkillPathsFromTree('acme', 'my-skills', 'main', noTelemetry) + + expect(result.entries).toHaveLength(0) + }) +})