Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions .github/workflows/indexer-backfill.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,15 @@ on:
default: '50'
type: string
path_prefix:
description: 'Scope crawl to one path prefix (e.g. .agents/skills); empty = all prefixes'
description: 'Scope crawl to one path prefix (e.g. .agents/skills); empty = broad query (all depths)'
required: false
default: ''
type: string
max_ranges:
description: 'SMI-5286 1c: per-dispatch size-(sub)range budget before a checkpoint+exit (default: 150)'
required: false
default: '150'
type: string
supabase_env:
description: 'Target Supabase environment'
required: false
Expand Down Expand Up @@ -181,6 +186,14 @@ jobs:
RESUME_FROM: ${{ github.event.inputs.resume_from || 'latest' }}
BACKFILL_MAX_SKILLS_PER_REPO: ${{ github.event.inputs.max_skills_per_repo || '50' }}
BACKFILL_PATH_PREFIX: ${{ github.event.inputs.path_prefix || '' }}
# SMI-5286 1c: run ONLY Phase 3 (the size-faceted subdirectory crawl) +
# finalize each dispatch — topic/high-trust are the cron's job, so this
# keeps every backfill dispatch focused and resumable on the facet cursor.
DISCOVERY_PHASE: '3'
# SMI-5286 1c: per-dispatch (sub)range budget — the facet driver writes a
# checkpoint after this many ranges so the run fits the GHA cap; re-dispatch
# with resume_from=latest until facets_remaining=0.
BACKFILL_MAX_RANGES: ${{ github.event.inputs.max_ranges || '150' }}
# Raised caps for backfill mode (per SPARC section #3).
# These override the conservative cron defaults.
CODE_SEARCH_MAX_PAGES: '10'
Expand Down Expand Up @@ -224,6 +237,11 @@ jobs:
CAP_SATURATED=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.cap_saturated // false')
TRUNCATED_REPO_COUNT=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.truncated_repo_count // 0')
TOKEN_SOURCE=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.token_source // "unknown"')
# SMI-5286 1c (M-2): true crawl position. current_facet == 'done' is the
# AUTHORITATIVE terminal signal (facets_remaining alone reads 0 while the
# last facet's bisected sub-ranges still drain).
CURRENT_FACET=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.current_facet // "unknown"')
PENDING_SUBRANGES=$(printf '%s' "$RESPONSE" | jq -r '.data.backfill.pending_subrange_count // 0')

# CRITICAL: token_source must be exactly "pat" on every backfill dispatch.
# An "app" value means the App env entries leaked in (consuming the cron's
Expand All @@ -247,15 +265,19 @@ jobs:
echo "| Facets Total | $FACETS_TOTAL |"
echo "| Facets Completed | $FACETS_COMPLETED |"
echo "| Facets Remaining | $FACETS_REMAINING |"
echo "| Current Facet | $CURRENT_FACET |"
echo "| Pending Sub-ranges | $PENDING_SUBRANGES |"
echo "| Checkpoint ID | $CHECKPOINT_ID |"
echo "| Cap Saturated | $CAP_SATURATED |"
echo "| Truncated Repo Count | $TRUNCATED_REPO_COUNT |"
echo "| Token Source | $TOKEN_SOURCE |"
echo ""
if [ "$FACETS_REMAINING" = "0" ]; then
echo "**TERMINAL CONDITION MET**: facets_remaining == 0. Backfill loop is complete."
# current_facet == 'done' is authoritative: it is set only when the
# ladder AND the bisection frontier are both exhausted (SMI-5286 1c C-1/M-2).
if [ "$CURRENT_FACET" = "done" ]; then
echo "**TERMINAL CONDITION MET**: current_facet == 'done'. Backfill loop is complete."
else
echo "**Backfill continues.** Re-dispatch with resume_from=latest to pick up from checkpoint $CHECKPOINT_ID."
echo "**Backfill continues** (current_facet=$CURRENT_FACET, pending_subranges=$PENDING_SUBRANGES). Re-dispatch with resume_from=latest to pick up from checkpoint $CHECKPOINT_ID."
fi
} >> "$GITHUB_STEP_SUMMARY"

Expand Down
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- **Indexer Backfill Facet Driver** (2026-06-18, SMI-5286 sub-wave 1c): the
out-of-band backfill (`indexer-backfill.yml`) now crawls the full
`filename:SKILL.md` universe past GitHub code-search's 1000-result-per-query
cap by partitioning the broad query into a fixed `size:` byte-range ladder
(`code-search.facets.ts`) with adaptive bisect-on-saturation: any facet whose
`total_count` exceeds the cap is split and its halves crawled before the next
facet, so every file is reachable. The depth-first frontier (facet index +
bisection stack + page) is fully captured by the checkpoint cursor — extended
with `pending_subranges` — so a dispatch boundary mid-bisection resumes
losslessly across the 6h GHA cap. `per_page` raised 30→100; `BACKFILL_PATH_PREFIX`
scopes a one-ecosystem DRY_RUN; `DISCOVERY_PHASE=3` focuses each dispatch on the
Phase-3b crawl + finalize. Also fixes a latent root-`SKILL.md` drop in
`fetchSkillPathsFromTree` (`trees-search.ts`) — repos whose only skill is a root
`SKILL.md` are now emitted as `path:''` instead of silently lost. Gated: the
live (`DRY_RUN=false`) crawl requires explicit operator sign-off.
- **Vendor-Org Trust Tier** (2026-05-02, SMI-4651): GitHub-verified vendor
organizations (Stripe, Notion, Atlassian, Figma, Canva, Zapier, Cloudflare,
and any future verified org) are now auto-promoted to the `curated` trust
Expand Down
168 changes: 165 additions & 3 deletions scripts/indexer/backfill-checkpoint.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,174 @@
*/

import type { SupabaseClient } from '@supabase/supabase-js'
import { type SizeFacet, buildSizeFacets, facetId, bisectFacet } from './code-search.facets.ts'

/** `event_type` discriminator for backfill checkpoint rows in `audit_logs`. */
export const BACKFILL_CHECKPOINT_EVENT_TYPE = 'indexer_backfill_checkpoint'

/**
* A persisted size sub-range `[lo, hi]`. `hi` is `null` when the range is
* open-ended (`Infinity`) — `Infinity` does NOT survive `JSON.stringify`
* (it serializes to `null`), so the cursor uses an explicit `null` sentinel and
* {@link deserializeRange} maps it back to `Number.POSITIVE_INFINITY`.
*/
export type PersistedSubrange = [number, number | null]

/**
* Resume cursor. `(path, facet, last_page)` lets a re-dispatch resume mid-facet,
* not just at facet boundaries (SPARC §#5 facet-AND-page granularity).
*/
export interface BackfillCursor {
/** The path-prefix facet being crawled (e.g. '.agents/skills'). */
/** The path-prefix being crawled ('' = the broad, no-`path:` query). */
path: string
/** The active facet window within that path (e.g. a date/size bucket; Wave 1c). */
/** Stable id of the active size facet/sub-range ({@link facetId}); 'done' when complete. */
facet: string
/** Last code-search page consumed within the current facet (1-based). */
/** Last code-search page consumed within the current (sub)range (1-based; 0 = none yet). */
last_page: number
/**
* SMI-5286 1c: 0-based index of the next top-level facet to process in the
* static {@link buildSizeFacets} ladder. Incremented when a top-level facet is
* RETIRED — either fully drained OR bisected (its sub-ranges, tracked in
* `pending_subranges`, then cover it). So it counts top-level facets whose
* coverage is committed, NOT necessarily finished crawling; use `current_facet`
* / `pending_subrange_count` (in the run summary) to tell 'bisecting' from 'done'.
*/
facet_index?: number
/**
* SMI-5286 1c: the in-progress bisection frontier — sub-ranges of the current
* facet not yet fully crawled (DFS stack; the LAST element is crawled next).
* Persisted so a dispatch boundary mid-bisection resumes without losing
* not-yet-crawled sub-ranges (the bare `(path,facet,last_page)` cursor cannot
* represent a partial bisection tree, C-2).
*/
pending_subranges?: PersistedSubrange[]
}

/** Map a runtime {@link SizeFacet} to its JSON-safe persisted form (`Infinity` → `null`). */
function serializeRange(facet: SizeFacet): PersistedSubrange {
return [facet.lo, Number.isFinite(facet.hi) ? facet.hi : null]
}

/** Map a persisted sub-range back to a runtime {@link SizeFacet} (`null` → `Infinity`). */
function deserializeRange([lo, hi]: PersistedSubrange): SizeFacet {
return { lo, hi: hi == null ? Number.POSITIVE_INFINITY : hi }
}

/**
* Runtime crawl frontier reconstructed from a {@link BackfillCursor}. The facet
* driver is a depth-first walk of the static size ladder: each top-level facet
* that saturates the 1000-result cap is bisected into `pendingSubranges`, which
* are drained (themselves bisecting further) before `facetIndex` advances.
*/
export interface FacetCrawlState {
/** Index into {@link buildSizeFacets} of the current top-level facet. */
facetIndex: number
/** DFS stack of sub-ranges still to crawl for the current facet; head crawled next. */
pendingSubranges: SizeFacet[]
/** Last page consumed within the current (sub)range (0 = none). */
lastPage: number
}

/** Reconstruct the crawl frontier from a persisted cursor (or a cold start). */
export function cursorToFacetState(cursor: BackfillCursor | null | undefined): FacetCrawlState {
if (!cursor) return { facetIndex: 0, pendingSubranges: [], lastPage: 0 }
return {
facetIndex: cursor.facet_index ?? 0,
pendingSubranges: (cursor.pending_subranges ?? []).map(deserializeRange),
lastPage: cursor.last_page ?? 0,
}
}

/**
* The range currently being crawled: the head of the bisection stack, else the
* top-level facet at `facetIndex`. `null` once the ladder is exhausted.
*/
export function currentFacetRange(
state: FacetCrawlState,
facets: SizeFacet[] = buildSizeFacets()
): SizeFacet | null {
if (state.pendingSubranges.length > 0) {
return state.pendingSubranges[state.pendingSubranges.length - 1]
}
if (state.facetIndex < facets.length) return facets[state.facetIndex]
return null
}

/**
* Replace the current saturated range with its two halves (the first half is
* crawled next). Resets the page cursor. Returns false when the range cannot
* subdivide (the caller then records truncation and advances).
*
* Retirement: a saturated range is REPLACED by its halves, so it must never be
* revisited. If it was a sub-range (stack non-empty) we pop it; if it was the
* TOP-LEVEL facet (stack empty) we advance `facetIndex` past it before pushing —
* otherwise, once the halves drain, `currentFacetRange` would return the same
* top-level facet again, it would re-saturate, and the crawl would loop forever
* without advancing `facets_completed` (governance C-1).
*/
export function bisectCurrentFacet(state: FacetCrawlState, range: SizeFacet): boolean {
const halves = bisectFacet(range)
if (!halves) return false
if (state.pendingSubranges.length > 0) {
state.pendingSubranges.pop() // retire the sub-range being bisected
} else {
state.facetIndex++ // retire the top-level facet — its halves now cover it
}
// Push so halves[0] ends up on top (LIFO) → the lower sub-range is crawled next.
state.pendingSubranges.push(halves[1], halves[0])
state.lastPage = 0
return true
}

/**
* Advance past the current exhausted (or unbisectable-saturated) range: pop the
* bisection stack if non-empty, else advance the top-level facet index. Resets
* the page cursor.
*/
export function advanceFacet(state: FacetCrawlState): void {
if (state.pendingSubranges.length > 0) state.pendingSubranges.pop()
else state.facetIndex++
state.lastPage = 0
}

/** True when every top-level facet AND its bisection frontier are exhausted. */
export function isFacetCrawlDone(
state: FacetCrawlState,
facets: SizeFacet[] = buildSizeFacets()
): boolean {
return state.facetIndex >= facets.length && state.pendingSubranges.length === 0
}

/** Serialize the crawl frontier back into a persisted {@link BackfillCursor}. */
export function facetStateToCursor(
state: FacetCrawlState,
pathPrefix: string,
facets: SizeFacet[] = buildSizeFacets()
): BackfillCursor {
const range = currentFacetRange(state, facets)
return {
path: pathPrefix,
facet: range ? facetId(range) : 'done',
last_page: state.lastPage,
facet_index: state.facetIndex,
pending_subranges: state.pendingSubranges.map(serializeRange),
}
}

/**
* The outcome of one dispatch's facet crawl: the advanced cursor to persist, a
* terminal flag, and the operator-observable counters. Lives here (not in
* `subdirectory-search.ts`) so `indexer-types.ts` can reference it without
* importing the search module.
*/
export interface BackfillCrawlOutcome {
cursor: BackfillCursor
done: boolean
cap_saturated: boolean
truncated_repo_count: number
facets_completed: number
facets_total: number
ranges_crawled: number
}

/**
Expand Down Expand Up @@ -113,6 +266,15 @@ export interface BackfillSummary {
facets_remaining: number
cap_saturated: boolean
truncated_repo_count: number
/**
* SMI-5286 1c (M-2): true crawl position. `facets_remaining` is coarse — it
* reads 0 once the last top-level facet is retired even while its bisected
* sub-ranges are still draining. `current_facet` (the active (sub)range id, or
* 'done') + `pending_subrange_count` (bisection-frontier depth) let the operator
* distinguish "finished" from "still bisecting".
*/
current_facet?: string
pending_subrange_count?: number
}

/**
Expand Down
Loading
Loading