diff --git a/batteries/skills/kata-close/SKILL.md b/batteries/skills/kata-close/SKILL.md index 2fd101b..12aa763 100644 --- a/batteries/skills/kata-close/SKILL.md +++ b/batteries/skills/kata-close/SKILL.md @@ -84,7 +84,27 @@ gh issue comment {issue_number} --body "{comment_body}" Commit and push only. Tests are not required for research mode — skip step 2. -No PR creation, no issue update. +No PR creation. + +Create a GitHub issue to capture the research findings and any follow-up work: + +```bash +gh issue create \ + --title "{research_title}" \ + --body "## Summary +{research_summary} + +## Findings +{key_findings} + +## Follow-up +{followup_items} + +Research doc: {research_doc_path}" \ + --label research +``` + +Use the research document's title and top-level summary for `{research_title}` and `{research_summary}`. Link the created issue number back in the commit message or as a follow-up comment if needed. ### If in planning mode diff --git a/eval/assertions.test.ts b/eval/assertions.test.ts index 42f8590..a8c6913 100644 --- a/eval/assertions.test.ts +++ b/eval/assertions.test.ts @@ -6,7 +6,8 @@ */ import { describe, it, expect, afterAll } from 'bun:test' -import { mkdirSync, writeFileSync, rmSync } from 'node:fs' +import { mkdirSync, writeFileSync, rmSync, mkdtempSync } from 'node:fs' +import { execSync } from 'node:child_process' import { join } from 'node:path' import { homedir, tmpdir } from 'node:os' import type { EvalContext } from './harness.js' @@ -52,6 +53,8 @@ import { assertSkillReadOrder, assertSkillNotRead, skillActivationPresets, + assertTwoCommitsSinceStart, + assertCommitsScopedToEachSession, } from './assertions.js' import type { SessionState } from '../src/state/schema.js' @@ -65,6 +68,7 @@ function mockContext(overrides: { baselineRef?: string | null sessionId?: string | null transcriptPath?: string | null + startSha?: string | null }): EvalContext { const files = overrides.files ?? {} const dirs = overrides.dirs ?? {} @@ -75,6 +79,7 @@ function mockContext(overrides: { baselineRef: overrides.baselineRef ?? null, sessionId: overrides.sessionId ?? null, transcriptPath: overrides.transcriptPath ?? null, + startSha: overrides.startSha ?? null, getSessionState() { if (overrides.state === null) return null return (overrides.state ?? {}) as SessionState @@ -945,3 +950,336 @@ describe('skillActivationPresets', () => { ]) }) }) + +// ─── Multi-Session Commit Scoping Assertions ──────────────────────────────── + +const TRACKER_TMP_DIRS: string[] = [] + +/** + * Build a real-dir-backed EvalContext that shells out to git via execSync. + * Mirrors the style used by baselineRef-backed assertions elsewhere in the + * codebase and avoids reinventing fixture infrastructure. + */ +function realCtx(projectDir: string, startSha: string | null): EvalContext { + return { + projectDir, + baselineRef: null, + sessionId: null, + transcriptPath: null, + startSha, + getSessionState() { + return null + }, + run(cmd: string) { + try { + return execSync(cmd, { + cwd: projectDir, + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + }) + } catch { + return '' + } + }, + fileExists(rel: string) { + try { + return execSync(`test -e ${JSON.stringify(join(projectDir, rel))} && echo 1 || echo 0`, { + encoding: 'utf-8', + }).trim() === '1' + } catch { + return false + } + }, + readFile(rel: string) { + try { + return execSync(`cat ${JSON.stringify(join(projectDir, rel))}`, { encoding: 'utf-8' }) + } catch { + return '' + } + }, + listDir() { + return [] + }, + } +} + +function initGitRepo(): string { + const dir = mkdtempSync(join(tmpdir(), 'kata-assert-')) + TRACKER_TMP_DIRS.push(dir) + execSync('git init -b main', { cwd: dir, stdio: 'pipe' }) + execSync('git config user.email "test@t.t"', { cwd: dir, stdio: 'pipe' }) + execSync('git config user.name "t"', { cwd: dir, stdio: 'pipe' }) + execSync('git config commit.gpgsign false', { cwd: dir, stdio: 'pipe' }) + execSync('git commit --allow-empty -m "init"', { cwd: dir, stdio: 'pipe' }) + return dir +} + +function getHeadSha(dir: string): string { + return execSync('git rev-parse HEAD', { cwd: dir, encoding: 'utf-8' }).trim() +} + +function writeAndCommit(dir: string, relPath: string, content: string, msg: string): string { + const abs = join(dir, relPath) + mkdirSync(join(abs, '..'), { recursive: true }) + writeFileSync(abs, content) + execSync(`git add ${JSON.stringify(relPath)}`, { cwd: dir, stdio: 'pipe' }) + execSync(`git commit -m ${JSON.stringify(msg)}`, { cwd: dir, stdio: 'pipe' }) + return getHeadSha(dir) +} + +/** + * Write a .kata/sessions// directory with state.json + edits.jsonl. + */ +function writeSessionDir( + projectDir: string, + id: string, + startedAt: string, + editFiles: string[], + currentMode: string = 'task', +): void { + const sessionDir = join(projectDir, '.kata', 'sessions', id) + mkdirSync(sessionDir, { recursive: true }) + const state = { + sessionId: id, + currentMode, + startedAt, + modeState: { [currentMode]: { enteredAt: startedAt } }, + modeHistory: [{ mode: currentMode, enteredAt: startedAt }], + } + writeFileSync(join(sessionDir, 'state.json'), JSON.stringify(state, null, 2)) + const editsLines = editFiles + .map((f) => JSON.stringify({ file: f, tool: 'Write', ts: startedAt })) + .join('\n') + writeFileSync(join(sessionDir, 'edits.jsonl'), editsLines + (editsLines ? '\n' : '')) +} + +afterAll(() => { + for (const d of TRACKER_TMP_DIRS) { + rmSync(d, { recursive: true, force: true }) + } +}) + +describe('assertTwoCommitsSinceStart', () => { + it('fails when startSha is null', async () => { + const dir = initGitRepo() + const ctx = realCtx(dir, null) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toContain('No startSha set') + }) + + it('fails when zero commits since start', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + const ctx = realCtx(dir, start) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toContain('Expected 2 non-merge commits') + expect(result).toContain('got 0') + }) + + it('fails when only one commit since start', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'a.txt', 'a', 'add a') + const ctx = realCtx(dir, start) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toContain('got 1') + }) + + it('passes when exactly two non-merge commits since start (tc2)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'a.txt', 'a', 'add a') + writeAndCommit(dir, 'b.txt', 'b', 'add b') + const ctx = realCtx(dir, start) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toBeNull() + }) + + it('fails when three commits since start (tc2)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'a.txt', 'a', 'add a') + writeAndCommit(dir, 'b.txt', 'b', 'add b') + writeAndCommit(dir, 'c.txt', 'c', 'add c') + const ctx = realCtx(dir, start) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toContain('got 3') + }) + + it('passes with two feature commits + one merge commit (tc5)', async () => { + // Create main with starter commit, branch A adds a.txt, branch B adds b.txt, + // then merge B into A to produce a merge commit on top. + const dir = initGitRepo() + const start = getHeadSha(dir) + // Commit 1 on main + writeAndCommit(dir, 'a.txt', 'a', 'add a') + // Create side branch from start, commit b.txt + execSync(`git checkout -b side ${start}`, { cwd: dir, stdio: 'pipe' }) + writeAndCommit(dir, 'b.txt', 'b', 'add b') + // Back to main, merge side with a merge commit (no ff) + execSync('git checkout main', { cwd: dir, stdio: 'pipe' }) + execSync('git merge --no-ff side -m "merge side"', { cwd: dir, stdio: 'pipe' }) + const ctx = realCtx(dir, start) + const result = await assertTwoCommitsSinceStart().assert(ctx) + expect(result).toBeNull() + }) +}) + +describe('assertCommitsScopedToEachSession', () => { + it('fails when startSha is null', async () => { + const dir = initGitRepo() + const ctx = realCtx(dir, null) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toContain('No startSha set') + }) + + it('passes with two sessions and two disjoint commits (tc1)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'src/foo.ts', 'export const foo = 42', 'add foo') + writeAndCommit(dir, 'src/bar.ts', 'export const bar = "hi"', 'add bar') + + // Both sessions started AFTER the scenario start timestamp + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const startMs = Date.parse(startIso) + const afterIso = new Date(startMs + 60_000).toISOString() + + writeSessionDir(dir, 'sess-a', afterIso, ['src/foo.ts']) + writeSessionDir(dir, 'sess-b', afterIso, ['src/bar.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toBeNull() + }) + + it('fails with diagnostic naming foreign path (tc7)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + // Commit touches both foo.ts (tracked) AND naughty.ts (NOT in edits) + const fooAbs = join(dir, 'src/foo.ts') + const naughtyAbs = join(dir, 'src/naughty.ts') + mkdirSync(join(dir, 'src'), { recursive: true }) + writeFileSync(fooAbs, 'export const foo = 42') + writeFileSync(naughtyAbs, 'export const naughty = true') + execSync('git add src/foo.ts src/naughty.ts', { cwd: dir, stdio: 'pipe' }) + execSync('git commit -m "leaked naughty"', { cwd: dir, stdio: 'pipe' }) + // second commit for other session + writeAndCommit(dir, 'src/bar.ts', 'b', 'add bar') + + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const afterIso = new Date(Date.parse(startIso) + 60_000).toISOString() + + writeSessionDir(dir, 'sess-a', afterIso, ['src/foo.ts']) + writeSessionDir(dir, 'sess-b', afterIso, ['src/bar.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toContain('foreign path') + expect(result).toContain('src/naughty.ts') + expect(result).toContain('sess-a') + }) + + it('fails when a session edits-set intersects zero commits (tc3a)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'src/foo.ts', 'foo', 'add foo') + writeAndCommit(dir, 'src/bar.ts', 'bar', 'add bar') + + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const afterIso = new Date(Date.parse(startIso) + 60_000).toISOString() + + // sess-c edits a file that nobody committed + writeSessionDir(dir, 'sess-c', afterIso, ['src/orphan.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toContain('matched 0 commit') + expect(result).toContain('sess-c') + expect(result).toContain('Candidate commits') + expect(result).toContain('src/foo.ts') + expect(result).toContain('src/bar.ts') + }) + + it('fails when a session edits-set intersects multiple commits (tc3b)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + // Two commits both touch shared.ts + writeAndCommit(dir, 'shared.ts', 'v1', 'first') + writeAndCommit(dir, 'shared.ts', 'v2', 'second') + + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const afterIso = new Date(Date.parse(startIso) + 60_000).toISOString() + + writeSessionDir(dir, 'sess-multi', afterIso, ['shared.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toContain('matched 2 commit') + expect(result).toContain('sess-multi') + }) + + it('allows *.tsbuildinfo via ALLOWLIST glob (tc4)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + // Commit contains both the tracked file AND a *.tsbuildinfo file + mkdirSync(join(dir, 'src'), { recursive: true }) + writeFileSync(join(dir, 'src/api.ts'), 'export const api = true') + writeFileSync(join(dir, 'src/api.tsbuildinfo'), '{"buildInfo":true}') + execSync('git add src/api.ts src/api.tsbuildinfo', { cwd: dir, stdio: 'pipe' }) + execSync('git commit -m "api + buildinfo"', { cwd: dir, stdio: 'pipe' }) + // Second commit so that assertTwoCommitsSinceStart invariant would also hold + writeAndCommit(dir, 'src/other.ts', 'o', 'other') + + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const afterIso = new Date(Date.parse(startIso) + 60_000).toISOString() + + // edits.jsonl only lists api.ts — tsbuildinfo is permitted via glob + writeSessionDir(dir, 'sess-api', afterIso, ['src/api.ts']) + writeSessionDir(dir, 'sess-other', afterIso, ['src/other.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toBeNull() + }) + + it('drops stale session whose startedAt is before scenario start (tc6)', async () => { + const dir = initGitRepo() + const start = getHeadSha(dir) + writeAndCommit(dir, 'src/foo.ts', 'foo', 'add foo') + writeAndCommit(dir, 'src/bar.ts', 'bar', 'add bar') + + const startIso = execSync(`git show -s --format=%cI ${start}`, { + cwd: dir, + encoding: 'utf-8', + }).trim() + const startMs = Date.parse(startIso) + const afterIso = new Date(startMs + 60_000).toISOString() + const staleIso = new Date(startMs - 7 * 24 * 60 * 60 * 1000).toISOString() // 7 days before + + writeSessionDir(dir, 'sess-a', afterIso, ['src/foo.ts']) + writeSessionDir(dir, 'sess-b', afterIso, ['src/bar.ts']) + // Stale session with old startedAt — references a file that doesn't exist in any commit. + // If the filter didn't drop it, the assertion would fail because its edits-set + // intersects zero commits. If it is dropped correctly, the assertion passes. + writeSessionDir(dir, 'sess-stale', staleIso, ['src/ancient.ts']) + + const ctx = realCtx(dir, start) + const result = await assertCommitsScopedToEachSession().assert(ctx) + expect(result).toBeNull() + }) +}) diff --git a/eval/assertions.ts b/eval/assertions.ts index 4c394ec..ab6b0ce 100644 --- a/eval/assertions.ts +++ b/eval/assertions.ts @@ -6,9 +6,10 @@ * definitions in scenario files. */ -import { readFileSync } from 'node:fs' +import { readFileSync, existsSync, readdirSync, statSync } from 'node:fs' import { join } from 'node:path' import { readNativeTaskFiles } from '../src/commands/enter/task-factory.js' +import { readEditsSet } from '../src/tracking/edits-log.js' import type { EvalCheckpoint, EvalContext } from './harness.js' import { judgeTranscript } from './judge.js' @@ -203,6 +204,216 @@ export function assertChangesPushed(): EvalCheckpoint { } } +// ─── Multi-Session Commit Scoping Assertions ───────────────────────────────── + +/** + * Assert that exactly 2 non-merge commits have been made since ctx.startSha. + * Used by the two-agent tracker scenario to confirm each agent produced + * exactly one commit. + */ +export function assertTwoCommitsSinceStart(): EvalCheckpoint { + return { + name: 'git: exactly 2 non-merge commits since scenario start', + assert(ctx: EvalContext) { + if (!ctx.startSha) { + return fail('No startSha set — this assertion requires the scenario-start SHA') + } + const countRaw = ctx.run(`git rev-list --count --no-merges ${ctx.startSha}..HEAD`) + const count = parseInt((countRaw ?? '').trim(), 10) + if (count !== 2) { + const subjects = ctx + .run(`git log --oneline --no-merges ${ctx.startSha}..HEAD`) + ?.trim() ?? '' + return fail( + `Expected 2 non-merge commits since ${ctx.startSha.slice(0, 8)}, got ${isNaN(count) ? 0 : count}.\n` + + `Recent commits:\n${subjects || '(none)'}`, + ) + } + return pass() + }, + } +} + +/** + * Minimal glob matcher for ALLOWLIST patterns. + * Supports `*` wildcard only — NOT `?`, `**`, or character classes. + * Used exclusively for the file-scoping assertion's allowlist. + */ +function matchesAllowlist(file: string, patterns: string[]): boolean { + for (const pattern of patterns) { + if (pattern.includes('*')) { + const rx = new RegExp( + '^' + pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*') + '$', + ) + if (rx.test(file)) return true + } else if (file === pattern) { + return true + } + } + return false +} + +/** + * Files permitted in a session's commit that aren't in the session's edits.jsonl. + * Glob syntax: `*` matches any substring. + */ +const COMMIT_SCOPE_ALLOWLIST = ['bun.lockb', 'bun.lock', 'package-lock.json', '*.tsbuildinfo'] + +/** + * Assert that each active session's commit contains only files belonging to + * that session's edits.jsonl (∪ framework allowlist). + * + * Algorithm: + * 1. For each session dir under .kata/sessions/, read state.json; keep those + * whose startedAt (fallback: modeState[currentMode].enteredAt) is ≥ + * the committer timestamp of ctx.startSha. + * 2. For each surviving session S, read E_S = readEditsSet(sessionDir). + * 3. Enumerate candidate SHAs via `git rev-list --no-merges startSha..HEAD`. + * For each SHA, compute F = files changed via `git show --name-only`. + * 4. Require that exactly one commit's file-set intersects E_S. + * 5. Require F ⊆ E_S ∪ ALLOWLIST for that commit. + */ +export function assertCommitsScopedToEachSession(): EvalCheckpoint { + return { + name: 'each session commit is scoped to its own edits.jsonl', + assert(ctx: EvalContext) { + if (!ctx.startSha) { + return fail('No startSha set — this assertion requires the scenario-start SHA') + } + + // Scenario-start committer timestamp (ISO) + const startIso = ctx.run(`git show -s --format=%cI ${ctx.startSha}`)?.trim() + if (!startIso) { + return fail(`Could not resolve committer timestamp for startSha ${ctx.startSha.slice(0, 8)}`) + } + const startMs = Date.parse(startIso) + if (isNaN(startMs)) { + return fail(`Could not parse scenario-start timestamp: '${startIso}'`) + } + + // Enumerate session directories + const sessionsRoot = join(ctx.projectDir, '.kata', 'sessions') + if (!existsSync(sessionsRoot)) { + return fail(`No .kata/sessions directory at ${sessionsRoot}`) + } + let sessionEntries: string[] + try { + sessionEntries = readdirSync(sessionsRoot) + } catch (err) { + return fail(`Cannot read ${sessionsRoot}: ${err instanceof Error ? err.message : String(err)}`) + } + + interface SurvivingSession { + id: string + edits: Set + } + const surviving: SurvivingSession[] = [] + + for (const id of sessionEntries) { + const dir = join(sessionsRoot, id) + let isDir = false + try { + isDir = statSync(dir).isDirectory() + } catch { + continue + } + if (!isDir) continue + + const statePath = join(dir, 'state.json') + if (!existsSync(statePath)) continue + let state: Record + try { + state = JSON.parse(readFileSync(statePath, 'utf-8')) + } catch { + continue + } + + // Determine session start time: prefer top-level startedAt, + // fall back to modeState[currentMode]?.enteredAt + let sessionStartIso: string | undefined = typeof state.startedAt === 'string' + ? (state.startedAt as string) + : undefined + if (!sessionStartIso) { + const currentMode = typeof state.currentMode === 'string' + ? (state.currentMode as string) + : undefined + const modeState = state.modeState as Record | undefined + if (currentMode && modeState?.[currentMode]?.enteredAt) { + sessionStartIso = modeState[currentMode].enteredAt + } + } + if (!sessionStartIso) continue + const sessionMs = Date.parse(sessionStartIso) + if (isNaN(sessionMs)) continue + // Drop stale sessions whose start is BEFORE the scenario-start + if (sessionMs < startMs) continue + + // Do NOT short-circuit on empty edits here — let flow fall into the + // candidate-intersection logic below, which produces a diagnostic + // naming the session and enumerating candidate SHAs. A per-session + // early return would skip that diagnostic AND short-circuit evaluation + // of any other surviving sessions. + const edits = readEditsSet(dir) + surviving.push({ id, edits }) + } + + if (surviving.length === 0) { + return fail('No active sessions survived scenario-start timestamp filter') + } + + // Enumerate candidate commit SHAs (non-merges since startSha) + const revRaw = ctx.run(`git rev-list --no-merges ${ctx.startSha}..HEAD`) ?? '' + const candidates = revRaw.split('\n').map((s) => s.trim()).filter(Boolean) + if (candidates.length === 0) { + return fail( + `No candidate commits since ${ctx.startSha.slice(0, 8)} — ` + + `expected at least one per surviving session (${surviving.length})`, + ) + } + + // Pre-compute file-set per candidate SHA + const filesBySha = new Map() + for (const sha of candidates) { + const raw = ctx.run(`git show --name-only --format= ${sha}`) ?? '' + const files = raw.split('\n').map((s) => s.trim()).filter(Boolean) + filesBySha.set(sha, files) + } + + // Match each session to exactly one commit + for (const session of surviving) { + const matched: string[] = [] + for (const sha of candidates) { + const files = filesBySha.get(sha) ?? [] + const intersects = files.some((f) => session.edits.has(f)) + if (intersects) matched.push(sha) + } + if (matched.length !== 1) { + const candidateLines = candidates + .map((sha) => ` ${sha.slice(0, 8)}: [${(filesBySha.get(sha) ?? []).join(', ')}]`) + .join('\n') + return fail( + `Session ${session.id} matched ${matched.length} commit(s); expected 1.\n` + + `Session edits: [${[...session.edits].join(', ')}]\n` + + `Candidate commits:\n${candidateLines}`, + ) + } + const sha = matched[0] + const files = filesBySha.get(sha) ?? [] + const foreign = files.filter( + (f) => !session.edits.has(f) && !matchesAllowlist(f, COMMIT_SCOPE_ALLOWLIST), + ) + if (foreign.length > 0) { + return fail( + `Session ${session.id} commit ${sha.slice(0, 8)}: foreign path(s): ${foreign.join(', ')}`, + ) + } + } + + return pass() + }, + } +} + // ─── File Assertions ─────────────────────────────────────────────────────────── /** diff --git a/eval/harness.test.ts b/eval/harness.test.ts new file mode 100644 index 0000000..465575d --- /dev/null +++ b/eval/harness.test.ts @@ -0,0 +1,160 @@ +/** + * Tests for the eval harness — pure unit tests focused on invariants. + * + * Does NOT spawn Claude Agent SDK queries; only verifies: + * - runScenario invariants (mutual exclusivity of prompt/agents) + * - buildContext behavior (default latest vs. explicit sessionId) + */ + +import { describe, it, expect, afterAll } from 'bun:test' +import { mkdirSync, mkdtempSync, writeFileSync, rmSync } from 'node:fs' +import { join } from 'node:path' +import { tmpdir } from 'node:os' +import { runScenario, buildContext } from './harness.js' +import type { EvalScenario } from './harness.js' + +// Track temp dirs to clean up at end +const tmpDirs: string[] = [] + +afterAll(() => { + for (const dir of tmpDirs) { + try { + rmSync(dir, { recursive: true, force: true }) + } catch { + // ignore + } + } +}) + +function makeTmpProject(): string { + const dir = mkdtempSync(join(tmpdir(), 'kata-harness-')) + tmpDirs.push(dir) + return dir +} + +// Integration test for the full two-agent query() dispatch is deferred +// to the live eval run (VP1 in spec 64). Unit-testing SDK dispatch +// requires invasive stubbing; VP1 validates this end-to-end. + +describe('runScenario invariants', () => { + it('throws when both prompt and agents are set', async () => { + const scenario: EvalScenario = { + id: 'invalid-both', + name: 'invalid: both prompt and agents', + prompt: 'do a thing', + agents: [{ prompt: 'agent prompt' }], + checkpoints: [], + } + await expect(runScenario(scenario)).rejects.toThrow( + 'EvalScenario must define exactly one of prompt or agents', + ) + }) + + it('throws when neither prompt nor agents is set', async () => { + const scenario: EvalScenario = { + id: 'invalid-neither', + name: 'invalid: neither prompt nor agents', + checkpoints: [], + } + await expect(runScenario(scenario)).rejects.toThrow( + 'EvalScenario must define exactly one of prompt or agents', + ) + }) + + it('throws when agents is set to an empty array and no prompt', async () => { + const scenario: EvalScenario = { + id: 'invalid-empty-agents', + name: 'invalid: empty agents', + agents: [], + checkpoints: [], + } + await expect(runScenario(scenario)).rejects.toThrow( + 'EvalScenario must define exactly one of prompt or agents', + ) + }) +}) + +describe('buildContext.getSessionState', () => { + it('default scan returns the most recently updated session', () => { + const projectDir = makeTmpProject() + const sessionsDir = join(projectDir, '.kata', 'sessions') + + mkdirSync(join(sessionsDir, 'session-a'), { recursive: true }) + mkdirSync(join(sessionsDir, 'session-b'), { recursive: true }) + + writeFileSync( + join(sessionsDir, 'session-a', 'state.json'), + JSON.stringify({ + sessionId: 'session-a', + currentMode: 'task', + updatedAt: '2026-01-01T00:00:00Z', + }), + ) + writeFileSync( + join(sessionsDir, 'session-b', 'state.json'), + JSON.stringify({ + sessionId: 'session-b', + currentMode: 'planning', + updatedAt: '2026-01-02T00:00:00Z', + }), + ) + + const ctx = buildContext(projectDir) + const state = ctx.getSessionState() + expect(state).not.toBeNull() + expect(state?.sessionId).toBe('session-b') + expect(state?.currentMode).toBe('planning') + }) + + it('returns the specific session when sessionId is provided', () => { + const projectDir = makeTmpProject() + const sessionsDir = join(projectDir, '.kata', 'sessions') + + mkdirSync(join(sessionsDir, 'session-a'), { recursive: true }) + mkdirSync(join(sessionsDir, 'session-b'), { recursive: true }) + + writeFileSync( + join(sessionsDir, 'session-a', 'state.json'), + JSON.stringify({ + sessionId: 'session-a', + currentMode: 'task', + updatedAt: '2026-01-01T00:00:00Z', + }), + ) + writeFileSync( + join(sessionsDir, 'session-b', 'state.json'), + JSON.stringify({ + sessionId: 'session-b', + currentMode: 'planning', + updatedAt: '2026-01-02T00:00:00Z', + }), + ) + + const ctx = buildContext(projectDir) + // Explicit sessionId returns the earlier (a) even though (b) was updated later + const stateA = ctx.getSessionState('session-a') + expect(stateA).not.toBeNull() + expect(stateA?.sessionId).toBe('session-a') + expect(stateA?.currentMode).toBe('task') + }) + + it('returns null for unknown sessionId', () => { + const projectDir = makeTmpProject() + const sessionsDir = join(projectDir, '.kata', 'sessions') + mkdirSync(join(sessionsDir, 'session-a'), { recursive: true }) + writeFileSync( + join(sessionsDir, 'session-a', 'state.json'), + JSON.stringify({ sessionId: 'session-a', currentMode: 'task', updatedAt: '2026-01-01T00:00:00Z' }), + ) + + const ctx = buildContext(projectDir) + const state = ctx.getSessionState('does-not-exist') + expect(state).toBeNull() + }) + + it('startSha is exposed on the context', () => { + const projectDir = makeTmpProject() + const ctx = buildContext(projectDir, null, null, null, 'abc123') + expect(ctx.startSha).toBe('abc123') + }) +}) diff --git a/eval/harness.ts b/eval/harness.ts index 90647cb..91c02aa 100644 --- a/eval/harness.ts +++ b/eval/harness.ts @@ -49,11 +49,25 @@ export interface EvalCheckpoint { assert: (ctx: EvalContext) => string | null | Promise } +export interface AgentSpec { + prompt: string + maxTurns?: number + sessionIdHint?: string // optional label used only for transcript filenames +} + export interface EvalScenario { id: string name: string - /** User prompt sent to Claude */ - prompt: string + /** + * User prompt sent to Claude (single-agent scenarios). + * Mutually exclusive with `agents` — exactly one must be set. + */ + prompt?: string + /** + * Multi-agent scenario — spawns one query() per AgentSpec in parallel. + * Mutually exclusive with `prompt` — exactly one must be set. + */ + agents?: AgentSpec[] checkpoints: EvalCheckpoint[] /** Max agent turns — omit to use the SDK default (no limit) */ maxTurns?: number @@ -91,7 +105,9 @@ export interface EvalContext { sessionId: string | null /** Path to the JSONL transcript file (null when --no-transcript) */ transcriptPath: string | null - getSessionState(): SessionState | null + /** Git HEAD SHA captured after fixtureSetup and before any agent spawns */ + startSha: string | null + getSessionState(sessionId?: string): SessionState | null run(cmd: string): string fileExists(relativePath: string): boolean readFile(relativePath: string): string @@ -156,6 +172,13 @@ export async function runScenario( ): Promise { const startMs = Date.now() + // Enforce mutual exclusivity of prompt vs agents + const hasPrompt = typeof scenario.prompt === 'string' && scenario.prompt.length > 0 + const hasAgents = Array.isArray(scenario.agents) && scenario.agents.length > 0 + if (hasPrompt === hasAgents) { + throw new Error('EvalScenario must define exactly one of prompt or agents') + } + // Resolve project directory let projectDir: string let baselineRef: string | null = null @@ -225,6 +248,19 @@ export async function runScenario( }) } + // Capture scenario start SHA before any agent spawns. + // For projectDir path this is effectively the same as baselineRef. + let scenarioStartSha: string | null = null + try { + scenarioStartSha = execSync('git rev-parse HEAD', { + cwd: projectDir, + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + }).trim() + } catch { + scenarioStartSha = null + } + const result: EvalResult = { scenarioId: scenario.id, scenarioName: scenario.name, @@ -321,10 +357,126 @@ export async function runScenario( queryOptions.maxTurns = scenario.maxTurns } + // scenario.prompt is guaranteed non-empty on the single-agent path by the + // hasPrompt/hasAgents invariant enforced at the top of runScenario. const prompt = isResume ? (options.resumeAnswer ?? 'Continue.') - : scenario.prompt + : (scenario.prompt as string) + + // Multi-agent branch: spawn one query() per AgentSpec in parallel. + if (hasAgents && scenario.agents) { + const transcriptDir = options.transcriptPath ? dirname(options.transcriptPath) : null + const agentSummaries: Array<{ + index: number + status: 'fulfilled' | 'rejected' + reason?: string + turns: number + sessionId?: string + }> = [] + + // Multi-agent scenarios do NOT support pause/resume (spec v1). Use a + // distinct canUseTool that has no closure over the outer abortController, + // sessionId, or pendingQuestion — otherwise a concurrent AskUserQuestion + // would clobber shared state and abort the wrong agent. + const multiAgentCanUseTool = async () => ({ behavior: 'allow' as const }) + + const agentPromises = scenario.agents.map((spec, idx) => { + return (async () => { + const agentAbort = new AbortController() + const perAgentTranscript = transcriptDir + ? join(transcriptDir, `agent-${idx}.jsonl`) + : null + if (perAgentTranscript) { + mkdirSync(dirname(perAgentTranscript), { recursive: true }) + } + + const agentQueryOptions: Record = { + abortController: agentAbort, + cwd: projectDir, + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'Task', 'AskUserQuestion'], + permissionMode: 'bypassPermissions', + allowDangerouslySkipPermissions: true, + settingSources: ['project'], + canUseTool: multiAgentCanUseTool, + env: cleanEnv, + } + if (spec.maxTurns !== undefined) { + agentQueryOptions.maxTurns = spec.maxTurns + } + let turns = 0 + let agentSessionId: string | undefined + + for await (const message of query({ prompt: spec.prompt, options: agentQueryOptions })) { + if ( + (message as { type: string; subtype?: string; session_id?: string }).type === 'system' && + (message as { subtype?: string }).subtype === 'init' + ) { + agentSessionId = (message as { session_id: string }).session_id + } + + if (perAgentTranscript) { + appendFileSync( + perAgentTranscript, + JSON.stringify({ ts: new Date().toISOString(), ...message }) + '\n', + ) + } + + if (message.type === 'assistant') { + turns++ + if (options.verbose) { + emitAssistantMessage(turns, message) + } + } else if (message.type === 'user') { + if (options.verbose) { + emitToolResults(message) + } + } + } + + return { turns, sessionId: agentSessionId } + })() + }) + + const settled = await Promise.allSettled(agentPromises) + for (let i = 0; i < settled.length; i++) { + const s = settled[i] + if (s.status === 'fulfilled') { + agentSummaries.push({ + index: i, + status: 'fulfilled', + turns: s.value.turns, + sessionId: s.value.sessionId, + }) + if (options.verbose) { + process.stdout.write( + `[agent-${i}] fulfilled · ${s.value.turns} turns · session=${s.value.sessionId ?? 'unknown'}\n`, + ) + } + } else { + const reason = s.reason instanceof Error ? s.reason.message : String(s.reason) + agentSummaries.push({ + index: i, + status: 'rejected', + reason, + turns: 0, + }) + if (options.verbose) { + process.stdout.write(`[agent-${i}] rejected · ${reason}\n`) + } + } + } + + // Set result.sessionId to the first agent's sessionId (or leave undefined if both failed). + const firstFulfilled = agentSummaries.find((a) => a.status === 'fulfilled' && a.sessionId) + if (firstFulfilled?.sessionId) { + sessionId = firstFulfilled.sessionId + result.sessionId = sessionId + } + result.turns = agentSummaries.reduce((s, a) => s + a.turns, 0) + + // Fall through to checkpoint evaluation below + } else { // The for-await loop may end normally (agent finished) or via abort (AskUserQuestion // triggered the safety-net abort controller). Catch abort errors gracefully. try { @@ -381,6 +533,7 @@ export async function runScenario( process.stdout.write('[abort] Query aborted (AskUserQuestion safety net)\n') } } + } // end single-agent else branch // If session was paused for a question, finalize and attach it to the result. // Patch sessionId if it wasn't available when canUseTool fired (race with init). @@ -404,7 +557,7 @@ export async function runScenario( } // Always run checkpoints — even when paused, state may already be written - const ctx: EvalContext = buildContext(projectDir, baselineRef, sessionId ?? null, options.transcriptPath ?? null) + const ctx: EvalContext = buildContext(projectDir, baselineRef, sessionId ?? null, options.transcriptPath ?? null, scenarioStartSha) for (const checkpoint of scenario.checkpoints) { const error = await checkpoint.assert(ctx) result.assertions.push({ @@ -514,20 +667,34 @@ function formatToolInput(name: string, input: unknown): string { // ─── Context builder ────────────────────────────────────────────────────────── -function buildContext( +export function buildContext( projectDir: string, baselineRef: string | null = null, sessionId: string | null = null, transcriptPath: string | null = null, + startSha: string | null = null, ): EvalContext { return { projectDir, baselineRef, sessionId, transcriptPath, - getSessionState(): SessionState | null { + startSha, + getSessionState(explicitSessionId?: string): SessionState | null { const sessionsDir = join(projectDir, '.kata', 'sessions') if (!existsSync(sessionsDir)) return null + + // If a specific sessionId is provided, read that session's state directly. + if (explicitSessionId) { + const statePath = join(sessionsDir, explicitSessionId, 'state.json') + if (!existsSync(statePath)) return null + try { + return JSON.parse(readFileSync(statePath, 'utf-8')) as SessionState + } catch { + return null + } + } + const candidates: Array<{ id: string; path: string }> = [] try { for (const id of readdirSync(sessionsDir)) { diff --git a/eval/run.ts b/eval/run.ts index e8d607d..4652f9a 100644 --- a/eval/run.ts +++ b/eval/run.ts @@ -33,6 +33,7 @@ import { liveHookVerifyScenario } from './scenarios/live-hook-verify.js' import { liveTaskScenario } from './scenarios/live-task.js' import { liveResearchScenario } from './scenarios/live-research.js' import { taskDisciplineScenario } from './scenarios/task-discipline.js' +import { twoAgentTrackerScenario } from './scenarios/two-agent-tracker.js' import { liveTaskDisciplineScenario } from './scenarios/live-task-discipline.js' import { stopHookEnforcementScenario } from './scenarios/stop-hook-enforcement.js' import { stopHookTestScenario } from './scenarios/stop-hook-test.js' @@ -51,7 +52,7 @@ const TRANSCRIPT_DIR = resolve(__dirname, '../eval-transcripts') // ─── Registry ───────────────────────────────────────────────────────────────── -const scenarios = [askUserPauseScenario, modeEntryScenario, taskModeScenario, taskDisciplineScenario, stopHookEnforcementScenario, stopHookTestScenario, hookLifecycleScenario, planningModeScenario, planningInterviewScenario, planningAuthScenario, planningReviewAgentsScenario, implAuthScenario, implTaskGenDefaultScenario, implTaskGenCustomScenario, implReviewAgentsScenario, impl3StepVerifyScenario, implE2eVerifyScenario, verifySubagentScenario, researchModeScenario, liveHookVerifyScenario, liveTaskScenario, liveResearchScenario, liveTaskDisciplineScenario] +const scenarios = [askUserPauseScenario, modeEntryScenario, taskModeScenario, twoAgentTrackerScenario, taskDisciplineScenario, stopHookEnforcementScenario, stopHookTestScenario, hookLifecycleScenario, planningModeScenario, planningInterviewScenario, planningAuthScenario, planningReviewAgentsScenario, implAuthScenario, implTaskGenDefaultScenario, implTaskGenCustomScenario, implReviewAgentsScenario, impl3StepVerifyScenario, implE2eVerifyScenario, verifySubagentScenario, researchModeScenario, liveHookVerifyScenario, liveTaskScenario, liveResearchScenario, liveTaskDisciplineScenario] /** Scenarios that require --project (no built-in fixture) */ const LIVE_SCENARIO_IDS = new Set(['live-hook-verify', 'live-task', 'live-research', 'live-task-discipline', 'stop-hook-test']) diff --git a/eval/scenarios/two-agent-tracker.ts b/eval/scenarios/two-agent-tracker.ts new file mode 100644 index 0000000..dfe05ea --- /dev/null +++ b/eval/scenarios/two-agent-tracker.ts @@ -0,0 +1,32 @@ +/** + * Two-Agent File-Edit Tracker Eval + * + * Runs two concurrent Claude agents in task mode against the same project, + * each editing disjoint files. Proves the per-session file-edit tracker + * (src/tracking/edits-log.ts + can-exit scoping) works end-to-end under + * real concurrency. + */ + +import type { EvalScenario } from '../harness.js' +import { + assertTwoCommitsSinceStart, + assertCommitsScopedToEachSession, +} from '../assertions.js' + +export const twoAgentTrackerScenario: EvalScenario = { + id: 'two-agent-tracker', + name: 'Two-agent file-edit tracker', + fixture: 'tanstack-start', + templatePath: '.kata/templates/task.md', + // Pre-install deps so neither agent triggers a lockfile-modifying install at runtime. + fixtureSetup: ['bun install'], + agents: [ + { prompt: "Add a utility function to src/utils/foo.ts that returns 42." }, + { prompt: "Add a utility function to src/utils/bar.ts that returns 'hello'." }, + ], + timeoutMs: 10 * 60 * 1000, + checkpoints: [ + assertTwoCommitsSinceStart(), + assertCommitsScopedToEachSession(), + ], +} diff --git a/planning/specs/64-two-agent-file-edit-tracker-eval.md b/planning/specs/64-two-agent-file-edit-tracker-eval.md new file mode 100644 index 0000000..54c8a24 --- /dev/null +++ b/planning/specs/64-two-agent-file-edit-tracker-eval.md @@ -0,0 +1,358 @@ +--- +initiative: two-agent-file-edit-tracker-eval +type: project +issue_type: feature +status: approved +priority: medium +github_issue: 64 +created: 2026-04-17 +updated: 2026-04-17 +phases: + - id: p1 + name: "Harness — agents[] support + session-ID-aware state lookup" + tasks: + - "Add AgentSpec type and optional agents?: AgentSpec[] to EvalScenario (discriminated with prompt?: string). fixtureSetup already exists on EvalScenario (eval/harness.ts:83) and is reused unchanged." + - "Enforce runtime invariant in runScenario: throw if both prompt and agents are set or if neither is set" + - "Guard existing scenario.prompt callers in runScenario with `if (scenario.prompt)` to satisfy the now-optional field" + - "Capture scenarioStartSha = `git rev-parse HEAD` after fixtureSetup completes and before any agent is spawned; attach it to the EvalContext so checkpoint closures can read ctx.startSha" + - "Implement Promise.allSettled branch in runScenario to spawn one query() per AgentSpec and capture per-agent outcomes" + - "Add optional sessionId parameter to getSessionState; default remains latest-by-updatedAt" + - "Write per-agent transcripts as agent-.jsonl under the scenario transcript dir to avoid append collisions" + test_cases: + - id: tc1 + description: "EvalScenario with agents[] spawns two concurrent query() calls and writes two distinct transcripts" + type: "integration" + - id: tc2 + description: "getSessionState(projectDir, sessionId) returns the requested session; getSessionState(projectDir) returns latest-by-updatedAt" + type: "unit" + - id: p2 + name: "Assertions — two-commits + scoped-commit assertions" + tasks: + - "Add assertTwoCommitsSinceStart() to eval/assertions.ts — uses `git rev-list --count --no-merges ..HEAD` and asserts count == 2" + - "Add assertCommitsScopedToEachSession() (no session argument) — scans .kata/sessions/ for sessions started within the scenario window, resolves each session's commit by file-set intersection, and asserts subset containment against each session's edits.jsonl plus a framework-file allowlist" + - "Unit-test both assertions following the existing mock EvalContext pattern in eval/assertions.test.ts, backed by a real temp git repo (fs.mkdtempSync + git init + controlled commits) that simulates two sessions with disjoint edits.jsonl sets" + test_cases: + - id: tc1 + description: "assertCommitsScopedToEachSession passes when each session's matched commit files ⊆ edits.jsonl ∪ allowlist and fails with a readable diagnostic when a foreign file is present" + type: "unit" + - id: tc2 + description: "assertTwoCommitsSinceStart passes when exactly two non-merge commits exist after scenario-start SHA and fails on 0/1/3+" + type: "unit" + - id: tc3 + description: "assertCommitsScopedToEachSession fails with a diagnostic listing candidate commits when a session's edits.jsonl intersects zero or multiple commits" + type: "unit" + - id: tc4 + description: "assertCommitsScopedToEachSession treats ALLOWLIST entries as globs (e.g., *.tsbuildinfo matches src/api.tsbuildinfo) rather than literal strings" + type: "unit" + - id: tc5 + description: "assertTwoCommitsSinceStart ignores merge commits (two feature commits + one merge => still passes)" + type: "unit" + - id: tc6 + description: "assertCommitsScopedToEachSession drops stale session dirs whose enteredAt is older than ctx.startSha timestamp" + type: "unit" + - id: tc7 + description: "assertCommitsScopedToEachSession fails with a readable diagnostic naming the foreign path when a commit contains a file not in its session's edits.jsonl" + type: "unit" + - id: p3 + name: "Scenario wiring" + tasks: + - "Create eval/scenarios/two-agent-tracker.ts with two simple natural-language prompts touching disjoint files" + - "Add scenario-level fixtureSetup that runs `bun install` before either agent starts so neither agent triggers a lockfile-modifying install" + - "Register scenario in the scenario index so --list and --scenario discover it" + - "Compose checkpoints from workflowPresets('task') (adapted per agent) plus assertTwoCommitsSinceStart() and assertCommitsScopedToEachSession()" + test_cases: + - id: tc1 + description: "npm run eval -- --scenario=two-agent-tracker completes both agents and all checkpoints pass" + type: "smoke" +--- + +# Two-Agent File-Edit Tracker Eval + +> GitHub Issue: [#64](https://github.com/codevibesmatter/kata-wm/issues/64) + +## Overview + +The file-edit tracker (commit `b5d2c95`) scopes the `committed` stop-condition per-session so that one kata session does not see another session's in-flight dirty files as "uncommitted changes." Unit tests cover the primitives, but no eval exercises the multi-agent path end-to-end. This spec adds an eval scenario that drives **two real concurrent Claude agents** through `task` mode on disjoint files and asserts each commit contains only its own session's edits — the real proof that the tracker works under concurrency. The spec also extends the eval harness with a small `agents[]` affordance and a session-ID-aware state lookup so future concurrent scenarios can reuse the plumbing. + +## Feature Behaviors + +### B1: Harness supports agents[] for concurrent scenarios + +**Core:** +- **ID:** harness-agents-array +- **Trigger:** A scenario definition has a non-empty `agents` array (as opposed to the existing top-level `prompt`) +- **Expected:** The harness spawns one SDK `query()` call per entry in `agents` via `Promise.allSettled`. All agents share the same `cwd` (the scenario project dir) and the same `.claude/settings.json` (so kata hooks fire for each), but each agent acquires its own kata session via its independent `kata enter task` invocation. The harness waits for all agents to settle (resolved or rejected) before evaluating checkpoints. Single-agent scenarios using the existing top-level `prompt` field are unchanged. `prompt` and `agents` are mutually exclusive; `runScenario` enforces a runtime invariant — it throws `Error("EvalScenario must define exactly one of prompt or agents")` before fixture setup if both are set or neither is set. +- **Verify:** Run the two-agent-tracker scenario with `--verbose`; logs show two concurrent `query()` invocations starting within the same second and two session directories appearing under `.kata/sessions/`. A unit test confirms the Promise.allSettled path is taken when `agents` is populated. If either agent rejects, the harness reports the rejection in the per-agent transcript summary and marks the scenario failed, but still runs checkpoint assertions against whatever session data exists so diagnostics survive a single-agent crash. A second unit test confirms the mutual-exclusivity invariant throws when both `prompt` and `agents` are set and when neither is set. +- **Source:** `eval/harness.ts:52-84` (EvalScenario), `eval/harness.ts:331` (query call), `eval/harness.ts:159-226` (fixture copy/isolation) + +#### UI Layer +N/A — harness-level change. + +#### API Layer +`EvalScenario` interface (in `eval/harness.ts`) gains: +```ts +interface AgentSpec { + prompt: string; + maxTurns?: number; + sessionIdHint?: string; // optional label used only for transcript filenames +} +interface EvalScenario { + // ...existing fields... + prompt?: string; // single-agent mode (existing scenarios) + agents?: AgentSpec[]; // multi-agent mode — harness runs all agents concurrently via Promise.allSettled + // runtime invariant: exactly one of `prompt` or `agents` must be set; runScenario throws otherwise +} +``` + +#### Data Layer +Transcript writer produces `eval-transcripts/-/agent-.jsonl` per agent instead of a single transcript file. No changes to `.kata/sessions/` layout. + +--- + +### B2: Session state lookup accepts explicit session ID + +**Core:** +- **ID:** session-state-by-id +- **Trigger:** An assertion or harness helper needs the `SessionState` for a *specific* session rather than the most-recently-updated one +- **Expected:** `getSessionState(projectDir, sessionId?)` accepts an optional `sessionId` parameter. When provided, it reads `.kata/sessions//state.json` directly. When omitted, it preserves existing behavior (latest-by-updatedAt scan of `.kata/sessions/`). All existing callers keep working without modification because the new parameter is optional. +- **Verify:** Unit test in `eval/harness.test.ts` asserts both code paths: default returns latest, explicit ID returns that specific session's state even when it is not the latest. +- **Source:** `eval/harness.ts:528-555` (existing `getSessionState`) + +#### UI Layer +N/A. + +#### API Layer +```ts +function getSessionState(projectDir: string, sessionId?: string): SessionState | null; +``` +Callers in new assertions pass the explicit ID obtained by scanning `.kata/sessions/` timestamps after both agents complete. Existing callers pass no second argument — no behavior change. + +#### Data Layer +Reads `.kata/sessions//state.json` (same file already produced by the state writer). + +--- + +### B3: Two-agent file-edit tracker scenario + assertions + +**Core:** +- **ID:** two-agent-tracker-scenario +- **Trigger:** `npm run eval -- --scenario=two-agent-tracker` (or inclusion in a full run) +- **Expected:** A scenario at `eval/scenarios/two-agent-tracker.ts` defines two agents with disjoint-file prompts (e.g., "Add a utility function to src/utils/foo.ts that returns 42" and "Add a utility function to src/utils/bar.ts that returns 'hello'"), fixture `tanstack-start`, with scenario-level `fixtureSetup: ['bun install']` to pre-populate dependencies so neither agent triggers a lockfile-modifying install at runtime. Both agents enter `task` mode via the planning/user-prompt hook nudges, edit their respective files, commit, and exit. Checkpoints include task-mode workflow basics for each agent plus two new assertions: `assertTwoCommitsSinceStart()` confirms exactly two non-merge commits exist relative to the scenario-start SHA (using `git rev-list --count --no-merges ..HEAD`; >2 commits from `--amend`-induced SHA churn or auto-formatter commits fails loudly — this is intentional, each agent should produce exactly one commit), and `assertCommitsScopedToEachSession()` (no caller-supplied session ID) resolves each session's commit internally and confirms the commit's changed file-set is a subset of that session's `edits.jsonl` ∪ a small framework-file allowlist. Intentionally removing the session-scoping in `src/commands/can-exit.ts` causes `assertCommitsScopedToEachSession` to fail with a diagnostic listing the foreign file(s). +- **Matching algorithm:** For each session S with edits set E_S, find the set of commits C_S where `git show --name-only ` ∩ E_S is non-empty. Assert |C_S| == 1. Call that commit the session's commit. Then assert `files(commit) ⊆ E_S ∪ ALLOWLIST` where `ALLOWLIST = ['bun.lockb', 'bun.lock', 'package-lock.json', '*.tsbuildinfo']`. ALLOWLIST entries are matched as **globs** via minimatch-style pattern matching (so `*.tsbuildinfo` matches `src/api.tsbuildinfo`), not strict string equality. If |C_S| ≠ 1 for any session, the assertion fails with a diagnostic listing the candidate commits. +- **Verify:** Scenario passes locally with all checkpoints green. Fault injection (temporarily removing the session filter in `checkGlobalConditions`) causes at least one agent's can-exit to be blocked, and/or `assertCommitsScopedToEachSession` fails with a readable diagnostic. +- **Source:** `eval/scenarios/two-agent-tracker.ts` (new), `eval/assertions.ts` (new assertions), `src/commands/can-exit.ts:47-116` (code under test), `src/tracking/edits-log.ts` (edits.jsonl format) + +#### UI Layer +N/A. + +#### API Layer +Two new exports from `eval/assertions.ts`: +```ts +assertTwoCommitsSinceStart(): EvalCheckpoint; +assertCommitsScopedToEachSession(): EvalCheckpoint; +``` +Neither is added to a preset — they are specific to this scenario and are spread inline in the scenario's `checkpoints`. `assertCommitsScopedToEachSession` takes no arguments: it discovers sessions at checkpoint-evaluation time by scanning `ctx.projectDir/.kata/sessions/` for state.json files whose `enteredAt` is ≥ the scenario-start timestamp (dropping any stale sessions from prior runs of the same `--project` dir), and matches each session to its commit via the algorithm above. + +#### Data Layer +Reads `.kata/sessions//edits.jsonl` (written by `PostToolUse` hook via `appendEdit` in `src/tracking/edits-log.ts`). Reads git history via `git log ..HEAD --format=%H` and `git show --name-only ` to compare against the edits set. + +--- + +## Non-Goals + +Explicitly out of scope for this feature: +- Modifying the file-edit tracker implementation in any way — the tracker landed in commit `b5d2c95` and is complete; if this eval surfaces a bug, file a separate issue rather than patching in this spec +- Testing `baseline.json` capture/honoring behavior — already covered by unit tests in `src/tracking/edits-log.test.ts` and `src/commands/can-exit.test.ts` +- Testing Bash-derived edit tracking (`sed`, `cp`, shell redirection) — covered by tracker unit tests +- Testing overlap on shared files between two sessions — merge/conflict concerns are orthogonal to the tracker scoping mechanism under test here +- Testing cross-mode interaction (e.g., one agent in `task`, another in `debug`) — single-mode with two agents is sufficient to prove the scoping mechanism +- Adding per-session `hooks.log.jsonl` parsing in assertions — append-only writes from two processes may interleave; acceptable for v1 since no assertion depends on per-session hook lines +- Changing the LLM-as-judge pipeline — existing `--judge[=provider]` flag applies if the user opts in; no special handling needed + +## Resolved Questions + +- [x] **Per-agent `fixtureSetup` commands?** No. v1 uses scenario-level `fixtureSetup` only. If a future scenario needs per-agent setup, add `AgentSpec.fixtureSetup?: string[]` in a follow-up. +- [x] **Lockfile / dependency collision between agents?** Pre-install dependencies in scenario-level `fixtureSetup` (`bun install`) before either agent runs, so neither agent triggers a lockfile-modifying install at runtime. The framework-file allowlist in `assertCommitsScopedToEachSession` (`bun.lockb`, `bun.lock`, `package-lock.json`, `*.tsbuildinfo`) provides a safety net if a stray install still occurs. + +## Implementation Phases + +See YAML frontmatter `phases:` above. Each phase is 1-3 hours of focused work. + +### Phase 1: Harness — agents[] support + session-ID-aware state lookup + +Tasks: +- Add `AgentSpec` type `{ prompt: string; maxTurns?: number; sessionIdHint?: string }` to `eval/harness.ts`. +- Extend `EvalScenario` with optional `prompt?: string` and optional `agents?: AgentSpec[]`. Document in the interface docstring that they are mutually exclusive; populate exactly one. +- At the top of `runScenario`, enforce the invariant: if both `scenario.prompt` and `scenario.agents` are set, or if neither is set, throw `Error("EvalScenario must define exactly one of prompt or agents")` before any fixture setup. +- Guard existing callers that read `scenario.prompt` with `if (scenario.prompt)` since the field is now optional at the type level. +- In `runScenario`, branch on `scenario.agents`: + - If present and non-empty: build `AgentSpec[]` into concurrent `query()` calls via `Promise.allSettled`. Each agent gets its own transcript file `agent-.jsonl` inside the scenario transcript dir. After `allSettled` resolves, record per-agent outcome (fulfilled/rejected with reason) into the scenario transcript summary. Proceed to checkpoint evaluation even if one agent rejected, so assertions can run against whatever session data exists. + - Else: existing single-agent code path is unchanged. +- Add optional `sessionId?: string` parameter to `getSessionState(projectDir, sessionId?)`. When provided, read `.kata/sessions//state.json` directly. When absent, use existing latest-by-updatedAt logic. +- Ensure multi-agent transcript writing does not interleave between the two streams. Per-agent files solve this cleanly. + +Verification: +- `bun run typecheck` passes. +- A unit test (or harness integration test) instantiates an `EvalScenario` with two trivial agents and confirms two transcript files appear and both agents' `query()` calls execute concurrently. +- Existing scenarios (using top-level `prompt`) continue to pass unchanged. + +### Phase 2: Assertions — two-commits + scoped-commit assertions + +Tasks: +- Add `assertTwoCommitsSinceStart(): EvalCheckpoint` to `eval/assertions.ts`. Implementation: + - Read `ctx.startSha` — captured in Phase 1 by the harness immediately after `fixtureSetup` completes and before any agent spawns. Existing assertions like `assertDiffContains` use the root commit instead; the scenario-start SHA is new and must be added to the checkpoint context in Phase 1. + - `git rev-list --count --no-merges ..HEAD` → parse integer; fail if not exactly 2. Merge commits are excluded. Contract: agents in this scenario must produce exactly one commit each; `--amend`-induced SHA churn or auto-formatter commits that push the count above 2 fail loudly by design. +- Add `assertCommitsScopedToEachSession(): EvalCheckpoint` (no arguments): + - Scan `ctx.projectDir/.kata/sessions/` for session directories; read each `state.json`. + - Filter to sessions whose `enteredAt` is ≥ the scenario-start timestamp (drop stale sessions from prior runs of the same `--project` dir). + - For each surviving session S, read its `edits.jsonl` via `readEditsSet` from `src/tracking/edits-log.ts` → set E_S. + - Resolve S's commit: enumerate candidate SHAs from `git rev-list --no-merges ..HEAD`; for each SHA, `git show --name-only ` gives its file-set F. Collect C_S = { sha : F ∩ E_S ≠ ∅ }. Assert |C_S| == 1; if 0 or ≥ 2, fail with a diagnostic listing the candidate SHAs and their file-sets. + - For the matched commit, assert `files(commit) ⊆ E_S ∪ ALLOWLIST` where `ALLOWLIST = ['bun.lockb', 'bun.lock', 'package-lock.json', '*.tsbuildinfo']`. On failure, print the foreign path(s) and the session ID. + - Assert every filtered session has exactly one matched commit (so the set of matched commits across sessions equals the set of commits from `assertTwoCommitsSinceStart`). +- Add unit tests to `eval/assertions.test.ts` following the existing mock `EvalContext` pattern in that file. Back the git-facing tests with a real temporary git repo (`fs.mkdtempSync` + `git init` + controlled commits via `git commit --allow-empty` and small file writes) rather than process-level mocks — this mirrors the existing pattern and avoids reinventing fixture infrastructure: + - Subset case (two sessions, two commits, disjoint files) → passes. + - Foreign file in one commit → fails with readable diagnostic naming the foreign path. + - Session's edits.jsonl intersects zero commits → fails with a candidate-list diagnostic. + - Session's edits.jsonl intersects multiple commits → fails with a candidate-list diagnostic. + - Exactly two non-merge commits since start → `assertTwoCommitsSinceStart` passes. + - Three commits → `assertTwoCommitsSinceStart` fails. + - Two commits plus a merge commit → `assertTwoCommitsSinceStart` still passes (merges excluded). + +Verification: +- `bun test eval/assertions.test.ts` — all existing tests plus the four new cases pass. +- `bun run typecheck` passes. + +### Phase 3: Scenario wiring + +Tasks: +- Create `eval/scenarios/two-agent-tracker.ts`: + ```ts + import type { EvalScenario } from '../harness'; + import { + assertTwoCommitsSinceStart, + assertCommitsScopedToEachSession, + } from '../assertions'; + + export const twoAgentTracker: EvalScenario = { + id: 'two-agent-tracker', + name: 'Two-agent file-edit tracker', + fixture: 'tanstack-start', + // Pre-install deps so neither agent triggers a lockfile-modifying install at runtime. + fixtureSetup: ['bun install'], + agents: [ + { prompt: "Add a utility function to src/utils/foo.ts that returns 42." }, + { prompt: "Add a utility function to src/utils/bar.ts that returns 'hello'." }, + ], + checkpoints: [ + assertTwoCommitsSinceStart(), + assertCommitsScopedToEachSession(), + ], + }; + ``` + Both new assertions discover sessions and commits at checkpoint-evaluation time — no per-agent session ID needs to be threaded through the scenario definition. +- Register the scenario in the scenario index (same file that currently exports the scenario map used by `--list`). +- Run `npm run eval -- --scenario=two-agent-tracker --verbose` once and confirm all checkpoints pass. + +Verification: +- Scenario runs end-to-end; both transcripts written; all checkpoints pass. +- `--list` shows `two-agent-tracker` in the scenario list. +- Fault-injection run (temporarily removing session filter in `src/commands/can-exit.ts` `checkGlobalConditions`) causes `assertCommitsScopedToEachSession` to fail — proving the eval actually measures the tracker behavior. + +## Verification Strategy + +### Test Infrastructure +- Unit tests: Bun's test runner discovers `.test.ts` files alongside source. Existing files extend: `eval/assertions.test.ts` (mock-based assertion tests) and `eval/harness.test.ts` (or new file for `getSessionState` coverage). +- End-to-end: `npm run eval -- --scenario=two-agent-tracker` runs the scenario against a fresh `tanstack-start` fixture copy under `eval-projects/two-agent-tracker-/`. + +### Build Verification +Use `bun run typecheck` to confirm types compile. No build step is required for this project (see CLAUDE.md: "The `kata` shell script at the repo root is the CLI entry point. It runs `bun src/index.ts` directly"). + +## Verification Plan + +Concrete, executable steps to verify the feature works against the REAL running system. + +### VP1: Scenario runs and passes + +Steps: +1. `cd /data/projects/kata-wm && npm run eval -- --scenario=two-agent-tracker --verbose` + Expected: exit code 0; stdout shows both agents starting concurrently; stdout includes `PASSED` for each checkpoint; a directory `eval-transcripts/two-agent-tracker-/` contains `agent-0.jsonl` and `agent-1.jsonl`. +2. `ls eval-projects/two-agent-tracker-*/\.kata/sessions/ | wc -l` + Expected: 2 (one session dir per agent). +3. `cd eval-projects/two-agent-tracker- && git rev-list --count --no-merges ..HEAD` + Expected: prints exactly `2`. `git log --oneline ..HEAD` shows one commit touching `src/utils/foo.ts` only and one touching `src/utils/bar.ts` only (modulo files in the `ALLOWLIST`). + +### VP2: Tracker scoping is actually what's being proven (fault injection) + +Steps: +1. Temporarily edit `src/commands/can-exit.ts` `checkGlobalConditions` (lines 47-116) so that the `committed` check uses the full working tree status instead of filtering to session edits — simulating pre-tracker behavior. Save. +2. `cd /data/projects/kata-wm && bun run eval -- --scenario=two-agent-tracker` + Expected: the run fails — either at least one agent's `kata can-exit` is blocked by the other agent's in-flight dirty files (scenario times out or exits with a can-exit failure), or `assertCommitsScopedToEachSession` fails with a diagnostic showing which foreign path leaked into the commit. +3. Revert the edit to `src/commands/can-exit.ts`. +4. `cd /data/projects/kata-wm && bun run eval -- --scenario=two-agent-tracker` + Expected: exit code 0; all checkpoints pass again. + +### VP3: getSessionState by ID + +Steps: +1. `cd /data/projects/kata-wm && bun test eval/harness.test.ts -t "getSessionState accepts sessionId"` + Expected: test passes; output confirms both default (latest-by-updatedAt) and explicit-id paths return the correct `SessionState` object. +2. `cd /data/projects/kata-wm && bun run typecheck` + Expected: exit code 0. + +## Implementation Hints + +### AgentSpec shape + +Start minimal: +```ts +interface AgentSpec { + prompt: string; + maxTurns?: number; + sessionIdHint?: string; // for transcript filename disambiguation only +} +``` +The kata session ID is produced by each agent's own `kata enter task` invocation at runtime; callers do not supply it. + +### Transcript isolation + +Per-agent transcripts as `eval-transcripts/-/agent-.jsonl` avoid append collisions that would occur if both agents streamed to the same file. The scenario transcript dir continues to hold any aggregate summary. + +### Session and commit discovery for assertions + +`assertCommitsScopedToEachSession` does not accept a session ID. At checkpoint time it scans `ctx.projectDir/.kata/sessions/`, reads each `state.json`, and filters to sessions whose `enteredAt` ≥ the scenario-start timestamp (captured by the harness at fixture-setup time). This drops any stale sessions left behind by a prior run against the same `--project` dir. + +For each surviving session S with edits set E_S, resolve its commit via the intersection algorithm: walk `git rev-list --no-merges ..HEAD`, compute F = `git show --name-only ` for each, and collect candidate SHAs where F ∩ E_S ≠ ∅. Assert exactly one candidate; otherwise fail with a diagnostic listing all candidates and their file-sets. Finally assert `F ⊆ E_S ∪ ALLOWLIST` where `ALLOWLIST = ['bun.lockb', 'bun.lock', 'package-lock.json', '*.tsbuildinfo']`. + +This approach removes the need to thread per-agent session IDs through the scenario definition or to correlate agent transcripts with session directories. + +### Unit-test git fixtures + +Follow the existing mock `EvalContext` pattern in `eval/assertions.test.ts`. For the new assertions, the git-facing cases should drive a real temporary git repo rather than stub out `child_process` — create a dir with `fs.mkdtempSync`, run `git init`, make a starter commit, then stage controlled file writes and commit them to produce a deterministic history. Synthesize `.kata/sessions//edits.jsonl` files inside that temp dir. This mirrors the style already used in the file and avoids inventing a new mocking layer. + +### Git index.lock contention between concurrent agents + +Both agents eventually run `git add` + `git commit` in the same working tree. Git's own per-repo `.git/index.lock` serializes concurrent commits at the filesystem level: the second writer sees `fatal: Unable to create '.../.git/index.lock': File exists` and exits non-zero. In practice, agent completion times differ by seconds (different prompts, different tool-use sequences), so collisions are rare. Do **not** add retry logic in the harness for v1. If flakiness appears in CI, the first mitigation is to have each agent's task-mode commit ceremony retry once after a short jitter on `index.lock` detection (tracked as a follow-up, not this spec). + +### Shared hooks.log.jsonl + +Both agents' hooks will append to the same `.kata/hooks.log.jsonl`. Writes from two Node processes may interleave within a single line in pathological cases. This is acceptable for v1 because **no assertion in this spec parses per-session lines from `hooks.log.jsonl`** — everything needed is in per-session `.kata/sessions//edits.jsonl`, which is session-scoped by path and therefore collision-free. Note this limitation in a code comment next to the harness multi-agent branch. + +### Prompts must be simple + +Per project memory and spec 8 ("Simple natural-language prompts"): prompts describe the task in plain English. Do not include `kata enter task` or answer AskUserQuestion prompts. Let the `user-prompt` and `SessionStart` hooks nudge each agent into task mode. If the task-mode skill asks the agent a question, the existing pause/resume mechanism (`--resume= --answer=...`) handles it — but for prompts this small, no pause is expected. + +### Do not change the tracker + +The tracker implementation (`src/tracking/edits-log.ts`, `handlePostToolUse` in `src/commands/hook.ts:1001-1057`, `checkGlobalConditions` in `src/commands/can-exit.ts:47-116`, `checkFeatureTestsAdded` in `src/commands/can-exit.ts:181-221`, `captureBaseline` in `src/commands/enter.ts`) is out of scope. If the eval surfaces a real bug, file a separate issue. + +### Reference Docs + +- `eval/harness.ts` — scenario runner, fixture copy, query integration +- `eval/assertions.ts` — preset arrays, `assertNewCommit`, `assertDiffContains`, `assertChangesPushed` +- `src/tracking/edits-log.ts` — `appendEdit`, `readEditsSet`, `parseGitStatusPaths`, `toGitRelative` +- `src/commands/can-exit.ts:47-116` — the scoping logic under test +- `planning/specs/8-eval-harness-redesign.md` — the eval design principles this spec builds on + +--- + +