diff --git a/package.json b/package.json index 3900afe..7ca7d0b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "bashkit", - "version": "0.5.2", + "version": "0.5.3", "description": "Agentic coding tools for the Vercel AI SDK", "type": "module", "main": "./dist/index.js", diff --git a/skills/bashkit-debug/SKILL.md b/skills/bashkit-debug/SKILL.md new file mode 100644 index 0000000..f50e4e9 --- /dev/null +++ b/skills/bashkit-debug/SKILL.md @@ -0,0 +1,163 @@ +--- +name: bashkit-debug +description: Analyze BashKit agent debug traces. Use when debugging a BashKit-powered agent run — reading JSONL trace files, correlating tool calls, identifying errors, slow operations, and execution patterns. Triggers on "debug trace", "analyze trace", "bashkit debug", "what went wrong", or when reading .jsonl trace files from BASHKIT_DEBUG. +--- + +# BashKit Debug Trace Analyzer + +You are analyzing execution traces from a BashKit-powered AI agent. These traces capture every tool call the agent made, with inputs, outputs, timing, errors, and parent/child relationships. + +## Trace Format + +Traces are JSONL files (one JSON object per line) generated by `BASHKIT_DEBUG=file:/path/to/trace.jsonl`. + +Each line is a `DebugEvent`: + +```typescript +interface DebugEvent { + id: string; // e.g. "bash-1", "grep-3" — correlates start/end pairs + ts: number; // Unix timestamp in milliseconds + tool: string; // Tool name: bash, read, write, edit, glob, grep, task, web-search, web-fetch, todo-write, skill, ask-user, enter-plan-mode, exit-plan-mode + event: "start" | "end" | "error"; + input?: unknown; // Tool input parameters (start events only, truncated) + output?: unknown; // Tool output data (end events only, truncated) + summary?: Record; // Key metrics (end events only) + duration_ms?: number; // Execution time (end events only) + parent?: string; // Parent event ID for nested calls (task sub-agent tools) + error?: string; // Error message (error events only) +} +``` + +### Event Lifecycle + +Every tool call produces 2-3 events: +1. **start** — Has `input` with summarized parameters +2. **end** — Has `output`, `summary`, and `duration_ms` (success path) +3. **error** — Has `error` message (failure path, replaces end) + +Correlate by `id`: a `start` event with `id: "bash-3"` pairs with the `end` or `error` event with the same `id`. + +### Parent/Child Relationships + +The `parent` field links tool calls to their spawning Task tool. Example: +```jsonl +{"id":"task-1","tool":"task","event":"start","input":{"description":"Fix the tests"}} +{"id":"bash-5","tool":"bash","event":"start","input":{"command":"npm test"},"parent":"task-1"} +{"id":"bash-5","tool":"bash","event":"end","summary":{"exitCode":1},"duration_ms":3200,"parent":"task-1"} +{"id":"task-1","tool":"task","event":"end","summary":{"subagent":"sub-1"},"duration_ms":15000} +``` + +Events with the same `parent` ran inside the same sub-agent. + +## Tool-Specific Summary Fields + +Each tool's `end` event includes a `summary` with tool-specific metrics: + +| Tool | Summary Fields | +|------|---------------| +| **bash** | `exitCode`, `stdoutLen`, `stderrLen`, `interrupted` | +| **read** | `type` ("text"/"directory"), `totalLines`, `returnedLines`, `offset` (text) or `count` (directory) | +| **write** | `bytes_written` | +| **edit** | `replacements` | +| **glob** | `count` (number of matched files) | +| **grep** | Varies by mode: `fileCount`/`total`/`matchCount`, plus `exitCode` | +| **task** | `subagent`, `tokens: { input, output }` | +| **web-search** | `resultCount` | +| **web-fetch** | `contentLength`, `responseLength` | +| **todo-write** | `total`, `in_progress`, `completed`, `pending` (counts per status) | +| **skill** | `skillName`, `instructionLength` | + +## Tool-Specific Output Fields + +The `output` field on `end` events contains the actual data the agent received: + +| Tool | Output Content | +|------|---------------| +| **bash** | `{ stdout, stderr }` — command output (truncated to ~4000 chars) | +| **read** | File content as string (text files) or directory entries | +| **grep** | Matched files, count entries, or content matches depending on mode | +| **task** | Sub-agent's final result text | +| **glob** | First 10 matched file paths | +| **web-search** | First 5 search results with titles and URLs | + +Output is truncated: strings at 4000 chars, arrays at 20 items, objects at depth 5. + +## Analysis Playbook + +When asked to analyze a trace, follow this process: + +### Step 1: Read the Trace File +```bash +cat /path/to/trace.jsonl +``` +Or if large, start with a summary: +```bash +wc -l /path/to/trace.jsonl # total events +grep '"event":"error"' /path/to/trace.jsonl # errors only +grep '"event":"end"' /path/to/trace.jsonl # completions with timing +``` + +### Step 2: Build the Timeline +Correlate start/end pairs by `id`. For each tool call, extract: +- **What**: tool name + key input (command, file path, pattern) +- **Result**: success/error + key summary metric +- **Time**: duration_ms +- **Context**: parent (if inside a sub-agent) + +### Step 3: Identify Issues + +**Errors** — Look for `"event":"error"` entries: +- `bash` errors: check exit codes, stderr content, timeout +- `read` errors: file not found, permission denied +- `edit` errors: string not found, multiple occurrences +- `grep` errors: invalid regex, ripgrep not installed +- `task` errors: sub-agent failures, budget exceeded + +**Slow Operations** — Flag tool calls over these thresholds: +- `bash`: >10s (may indicate hanging command or large output) +- `read`/`write`/`edit`: >1s (filesystem issue) +- `grep`/`glob`: >5s (overly broad pattern) +- `task`: depends on complexity, but >60s warrants investigation +- `web-search`/`web-fetch`: >10s (API latency) + +**Patterns to Watch For**: +- Repeated identical tool calls → agent is looping (check if caching is enabled) +- `bash` with exit code != 0 followed by same command → agent retrying without fixing +- `edit` errors "string not found" → agent has stale file content, needs to re-read +- `grep` with 0 matches followed by broader grep → agent searching iteratively (normal) +- `task` with high token usage → sub-agent doing too much work +- Orphaned `start` events with no matching `end`/`error` → crash or timeout +- Many `read` calls to the same file → missing cache, or agent re-reading after edits (normal) + +### Step 4: Report Findings + +Structure your report as: + +1. **Overview**: Total calls, duration, error count +2. **Errors**: Each error with context (what was attempted, why it failed) +3. **Performance**: Slowest calls, total time breakdown by tool +4. **Patterns**: Any concerning patterns (loops, retries, excessive calls) +5. **Recommendations**: Specific fixes (enable caching, adjust timeouts, fix tool inputs) + +## Helper Script + +The `summarize-trace.sh` script in this skill directory can pre-process large trace files: + +```bash +# Generate a compact summary of a trace file +./skills/bashkit-debug/summarize-trace.sh /path/to/trace.jsonl +``` + +This outputs a condensed timeline that's more token-efficient to analyze than the raw JSONL. + +## Enabling Debug Traces + +Tell users to run their BashKit agent with: +```bash +BASHKIT_DEBUG=file:./trace.jsonl node my-agent.js +``` + +Other modes (less useful for post-mortem analysis): +- `BASHKIT_DEBUG=json` — JSON lines to stderr (pipe to file: `2>trace.jsonl`) +- `BASHKIT_DEBUG=stderr` or `BASHKIT_DEBUG=1` — Human-readable to stderr +- `BASHKIT_DEBUG=memory` — In-memory only (programmatic access via `getDebugLogs()`) diff --git a/skills/bashkit-debug/summarize-trace.sh b/skills/bashkit-debug/summarize-trace.sh new file mode 100755 index 0000000..000a4c8 --- /dev/null +++ b/skills/bashkit-debug/summarize-trace.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Summarize a BashKit debug trace file into a compact, agent-readable format. +# Usage: ./summarize-trace.sh /path/to/trace.jsonl +# +# Requires: jq + +set -euo pipefail + +TRACE_FILE="${1:?Usage: summarize-trace.sh }" + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required. Install with: brew install jq" >&2 + exit 1 +fi + +if [[ ! -f "$TRACE_FILE" ]]; then + echo "Error: File not found: $TRACE_FILE" >&2 + exit 1 +fi + +TOTAL_EVENTS=$(wc -l < "$TRACE_FILE" | tr -d ' ') +ERROR_COUNT=$(grep -c '"event":"error"' "$TRACE_FILE" || true) +END_COUNT=$(grep -c '"event":"end"' "$TRACE_FILE" || true) +START_COUNT=$(grep -c '"event":"start"' "$TRACE_FILE" || true) + +# Extract first and last timestamps for total duration +FIRST_TS=$(head -1 "$TRACE_FILE" | jq -r '.ts') +LAST_TS=$(tail -1 "$TRACE_FILE" | jq -r '.ts') +TOTAL_DURATION_MS=$((LAST_TS - FIRST_TS)) +TOTAL_DURATION_S=$(echo "scale=1; $TOTAL_DURATION_MS / 1000" | bc) + +echo "== BashKit Debug Trace Summary ==" +echo "File: $TRACE_FILE" +echo "Events: $TOTAL_EVENTS ($START_COUNT calls, $ERROR_COUNT errors)" +echo "Duration: ${TOTAL_DURATION_S}s" +echo "" + +# Per-tool stats +echo "== Tool Stats ==" +echo "Tool | Calls | Errors | Total ms | Avg ms" +echo "---------------|-------|--------|----------|-------" +jq -r 'select(.event == "end") | [.tool, .duration_ms] | @tsv' "$TRACE_FILE" 2>/dev/null | \ + awk -F'\t' '{ + tool=$1; ms=$2; + calls[tool]++; + total[tool]+=ms; + } + END { + for (t in calls) { + avg = total[t] / calls[t]; + printf "%-15s| %5d | %6s | %8d | %6d\n", t, calls[t], "", total[t], avg; + } + }' | sort + +# Add error counts +if [[ "$ERROR_COUNT" -gt 0 ]]; then + echo "" + jq -r 'select(.event == "error") | .tool' "$TRACE_FILE" 2>/dev/null | \ + sort | uniq -c | sort -rn | while read -r count tool; do + echo " $tool: $count error(s)" + done +fi + +echo "" + +# Timeline: correlated start/end pairs +echo "== Timeline ==" +jq -r ' + if .event == "start" then + "→ \(.id) \(.tool) " + ( + if .tool == "bash" then (.input.command // "?" | tostring | .[0:120]) + elif .tool == "read" then (.input.file_path // "?") + elif .tool == "write" then (.input.file_path // "?") + elif .tool == "edit" then (.input.file_path // "?") + elif .tool == "grep" then "pattern=" + (.input.pattern // "?") + " " + (.input.path // ".") + elif .tool == "glob" then "pattern=" + (.input.pattern // "?") + elif .tool == "task" then (.input.description // "?" | .[0:100]) + elif .tool == "web-search" then (.input.query // "?") + elif .tool == "web-fetch" then (.input.url // "?") + else (.input | tostring | .[0:80]) + end + ) + (if .parent then " [parent=\(.parent)]" else "" end) + elif .event == "end" then + "← \(.id) \(.duration_ms)ms " + ( + if .summary then (.summary | to_entries | map("\(.key)=\(.value)") | join(" ")) + else "" + end + ) + elif .event == "error" then + "✗ \(.id) \(.tool): \(.error)" + else empty + end +' "$TRACE_FILE" 2>/dev/null + +# Slow calls (>5s) +echo "" +SLOW=$(jq -r 'select(.event == "end" and .duration_ms > 5000) | "\(.id) \(.tool) \(.duration_ms)ms"' "$TRACE_FILE" 2>/dev/null) +if [[ -n "$SLOW" ]]; then + echo "== Slow Calls (>5s) ==" + echo "$SLOW" +else + echo "== No slow calls (>5s) ==" +fi + +# Errors with context +if [[ "$ERROR_COUNT" -gt 0 ]]; then + echo "" + echo "== Errors ==" + jq -r 'select(.event == "error") | "\(.id) \(.tool): \(.error)"' "$TRACE_FILE" 2>/dev/null +fi diff --git a/src/tools/bash.ts b/src/tools/bash.ts index 6f15976..4366211 100644 --- a/src/tools/bash.ts +++ b/src/tools/bash.ts @@ -134,6 +134,7 @@ export function createBashTool(sandbox: Sandbox, config?: ToolConfig) { if (debugId) { debugEnd(debugId, "bash", { + output: { stdout, stderr: stderr || undefined }, summary: { exitCode: result.exitCode, stdoutLen: result.stdout.length, diff --git a/src/tools/grep.ts b/src/tools/grep.ts index 0acceda..5f2ccbc 100644 --- a/src/tools/grep.ts +++ b/src/tools/grep.ts @@ -263,6 +263,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseFilesOutput(result.stdout); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepFilesOutput).files, summary: { fileCount: (output as GrepFilesOutput).count, exitCode: result.exitCode, @@ -274,6 +275,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseCountOutput(result.stdout); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepCountOutput).counts, summary: { total: (output as GrepCountOutput).total, exitCode: result.exitCode, @@ -285,6 +287,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseContentOutput(result.stdout, head_limit, offset); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepContentOutput).matches, summary: { matchCount: (output as GrepContentOutput).total_matches, exitCode: result.exitCode, diff --git a/src/tools/read.ts b/src/tools/read.ts index 17b1fc3..96432a8 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -101,6 +101,7 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { const durationMs = Math.round(performance.now() - startTime); if (debugId) { debugEnd(debugId, "read", { + output: entries, summary: { type: "directory", count: entries.length }, duration_ms: durationMs, }); @@ -161,9 +162,11 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { content: line, })); + const contentStr = selectedLines.join("\n"); const durationMs = Math.round(performance.now() - startTime); if (debugId) { debugEnd(debugId, "read", { + output: contentStr, summary: { type: "text", totalLines, @@ -176,7 +179,7 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { return { type: "text", - content: selectedLines.join("\n"), + content: contentStr, lines, total_lines: totalLines, }; diff --git a/src/tools/task.ts b/src/tools/task.ts index 8d714c5..5b593f7 100644 --- a/src/tools/task.ts +++ b/src/tools/task.ts @@ -389,7 +389,9 @@ export function createTaskTool( debugError(debugId, "task", result.error); } else { debugEnd(debugId, "task", { + output: result.result, summary: { + subagent: result.subagent, tokens: result.usage ? { input: result.usage.input_tokens, diff --git a/src/utils/AGENTS.md b/src/utils/AGENTS.md index 93a4e9d..0b43068 100644 --- a/src/utils/AGENTS.md +++ b/src/utils/AGENTS.md @@ -143,8 +143,8 @@ Three complementary approaches: - Events include `parent` field to reconstruct call hierarchy **Data Truncation**: -- Strings truncated to 1000 chars -- Arrays truncated to 10 items +- Strings truncated to 4000 chars +- Arrays truncated to 20 items - Recursive summarization for nested objects (max depth: 5) ## Design Patterns diff --git a/src/utils/debug.ts b/src/utils/debug.ts index 276871f..10c5a03 100644 --- a/src/utils/debug.ts +++ b/src/utils/debug.ts @@ -60,8 +60,8 @@ interface DebugContext { const debugContext = new AsyncLocalStorage(); // Truncation limits -const MAX_STRING_LENGTH = 1000; -const MAX_ARRAY_ITEMS = 10; +const MAX_STRING_LENGTH = 4000; +const MAX_ARRAY_ITEMS = 20; /** * Initialize debug mode from environment variable. @@ -119,8 +119,8 @@ function truncateString(str: string): string { /** * Summarize data for debug output. - * - Truncates strings to 1000 chars - * - Limits arrays to 10 items + * - Truncates strings to MAX_STRING_LENGTH chars + * - Limits arrays to MAX_ARRAY_ITEMS items * - Recursively summarizes nested objects */ export function summarize(data: unknown, depth = 0): unknown { diff --git a/tests/utils/debug.test.ts b/tests/utils/debug.test.ts index 9618c99..037d4d4 100644 --- a/tests/utils/debug.test.ts +++ b/tests/utils/debug.test.ts @@ -140,7 +140,7 @@ describe("debug", () => { }); it("summarizes input data", () => { - const longString = "x".repeat(2000); + const longString = "x".repeat(5000); debugStart("bash", { command: longString }); const logs = getDebugLogs(); const input = logs[0].input as Record; @@ -407,7 +407,7 @@ describe("debug", () => { }); it("truncates long strings", () => { - const long = "x".repeat(2000); + const long = "x".repeat(5000); const result = summarize(long) as string; expect(result.length).toBeLessThan(long.length); expect(result).toContain("[truncated"); @@ -428,10 +428,10 @@ describe("debug", () => { }); it("truncates long arrays", () => { - const arr = Array.from({ length: 20 }, (_, i) => i); + const arr = Array.from({ length: 30 }, (_, i) => i); const result = summarize(arr) as unknown[]; - expect(result.length).toBe(11); // 10 items + truncation message - expect(result[10]).toContain("10 more items"); + expect(result.length).toBe(21); // 20 items + truncation message + expect(result[20]).toContain("10 more items"); }); it("keeps short arrays intact", () => { @@ -440,7 +440,7 @@ describe("debug", () => { }); it("recursively summarizes objects", () => { - const obj = { name: "test", value: "x".repeat(2000) }; + const obj = { name: "test", value: "x".repeat(5000) }; const result = summarize(obj) as Record; expect(result.name).toBe("test"); expect(result.value).toContain("[truncated");