From 81b16a0e53c8cabf314f2e267048776c08e94473 Mon Sep 17 00:00:00 2001 From: jbreite Date: Sat, 7 Mar 2026 10:23:41 -0500 Subject: [PATCH 1/4] Enrich debug traces with tool outputs and add bashkit-debug skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add output data to debugEnd calls for bash, grep, read, and task tools so trace files capture what each tool actually returned - Bump truncation limits (strings: 1000→4000 chars, arrays: 10→20 items) to preserve more useful data in traces - Add bashkit-debug skill with SKILL.md and summarize-trace.sh for analyzing debug trace files with Claude Code or other agents Co-Authored-By: Claude Opus 4.6 --- skills/bashkit-debug/SKILL.md | 163 ++++++++++++++++++++++++ skills/bashkit-debug/summarize-trace.sh | 110 ++++++++++++++++ src/tools/bash.ts | 1 + src/tools/grep.ts | 3 + src/tools/read.ts | 1 + src/tools/task.ts | 2 + src/utils/debug.ts | 4 +- 7 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 skills/bashkit-debug/SKILL.md create mode 100755 skills/bashkit-debug/summarize-trace.sh diff --git a/skills/bashkit-debug/SKILL.md b/skills/bashkit-debug/SKILL.md new file mode 100644 index 0000000..f50e4e9 --- /dev/null +++ b/skills/bashkit-debug/SKILL.md @@ -0,0 +1,163 @@ +--- +name: bashkit-debug +description: Analyze BashKit agent debug traces. Use when debugging a BashKit-powered agent run — reading JSONL trace files, correlating tool calls, identifying errors, slow operations, and execution patterns. Triggers on "debug trace", "analyze trace", "bashkit debug", "what went wrong", or when reading .jsonl trace files from BASHKIT_DEBUG. +--- + +# BashKit Debug Trace Analyzer + +You are analyzing execution traces from a BashKit-powered AI agent. These traces capture every tool call the agent made, with inputs, outputs, timing, errors, and parent/child relationships. + +## Trace Format + +Traces are JSONL files (one JSON object per line) generated by `BASHKIT_DEBUG=file:/path/to/trace.jsonl`. + +Each line is a `DebugEvent`: + +```typescript +interface DebugEvent { + id: string; // e.g. "bash-1", "grep-3" — correlates start/end pairs + ts: number; // Unix timestamp in milliseconds + tool: string; // Tool name: bash, read, write, edit, glob, grep, task, web-search, web-fetch, todo-write, skill, ask-user, enter-plan-mode, exit-plan-mode + event: "start" | "end" | "error"; + input?: unknown; // Tool input parameters (start events only, truncated) + output?: unknown; // Tool output data (end events only, truncated) + summary?: Record; // Key metrics (end events only) + duration_ms?: number; // Execution time (end events only) + parent?: string; // Parent event ID for nested calls (task sub-agent tools) + error?: string; // Error message (error events only) +} +``` + +### Event Lifecycle + +Every tool call produces 2-3 events: +1. **start** — Has `input` with summarized parameters +2. **end** — Has `output`, `summary`, and `duration_ms` (success path) +3. **error** — Has `error` message (failure path, replaces end) + +Correlate by `id`: a `start` event with `id: "bash-3"` pairs with the `end` or `error` event with the same `id`. + +### Parent/Child Relationships + +The `parent` field links tool calls to their spawning Task tool. Example: +```jsonl +{"id":"task-1","tool":"task","event":"start","input":{"description":"Fix the tests"}} +{"id":"bash-5","tool":"bash","event":"start","input":{"command":"npm test"},"parent":"task-1"} +{"id":"bash-5","tool":"bash","event":"end","summary":{"exitCode":1},"duration_ms":3200,"parent":"task-1"} +{"id":"task-1","tool":"task","event":"end","summary":{"subagent":"sub-1"},"duration_ms":15000} +``` + +Events with the same `parent` ran inside the same sub-agent. + +## Tool-Specific Summary Fields + +Each tool's `end` event includes a `summary` with tool-specific metrics: + +| Tool | Summary Fields | +|------|---------------| +| **bash** | `exitCode`, `stdoutLen`, `stderrLen`, `interrupted` | +| **read** | `type` ("text"/"directory"), `totalLines`, `returnedLines`, `offset` (text) or `count` (directory) | +| **write** | `bytes_written` | +| **edit** | `replacements` | +| **glob** | `count` (number of matched files) | +| **grep** | Varies by mode: `fileCount`/`total`/`matchCount`, plus `exitCode` | +| **task** | `subagent`, `tokens: { input, output }` | +| **web-search** | `resultCount` | +| **web-fetch** | `contentLength`, `responseLength` | +| **todo-write** | `total`, `in_progress`, `completed`, `pending` (counts per status) | +| **skill** | `skillName`, `instructionLength` | + +## Tool-Specific Output Fields + +The `output` field on `end` events contains the actual data the agent received: + +| Tool | Output Content | +|------|---------------| +| **bash** | `{ stdout, stderr }` — command output (truncated to ~4000 chars) | +| **read** | File content as string (text files) or directory entries | +| **grep** | Matched files, count entries, or content matches depending on mode | +| **task** | Sub-agent's final result text | +| **glob** | First 10 matched file paths | +| **web-search** | First 5 search results with titles and URLs | + +Output is truncated: strings at 4000 chars, arrays at 20 items, objects at depth 5. + +## Analysis Playbook + +When asked to analyze a trace, follow this process: + +### Step 1: Read the Trace File +```bash +cat /path/to/trace.jsonl +``` +Or if large, start with a summary: +```bash +wc -l /path/to/trace.jsonl # total events +grep '"event":"error"' /path/to/trace.jsonl # errors only +grep '"event":"end"' /path/to/trace.jsonl # completions with timing +``` + +### Step 2: Build the Timeline +Correlate start/end pairs by `id`. For each tool call, extract: +- **What**: tool name + key input (command, file path, pattern) +- **Result**: success/error + key summary metric +- **Time**: duration_ms +- **Context**: parent (if inside a sub-agent) + +### Step 3: Identify Issues + +**Errors** — Look for `"event":"error"` entries: +- `bash` errors: check exit codes, stderr content, timeout +- `read` errors: file not found, permission denied +- `edit` errors: string not found, multiple occurrences +- `grep` errors: invalid regex, ripgrep not installed +- `task` errors: sub-agent failures, budget exceeded + +**Slow Operations** — Flag tool calls over these thresholds: +- `bash`: >10s (may indicate hanging command or large output) +- `read`/`write`/`edit`: >1s (filesystem issue) +- `grep`/`glob`: >5s (overly broad pattern) +- `task`: depends on complexity, but >60s warrants investigation +- `web-search`/`web-fetch`: >10s (API latency) + +**Patterns to Watch For**: +- Repeated identical tool calls → agent is looping (check if caching is enabled) +- `bash` with exit code != 0 followed by same command → agent retrying without fixing +- `edit` errors "string not found" → agent has stale file content, needs to re-read +- `grep` with 0 matches followed by broader grep → agent searching iteratively (normal) +- `task` with high token usage → sub-agent doing too much work +- Orphaned `start` events with no matching `end`/`error` → crash or timeout +- Many `read` calls to the same file → missing cache, or agent re-reading after edits (normal) + +### Step 4: Report Findings + +Structure your report as: + +1. **Overview**: Total calls, duration, error count +2. **Errors**: Each error with context (what was attempted, why it failed) +3. **Performance**: Slowest calls, total time breakdown by tool +4. **Patterns**: Any concerning patterns (loops, retries, excessive calls) +5. **Recommendations**: Specific fixes (enable caching, adjust timeouts, fix tool inputs) + +## Helper Script + +The `summarize-trace.sh` script in this skill directory can pre-process large trace files: + +```bash +# Generate a compact summary of a trace file +./skills/bashkit-debug/summarize-trace.sh /path/to/trace.jsonl +``` + +This outputs a condensed timeline that's more token-efficient to analyze than the raw JSONL. + +## Enabling Debug Traces + +Tell users to run their BashKit agent with: +```bash +BASHKIT_DEBUG=file:./trace.jsonl node my-agent.js +``` + +Other modes (less useful for post-mortem analysis): +- `BASHKIT_DEBUG=json` — JSON lines to stderr (pipe to file: `2>trace.jsonl`) +- `BASHKIT_DEBUG=stderr` or `BASHKIT_DEBUG=1` — Human-readable to stderr +- `BASHKIT_DEBUG=memory` — In-memory only (programmatic access via `getDebugLogs()`) diff --git a/skills/bashkit-debug/summarize-trace.sh b/skills/bashkit-debug/summarize-trace.sh new file mode 100755 index 0000000..000a4c8 --- /dev/null +++ b/skills/bashkit-debug/summarize-trace.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Summarize a BashKit debug trace file into a compact, agent-readable format. +# Usage: ./summarize-trace.sh /path/to/trace.jsonl +# +# Requires: jq + +set -euo pipefail + +TRACE_FILE="${1:?Usage: summarize-trace.sh }" + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required. Install with: brew install jq" >&2 + exit 1 +fi + +if [[ ! -f "$TRACE_FILE" ]]; then + echo "Error: File not found: $TRACE_FILE" >&2 + exit 1 +fi + +TOTAL_EVENTS=$(wc -l < "$TRACE_FILE" | tr -d ' ') +ERROR_COUNT=$(grep -c '"event":"error"' "$TRACE_FILE" || true) +END_COUNT=$(grep -c '"event":"end"' "$TRACE_FILE" || true) +START_COUNT=$(grep -c '"event":"start"' "$TRACE_FILE" || true) + +# Extract first and last timestamps for total duration +FIRST_TS=$(head -1 "$TRACE_FILE" | jq -r '.ts') +LAST_TS=$(tail -1 "$TRACE_FILE" | jq -r '.ts') +TOTAL_DURATION_MS=$((LAST_TS - FIRST_TS)) +TOTAL_DURATION_S=$(echo "scale=1; $TOTAL_DURATION_MS / 1000" | bc) + +echo "== BashKit Debug Trace Summary ==" +echo "File: $TRACE_FILE" +echo "Events: $TOTAL_EVENTS ($START_COUNT calls, $ERROR_COUNT errors)" +echo "Duration: ${TOTAL_DURATION_S}s" +echo "" + +# Per-tool stats +echo "== Tool Stats ==" +echo "Tool | Calls | Errors | Total ms | Avg ms" +echo "---------------|-------|--------|----------|-------" +jq -r 'select(.event == "end") | [.tool, .duration_ms] | @tsv' "$TRACE_FILE" 2>/dev/null | \ + awk -F'\t' '{ + tool=$1; ms=$2; + calls[tool]++; + total[tool]+=ms; + } + END { + for (t in calls) { + avg = total[t] / calls[t]; + printf "%-15s| %5d | %6s | %8d | %6d\n", t, calls[t], "", total[t], avg; + } + }' | sort + +# Add error counts +if [[ "$ERROR_COUNT" -gt 0 ]]; then + echo "" + jq -r 'select(.event == "error") | .tool' "$TRACE_FILE" 2>/dev/null | \ + sort | uniq -c | sort -rn | while read -r count tool; do + echo " $tool: $count error(s)" + done +fi + +echo "" + +# Timeline: correlated start/end pairs +echo "== Timeline ==" +jq -r ' + if .event == "start" then + "→ \(.id) \(.tool) " + ( + if .tool == "bash" then (.input.command // "?" | tostring | .[0:120]) + elif .tool == "read" then (.input.file_path // "?") + elif .tool == "write" then (.input.file_path // "?") + elif .tool == "edit" then (.input.file_path // "?") + elif .tool == "grep" then "pattern=" + (.input.pattern // "?") + " " + (.input.path // ".") + elif .tool == "glob" then "pattern=" + (.input.pattern // "?") + elif .tool == "task" then (.input.description // "?" | .[0:100]) + elif .tool == "web-search" then (.input.query // "?") + elif .tool == "web-fetch" then (.input.url // "?") + else (.input | tostring | .[0:80]) + end + ) + (if .parent then " [parent=\(.parent)]" else "" end) + elif .event == "end" then + "← \(.id) \(.duration_ms)ms " + ( + if .summary then (.summary | to_entries | map("\(.key)=\(.value)") | join(" ")) + else "" + end + ) + elif .event == "error" then + "✗ \(.id) \(.tool): \(.error)" + else empty + end +' "$TRACE_FILE" 2>/dev/null + +# Slow calls (>5s) +echo "" +SLOW=$(jq -r 'select(.event == "end" and .duration_ms > 5000) | "\(.id) \(.tool) \(.duration_ms)ms"' "$TRACE_FILE" 2>/dev/null) +if [[ -n "$SLOW" ]]; then + echo "== Slow Calls (>5s) ==" + echo "$SLOW" +else + echo "== No slow calls (>5s) ==" +fi + +# Errors with context +if [[ "$ERROR_COUNT" -gt 0 ]]; then + echo "" + echo "== Errors ==" + jq -r 'select(.event == "error") | "\(.id) \(.tool): \(.error)"' "$TRACE_FILE" 2>/dev/null +fi diff --git a/src/tools/bash.ts b/src/tools/bash.ts index 6f15976..4366211 100644 --- a/src/tools/bash.ts +++ b/src/tools/bash.ts @@ -134,6 +134,7 @@ export function createBashTool(sandbox: Sandbox, config?: ToolConfig) { if (debugId) { debugEnd(debugId, "bash", { + output: { stdout, stderr: stderr || undefined }, summary: { exitCode: result.exitCode, stdoutLen: result.stdout.length, diff --git a/src/tools/grep.ts b/src/tools/grep.ts index 0acceda..5f2ccbc 100644 --- a/src/tools/grep.ts +++ b/src/tools/grep.ts @@ -263,6 +263,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseFilesOutput(result.stdout); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepFilesOutput).files, summary: { fileCount: (output as GrepFilesOutput).count, exitCode: result.exitCode, @@ -274,6 +275,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseCountOutput(result.stdout); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepCountOutput).counts, summary: { total: (output as GrepCountOutput).total, exitCode: result.exitCode, @@ -285,6 +287,7 @@ export function createGrepTool(sandbox: Sandbox, config?: GrepToolConfig) { output = parseContentOutput(result.stdout, head_limit, offset); if (debugId) { debugEnd(debugId, "grep", { + output: (output as GrepContentOutput).matches, summary: { matchCount: (output as GrepContentOutput).total_matches, exitCode: result.exitCode, diff --git a/src/tools/read.ts b/src/tools/read.ts index 17b1fc3..83fded5 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -164,6 +164,7 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { const durationMs = Math.round(performance.now() - startTime); if (debugId) { debugEnd(debugId, "read", { + output: selectedLines.join("\n"), summary: { type: "text", totalLines, diff --git a/src/tools/task.ts b/src/tools/task.ts index 8d714c5..5b593f7 100644 --- a/src/tools/task.ts +++ b/src/tools/task.ts @@ -389,7 +389,9 @@ export function createTaskTool( debugError(debugId, "task", result.error); } else { debugEnd(debugId, "task", { + output: result.result, summary: { + subagent: result.subagent, tokens: result.usage ? { input: result.usage.input_tokens, diff --git a/src/utils/debug.ts b/src/utils/debug.ts index 276871f..32fffe3 100644 --- a/src/utils/debug.ts +++ b/src/utils/debug.ts @@ -60,8 +60,8 @@ interface DebugContext { const debugContext = new AsyncLocalStorage(); // Truncation limits -const MAX_STRING_LENGTH = 1000; -const MAX_ARRAY_ITEMS = 10; +const MAX_STRING_LENGTH = 4000; +const MAX_ARRAY_ITEMS = 20; /** * Initialize debug mode from environment variable. From e3eb3e9877a46a040efd6fa7e8643c793ff781b0 Mon Sep 17 00:00:00 2001 From: jbreite Date: Sat, 7 Mar 2026 10:27:18 -0500 Subject: [PATCH 2/4] fix: update debug tests to match current truncation limits Co-Authored-By: Claude Opus 4.6 --- tests/utils/debug.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/utils/debug.test.ts b/tests/utils/debug.test.ts index 9618c99..037d4d4 100644 --- a/tests/utils/debug.test.ts +++ b/tests/utils/debug.test.ts @@ -140,7 +140,7 @@ describe("debug", () => { }); it("summarizes input data", () => { - const longString = "x".repeat(2000); + const longString = "x".repeat(5000); debugStart("bash", { command: longString }); const logs = getDebugLogs(); const input = logs[0].input as Record; @@ -407,7 +407,7 @@ describe("debug", () => { }); it("truncates long strings", () => { - const long = "x".repeat(2000); + const long = "x".repeat(5000); const result = summarize(long) as string; expect(result.length).toBeLessThan(long.length); expect(result).toContain("[truncated"); @@ -428,10 +428,10 @@ describe("debug", () => { }); it("truncates long arrays", () => { - const arr = Array.from({ length: 20 }, (_, i) => i); + const arr = Array.from({ length: 30 }, (_, i) => i); const result = summarize(arr) as unknown[]; - expect(result.length).toBe(11); // 10 items + truncation message - expect(result[10]).toContain("10 more items"); + expect(result.length).toBe(21); // 20 items + truncation message + expect(result[20]).toContain("10 more items"); }); it("keeps short arrays intact", () => { @@ -440,7 +440,7 @@ describe("debug", () => { }); it("recursively summarizes objects", () => { - const obj = { name: "test", value: "x".repeat(2000) }; + const obj = { name: "test", value: "x".repeat(5000) }; const result = summarize(obj) as Record; expect(result.name).toBe("test"); expect(result.value).toContain("[truncated"); From d8a64fc7fc581a5cec53569eb86b5dca6d45edf3 Mon Sep 17 00:00:00 2001 From: jbreite Date: Sat, 7 Mar 2026 10:39:41 -0500 Subject: [PATCH 3/4] fix: update stale docs, add directory output to read debug, deduplicate join - Update summarize() JSDoc and AGENTS.md to reflect current truncation limits (4000 chars, 20 items) - Add output field to read tool's directory-listing debug trace - Extract selectedLines.join() to avoid computing it twice Co-Authored-By: Claude Opus 4.6 --- src/tools/read.ts | 6 ++++-- src/utils/AGENTS.md | 4 ++-- src/utils/debug.ts | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/tools/read.ts b/src/tools/read.ts index 83fded5..96432a8 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -101,6 +101,7 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { const durationMs = Math.round(performance.now() - startTime); if (debugId) { debugEnd(debugId, "read", { + output: entries, summary: { type: "directory", count: entries.length }, duration_ms: durationMs, }); @@ -161,10 +162,11 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { content: line, })); + const contentStr = selectedLines.join("\n"); const durationMs = Math.round(performance.now() - startTime); if (debugId) { debugEnd(debugId, "read", { - output: selectedLines.join("\n"), + output: contentStr, summary: { type: "text", totalLines, @@ -177,7 +179,7 @@ export function createReadTool(sandbox: Sandbox, config?: ToolConfig) { return { type: "text", - content: selectedLines.join("\n"), + content: contentStr, lines, total_lines: totalLines, }; diff --git a/src/utils/AGENTS.md b/src/utils/AGENTS.md index 93a4e9d..0b43068 100644 --- a/src/utils/AGENTS.md +++ b/src/utils/AGENTS.md @@ -143,8 +143,8 @@ Three complementary approaches: - Events include `parent` field to reconstruct call hierarchy **Data Truncation**: -- Strings truncated to 1000 chars -- Arrays truncated to 10 items +- Strings truncated to 4000 chars +- Arrays truncated to 20 items - Recursive summarization for nested objects (max depth: 5) ## Design Patterns diff --git a/src/utils/debug.ts b/src/utils/debug.ts index 32fffe3..10c5a03 100644 --- a/src/utils/debug.ts +++ b/src/utils/debug.ts @@ -119,8 +119,8 @@ function truncateString(str: string): string { /** * Summarize data for debug output. - * - Truncates strings to 1000 chars - * - Limits arrays to 10 items + * - Truncates strings to MAX_STRING_LENGTH chars + * - Limits arrays to MAX_ARRAY_ITEMS items * - Recursively summarizes nested objects */ export function summarize(data: unknown, depth = 0): unknown { From 339e39d688b993c905475a8384ccee5d467496a5 Mon Sep 17 00:00:00 2001 From: jbreite Date: Sat, 7 Mar 2026 11:21:59 -0500 Subject: [PATCH 4/4] bump package version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3900afe..7ca7d0b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "bashkit", - "version": "0.5.2", + "version": "0.5.3", "description": "Agentic coding tools for the Vercel AI SDK", "type": "module", "main": "./dist/index.js",