From f5fcb3f5205ffc86cf34a58e62320703c7fcde40 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 23:28:23 +0000 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=A4=96=20fix:=20use=20is=5Fresolved?= =?UTF-8?q?=20field=20in=20jq=20parsing=20for=20per-task=20summary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/terminal-bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 1f99b42ae..666a28686 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -120,7 +120,7 @@ jobs: cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" echo "" echo "Per-task summary:" - cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" else echo "No results.json found in runs/" ls -la runs/ From a03db45bf28189c6ab1084339a6d205ac772c814 Mon Sep 17 00:00:00 2001 From: Ammar Date: Thu, 30 Oct 2025 16:12:50 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=A4=96=20feat:=20implement=20intellig?= =?UTF-8?q?ent=20per-task=20timeouts=20for=20terminal-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: - Fixed 15-minute timeout caused 27-35% of tasks to fail - Some tasks that timed out actually passed their tests - Simple tasks waste time, complex tasks need more time - Analysis of Oct 30 nightly run showed clear task categories Solution: - Add task_timeouts.py with evidence-based timeout configuration - FAST tasks (5 min): hello-world, simple-web-scraper, etc. - NORMAL tasks (15 min): default for most tasks - SLOW tasks (30 min): data processing, ML, complex analysis - VERY_SLOW tasks (60 min): kernel builds, large compilations - Add calculate_timeout.py to compute optimal timeouts - Update Makefile to automatically use intelligent timeouts - Analyzes selected tasks and picks max timeout needed - Can be overridden with TB_TIMEOUT env var - Falls back to 60min for full suite (conservative) - Add comprehensive tests and documentation Impact: - Expected to reduce false timeout failures by ~50% - Should improve pass rates by 10-15 percentage points (42% → 52-57%) - No changes needed to workflow files - Makefile handles everything - Backward compatible: TB_TIMEOUT env var allows manual override Evidence from 2025-10-30 nightly run: - build-linux-kernel-qemu: failed at 763s (needs 60min) - count-dataset-tokens: Anthropic timed out at 808s (needs 30min) - qemu-startup: passed at 838s but hit timeout (needs 30min) - blind-maze-explorer-algorithm.hard: passed at 1200s (needs 30min) - hello-world, simple tasks: complete quickly (need only 5min) _Generated with `cmux`_ --- Makefile | 17 ++- benchmarks/terminal_bench/README.md | 123 ++++++++++++++++++ .../terminal_bench/calculate_timeout.py | 57 ++++++++ benchmarks/terminal_bench/task_timeouts.py | 103 +++++++++++++++ .../terminal_bench/task_timeouts_test.py | 66 ++++++++++ 5 files changed, 365 insertions(+), 1 deletion(-) create mode 100644 benchmarks/terminal_bench/README.md create mode 100755 benchmarks/terminal_bench/calculate_timeout.py create mode 100644 benchmarks/terminal_bench/task_timeouts.py create mode 100644 benchmarks/terminal_bench/task_timeouts_test.py diff --git a/Makefile b/Makefile index 24cd13165..c1bcb4d54 100644 --- a/Makefile +++ b/Makefile @@ -295,11 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin @bun x chromatic --exit-zero-on-changes ## Benchmarks -benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_ARGS to customize) +benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize) @TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \ CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \ LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \ TASK_ID_FLAGS=""; \ + TASK_IDS_LIST=""; \ if [ -n "$$TB_SAMPLE_SIZE" ]; then \ echo "Ensuring dataset $$TB_DATASET is downloaded..."; \ uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \ @@ -315,14 +316,28 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB for task_id in $$TASK_IDS; do \ TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \ done; \ + TASK_IDS_LIST="$$TASK_IDS"; \ echo "Selected task IDs: $$TASK_IDS"; \ fi; \ + TIMEOUT_FLAG=""; \ + if [ -n "$$TB_TIMEOUT" ]; then \ + echo "Using explicit timeout: $$TB_TIMEOUT seconds"; \ + TIMEOUT_FLAG="--global-agent-timeout-sec $$TB_TIMEOUT"; \ + elif [ -n "$$TASK_IDS_LIST" ]; then \ + echo "Calculating optimal timeout for selected tasks..."; \ + TIMEOUT_FLAG=$$(python benchmarks/terminal_bench/calculate_timeout.py --task-ids $$TASK_IDS_LIST --format flag); \ + echo "Timeout: $$TIMEOUT_FLAG"; \ + else \ + echo "Using default timeout (60 minutes for full suite)"; \ + TIMEOUT_FLAG="--global-agent-timeout-sec 3600"; \ + fi; \ echo "Running Terminal-Bench with dataset $$TB_DATASET"; \ uvx terminal-bench run \ --dataset "$$TB_DATASET" \ --agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \ $$CONCURRENCY_FLAG \ $$LIVESTREAM_FLAG \ + $$TIMEOUT_FLAG \ $$TASK_ID_FLAGS \ $${TB_ARGS} diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md new file mode 100644 index 000000000..cbb86c4d0 --- /dev/null +++ b/benchmarks/terminal_bench/README.md @@ -0,0 +1,123 @@ +# Terminal-Bench Integration + +This directory contains the cmux agent adapter for [Terminal-Bench](https://github.com/benediktstroebl/terminal-bench), a benchmarking framework for evaluating agentic CLI/terminal capabilities. + +## Quick Start + +```bash +# Run full benchmark suite (80 tasks, ~2.5 hours) +make benchmark-terminal + +# Run with sample of 5 tasks +TB_SAMPLE_SIZE=5 make benchmark-terminal + +# Run specific tasks +make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move" + +# Run with specific model +make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-4" +``` + +## Configuration + +### Environment Variables + +- `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`) +- `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks) +- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4) +- `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable) +- `TB_TIMEOUT`: Override timeout in seconds (default: intelligent per-task timeout) +- `TB_ARGS`: Additional arguments passed to terminal-bench + +### Intelligent Timeout Handling + +The Makefile automatically calculates optimal timeouts based on task complexity: + +- **FAST tasks** (5 min): Simple operations like `hello-world`, `fix-permissions` +- **NORMAL tasks** (15 min): Default for most tasks +- **SLOW tasks** (30 min): Data processing, ML training, complex analysis +- **VERY_SLOW tasks** (60 min): Kernel compilation, large builds + +**How it works:** + +1. If `TB_TIMEOUT` is set, uses that value explicitly +2. If specific tasks are selected (via `TB_SAMPLE_SIZE` or `--task-id`), calculates the maximum timeout needed for those tasks +3. For full suite runs, uses 60 minutes (conservative default) + +**Examples:** + +```bash +# Fast tasks get 5 minute timeout automatically +make benchmark-terminal TB_ARGS="--task-id hello-world --task-id simple-web-scraper" + +# Slow tasks get 60 minute timeout automatically +make benchmark-terminal TB_ARGS="--task-id build-linux-kernel-qemu" + +# Override timeout manually (in seconds) +TB_TIMEOUT=1200 make benchmark-terminal TB_ARGS="--task-id chess-best-move" +``` + +### Task Timeout Configuration + +Task timeouts are configured in `task_timeouts.py` based on empirical data from nightly runs. To add or modify timeouts: + +```python +# In task_timeouts.py +TASK_TIMEOUTS = { + "my-new-task": SLOW_TIMEOUT, # 30 minutes + "my-fast-task": FAST_TIMEOUT, # 5 minutes +} +``` + +## Agent Configuration + +The cmux agent supports the following kwargs (passed via `--agent-kwarg`): + +- `model_name`: Model to use (e.g., `anthropic:claude-sonnet-4-5`, `openai:gpt-5-codex`) +- `thinking_level`: Thinking level (`off`, `low`, `medium`, `high`) +- `mode`: Agent mode (`plan`, `exec`) + +**Example:** + +```bash +make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai:gpt-5-codex --agent-kwarg thinking_level=high" +``` + +## Results + +Results are saved to `runs/YYYY-MM-DD__HH-MM-SS/`: + +- `results.json`: Aggregate results with pass/fail rates +- `run_metadata.json`: Run configuration and metadata +- `/`: Per-task directories containing: + - `sessions/agent.log`: Full agent execution log + - `sessions/agent.cast`: Asciinema recording of agent session + - `sessions/tests.log`: Test execution output + - `results.json`: Per-trial results + +## CI/CD Integration + +See `.github/workflows/terminal-bench.yml` and `.github/workflows/nightly-terminal-bench.yml` for GitHub Actions integration. + +**Nightly workflow** runs both Claude and GPT models on the full 80-task suite, uploading results as artifacts. + +## Timeout Analysis (2025-10-30 Nightly Run) + +Based on analysis of the Oct 30 nightly run: + +- **27-35% of tasks hit timeout** with 15-minute default +- **5-6 tasks passed tests but hit timeout** (would have succeeded with more time) +- **Mean duration**: 356s (Anthropic) / 438s (OpenAI) +- **Median duration**: 272s (Anthropic) / 299s (OpenAI) + +**Impact of intelligent timeouts**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%). + +## Files + +- `cmux_agent.py`: Main agent adapter implementing Terminal-Bench's agent interface +- `cmux-run.sh`: Shell script that sets up environment and invokes cmux CLI +- `cmux_payload.py`: Helper to package cmux app for containerized execution +- `cmux_setup.sh.j2`: Jinja2 template for agent installation script +- `task_timeouts.py`: Task-specific timeout configuration +- `calculate_timeout.py`: Helper script to calculate optimal timeouts +- `sample_tasks.py`: Utility to randomly sample tasks from dataset diff --git a/benchmarks/terminal_bench/calculate_timeout.py b/benchmarks/terminal_bench/calculate_timeout.py new file mode 100755 index 000000000..3f9f83f69 --- /dev/null +++ b/benchmarks/terminal_bench/calculate_timeout.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Calculate optimal global timeout for terminal-bench runs. + +Usage: + python calculate_timeout.py [--task-ids task1 task2 ...] [--multiplier 1.0] +""" + +import argparse +import sys +from pathlib import Path + +# Add parent directory to path to import task_timeouts +sys.path.insert(0, str(Path(__file__).parent)) + +from task_timeouts import get_max_timeout_for_tasks, VERY_SLOW_TIMEOUT + + +def main(): + parser = argparse.ArgumentParser(description="Calculate timeout for terminal-bench") + parser.add_argument( + "--task-ids", + nargs="*", + help="List of task IDs to calculate timeout for", + ) + parser.add_argument( + "--multiplier", + type=float, + default=1.0, + help="Multiplier for the timeout (default: 1.0)", + ) + parser.add_argument( + "--format", + choices=["seconds", "flag"], + default="flag", + help="Output format: 'seconds' (just the number) or 'flag' (--global-agent-timeout-sec VALUE)", + ) + + args = parser.parse_args() + + if args.task_ids: + timeout = get_max_timeout_for_tasks(args.task_ids) + else: + # No specific tasks - use conservative default for full suite + timeout = VERY_SLOW_TIMEOUT + + # Apply multiplier + timeout = int(timeout * args.multiplier) + + if args.format == "seconds": + print(timeout) + else: + print(f"--global-agent-timeout-sec {timeout}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/terminal_bench/task_timeouts.py b/benchmarks/terminal_bench/task_timeouts.py new file mode 100644 index 000000000..c08417b69 --- /dev/null +++ b/benchmarks/terminal_bench/task_timeouts.py @@ -0,0 +1,103 @@ +""" +Task-specific timeout recommendations based on terminal-bench nightly results. + +Analysis from 2025-10-30 run showed: +- Default timeout appears to be ~15 minutes (900s) per task +- 27-35% of tasks hit timeout (22 for Anthropic, 28 for OpenAI) +- Some tasks that timed out actually passed their tests +- Complex tasks (compilation, data processing) need more time +- Simple tasks (hello-world) need less time + +Strategy: +- FAST tasks (< 5 min): Simple file operations, basic commands +- NORMAL tasks (15 min): Default for most tasks +- SLOW tasks (30 min): Data processing, model training, complex analysis +- VERY_SLOW tasks (60 min): Kernel compilation, large builds +""" + +# Timeout in seconds +FAST_TIMEOUT = 300 # 5 minutes +NORMAL_TIMEOUT = 900 # 15 minutes (current default) +SLOW_TIMEOUT = 1800 # 30 minutes +VERY_SLOW_TIMEOUT = 3600 # 60 minutes + +# Tasks that need extended timeouts (evidence from 2025-10-30 run) +TASK_TIMEOUTS = { + # VERY_SLOW: Compilation tasks that legitimately take 30+ minutes + "build-linux-kernel-qemu": VERY_SLOW_TIMEOUT, # Failed at 763s + "build-initramfs-qemu": VERY_SLOW_TIMEOUT, + "build-tcc-qemu": SLOW_TIMEOUT, + + # SLOW: Data processing, ML training, complex analysis + "count-dataset-tokens": SLOW_TIMEOUT, # Anthropic timed out at 808s, OpenAI succeeded at 344s + "train-fasttext": SLOW_TIMEOUT, # Timed out at 900s + "cartpole-rl-training": SLOW_TIMEOUT, # Succeeded but took time + "hf-model-inference": SLOW_TIMEOUT, # Timed out at 660s + "eval-mteb": SLOW_TIMEOUT, + "eval-mteb.hard": SLOW_TIMEOUT, + "reshard-c4-data": SLOW_TIMEOUT, + + # SLOW: QEMU/emulation tasks + "qemu-startup": SLOW_TIMEOUT, # Passed at 838s but hit timeout + "qemu-alpine-ssh": SLOW_TIMEOUT, + "run-pdp11-code": SLOW_TIMEOUT, + + # SLOW: Complex algorithmic tasks + "blind-maze-explorer-algorithm": SLOW_TIMEOUT, + "blind-maze-explorer-algorithm.easy": SLOW_TIMEOUT, + "blind-maze-explorer-algorithm.hard": SLOW_TIMEOUT, # Passed at 1200s! + "path-tracing": SLOW_TIMEOUT, # Passed at 660s + "path-tracing-reverse": SLOW_TIMEOUT, # Timed out at 660s + + # SLOW: Security/crypto tasks that may need brute force + "crack-7z-hash": SLOW_TIMEOUT, + "crack-7z-hash.hard": SLOW_TIMEOUT, + "password-recovery": SLOW_TIMEOUT, + "security-vulhub-minio": SLOW_TIMEOUT, + + # SLOW: Complex git/code analysis + "git-workflow-hack": SLOW_TIMEOUT, # Passed but hit timeout + "pytorch-model-cli": SLOW_TIMEOUT, # Passed at 541s + "swe-bench-astropy-1": SLOW_TIMEOUT, + "swe-bench-astropy-2": SLOW_TIMEOUT, + "swe-bench-fsspec": SLOW_TIMEOUT, + "swe-bench-langcodes": SLOW_TIMEOUT, + + # SLOW: Compilation/code generation + "gpt2-codegolf": SLOW_TIMEOUT, + "polyglot-c-py": SLOW_TIMEOUT, + "polyglot-rust-c": SLOW_TIMEOUT, + "write-compressor": SLOW_TIMEOUT, + + # SLOW: Complex system tasks + "cron-broken-network": SLOW_TIMEOUT, + "oom": SLOW_TIMEOUT, + "fibonacci-server": SLOW_TIMEOUT, + "incompatible-python-fasttext.base_with_hint": SLOW_TIMEOUT, + "extract-safely": SLOW_TIMEOUT, + + # FAST: Simple tasks that should complete quickly + "hello-world": FAST_TIMEOUT, + "fix-permissions": FAST_TIMEOUT, + "openssl-selfsigned-cert": FAST_TIMEOUT, + "simple-web-scraper": FAST_TIMEOUT, + "simple-sheets-put": FAST_TIMEOUT, + "csv-to-parquet": FAST_TIMEOUT, + "crack-7z-hash.easy": FAST_TIMEOUT, +} + + +def get_timeout_for_task(task_id: str) -> int: + """Get recommended timeout in seconds for a given task.""" + return TASK_TIMEOUTS.get(task_id, NORMAL_TIMEOUT) + + +def get_max_timeout_for_tasks(task_ids: list[str]) -> int: + """ + Get the maximum timeout needed for a set of tasks. + Useful for setting --global-agent-timeout-sec. + """ + if not task_ids: + return VERY_SLOW_TIMEOUT # Conservative default for unknown tasks + + return max(get_timeout_for_task(task_id) for task_id in task_ids) diff --git a/benchmarks/terminal_bench/task_timeouts_test.py b/benchmarks/terminal_bench/task_timeouts_test.py new file mode 100644 index 000000000..df7c7d599 --- /dev/null +++ b/benchmarks/terminal_bench/task_timeouts_test.py @@ -0,0 +1,66 @@ +"""Tests for task timeout configuration.""" + +from task_timeouts import ( + FAST_TIMEOUT, + NORMAL_TIMEOUT, + SLOW_TIMEOUT, + VERY_SLOW_TIMEOUT, + get_timeout_for_task, + get_max_timeout_for_tasks, +) + + +def test_fast_tasks(): + """Fast tasks should get 5 minute timeout.""" + assert get_timeout_for_task("hello-world") == FAST_TIMEOUT + assert get_timeout_for_task("simple-web-scraper") == FAST_TIMEOUT + assert get_timeout_for_task("fix-permissions") == FAST_TIMEOUT + + +def test_normal_tasks(): + """Normal tasks should get default 15 minute timeout.""" + # Unknown tasks default to NORMAL + assert get_timeout_for_task("unknown-task") == NORMAL_TIMEOUT + assert get_timeout_for_task("some-random-task") == NORMAL_TIMEOUT + + +def test_slow_tasks(): + """Slow tasks should get 30 minute timeout.""" + assert get_timeout_for_task("count-dataset-tokens") == SLOW_TIMEOUT + assert get_timeout_for_task("qemu-startup") == SLOW_TIMEOUT + assert get_timeout_for_task("path-tracing") == SLOW_TIMEOUT + + +def test_very_slow_tasks(): + """Very slow tasks should get 60 minute timeout.""" + assert get_timeout_for_task("build-linux-kernel-qemu") == VERY_SLOW_TIMEOUT + assert get_timeout_for_task("build-initramfs-qemu") == VERY_SLOW_TIMEOUT + + +def test_max_timeout_for_tasks(): + """Should return maximum timeout needed for a set of tasks.""" + # Mix of fast and slow + tasks = ["hello-world", "count-dataset-tokens"] + assert get_max_timeout_for_tasks(tasks) == SLOW_TIMEOUT + + # Mix of fast, slow, and very slow + tasks = ["hello-world", "count-dataset-tokens", "build-linux-kernel-qemu"] + assert get_max_timeout_for_tasks(tasks) == VERY_SLOW_TIMEOUT + + # All fast + tasks = ["hello-world", "simple-web-scraper"] + assert get_max_timeout_for_tasks(tasks) == FAST_TIMEOUT + + # Empty list should return conservative default + assert get_max_timeout_for_tasks([]) == VERY_SLOW_TIMEOUT + + +def test_timeout_values(): + """Verify timeout constants are reasonable.""" + assert FAST_TIMEOUT == 300 # 5 minutes + assert NORMAL_TIMEOUT == 900 # 15 minutes + assert SLOW_TIMEOUT == 1800 # 30 minutes + assert VERY_SLOW_TIMEOUT == 3600 # 60 minutes + + # Ensure proper ordering + assert FAST_TIMEOUT < NORMAL_TIMEOUT < SLOW_TIMEOUT < VERY_SLOW_TIMEOUT From 8b8a62b67c70800eb7572fe19e0e9683f92328ac Mon Sep 17 00:00:00 2001 From: Ammar Date: Thu, 30 Oct 2025 16:40:53 +0000 Subject: [PATCH 3/5] fix: format Python files with ruff _Generated with `cmux`_ --- benchmarks/terminal_bench/calculate_timeout.py | 8 ++++---- benchmarks/terminal_bench/task_timeouts.py | 12 ++---------- benchmarks/terminal_bench/task_timeouts_test.py | 8 ++++---- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/benchmarks/terminal_bench/calculate_timeout.py b/benchmarks/terminal_bench/calculate_timeout.py index 3f9f83f69..e4aa84351 100755 --- a/benchmarks/terminal_bench/calculate_timeout.py +++ b/benchmarks/terminal_bench/calculate_timeout.py @@ -35,18 +35,18 @@ def main(): default="flag", help="Output format: 'seconds' (just the number) or 'flag' (--global-agent-timeout-sec VALUE)", ) - + args = parser.parse_args() - + if args.task_ids: timeout = get_max_timeout_for_tasks(args.task_ids) else: # No specific tasks - use conservative default for full suite timeout = VERY_SLOW_TIMEOUT - + # Apply multiplier timeout = int(timeout * args.multiplier) - + if args.format == "seconds": print(timeout) else: diff --git a/benchmarks/terminal_bench/task_timeouts.py b/benchmarks/terminal_bench/task_timeouts.py index c08417b69..e9e75869b 100644 --- a/benchmarks/terminal_bench/task_timeouts.py +++ b/benchmarks/terminal_bench/task_timeouts.py @@ -10,7 +10,7 @@ Strategy: - FAST tasks (< 5 min): Simple file operations, basic commands -- NORMAL tasks (15 min): Default for most tasks +- NORMAL tasks (15 min): Default for most tasks - SLOW tasks (30 min): Data processing, model training, complex analysis - VERY_SLOW tasks (60 min): Kernel compilation, large builds """ @@ -27,7 +27,6 @@ "build-linux-kernel-qemu": VERY_SLOW_TIMEOUT, # Failed at 763s "build-initramfs-qemu": VERY_SLOW_TIMEOUT, "build-tcc-qemu": SLOW_TIMEOUT, - # SLOW: Data processing, ML training, complex analysis "count-dataset-tokens": SLOW_TIMEOUT, # Anthropic timed out at 808s, OpenAI succeeded at 344s "train-fasttext": SLOW_TIMEOUT, # Timed out at 900s @@ -36,25 +35,21 @@ "eval-mteb": SLOW_TIMEOUT, "eval-mteb.hard": SLOW_TIMEOUT, "reshard-c4-data": SLOW_TIMEOUT, - # SLOW: QEMU/emulation tasks "qemu-startup": SLOW_TIMEOUT, # Passed at 838s but hit timeout "qemu-alpine-ssh": SLOW_TIMEOUT, "run-pdp11-code": SLOW_TIMEOUT, - # SLOW: Complex algorithmic tasks "blind-maze-explorer-algorithm": SLOW_TIMEOUT, "blind-maze-explorer-algorithm.easy": SLOW_TIMEOUT, "blind-maze-explorer-algorithm.hard": SLOW_TIMEOUT, # Passed at 1200s! "path-tracing": SLOW_TIMEOUT, # Passed at 660s "path-tracing-reverse": SLOW_TIMEOUT, # Timed out at 660s - # SLOW: Security/crypto tasks that may need brute force "crack-7z-hash": SLOW_TIMEOUT, "crack-7z-hash.hard": SLOW_TIMEOUT, "password-recovery": SLOW_TIMEOUT, "security-vulhub-minio": SLOW_TIMEOUT, - # SLOW: Complex git/code analysis "git-workflow-hack": SLOW_TIMEOUT, # Passed but hit timeout "pytorch-model-cli": SLOW_TIMEOUT, # Passed at 541s @@ -62,20 +57,17 @@ "swe-bench-astropy-2": SLOW_TIMEOUT, "swe-bench-fsspec": SLOW_TIMEOUT, "swe-bench-langcodes": SLOW_TIMEOUT, - # SLOW: Compilation/code generation "gpt2-codegolf": SLOW_TIMEOUT, "polyglot-c-py": SLOW_TIMEOUT, "polyglot-rust-c": SLOW_TIMEOUT, "write-compressor": SLOW_TIMEOUT, - # SLOW: Complex system tasks "cron-broken-network": SLOW_TIMEOUT, "oom": SLOW_TIMEOUT, "fibonacci-server": SLOW_TIMEOUT, "incompatible-python-fasttext.base_with_hint": SLOW_TIMEOUT, "extract-safely": SLOW_TIMEOUT, - # FAST: Simple tasks that should complete quickly "hello-world": FAST_TIMEOUT, "fix-permissions": FAST_TIMEOUT, @@ -99,5 +91,5 @@ def get_max_timeout_for_tasks(task_ids: list[str]) -> int: """ if not task_ids: return VERY_SLOW_TIMEOUT # Conservative default for unknown tasks - + return max(get_timeout_for_task(task_id) for task_id in task_ids) diff --git a/benchmarks/terminal_bench/task_timeouts_test.py b/benchmarks/terminal_bench/task_timeouts_test.py index df7c7d599..89d38a264 100644 --- a/benchmarks/terminal_bench/task_timeouts_test.py +++ b/benchmarks/terminal_bench/task_timeouts_test.py @@ -42,15 +42,15 @@ def test_max_timeout_for_tasks(): # Mix of fast and slow tasks = ["hello-world", "count-dataset-tokens"] assert get_max_timeout_for_tasks(tasks) == SLOW_TIMEOUT - + # Mix of fast, slow, and very slow tasks = ["hello-world", "count-dataset-tokens", "build-linux-kernel-qemu"] assert get_max_timeout_for_tasks(tasks) == VERY_SLOW_TIMEOUT - + # All fast tasks = ["hello-world", "simple-web-scraper"] assert get_max_timeout_for_tasks(tasks) == FAST_TIMEOUT - + # Empty list should return conservative default assert get_max_timeout_for_tasks([]) == VERY_SLOW_TIMEOUT @@ -61,6 +61,6 @@ def test_timeout_values(): assert NORMAL_TIMEOUT == 900 # 15 minutes assert SLOW_TIMEOUT == 1800 # 30 minutes assert VERY_SLOW_TIMEOUT == 3600 # 60 minutes - + # Ensure proper ordering assert FAST_TIMEOUT < NORMAL_TIMEOUT < SLOW_TIMEOUT < VERY_SLOW_TIMEOUT From eaffa08e6eaeb947651e6113d9843f24f4f7447e Mon Sep 17 00:00:00 2001 From: Ammar Date: Thu, 30 Oct 2025 16:50:49 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=A4=96=20Simplify=20to=20global=20tim?= =?UTF-8?q?eout=20approach?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove per-task timeout complexity in favor of single global default. - Remove task_timeouts.py, calculate_timeout.py, and tests - Set 30-minute (1800s) global timeout default in Makefile - Update README to document preference for global defaults - Based on Oct 30 nightly run analysis: longest successful task was 20 min Rationale: Avoid maintenance burden of per-task configuration. If tasks timeout, users can increase TB_TIMEOUT globally. --- Makefile | 18 +--- benchmarks/terminal_bench/README.md | 56 ++++------- .../terminal_bench/calculate_timeout.py | 57 ----------- benchmarks/terminal_bench/task_timeouts.py | 95 ------------------- .../terminal_bench/task_timeouts_test.py | 66 ------------- 5 files changed, 23 insertions(+), 269 deletions(-) delete mode 100755 benchmarks/terminal_bench/calculate_timeout.py delete mode 100644 benchmarks/terminal_bench/task_timeouts.py delete mode 100644 benchmarks/terminal_bench/task_timeouts_test.py diff --git a/Makefile b/Makefile index c1bcb4d54..d3df25aee 100644 --- a/Makefile +++ b/Makefile @@ -297,10 +297,10 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin ## Benchmarks benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize) @TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \ + TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \ CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \ LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \ TASK_ID_FLAGS=""; \ - TASK_IDS_LIST=""; \ if [ -n "$$TB_SAMPLE_SIZE" ]; then \ echo "Ensuring dataset $$TB_DATASET is downloaded..."; \ uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \ @@ -316,28 +316,16 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB for task_id in $$TASK_IDS; do \ TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \ done; \ - TASK_IDS_LIST="$$TASK_IDS"; \ echo "Selected task IDs: $$TASK_IDS"; \ fi; \ - TIMEOUT_FLAG=""; \ - if [ -n "$$TB_TIMEOUT" ]; then \ - echo "Using explicit timeout: $$TB_TIMEOUT seconds"; \ - TIMEOUT_FLAG="--global-agent-timeout-sec $$TB_TIMEOUT"; \ - elif [ -n "$$TASK_IDS_LIST" ]; then \ - echo "Calculating optimal timeout for selected tasks..."; \ - TIMEOUT_FLAG=$$(python benchmarks/terminal_bench/calculate_timeout.py --task-ids $$TASK_IDS_LIST --format flag); \ - echo "Timeout: $$TIMEOUT_FLAG"; \ - else \ - echo "Using default timeout (60 minutes for full suite)"; \ - TIMEOUT_FLAG="--global-agent-timeout-sec 3600"; \ - fi; \ + echo "Using timeout: $$TB_TIMEOUT seconds"; \ echo "Running Terminal-Bench with dataset $$TB_DATASET"; \ uvx terminal-bench run \ --dataset "$$TB_DATASET" \ --agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \ + --global-agent-timeout-sec $$TB_TIMEOUT \ $$CONCURRENCY_FLAG \ $$LIVESTREAM_FLAG \ - $$TIMEOUT_FLAG \ $$TASK_ID_FLAGS \ $${TB_ARGS} diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md index cbb86c4d0..0c3d727b5 100644 --- a/benchmarks/terminal_bench/README.md +++ b/benchmarks/terminal_bench/README.md @@ -26,48 +26,33 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus- - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks) - `TB_CONCURRENCY`: Number of concurrent tasks (default: 4) - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable) -- `TB_TIMEOUT`: Override timeout in seconds (default: intelligent per-task timeout) +- `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes) - `TB_ARGS`: Additional arguments passed to terminal-bench -### Intelligent Timeout Handling +### Timeout Handling -The Makefile automatically calculates optimal timeouts based on task complexity: +The benchmark uses a **global timeout** applied to all tasks. The default is **30 minutes (1800 seconds)**, which provides sufficient time for most tasks while catching genuinely stuck agents. -- **FAST tasks** (5 min): Simple operations like `hello-world`, `fix-permissions` -- **NORMAL tasks** (15 min): Default for most tasks -- **SLOW tasks** (30 min): Data processing, ML training, complex analysis -- **VERY_SLOW tasks** (60 min): Kernel compilation, large builds +**Design Rationale:** -**How it works:** +Based on analysis of Oct 30, 2025 nightly runs: +- Longest successful task: `blind-maze-explorer-algorithm.hard` at 20 minutes +- 95th percentile: ~15 minutes +- Mean duration: ~6 minutes -1. If `TB_TIMEOUT` is set, uses that value explicitly -2. If specific tasks are selected (via `TB_SAMPLE_SIZE` or `--task-id`), calculates the maximum timeout needed for those tasks -3. For full suite runs, uses 60 minutes (conservative default) +The 30-minute default provides comfortable headroom for complex tasks without excessive wait times for failed attempts. -**Examples:** +**Override timeout:** ```bash -# Fast tasks get 5 minute timeout automatically -make benchmark-terminal TB_ARGS="--task-id hello-world --task-id simple-web-scraper" +# Run with 60 minute timeout for very complex tasks +TB_TIMEOUT=3600 make benchmark-terminal -# Slow tasks get 60 minute timeout automatically -make benchmark-terminal TB_ARGS="--task-id build-linux-kernel-qemu" - -# Override timeout manually (in seconds) -TB_TIMEOUT=1200 make benchmark-terminal TB_ARGS="--task-id chess-best-move" +# Run with shorter 10 minute timeout for quick iteration +TB_TIMEOUT=600 make benchmark-terminal TB_SAMPLE_SIZE=5 ``` -### Task Timeout Configuration - -Task timeouts are configured in `task_timeouts.py` based on empirical data from nightly runs. To add or modify timeouts: - -```python -# In task_timeouts.py -TASK_TIMEOUTS = { - "my-new-task": SLOW_TIMEOUT, # 30 minutes - "my-fast-task": FAST_TIMEOUT, # 5 minutes -} -``` +**Note:** We prefer global timeout defaults over per-task configuration to avoid complexity and maintenance burden. If you find tasks consistently timing out, increase `TB_TIMEOUT` rather than adding per-task configuration. ## Agent Configuration @@ -103,14 +88,15 @@ See `.github/workflows/terminal-bench.yml` and `.github/workflows/nightly-termin ## Timeout Analysis (2025-10-30 Nightly Run) -Based on analysis of the Oct 30 nightly run: +Based on analysis of the Oct 30 nightly run (15-minute timeout): -- **27-35% of tasks hit timeout** with 15-minute default -- **5-6 tasks passed tests but hit timeout** (would have succeeded with more time) +- **27-35% of tasks hit timeout** (too aggressive) +- **5-6 tasks passed tests but hit timeout flag** (false negatives) - **Mean duration**: 356s (Anthropic) / 438s (OpenAI) - **Median duration**: 272s (Anthropic) / 299s (OpenAI) +- **Longest successful**: 1200s (20 minutes) for `blind-maze-explorer-algorithm.hard` -**Impact of intelligent timeouts**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%). +**Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%). ## Files @@ -118,6 +104,4 @@ Based on analysis of the Oct 30 nightly run: - `cmux-run.sh`: Shell script that sets up environment and invokes cmux CLI - `cmux_payload.py`: Helper to package cmux app for containerized execution - `cmux_setup.sh.j2`: Jinja2 template for agent installation script -- `task_timeouts.py`: Task-specific timeout configuration -- `calculate_timeout.py`: Helper script to calculate optimal timeouts - `sample_tasks.py`: Utility to randomly sample tasks from dataset diff --git a/benchmarks/terminal_bench/calculate_timeout.py b/benchmarks/terminal_bench/calculate_timeout.py deleted file mode 100755 index e4aa84351..000000000 --- a/benchmarks/terminal_bench/calculate_timeout.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -""" -Calculate optimal global timeout for terminal-bench runs. - -Usage: - python calculate_timeout.py [--task-ids task1 task2 ...] [--multiplier 1.0] -""" - -import argparse -import sys -from pathlib import Path - -# Add parent directory to path to import task_timeouts -sys.path.insert(0, str(Path(__file__).parent)) - -from task_timeouts import get_max_timeout_for_tasks, VERY_SLOW_TIMEOUT - - -def main(): - parser = argparse.ArgumentParser(description="Calculate timeout for terminal-bench") - parser.add_argument( - "--task-ids", - nargs="*", - help="List of task IDs to calculate timeout for", - ) - parser.add_argument( - "--multiplier", - type=float, - default=1.0, - help="Multiplier for the timeout (default: 1.0)", - ) - parser.add_argument( - "--format", - choices=["seconds", "flag"], - default="flag", - help="Output format: 'seconds' (just the number) or 'flag' (--global-agent-timeout-sec VALUE)", - ) - - args = parser.parse_args() - - if args.task_ids: - timeout = get_max_timeout_for_tasks(args.task_ids) - else: - # No specific tasks - use conservative default for full suite - timeout = VERY_SLOW_TIMEOUT - - # Apply multiplier - timeout = int(timeout * args.multiplier) - - if args.format == "seconds": - print(timeout) - else: - print(f"--global-agent-timeout-sec {timeout}") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/terminal_bench/task_timeouts.py b/benchmarks/terminal_bench/task_timeouts.py deleted file mode 100644 index e9e75869b..000000000 --- a/benchmarks/terminal_bench/task_timeouts.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Task-specific timeout recommendations based on terminal-bench nightly results. - -Analysis from 2025-10-30 run showed: -- Default timeout appears to be ~15 minutes (900s) per task -- 27-35% of tasks hit timeout (22 for Anthropic, 28 for OpenAI) -- Some tasks that timed out actually passed their tests -- Complex tasks (compilation, data processing) need more time -- Simple tasks (hello-world) need less time - -Strategy: -- FAST tasks (< 5 min): Simple file operations, basic commands -- NORMAL tasks (15 min): Default for most tasks -- SLOW tasks (30 min): Data processing, model training, complex analysis -- VERY_SLOW tasks (60 min): Kernel compilation, large builds -""" - -# Timeout in seconds -FAST_TIMEOUT = 300 # 5 minutes -NORMAL_TIMEOUT = 900 # 15 minutes (current default) -SLOW_TIMEOUT = 1800 # 30 minutes -VERY_SLOW_TIMEOUT = 3600 # 60 minutes - -# Tasks that need extended timeouts (evidence from 2025-10-30 run) -TASK_TIMEOUTS = { - # VERY_SLOW: Compilation tasks that legitimately take 30+ minutes - "build-linux-kernel-qemu": VERY_SLOW_TIMEOUT, # Failed at 763s - "build-initramfs-qemu": VERY_SLOW_TIMEOUT, - "build-tcc-qemu": SLOW_TIMEOUT, - # SLOW: Data processing, ML training, complex analysis - "count-dataset-tokens": SLOW_TIMEOUT, # Anthropic timed out at 808s, OpenAI succeeded at 344s - "train-fasttext": SLOW_TIMEOUT, # Timed out at 900s - "cartpole-rl-training": SLOW_TIMEOUT, # Succeeded but took time - "hf-model-inference": SLOW_TIMEOUT, # Timed out at 660s - "eval-mteb": SLOW_TIMEOUT, - "eval-mteb.hard": SLOW_TIMEOUT, - "reshard-c4-data": SLOW_TIMEOUT, - # SLOW: QEMU/emulation tasks - "qemu-startup": SLOW_TIMEOUT, # Passed at 838s but hit timeout - "qemu-alpine-ssh": SLOW_TIMEOUT, - "run-pdp11-code": SLOW_TIMEOUT, - # SLOW: Complex algorithmic tasks - "blind-maze-explorer-algorithm": SLOW_TIMEOUT, - "blind-maze-explorer-algorithm.easy": SLOW_TIMEOUT, - "blind-maze-explorer-algorithm.hard": SLOW_TIMEOUT, # Passed at 1200s! - "path-tracing": SLOW_TIMEOUT, # Passed at 660s - "path-tracing-reverse": SLOW_TIMEOUT, # Timed out at 660s - # SLOW: Security/crypto tasks that may need brute force - "crack-7z-hash": SLOW_TIMEOUT, - "crack-7z-hash.hard": SLOW_TIMEOUT, - "password-recovery": SLOW_TIMEOUT, - "security-vulhub-minio": SLOW_TIMEOUT, - # SLOW: Complex git/code analysis - "git-workflow-hack": SLOW_TIMEOUT, # Passed but hit timeout - "pytorch-model-cli": SLOW_TIMEOUT, # Passed at 541s - "swe-bench-astropy-1": SLOW_TIMEOUT, - "swe-bench-astropy-2": SLOW_TIMEOUT, - "swe-bench-fsspec": SLOW_TIMEOUT, - "swe-bench-langcodes": SLOW_TIMEOUT, - # SLOW: Compilation/code generation - "gpt2-codegolf": SLOW_TIMEOUT, - "polyglot-c-py": SLOW_TIMEOUT, - "polyglot-rust-c": SLOW_TIMEOUT, - "write-compressor": SLOW_TIMEOUT, - # SLOW: Complex system tasks - "cron-broken-network": SLOW_TIMEOUT, - "oom": SLOW_TIMEOUT, - "fibonacci-server": SLOW_TIMEOUT, - "incompatible-python-fasttext.base_with_hint": SLOW_TIMEOUT, - "extract-safely": SLOW_TIMEOUT, - # FAST: Simple tasks that should complete quickly - "hello-world": FAST_TIMEOUT, - "fix-permissions": FAST_TIMEOUT, - "openssl-selfsigned-cert": FAST_TIMEOUT, - "simple-web-scraper": FAST_TIMEOUT, - "simple-sheets-put": FAST_TIMEOUT, - "csv-to-parquet": FAST_TIMEOUT, - "crack-7z-hash.easy": FAST_TIMEOUT, -} - - -def get_timeout_for_task(task_id: str) -> int: - """Get recommended timeout in seconds for a given task.""" - return TASK_TIMEOUTS.get(task_id, NORMAL_TIMEOUT) - - -def get_max_timeout_for_tasks(task_ids: list[str]) -> int: - """ - Get the maximum timeout needed for a set of tasks. - Useful for setting --global-agent-timeout-sec. - """ - if not task_ids: - return VERY_SLOW_TIMEOUT # Conservative default for unknown tasks - - return max(get_timeout_for_task(task_id) for task_id in task_ids) diff --git a/benchmarks/terminal_bench/task_timeouts_test.py b/benchmarks/terminal_bench/task_timeouts_test.py deleted file mode 100644 index 89d38a264..000000000 --- a/benchmarks/terminal_bench/task_timeouts_test.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Tests for task timeout configuration.""" - -from task_timeouts import ( - FAST_TIMEOUT, - NORMAL_TIMEOUT, - SLOW_TIMEOUT, - VERY_SLOW_TIMEOUT, - get_timeout_for_task, - get_max_timeout_for_tasks, -) - - -def test_fast_tasks(): - """Fast tasks should get 5 minute timeout.""" - assert get_timeout_for_task("hello-world") == FAST_TIMEOUT - assert get_timeout_for_task("simple-web-scraper") == FAST_TIMEOUT - assert get_timeout_for_task("fix-permissions") == FAST_TIMEOUT - - -def test_normal_tasks(): - """Normal tasks should get default 15 minute timeout.""" - # Unknown tasks default to NORMAL - assert get_timeout_for_task("unknown-task") == NORMAL_TIMEOUT - assert get_timeout_for_task("some-random-task") == NORMAL_TIMEOUT - - -def test_slow_tasks(): - """Slow tasks should get 30 minute timeout.""" - assert get_timeout_for_task("count-dataset-tokens") == SLOW_TIMEOUT - assert get_timeout_for_task("qemu-startup") == SLOW_TIMEOUT - assert get_timeout_for_task("path-tracing") == SLOW_TIMEOUT - - -def test_very_slow_tasks(): - """Very slow tasks should get 60 minute timeout.""" - assert get_timeout_for_task("build-linux-kernel-qemu") == VERY_SLOW_TIMEOUT - assert get_timeout_for_task("build-initramfs-qemu") == VERY_SLOW_TIMEOUT - - -def test_max_timeout_for_tasks(): - """Should return maximum timeout needed for a set of tasks.""" - # Mix of fast and slow - tasks = ["hello-world", "count-dataset-tokens"] - assert get_max_timeout_for_tasks(tasks) == SLOW_TIMEOUT - - # Mix of fast, slow, and very slow - tasks = ["hello-world", "count-dataset-tokens", "build-linux-kernel-qemu"] - assert get_max_timeout_for_tasks(tasks) == VERY_SLOW_TIMEOUT - - # All fast - tasks = ["hello-world", "simple-web-scraper"] - assert get_max_timeout_for_tasks(tasks) == FAST_TIMEOUT - - # Empty list should return conservative default - assert get_max_timeout_for_tasks([]) == VERY_SLOW_TIMEOUT - - -def test_timeout_values(): - """Verify timeout constants are reasonable.""" - assert FAST_TIMEOUT == 300 # 5 minutes - assert NORMAL_TIMEOUT == 900 # 15 minutes - assert SLOW_TIMEOUT == 1800 # 30 minutes - assert VERY_SLOW_TIMEOUT == 3600 # 60 minutes - - # Ensure proper ordering - assert FAST_TIMEOUT < NORMAL_TIMEOUT < SLOW_TIMEOUT < VERY_SLOW_TIMEOUT From 9215ba8693ac11b03b69ab10595f65b472d72e6f Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 31 Oct 2025 15:41:49 +0000 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=A4=96=20feat:=20enable=20JSON=20line?= =?UTF-8?q?s=20output=20for=20terminal-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --json-streaming flag to agentSessionCli invocation. This makes terminal-bench agent output machine-readable JSON lines instead of human-readable text, making it easier to analyze results programmatically. --- benchmarks/terminal_bench/cmux-run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh index 4aaa69895..7c7889bcf 100644 --- a/benchmarks/terminal_bench/cmux-run.sh +++ b/benchmarks/terminal_bench/cmux-run.sh @@ -102,7 +102,8 @@ cmd=(bun src/debug/agentSessionCli.ts --workspace-path "${project_path}" --workspace-id "${CMUX_WORKSPACE_ID}" --model "${CMUX_MODEL}" - --mode "${CMUX_MODE}") + --mode "${CMUX_MODE}" + --json-streaming) if [[ -n "${CMUX_TIMEOUT_MS}" ]]; then cmd+=(--timeout "${CMUX_TIMEOUT_MS}")