|
| 1 | +""" |
| 2 | +Task-specific timeout recommendations based on terminal-bench nightly results. |
| 3 | +
|
| 4 | +Analysis from 2025-10-30 run showed: |
| 5 | +- Default timeout appears to be ~15 minutes (900s) per task |
| 6 | +- 27-35% of tasks hit timeout (22 for Anthropic, 28 for OpenAI) |
| 7 | +- Some tasks that timed out actually passed their tests |
| 8 | +- Complex tasks (compilation, data processing) need more time |
| 9 | +- Simple tasks (hello-world) need less time |
| 10 | +
|
| 11 | +Strategy: |
| 12 | +- FAST tasks (< 5 min): Simple file operations, basic commands |
| 13 | +- NORMAL tasks (15 min): Default for most tasks |
| 14 | +- SLOW tasks (30 min): Data processing, model training, complex analysis |
| 15 | +- VERY_SLOW tasks (60 min): Kernel compilation, large builds |
| 16 | +""" |
| 17 | + |
| 18 | +# Timeout in seconds |
| 19 | +FAST_TIMEOUT = 300 # 5 minutes |
| 20 | +NORMAL_TIMEOUT = 900 # 15 minutes (current default) |
| 21 | +SLOW_TIMEOUT = 1800 # 30 minutes |
| 22 | +VERY_SLOW_TIMEOUT = 3600 # 60 minutes |
| 23 | + |
| 24 | +# Tasks that need extended timeouts (evidence from 2025-10-30 run) |
| 25 | +TASK_TIMEOUTS = { |
| 26 | + # VERY_SLOW: Compilation tasks that legitimately take 30+ minutes |
| 27 | + "build-linux-kernel-qemu": VERY_SLOW_TIMEOUT, # Failed at 763s |
| 28 | + "build-initramfs-qemu": VERY_SLOW_TIMEOUT, |
| 29 | + "build-tcc-qemu": SLOW_TIMEOUT, |
| 30 | + |
| 31 | + # SLOW: Data processing, ML training, complex analysis |
| 32 | + "count-dataset-tokens": SLOW_TIMEOUT, # Anthropic timed out at 808s, OpenAI succeeded at 344s |
| 33 | + "train-fasttext": SLOW_TIMEOUT, # Timed out at 900s |
| 34 | + "cartpole-rl-training": SLOW_TIMEOUT, # Succeeded but took time |
| 35 | + "hf-model-inference": SLOW_TIMEOUT, # Timed out at 660s |
| 36 | + "eval-mteb": SLOW_TIMEOUT, |
| 37 | + "eval-mteb.hard": SLOW_TIMEOUT, |
| 38 | + "reshard-c4-data": SLOW_TIMEOUT, |
| 39 | + |
| 40 | + # SLOW: QEMU/emulation tasks |
| 41 | + "qemu-startup": SLOW_TIMEOUT, # Passed at 838s but hit timeout |
| 42 | + "qemu-alpine-ssh": SLOW_TIMEOUT, |
| 43 | + "run-pdp11-code": SLOW_TIMEOUT, |
| 44 | + |
| 45 | + # SLOW: Complex algorithmic tasks |
| 46 | + "blind-maze-explorer-algorithm": SLOW_TIMEOUT, |
| 47 | + "blind-maze-explorer-algorithm.easy": SLOW_TIMEOUT, |
| 48 | + "blind-maze-explorer-algorithm.hard": SLOW_TIMEOUT, # Passed at 1200s! |
| 49 | + "path-tracing": SLOW_TIMEOUT, # Passed at 660s |
| 50 | + "path-tracing-reverse": SLOW_TIMEOUT, # Timed out at 660s |
| 51 | + |
| 52 | + # SLOW: Security/crypto tasks that may need brute force |
| 53 | + "crack-7z-hash": SLOW_TIMEOUT, |
| 54 | + "crack-7z-hash.hard": SLOW_TIMEOUT, |
| 55 | + "password-recovery": SLOW_TIMEOUT, |
| 56 | + "security-vulhub-minio": SLOW_TIMEOUT, |
| 57 | + |
| 58 | + # SLOW: Complex git/code analysis |
| 59 | + "git-workflow-hack": SLOW_TIMEOUT, # Passed but hit timeout |
| 60 | + "pytorch-model-cli": SLOW_TIMEOUT, # Passed at 541s |
| 61 | + "swe-bench-astropy-1": SLOW_TIMEOUT, |
| 62 | + "swe-bench-astropy-2": SLOW_TIMEOUT, |
| 63 | + "swe-bench-fsspec": SLOW_TIMEOUT, |
| 64 | + "swe-bench-langcodes": SLOW_TIMEOUT, |
| 65 | + |
| 66 | + # SLOW: Compilation/code generation |
| 67 | + "gpt2-codegolf": SLOW_TIMEOUT, |
| 68 | + "polyglot-c-py": SLOW_TIMEOUT, |
| 69 | + "polyglot-rust-c": SLOW_TIMEOUT, |
| 70 | + "write-compressor": SLOW_TIMEOUT, |
| 71 | + |
| 72 | + # SLOW: Complex system tasks |
| 73 | + "cron-broken-network": SLOW_TIMEOUT, |
| 74 | + "oom": SLOW_TIMEOUT, |
| 75 | + "fibonacci-server": SLOW_TIMEOUT, |
| 76 | + "incompatible-python-fasttext.base_with_hint": SLOW_TIMEOUT, |
| 77 | + "extract-safely": SLOW_TIMEOUT, |
| 78 | + |
| 79 | + # FAST: Simple tasks that should complete quickly |
| 80 | + "hello-world": FAST_TIMEOUT, |
| 81 | + "fix-permissions": FAST_TIMEOUT, |
| 82 | + "openssl-selfsigned-cert": FAST_TIMEOUT, |
| 83 | + "simple-web-scraper": FAST_TIMEOUT, |
| 84 | + "simple-sheets-put": FAST_TIMEOUT, |
| 85 | + "csv-to-parquet": FAST_TIMEOUT, |
| 86 | + "crack-7z-hash.easy": FAST_TIMEOUT, |
| 87 | +} |
| 88 | + |
| 89 | + |
| 90 | +def get_timeout_for_task(task_id: str) -> int: |
| 91 | + """Get recommended timeout in seconds for a given task.""" |
| 92 | + return TASK_TIMEOUTS.get(task_id, NORMAL_TIMEOUT) |
| 93 | + |
| 94 | + |
| 95 | +def get_max_timeout_for_tasks(task_ids: list[str]) -> int: |
| 96 | + """ |
| 97 | + Get the maximum timeout needed for a set of tasks. |
| 98 | + Useful for setting --global-agent-timeout-sec. |
| 99 | + """ |
| 100 | + if not task_ids: |
| 101 | + return VERY_SLOW_TIMEOUT # Conservative default for unknown tasks |
| 102 | + |
| 103 | + return max(get_timeout_for_task(task_id) for task_id in task_ids) |
0 commit comments