From 5a5d5b8842b9f607480223d13f86d2176ec43b57 Mon Sep 17 00:00:00 2001 From: DarkSkyXD Date: Sun, 22 Mar 2026 23:13:17 -0500 Subject: [PATCH] fix: prevent workers from getting stuck in timeout-retry loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers spawning large LLM requests (e.g. writing a full PRD) were getting stuck for 30+ minutes because: 1. The global reqwest client timeout of 120s was too short for large Anthropic completions with extended thinking — requests timed out before the API could finish generating. 2. The retry cascade (3 model-level retries × 5 worker-level retries) meant each timeout failure took ~30 minutes before the worker finally gave up, only to be respawned with the same result. Changes: - Add 10-minute per-request timeout on Anthropic API calls, overriding the 120s global client timeout. Matches the scale of the streaming path (30min). - Reduce MAX_TRANSIENT_RETRIES from 5 to 3 (still 9 total attempts with model-level retries) to fail faster on sustained API issues. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agent/worker.rs | 5 +++-- src/llm/anthropic/params.rs | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/agent/worker.rs b/src/agent/worker.rs index 58c21d8e3..815838789 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -35,8 +35,9 @@ const MAX_OVERFLOW_RETRIES: usize = 2; /// Max consecutive transient provider error retries before giving up. /// Transient errors (upstream 500s, timeouts, rate limits that survived /// model-level retries) get a backoff-and-retry at the worker level so -/// the worker survives temporary provider outages. -const MAX_TRANSIENT_RETRIES: usize = 5; +/// the worker survives temporary provider outages. Each retry triggers +/// up to 3 model-level attempts, so 3 worker retries = 9 total attempts. +const MAX_TRANSIENT_RETRIES: usize = 3; /// Base delay for worker-level transient error backoff (doubles each retry). const TRANSIENT_RETRY_BASE_DELAY: std::time::Duration = std::time::Duration::from_secs(5); diff --git a/src/llm/anthropic/params.rs b/src/llm/anthropic/params.rs index 11dcda8ef..6ecaf8f4b 100644 --- a/src/llm/anthropic/params.rs +++ b/src/llm/anthropic/params.rs @@ -97,8 +97,11 @@ pub fn build_anthropic_request( body["output_config"] = serde_json::json!({ "effort": effort }); } + // Override the global 120s client timeout — large completions with + // extended thinking can easily take 5–10 minutes to generate. let builder = http_client .post(&url) + .timeout(std::time::Duration::from_secs(10 * 60)) .header("anthropic-version", "2023-06-01") .header("content-type", "application/json");