From 5a5d5b8842b9f607480223d13f86d2176ec43b57 Mon Sep 17 00:00:00 2001
From: DarkSkyXD <leveluptogetherbiz@gmail.com>
Date: Sun, 22 Mar 2026 23:13:17 -0500
Subject: [PATCH] fix: prevent workers from getting stuck in timeout-retry
 loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers spawning large LLM requests (e.g. writing a full PRD) were getting
stuck for 30+ minutes because:

1. The global reqwest client timeout of 120s was too short for large Anthropic
   completions with extended thinking — requests timed out before the API
   could finish generating.

2. The retry cascade (3 model-level retries × 5 worker-level retries) meant
   each timeout failure took ~30 minutes before the worker finally gave up,
   only to be respawned with the same result.

Changes:
- Add 10-minute per-request timeout on Anthropic API calls, overriding the
  120s global client timeout. Matches the scale of the streaming path (30min).
- Reduce MAX_TRANSIENT_RETRIES from 5 to 3 (still 9 total attempts with
  model-level retries) to fail faster on sustained API issues.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agent/worker.rs         | 5 +++--
 src/llm/anthropic/params.rs | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
index 58c21d8e3..815838789 100644
--- a/src/agent/worker.rs
+++ b/src/agent/worker.rs
@@ -35,8 +35,9 @@ const MAX_OVERFLOW_RETRIES: usize = 2;
 /// Max consecutive transient provider error retries before giving up.
 /// Transient errors (upstream 500s, timeouts, rate limits that survived
 /// model-level retries) get a backoff-and-retry at the worker level so
-/// the worker survives temporary provider outages.
-const MAX_TRANSIENT_RETRIES: usize = 5;
+/// the worker survives temporary provider outages. Each retry triggers
+/// up to 3 model-level attempts, so 3 worker retries = 9 total attempts.
+const MAX_TRANSIENT_RETRIES: usize = 3;
 
 /// Base delay for worker-level transient error backoff (doubles each retry).
 const TRANSIENT_RETRY_BASE_DELAY: std::time::Duration = std::time::Duration::from_secs(5);
diff --git a/src/llm/anthropic/params.rs b/src/llm/anthropic/params.rs
index 11dcda8ef..6ecaf8f4b 100644
--- a/src/llm/anthropic/params.rs
+++ b/src/llm/anthropic/params.rs
@@ -97,8 +97,11 @@ pub fn build_anthropic_request(
         body["output_config"] = serde_json::json!({ "effort": effort });
     }
 
+    // Override the global 120s client timeout — large completions with
+    // extended thinking can easily take 5–10 minutes to generate.
     let builder = http_client
         .post(&url)
+        .timeout(std::time::Duration::from_secs(10 * 60))
         .header("anthropic-version", "2023-06-01")
         .header("content-type", "application/json");