From 91cf35bc0088b4020e28780965ff18b3140c9ead Mon Sep 17 00:00:00 2001 From: Catalin Lupuleti Date: Sun, 15 Mar 2026 21:35:39 +0000 Subject: [PATCH] feat: add --prompt-retries flag for automatic retry on transient errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When model APIs return transient errors (HTTP 400/500 surfacing as ACP internal errors), acpx now supports automatic retry with exponential backoff via the --prompt-retries flag. - Add isRetryablePromptError() to classify transient vs permanent errors - Retry only on ACP -32603 (internal) and -32700 (parse) errors - Never retry auth, permission, timeout, or session-not-found errors - Exponential backoff: 1s, 2s, 4s, 8s, capped at 10s - Skip retry if agent process crashed during prompt - Flow --prompt-retries through CLI → queue owner → prompt execution - Default: 0 (no retries, preserving existing behavior) Closes #137 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cli-core.ts | 2 + src/cli/flags.ts | 15 ++ src/error-normalization.ts | 43 ++++++ src/queue-owner-env.ts | 4 + src/session-runtime.ts | 153 ++++++++++++++------- src/session-runtime/queue-owner-process.ts | 3 + test/prompt-retry.test.ts | 74 ++++++++++ 7 files changed, 241 insertions(+), 53 deletions(-) create mode 100644 test/prompt-retry.test.ts diff --git a/src/cli-core.ts b/src/cli-core.ts index 244cba4..b21044b 100644 --- a/src/cli-core.ts +++ b/src/cli-core.ts @@ -292,6 +292,7 @@ async function handlePrompt( timeoutMs: globalFlags.timeout, ttlMs: globalFlags.ttl, maxQueueDepth: config.queueMaxDepth, + promptRetries: globalFlags.promptRetries, verbose: globalFlags.verbose, waitForCompletion: flags.wait !== false, }); @@ -364,6 +365,7 @@ async function handleExec( suppressSdkConsoleErrors: outputPolicy.suppressSdkConsoleErrors, timeoutMs: globalFlags.timeout, verbose: globalFlags.verbose, + promptRetries: globalFlags.promptRetries, sessionOptions: { model: globalFlags.model, allowedTools: globalFlags.allowedTools, diff --git a/src/cli/flags.ts b/src/cli/flags.ts index 6c7771b..fc94dcc 100644 --- a/src/cli/flags.ts +++ b/src/cli/flags.ts @@ -37,6 +37,7 @@ export type GlobalFlags = PermissionFlags & { model?: string; allowedTools?: string[]; maxTurns?: number; + promptRetries?: number; }; export type PromptFlags = { @@ -153,6 +154,14 @@ export function parseMaxTurns(value: string): number { return parsed; } +export function parsePromptRetries(value: string): number { + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed < 0) { + throw new InvalidArgumentError("Prompt retries must be a non-negative integer"); + } + return parsed; +} + export function resolvePermissionMode( flags: PermissionFlags, defaultMode: PermissionMode, @@ -200,6 +209,11 @@ export function addGlobalFlags(command: Command): Command { parseAllowedTools, ) .option("--max-turns ", "Maximum turns for the session", parseMaxTurns) + .option( + "--prompt-retries ", + "Retry failed prompt turns on transient errors (default: 0)", + parsePromptRetries, + ) .option( "--json-strict", "Strict JSON mode: requires --format json and suppresses non-JSON stderr output", @@ -285,6 +299,7 @@ export function resolveGlobalFlags(command: Command, config: ResolvedAcpxConfig) model: typeof opts.model === "string" ? parseNonEmptyValue("Model", opts.model) : undefined, allowedTools: Array.isArray(opts.allowedTools) ? opts.allowedTools : undefined, maxTurns: typeof opts.maxTurns === "number" ? opts.maxTurns : undefined, + promptRetries: typeof opts.promptRetries === "number" ? opts.promptRetries : undefined, approveAll: opts.approveAll ? true : undefined, approveReads: opts.approveReads ? true : undefined, denyAll: opts.denyAll ? true : undefined, diff --git a/src/error-normalization.ts b/src/error-normalization.ts index 035617b..02fb86b 100644 --- a/src/error-normalization.ts +++ b/src/error-normalization.ts @@ -226,6 +226,49 @@ export function normalizeOutputError( }; } +/** + * Returns true when an error from `client.prompt()` looks transient and + * can reasonably be retried (e.g. model-API 400/500, network hiccups that + * surface as ACP internal errors). + * + * Errors that are definitively non-recoverable (auth, missing session, + * invalid params, timeout, permission) return false. + */ +export function isRetryablePromptError(error: unknown): boolean { + if (error instanceof PermissionDeniedError || error instanceof PermissionPromptUnavailableError) { + return false; + } + if (isTimeoutLike(error) || isNoSessionLike(error) || isUsageLike(error)) { + return false; + } + + // Extract ACP payload once and reuse for all subsequent checks. + const acp = extractAcpError(error); + if (!acp) { + // Non-ACP errors (e.g. process crash) are not retried at the prompt level. + return false; + } + + // Resource-not-found (session gone) — check using the already-extracted payload. + if (acp.code === -32001 || acp.code === -32002) { + return false; + } + + // Auth-required errors are never retryable. Use the same thorough check as normalizeOutputError. + if (isAcpAuthRequiredPayload(acp)) { + return false; + } + + // Method-not-found or invalid-params are permanent protocol errors. + if (acp.code === -32601 || acp.code === -32602) { + return false; + } + + // ACP internal errors (-32603) typically wrap model-API failures → retryable. + // Parse errors (-32700) can also be transient. + return acp.code === -32603 || acp.code === -32700; +} + export function exitCodeForOutputErrorCode(code: OutputErrorCode): ExitCode { switch (code) { case "USAGE": diff --git a/src/queue-owner-env.ts b/src/queue-owner-env.ts index d0b532b..b013f8d 100644 --- a/src/queue-owner-env.ts +++ b/src/queue-owner-env.ts @@ -72,6 +72,10 @@ export function parseQueueOwnerPayload(raw: string): QueueOwnerRuntimeOptions { options.maxQueueDepth = Math.max(1, Math.round(record.maxQueueDepth)); } + if (typeof record.promptRetries === "number" && Number.isFinite(record.promptRetries)) { + options.promptRetries = Math.max(0, Math.round(record.promptRetries)); + } + return options; } diff --git a/src/session-runtime.ts b/src/session-runtime.ts index 63cf455..8068f64 100644 --- a/src/session-runtime.ts +++ b/src/session-runtime.ts @@ -1,7 +1,11 @@ import fs from "node:fs/promises"; import path from "node:path"; import { AcpClient } from "./client.js"; -import { formatErrorMessage, normalizeOutputError } from "./error-normalization.js"; +import { + formatErrorMessage, + isRetryablePromptError, + normalizeOutputError, +} from "./error-normalization.js"; import { checkpointPerfMetricsCapture } from "./perf-metrics-capture.js"; import { formatPerfMetric, measurePerf, setPerfGauge, startPerfTimer } from "./perf-metrics.js"; import { refreshQueueOwnerLease } from "./queue-lease-store.js"; @@ -116,6 +120,7 @@ export type RunOnceOptions = { suppressSdkConsoleErrors?: boolean; verbose?: boolean; sessionOptions?: SessionAgentOptions; + promptRetries?: number; } & TimedRunOptions; export type SessionCreateOptions = { @@ -147,6 +152,7 @@ export type SessionSendOptions = { waitForCompletion?: boolean; ttlMs?: number; maxQueueDepth?: number; + promptRetries?: number; } & TimedRunOptions; export type SessionEnsureOptions = { @@ -219,6 +225,7 @@ type RunSessionPromptOptions = { timeoutMs?: number; suppressSdkConsoleErrors?: boolean; verbose?: boolean; + promptRetries?: number; onClientAvailable?: (controller: ActiveSessionController) => void; onClientClosed?: () => void; onPromptActive?: () => Promise | void; @@ -399,6 +406,7 @@ async function runQueuedTask( authCredentials?: Record; authPolicy?: AuthPolicy; suppressSdkConsoleErrors?: boolean; + promptRetries?: number; onClientAvailable?: (controller: ActiveSessionController) => void; onClientClosed?: () => void; onPromptActive?: () => Promise | void; @@ -422,6 +430,7 @@ async function runQueuedTask( timeoutMs: task.timeoutMs, suppressSdkConsoleErrors: task.suppressSdkConsoleErrors ?? options.suppressSdkConsoleErrors, verbose: options.verbose, + promptRetries: options.promptRetries, onClientAvailable: options.onClientAvailable, onClientClosed: options.onClientClosed, onPromptActive: options.onPromptActive, @@ -616,63 +625,82 @@ async function runSessionPrompt(options: RunSessionPromptOptions): Promise { - return await withTimeout(promptPromise, options.timeoutMs); - }); - if (options.verbose) { - process.stderr.write( - `[acpx] ${formatPerfMetric("prompt.agent_turn", Date.now() - promptStartedAt)}\n`, - ); - } - } catch (error) { - const snapshot = client.getAgentLifecycleSnapshot(); - applyLifecycleSnapshotToRecord(record, snapshot); - if (snapshot.lastExit?.unexpectedDuringPrompt && options.verbose) { - process.stderr.write( - "[acpx] agent disconnected during prompt (" + - snapshot.lastExit.reason + - ", exit=" + - snapshot.lastExit.exitCode + - ", signal=" + - (snapshot.lastExit.signal ?? "none") + - ")\n", - ); - } + response = await measurePerf("runtime.prompt.agent_turn", async () => { + return await withTimeout(promptPromise, options.timeoutMs); + }); + if (options.verbose) { + process.stderr.write( + `[acpx] ${formatPerfMetric("prompt.agent_turn", Date.now() - promptStartedAt)}\n`, + ); + } + break; + } catch (error) { + const snapshot = client.getAgentLifecycleSnapshot(); + const agentCrashed = snapshot.lastExit?.unexpectedDuringPrompt === true; + + // Retry if: retries remain, agent is still alive, error is transient. + if (attempt < maxRetries && !agentCrashed && isRetryablePromptError(error)) { + const delayMs = Math.min(1_000 * 2 ** attempt, 10_000); + process.stderr.write( + `[acpx] prompt failed (${formatErrorMessage(error)}), retrying in ${delayMs}ms ` + + `(attempt ${attempt + 1}/${maxRetries})\n`, + ); + await waitMs(delayMs); + continue; + } - const normalizedError = normalizeOutputError(error, { - origin: "runtime", - }); + applyLifecycleSnapshotToRecord(record, snapshot); + const lastExit = snapshot.lastExit; + if (lastExit?.unexpectedDuringPrompt && options.verbose) { + process.stderr.write( + "[acpx] agent disconnected during prompt (" + + lastExit.reason + + ", exit=" + + lastExit.exitCode + + ", signal=" + + (lastExit.signal ?? "none") + + ")\n", + ); + } - await flushPendingMessages(false).catch(() => { - // best effort while bubbling prompt failure - }); + const normalizedError = normalizeOutputError(error, { + origin: "runtime", + }); + + await flushPendingMessages(false).catch(() => { + // best effort while bubbling prompt failure + }); - output.flush(); + output.flush(); - record.lastUsedAt = isoNow(); - applyConversation(record, conversation); - record.acpx = acpxState; + record.lastUsedAt = isoNow(); + applyConversation(record, conversation); + record.acpx = acpxState; - const propagated = error instanceof Error ? error : new Error(formatErrorMessage(error)); - (propagated as { outputAlreadyEmitted?: boolean }).outputAlreadyEmitted = sawAcpMessage; - (propagated as { normalizedOutputError?: unknown }).normalizedOutputError = - normalizedError; - throw propagated; + const propagated = + error instanceof Error ? error : new Error(formatErrorMessage(error)); + (propagated as { outputAlreadyEmitted?: boolean }).outputAlreadyEmitted = sawAcpMessage; + (propagated as { normalizedOutputError?: unknown }).normalizedOutputError = + normalizedError; + throw propagated; + } } await flushPendingMessages(false); @@ -769,9 +797,27 @@ export async function runOnce(options: RunOnceOptions): Promise sessionId, }); - const response = await measurePerf("runtime.exec.prompt", async () => { - return await withTimeout(client.prompt(sessionId, options.prompt), options.timeoutMs); - }); + const maxRetries = options.promptRetries ?? 0; + let response; + for (let attempt = 0; ; attempt++) { + try { + response = await measurePerf("runtime.exec.prompt", async () => { + return await withTimeout(client.prompt(sessionId, options.prompt), options.timeoutMs); + }); + break; + } catch (error) { + if (attempt < maxRetries && isRetryablePromptError(error)) { + const delayMs = Math.min(1_000 * 2 ** attempt, 10_000); + process.stderr.write( + `[acpx] prompt failed (${formatErrorMessage(error)}), retrying in ${delayMs}ms ` + + `(attempt ${attempt + 1}/${maxRetries})\n`, + ); + await waitMs(delayMs); + continue; + } + throw error; + } + } output.flush(); return toPromptResult(response.stopReason, sessionId, client); }, @@ -1083,6 +1129,7 @@ export async function runSessionQueueOwner(options: QueueOwnerRuntimeOptions): P authCredentials: options.authCredentials, authPolicy: options.authPolicy, suppressSdkConsoleErrors: options.suppressSdkConsoleErrors, + promptRetries: options.promptRetries, onClientAvailable: setActiveController, onClientClosed: clearActiveController, onPromptActive: async () => { diff --git a/src/session-runtime/queue-owner-process.ts b/src/session-runtime/queue-owner-process.ts index 0e67693..16c179d 100644 --- a/src/session-runtime/queue-owner-process.ts +++ b/src/session-runtime/queue-owner-process.ts @@ -18,6 +18,7 @@ export type QueueOwnerRuntimeOptions = { verbose?: boolean; ttlMs?: number; maxQueueDepth?: number; + promptRetries?: number; }; type SessionSendLike = { @@ -31,6 +32,7 @@ type SessionSendLike = { verbose?: boolean; ttlMs?: number; maxQueueDepth?: number; + promptRetries?: number; }; export function resolveQueueOwnerSpawnArgs(argv: readonly string[] = process.argv): string[] { @@ -56,6 +58,7 @@ export function queueOwnerRuntimeOptionsFromSend( verbose: options.verbose, ttlMs: options.ttlMs, maxQueueDepth: options.maxQueueDepth, + promptRetries: options.promptRetries, }; } diff --git a/test/prompt-retry.test.ts b/test/prompt-retry.test.ts new file mode 100644 index 0000000..88fe634 --- /dev/null +++ b/test/prompt-retry.test.ts @@ -0,0 +1,74 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { isRetryablePromptError } from "../src/error-normalization.js"; +import { PermissionDeniedError, PermissionPromptUnavailableError } from "../src/errors.js"; + +// --- isRetryablePromptError --- + +test("isRetryablePromptError returns true for ACP internal error (-32603)", () => { + const error = { code: -32603, message: "Internal error" }; + assert.equal(isRetryablePromptError(error), true); +}); + +test("isRetryablePromptError returns true for ACP parse error (-32700)", () => { + const error = { code: -32700, message: "Parse error" }; + assert.equal(isRetryablePromptError(error), true); +}); + +test("isRetryablePromptError returns true for wrapped ACP internal error", () => { + const error = new Error("prompt failed"); + (error as Error & { error?: unknown }).error = { + code: -32603, + message: "Internal error", + data: { details: "model returned HTTP 400" }, + }; + assert.equal(isRetryablePromptError(error), true); +}); + +test("isRetryablePromptError returns false for auth-required error (-32000)", () => { + const error = { code: -32000, message: "Authentication required" }; + assert.equal(isRetryablePromptError(error), false); +}); + +test("isRetryablePromptError returns false for method-not-found error (-32601)", () => { + const error = { code: -32601, message: "Method not found: session/prompt" }; + assert.equal(isRetryablePromptError(error), false); +}); + +test("isRetryablePromptError returns false for invalid-params error (-32602)", () => { + const error = { code: -32602, message: "Invalid params" }; + assert.equal(isRetryablePromptError(error), false); +}); + +test("isRetryablePromptError returns false for resource-not-found error (-32002)", () => { + const error = { code: -32002, message: "Resource not found: session" }; + assert.equal(isRetryablePromptError(error), false); +}); + +test("isRetryablePromptError returns false for PermissionDeniedError", () => { + assert.equal(isRetryablePromptError(new PermissionDeniedError("denied")), false); +}); + +test("isRetryablePromptError returns false for PermissionPromptUnavailableError", () => { + assert.equal(isRetryablePromptError(new PermissionPromptUnavailableError()), false); +}); + +test("isRetryablePromptError returns false for TimeoutError", () => { + const error = new Error("timeout"); + error.name = "TimeoutError"; + assert.equal(isRetryablePromptError(error), false); +}); + +test("isRetryablePromptError returns false for non-ACP errors", () => { + assert.equal(isRetryablePromptError(new Error("random failure")), false); +}); + +test("isRetryablePromptError returns false for null/undefined", () => { + assert.equal(isRetryablePromptError(null), false); + assert.equal(isRetryablePromptError(undefined), false); +}); + +test("isRetryablePromptError returns false for auth message in -32603 error", () => { + const error = { code: -32000, message: "auth required" }; + assert.equal(isRetryablePromptError(error), false); +});