diff --git a/.github/workflows/codex-code-review.yml b/.github/workflows/codex-code-review.yml new file mode 100644 index 0000000..56a0e3e --- /dev/null +++ b/.github/workflows/codex-code-review.yml @@ -0,0 +1,24 @@ +name: Codex Code Review + +# Thin caller. The workflow body — gpt-5.5 prompt, codex subscription +# auth, incremental review, sticky-comment lifecycle — lives in +# happycatlabs/codex-review-workflow. Update there to update every +# consumer at once. + +on: + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +# Required: the reusable workflow needs pull-requests + issues write +# to post the sticky review comment. GitHub bounds the called +# workflow's job-level permissions by the caller's workflow-level +# permissions, so the caller must declare at least these. +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + review: + uses: happycatlabs/codex-review-workflow/.github/workflows/codex-code-review.yml@main + secrets: inherit diff --git a/README.md b/README.md index 23419ef..0aa06b4 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ If a run hits `waiting_for_answer`, it's blocked until you respond: ```bash orca status --last # read the question -orca answer "yes, use migration A" # unblock it +orca answer "yes, use migration A" # answer and resume the live run ``` ### Spec / plan files @@ -104,6 +104,8 @@ Orca loads config in this order (later overrides earlier): `.ts` is preferred over `.js` when both exist. +Stale executor values from older configs are ignored and coerced to `codex`. Orca no longer supports alternate executors. + ```ts // orca.config.ts import { defineOrcaConfig } from "orcastrator"; @@ -117,7 +119,13 @@ export default defineOrcaConfig({ codex: { model: "gpt-5.3-codex", - effort: "medium", // "low" | "medium" | "high" — applies to all Codex turns + effort: "medium", // fallback for all Codex turns unless overridden below + thinkingLevel: { + decision: "low", // planning gate / quick routing decisions + planning: "xhigh", // task graph generation + review: "high", // task graph consultation + post-execution review prompts + execution: "medium", // task execution turns + }, timeoutMs: 300000, multiAgent: false, // see Multi-agent section perCwdExtraUserRoots: [ @@ -168,10 +176,20 @@ After planning, Orca runs a pre-execution review that can edit the task graph (a After execution, Orca runs validation commands and asks Codex to review findings. With `onFindings: "auto_fix"`, it applies fixes and retries up to `maxCycles` times, then reports. Set `ORCA_SKIP_VALIDATORS=1` to skip validator auto-detection at runtime. +Use `codex.thinkingLevel` when you want different reasoning levels for different stages instead of a single global `codex.effort`. + ### Multi-agent mode Set `codex.multiAgent: true` to spawn parallel Codex agents per task. Faster for large refactors with independent subtasks; higher token cost. **Note:** this writes `multi_agent = true` to your global `~/.codex/config.toml`. +If `~/.codex/config.toml` already enables `[features].multi_agent = true`, Orca also treats the run as multi-agent-aware for planning, review, consultation, and execution prompts even when `codex.multiAgent` is not set in Orca config. + +### Codex binary and MCP diagnostics + +When `ORCA_CODEX_PATH` is unset, Orca auto-selects the newest installed Codex CLI/app-server it can find instead of blindly trusting the first `codex` binary on `PATH`. This avoids talking to an older global install when a newer desktop build is present. + +If configured Codex MCP servers are enabled but not logged in, Orca now summarizes that once and continues without them instead of streaming raw app-server auth noise throughout the run. + ### Skills Orca auto-loads skills in this precedence order (first name wins): @@ -229,7 +247,7 @@ orca setup Interactive setup wizard ### Hooks -Available hook names: `onMilestone`, `onTaskComplete`, `onTaskFail`, `onInvalidPlan`, `onFindings`, `onComplete`, `onError`. +Available hook names: `onMilestone`, `onQuestion`, `onTaskComplete`, `onTaskFail`, `onInvalidPlan`, `onFindings`, `onComplete`, `onError`. - Function hooks (`config.hooks`): receive `(event, context)` where `context = { cwd, pid, invokedAt }` - Command hooks (`config.hookCommands` / `--on-*` flags): receive full event JSON over stdin diff --git a/docs/codex-app-server.md b/docs/codex-app-server.md new file mode 100644 index 0000000..1b646d6 --- /dev/null +++ b/docs/codex-app-server.md @@ -0,0 +1,1438 @@ +# Codex App Server + +Codex app-server is the interface Codex uses to power rich clients (for example, the Codex VS Code extension). Use it when you want a deep integration inside your own product: authentication, conversation history, approvals, and streamed agent events. The app-server implementation is open source in the Codex GitHub repository ([openai/codex/codex-rs/app-server](https://github.com/openai/codex/tree/main/codex-rs/app-server)). See the [Open Source](https://developers.openai.com/codex/open-source) page for the full list of open-source Codex components. + +If you are automating jobs or running Codex in CI, use the + Codex SDK instead. + +## Protocol + +Like [MCP](https://modelcontextprotocol.io/), `codex app-server` supports bidirectional communication using JSON-RPC 2.0 messages (with the `"jsonrpc":"2.0"` header omitted on the wire). + +Supported transports: + +- `stdio` (`--listen stdio://`, default): newline-delimited JSON (JSONL). +- `websocket` (`--listen ws://IP:PORT`, experimental): one JSON-RPC message per WebSocket text frame. + +In WebSocket mode, app-server uses bounded queues. When request ingress is full, the server rejects new requests with JSON-RPC error code `-32001` and message `"Server overloaded; retry later."` Clients should retry with an exponentially increasing delay and jitter. + +## Message schema + +Requests include `method`, `params`, and `id`: + +```json +{ "method": "thread/start", "id": 10, "params": { "model": "gpt-5.1-codex" } } +``` + +Responses echo the `id` with either `result` or `error`: + +```json +{ "id": 10, "result": { "thread": { "id": "thr_123" } } } +``` + +```json +{ "id": 10, "error": { "code": 123, "message": "Something went wrong" } } +``` + +Notifications omit `id` and use only `method` and `params`: + +```json +{ "method": "turn/started", "params": { "turn": { "id": "turn_456" } } } +``` + +You can generate a TypeScript schema or a JSON Schema bundle from the CLI. Each output is specific to the Codex version you ran, so the generated artifacts match that version exactly: + +```bash +codex app-server generate-ts --out ./schemas +codex app-server generate-json-schema --out ./schemas +``` + +## Getting started + +1. Start the server with `codex app-server` (default stdio transport) or `codex app-server --listen ws://127.0.0.1:4500` (experimental WebSocket transport). +2. Connect a client over the selected transport, then send `initialize` followed by the `initialized` notification. +3. Start a thread and a turn, then keep reading notifications from the active transport stream. + +Example (Node.js / TypeScript): + +```ts + + + +const proc = spawn("codex", ["app-server"], { + stdio: ["pipe", "pipe", "inherit"], +}); +const rl = readline.createInterface({ input: proc.stdout }); + +const send = (message: unknown) => { + proc.stdin.write(`${JSON.stringify(message)}\n`); +}; + +let threadId: string | null = null; + +rl.on("line", (line) => { + const msg = JSON.parse(line) as any; + console.log("server:", msg); + + if (msg.id === 1 && msg.result?.thread?.id && !threadId) { + threadId = msg.result.thread.id; + send({ + method: "turn/start", + id: 2, + params: { + threadId, + input: [{ type: "text", text: "Summarize this repo." }], + }, + }); + } +}); + +send({ + method: "initialize", + id: 0, + params: { + clientInfo: { + name: "my_product", + title: "My Product", + version: "0.1.0", + }, + }, +}); +send({ method: "initialized", params: {} }); +send({ method: "thread/start", id: 1, params: { model: "gpt-5.1-codex" } }); +``` + +## Core primitives + +- **Thread**: A conversation between a user and the Codex agent. Threads contain turns. +- **Turn**: A single user request and the agent work that follows. Turns contain items and stream incremental updates. +- **Item**: A unit of input or output (user message, agent message, command runs, file change, tool call, and more). + +Use the thread APIs to create, list, or archive conversations. Drive a conversation with turn APIs and stream progress via turn notifications. + +## Lifecycle overview + +- **Initialize once per connection**: Immediately after opening a transport connection, send an `initialize` request with your client metadata, then emit `initialized`. The server rejects any request on that connection before this handshake. +- **Start (or resume) a thread**: Call `thread/start` for a new conversation, `thread/resume` to continue an existing one, or `thread/fork` to branch history into a new thread id. +- **Begin a turn**: Call `turn/start` with the target `threadId` and user input. Optional fields override model, personality, `cwd`, sandbox policy, and more. +- **Steer an active turn**: Call `turn/steer` to append user input to the currently in-flight turn without creating a new turn. +- **Stream events**: After `turn/start`, keep reading notifications on stdout: `thread/archived`, `thread/unarchived`, `item/started`, `item/completed`, `item/agentMessage/delta`, tool progress, and other updates. +- **Finish the turn**: The server emits `turn/completed` with final status when the model finishes or after a `turn/interrupt` cancellation. + +## Initialization + +Clients must send a single `initialize` request per transport connection before invoking any other method on that connection, then acknowledge with an `initialized` notification. Requests sent before initialization receive a `Not initialized` error, and repeated `initialize` calls on the same connection return `Already initialized`. + +The server returns the user agent string it will present to upstream services. Set `clientInfo` to identify your integration. + +`initialize.params.capabilities` also supports per-connection notification opt-out via `optOutNotificationMethods`, which is a list of exact method names to suppress for that connection. Matching is exact (no wildcards/prefixes). Unknown method names are accepted and ignored. + +**Important**: Use `clientInfo.name` to identify your client for the OpenAI Compliance Logs Platform. If you are developing a new Codex integration intended for enterprise use, please contact OpenAI to get it added to a known clients list. For more context, see the [Codex logs reference](https://chatgpt.com/admin/api-reference#tag/Logs:-Codex). + +Example (from the Codex VS Code extension): + +```json +{ + "method": "initialize", + "id": 0, + "params": { + "clientInfo": { + "name": "codex_vscode", + "title": "Codex VS Code Extension", + "version": "0.1.0" + } + } +} +``` + +Example with notification opt-out: + +```json +{ + "method": "initialize", + "id": 1, + "params": { + "clientInfo": { + "name": "my_client", + "title": "My Client", + "version": "0.1.0" + }, + "capabilities": { + "experimentalApi": true, + "optOutNotificationMethods": [ + "codex/event/session_configured", + "item/agentMessage/delta" + ] + } + } +} +``` + +## Experimental API opt-in + +Some app-server methods and fields are intentionally gated behind `experimentalApi` capability. + +- Omit `capabilities` (or set `experimentalApi` to `false`) to stay on the stable API surface, and the server rejects experimental methods/fields. +- Set `capabilities.experimentalApi` to `true` to enable experimental methods and fields. + +```json +{ + "method": "initialize", + "id": 1, + "params": { + "clientInfo": { + "name": "my_client", + "title": "My Client", + "version": "0.1.0" + }, + "capabilities": { + "experimentalApi": true + } + } +} +``` + +If a client sends an experimental method or field without opting in, app-server rejects it with: + +` requires experimentalApi capability` + +## API overview + +- `thread/start` - create a new thread; emits `thread/started` and automatically subscribes you to turn/item events for that thread. +- `thread/resume` - reopen an existing thread by id so later `turn/start` calls append to it. +- `thread/fork` - fork a thread into a new thread id by copying stored history; emits `thread/started` for the new thread. +- `thread/read` - read a stored thread by id without resuming it; set `includeTurns` to return full turn history. Returned `thread` objects include runtime `status`. +- `thread/list` - page through stored thread logs; supports cursor-based pagination plus `modelProviders`, `sourceKinds`, `archived`, and `cwd` filters. Returned `thread` objects include runtime `status`. +- `thread/loaded/list` - list the thread ids currently loaded in memory. +- `thread/archive` - move a thread's log file into the archived directory; returns `{}` on success and emits `thread/archived`. +- `thread/unsubscribe` - unsubscribe this connection from thread turn/item events. If this was the last subscriber, the server unloads the thread and emits `thread/closed`. +- `thread/unarchive` - restore an archived thread rollout back into the active sessions directory; returns the restored `thread` and emits `thread/unarchived`. +- `thread/status/changed` - notification emitted when a loaded thread's runtime `status` changes. +- `thread/compact/start` - trigger conversation history compaction for a thread; returns `{}` immediately while progress streams via `turn/*` and `item/*` notifications. +- `thread/rollback` - drop the last N turns from the in-memory context and persist a rollback marker; returns the updated `thread`. +- `turn/start` - add user input to a thread and begin Codex generation; responds with the initial `turn` and streams events. For `collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode." +- `turn/steer` - append user input to the active in-flight turn for a thread; returns the accepted `turnId`. +- `turn/interrupt` - request cancellation of an in-flight turn; success is `{}` and the turn ends with `status: "interrupted"`. +- `review/start` - kick off the Codex reviewer for a thread; emits `enteredReviewMode` and `exitedReviewMode` items. +- `command/exec` - run a single command under the server sandbox without starting a thread/turn. +- `model/list` - list available models (set `includeHidden: true` to include entries with `hidden: true`) with effort options, optional `upgrade`, and `inputModalities`. +- `experimentalFeature/list` - list feature flags with lifecycle stage metadata and cursor pagination. +- `collaborationMode/list` - list collaboration mode presets (experimental, no pagination). +- `skills/list` - list skills for one or more `cwd` values (supports `forceReload` and optional `perCwdExtraUserRoots`). +- `app/list` - list available apps (connectors) with pagination plus accessibility/enabled metadata. +- `skills/config/write` - enable or disable skills by path. +- `mcpServer/oauth/login` - start an OAuth login for a configured MCP server; returns an authorization URL and emits `mcpServer/oauthLogin/completed` on completion. +- `tool/requestUserInput` - prompt the user with 1-3 short questions for a tool call (experimental); questions can set `isOther` for a free-form option. +- `config/mcpServer/reload` - reload MCP server configuration from disk and queue a refresh for loaded threads. +- `mcpServerStatus/list` - list MCP servers, tools, resources, and auth status (cursor + limit pagination). +- `windowsSandbox/setupStart` - start Windows sandbox setup for `elevated` or `unelevated` mode; returns quickly and later emits `windowsSandbox/setupCompleted`. +- `feedback/upload` - submit a feedback report (classification + optional reason/logs + conversation id, plus optional `extraLogFiles` attachments). +- `config/read` - fetch the effective configuration on disk after resolving configuration layering. +- `externalAgentConfig/detect` - detect migratable external-agent artifacts with `includeHome` and optional `cwds`; each detected item includes `cwd` (`null` for home). +- `externalAgentConfig/import` - apply selected external-agent migration items by passing explicit `migrationItems` with `cwd` (`null` for home). +- `config/value/write` - write a single configuration key/value to the user's `config.toml` on disk. +- `config/batchWrite` - apply configuration edits atomically to the user's `config.toml` on disk. +- `configRequirements/read` - fetch requirements from `requirements.toml` and/or MDM, including allow-lists, pinned `featureRequirements`, and residency/network requirements (or `null` if you haven't set any up). + +## Models + +### List models (`model/list`) + +Call `model/list` to discover available models and their capabilities before rendering model or personality selectors. + +```json +{ "method": "model/list", "id": 6, "params": { "limit": 20, "includeHidden": false } } +{ "id": 6, "result": { + "data": [{ + "id": "gpt-5.4", + "model": "gpt-5.4", + "displayName": "GPT-5.4", + "hidden": false, + "defaultReasoningEffort": "medium", + "supportedReasoningEfforts": [{ + "reasoningEffort": "low", + "description": "Lower latency" + }], + "inputModalities": ["text", "image"], + "supportsPersonality": true, + "isDefault": true + }], + "nextCursor": null +} } +``` + +Each model entry can include: + +- `supportedReasoningEfforts` - supported effort options for the model. +- `defaultReasoningEffort` - suggested default effort for clients. +- `upgrade` - optional recommended upgrade model id for migration prompts in clients. +- `upgradeInfo` - optional upgrade metadata for migration prompts in clients. +- `hidden` - whether the model is hidden from the default picker list. +- `inputModalities` - supported input types for the model (for example `text`, `image`). +- `supportsPersonality` - whether the model supports personality-specific instructions such as `/personality`. +- `isDefault` - whether the model is the recommended default. + +By default, `model/list` returns picker-visible models only. Set `includeHidden: true` if you need the full list and want to filter on the client side using `hidden`. + +When `inputModalities` is missing (older model catalogs), treat it as `["text", "image"]` for backward compatibility. + +### List experimental features (`experimentalFeature/list`) + +Use this endpoint to discover feature flags with metadata and lifecycle stage: + +```json +{ "method": "experimentalFeature/list", "id": 7, "params": { "limit": 20 } } +{ "id": 7, "result": { + "data": [{ + "name": "unified_exec", + "stage": "beta", + "displayName": "Unified exec", + "description": "Use the unified PTY-backed execution tool.", + "announcement": "Beta rollout for improved command execution reliability.", + "enabled": false, + "defaultEnabled": false + }], + "nextCursor": null +} } +``` + +`stage` can be `beta`, `underDevelopment`, `stable`, `deprecated`, or `removed`. For non-beta flags, `displayName`, `description`, and `announcement` may be `null`. + +## Threads + +- `thread/read` reads a stored thread without subscribing to it; set `includeTurns` to include turns. +- `thread/list` supports cursor pagination plus `modelProviders`, `sourceKinds`, `archived`, and `cwd` filtering. +- `thread/loaded/list` returns the thread IDs currently in memory. +- `thread/archive` moves the thread's persisted JSONL log into the archived directory. +- `thread/unsubscribe` unsubscribes the current connection from a loaded thread and can trigger `thread/closed`. +- `thread/unarchive` restores an archived thread rollout back into the active sessions directory. +- `thread/compact/start` triggers compaction and returns `{}` immediately. +- `thread/rollback` drops the last N turns from the in-memory context and records a rollback marker in the thread's persisted JSONL log. + +### Start or resume a thread + +Start a fresh thread when you need a new Codex conversation. + +```json +{ "method": "thread/start", "id": 10, "params": { + "model": "gpt-5.1-codex", + "cwd": "/Users/me/project", + "approvalPolicy": "never", + "sandbox": "workspaceWrite", + "personality": "friendly", + "serviceName": "my_app_server_client" +} } +{ "id": 10, "result": { + "thread": { + "id": "thr_123", + "preview": "", + "ephemeral": false, + "modelProvider": "openai", + "createdAt": 1730910000 + } +} } +{ "method": "thread/started", "params": { "thread": { "id": "thr_123" } } } +``` + +`serviceName` is optional. Set it when you want app-server to tag thread-level metrics with your integration's service name. + +To continue a stored session, call `thread/resume` with the `thread.id` you recorded earlier. The response shape matches `thread/start`. You can also pass the same configuration overrides supported by `thread/start`, such as `personality`: + +```json +{ "method": "thread/resume", "id": 11, "params": { + "threadId": "thr_123", + "personality": "friendly" +} } +{ "id": 11, "result": { "thread": { "id": "thr_123", "name": "Bug bash notes", "ephemeral": false } } } +``` + +Resuming a thread doesn't update `thread.updatedAt` (or the rollout file's modified time) by itself. The timestamp updates when you start a turn. + +If you mark an enabled MCP server as `required` in config and that server fails to initialize, `thread/start` and `thread/resume` fail instead of continuing without it. + +`dynamicTools` on `thread/start` is an experimental field (requires `capabilities.experimentalApi = true`). Codex persists these dynamic tools in the thread rollout metadata and restores them on `thread/resume` when you don't supply new dynamic tools. + +If you resume with a different model than the one recorded in the rollout, Codex emits a warning and applies a one-time model-switch instruction on the next turn. + +To branch from a stored session, call `thread/fork` with the `thread.id`. This creates a new thread id and emits a `thread/started` notification for it: + +```json +{ "method": "thread/fork", "id": 12, "params": { "threadId": "thr_123" } } +{ "id": 12, "result": { "thread": { "id": "thr_456" } } } +{ "method": "thread/started", "params": { "thread": { "id": "thr_456" } } } +``` + +When a user-facing thread title has been set, app-server hydrates `thread.name` on `thread/list`, `thread/read`, `thread/resume`, `thread/unarchive`, and `thread/rollback` responses. `thread/start` and `thread/fork` may omit `name` (or return `null`) until a title is set later. + +### Read a stored thread (without resuming) + +Use `thread/read` when you want stored thread data but don't want to resume the thread or subscribe to its events. + +- `includeTurns` - when `true`, the response includes the thread's turns; when `false` or omitted, you get the thread summary only. +- Returned `thread` objects include runtime `status` (`notLoaded`, `idle`, `systemError`, or `active` with `activeFlags`). + +```json +{ "method": "thread/read", "id": 19, "params": { "threadId": "thr_123", "includeTurns": true } } +{ "id": 19, "result": { "thread": { "id": "thr_123", "name": "Bug bash notes", "ephemeral": false, "status": { "type": "notLoaded" }, "turns": [] } } } +``` + +Unlike `thread/resume`, `thread/read` doesn't load the thread into memory or emit `thread/started`. + +### List threads (with pagination & filters) + +`thread/list` lets you render a history UI. Results default to newest-first by `createdAt`. Filters apply before pagination. Pass any combination of: + +- `cursor` - opaque string from a prior response; omit for the first page. +- `limit` - server defaults to a reasonable page size if unset. +- `sortKey` - `created_at` (default) or `updated_at`. +- `modelProviders` - restrict results to specific providers; unset, null, or an empty array includes all providers. +- `sourceKinds` - restrict results to specific thread sources. When omitted or `[]`, the server defaults to interactive sources only: `cli` and `vscode`. +- `archived` - when `true`, list archived threads only. When `false` or omitted, list non-archived threads (default). +- `cwd` - restrict results to threads whose session current working directory exactly matches this path. + +`sourceKinds` accepts the following values: + +- `cli` +- `vscode` +- `exec` +- `appServer` +- `subAgent` +- `subAgentReview` +- `subAgentCompact` +- `subAgentThreadSpawn` +- `subAgentOther` +- `unknown` + +Example: + +```json +{ "method": "thread/list", "id": 20, "params": { + "cursor": null, + "limit": 25, + "sortKey": "created_at" +} } +{ "id": 20, "result": { + "data": [ + { "id": "thr_a", "preview": "Create a TUI", "ephemeral": false, "modelProvider": "openai", "createdAt": 1730831111, "updatedAt": 1730831111, "name": "TUI prototype", "status": { "type": "notLoaded" } }, + { "id": "thr_b", "preview": "Fix tests", "ephemeral": true, "modelProvider": "openai", "createdAt": 1730750000, "updatedAt": 1730750000, "status": { "type": "notLoaded" } } + ], + "nextCursor": "opaque-token-or-null" +} } +``` + +When `nextCursor` is `null`, you have reached the final page. + +### Track thread status changes + +`thread/status/changed` is emitted whenever a loaded thread's runtime status changes. The payload includes `threadId` and the new `status`. + +```json +{ + "method": "thread/status/changed", + "params": { + "threadId": "thr_123", + "status": { "type": "active", "activeFlags": ["waitingOnApproval"] } + } +} +``` + +### List loaded threads + +`thread/loaded/list` returns thread IDs currently loaded in memory. + +```json +{ "method": "thread/loaded/list", "id": 21 } +{ "id": 21, "result": { "data": ["thr_123", "thr_456"] } } +``` + +### Unsubscribe from a loaded thread + +`thread/unsubscribe` removes the current connection's subscription to a thread. The response status is one of: + +- `unsubscribed` when the connection was subscribed and is now removed. +- `notSubscribed` when the connection was not subscribed to that thread. +- `notLoaded` when the thread is not loaded. + +If this was the last subscriber, the server unloads the thread and emits a `thread/status/changed` transition to `notLoaded` plus `thread/closed`. + +```json +{ "method": "thread/unsubscribe", "id": 22, "params": { "threadId": "thr_123" } } +{ "id": 22, "result": { "status": "unsubscribed" } } +{ "method": "thread/status/changed", "params": { + "threadId": "thr_123", + "status": { "type": "notLoaded" } +} } +{ "method": "thread/closed", "params": { "threadId": "thr_123" } } +``` + +### Archive a thread + +Use `thread/archive` to move the persisted thread log (stored as a JSONL file on disk) into the archived sessions directory. + +```json +{ "method": "thread/archive", "id": 22, "params": { "threadId": "thr_b" } } +{ "id": 22, "result": {} } +{ "method": "thread/archived", "params": { "threadId": "thr_b" } } +``` + +Archived threads won't appear in future calls to `thread/list` unless you pass `archived: true`. + +### Unarchive a thread + +Use `thread/unarchive` to move an archived thread rollout back into the active sessions directory. + +```json +{ "method": "thread/unarchive", "id": 24, "params": { "threadId": "thr_b" } } +{ "id": 24, "result": { "thread": { "id": "thr_b", "name": "Bug bash notes" } } } +{ "method": "thread/unarchived", "params": { "threadId": "thr_b" } } +``` + +### Trigger thread compaction + +Use `thread/compact/start` to trigger manual history compaction for a thread. The request returns immediately with `{}`. + +App-server emits progress as standard `turn/*` and `item/*` notifications on the same `threadId`, including a `contextCompaction` item lifecycle (`item/started` then `item/completed`). + +```json +{ "method": "thread/compact/start", "id": 25, "params": { "threadId": "thr_b" } } +{ "id": 25, "result": {} } +``` + +### Roll back recent turns + +Use `thread/rollback` to remove the last `numTurns` entries from the in-memory context and persist a rollback marker in the rollout log. The returned `thread` includes `turns` populated after the rollback. + +```json +{ "method": "thread/rollback", "id": 26, "params": { "threadId": "thr_b", "numTurns": 1 } } +{ "id": 26, "result": { "thread": { "id": "thr_b", "name": "Bug bash notes", "ephemeral": false } } } +``` + +## Turns + +The `input` field accepts a list of items: + +- `{ "type": "text", "text": "Explain this diff" }` +- `{ "type": "image", "url": "https://.../design.png" }` +- `{ "type": "localImage", "path": "/tmp/screenshot.png" }` + +You can override configuration settings per turn (model, effort, personality, `cwd`, sandbox policy, summary). When specified, these settings become the defaults for later turns on the same thread. `outputSchema` applies only to the current turn. For `sandboxPolicy.type = "externalSandbox"`, set `networkAccess` to `restricted` or `enabled`; for `workspaceWrite`, `networkAccess` remains a boolean. + +For `turn/start.collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode" rather than clearing mode instructions. + +### Sandbox read access (`ReadOnlyAccess`) + +`sandboxPolicy` supports explicit read-access controls: + +- `readOnly`: optional `access` (`{ "type": "fullAccess" }` by default, or restricted roots). +- `workspaceWrite`: optional `readOnlyAccess` (`{ "type": "fullAccess" }` by default, or restricted roots). + +Restricted read access shape: + +```json +{ + "type": "restricted", + "includePlatformDefaults": true, + "readableRoots": ["/Users/me/shared-read-only"] +} +``` + +On macOS, `includePlatformDefaults: true` appends a curated platform-default Seatbelt policy for restricted-read sessions. This improves tool compatibility without broadly allowing all of `/System`. + +Examples: + +```json +{ "type": "readOnly", "access": { "type": "fullAccess" } } +``` + +```json +{ + "type": "workspaceWrite", + "writableRoots": ["/Users/me/project"], + "readOnlyAccess": { + "type": "restricted", + "includePlatformDefaults": true, + "readableRoots": ["/Users/me/shared-read-only"] + }, + "networkAccess": false +} +``` + +### Start a turn + +```json +{ "method": "turn/start", "id": 30, "params": { + "threadId": "thr_123", + "input": [ { "type": "text", "text": "Run tests" } ], + "cwd": "/Users/me/project", + "approvalPolicy": "unlessTrusted", + "sandboxPolicy": { + "type": "workspaceWrite", + "writableRoots": ["/Users/me/project"], + "networkAccess": true + }, + "model": "gpt-5.1-codex", + "effort": "medium", + "summary": "concise", + "personality": "friendly", + "outputSchema": { + "type": "object", + "properties": { "answer": { "type": "string" } }, + "required": ["answer"], + "additionalProperties": false + } +} } +{ "id": 30, "result": { "turn": { "id": "turn_456", "status": "inProgress", "items": [], "error": null } } } +``` + +### Steer an active turn + +Use `turn/steer` to append more user input to the active in-flight turn. + +- Include `expectedTurnId`; it must match the active turn id. +- The request fails if there is no active turn on the thread. +- `turn/steer` doesn't emit a new `turn/started` notification. +- `turn/steer` doesn't accept turn-level overrides (`model`, `cwd`, `sandboxPolicy`, or `outputSchema`). + +```json +{ "method": "turn/steer", "id": 32, "params": { + "threadId": "thr_123", + "input": [ { "type": "text", "text": "Actually focus on failing tests first." } ], + "expectedTurnId": "turn_456" +} } +{ "id": 32, "result": { "turnId": "turn_456" } } +``` + +### Start a turn (invoke a skill) + +Invoke a skill explicitly by including `$` in the text input and adding a `skill` input item alongside it. + +```json +{ "method": "turn/start", "id": 33, "params": { + "threadId": "thr_123", + "input": [ + { "type": "text", "text": "$skill-creator Add a new skill for triaging flaky CI and include step-by-step usage." }, + { "type": "skill", "name": "skill-creator", "path": "/Users/me/.codex/skills/skill-creator/SKILL.md" } + ] +} } +{ "id": 33, "result": { "turn": { "id": "turn_457", "status": "inProgress", "items": [], "error": null } } } +``` + +### Interrupt a turn + +```json +{ "method": "turn/interrupt", "id": 31, "params": { "threadId": "thr_123", "turnId": "turn_456" } } +{ "id": 31, "result": {} } +``` + +On success, the turn finishes with `status: "interrupted"`. + +## Review + +`review/start` runs the Codex reviewer for a thread and streams review items. Targets include: + +- `uncommittedChanges` +- `baseBranch` (diff against a branch) +- `commit` (review a specific commit) +- `custom` (free-form instructions) + +Use `delivery: "inline"` (default) to run the review on the existing thread, or `delivery: "detached"` to fork a new review thread. + +Example request/response: + +```json +{ "method": "review/start", "id": 40, "params": { + "threadId": "thr_123", + "delivery": "inline", + "target": { "type": "commit", "sha": "1234567deadbeef", "title": "Polish tui colors" } +} } +{ "id": 40, "result": { + "turn": { + "id": "turn_900", + "status": "inProgress", + "items": [ + { "type": "userMessage", "id": "turn_900", "content": [ { "type": "text", "text": "Review commit 1234567: Polish tui colors" } ] } + ], + "error": null + }, + "reviewThreadId": "thr_123" +} } +``` + +For a detached review, use `"delivery": "detached"`. The response is the same shape, but `reviewThreadId` will be the id of the new review thread (different from the original `threadId`). The server also emits a `thread/started` notification for that new thread before streaming the review turn. + +Codex streams the usual `turn/started` notification followed by an `item/started` with an `enteredReviewMode` item: + +```json +{ + "method": "item/started", + "params": { + "item": { + "type": "enteredReviewMode", + "id": "turn_900", + "review": "current changes" + } + } +} +``` + +When the reviewer finishes, the server emits `item/started` and `item/completed` containing an `exitedReviewMode` item with the final review text: + +```json +{ + "method": "item/completed", + "params": { + "item": { + "type": "exitedReviewMode", + "id": "turn_900", + "review": "Looks solid overall..." + } + } +} +``` + +Use this notification to render the reviewer output in your client. + +## Command execution + +`command/exec` runs a single command (`argv` array) under the server sandbox without creating a thread. + +```json +{ "method": "command/exec", "id": 50, "params": { + "command": ["ls", "-la"], + "cwd": "/Users/me/project", + "sandboxPolicy": { "type": "workspaceWrite" }, + "timeoutMs": 10000 +} } +{ "id": 50, "result": { "exitCode": 0, "stdout": "...", "stderr": "" } } +``` + +Use `sandboxPolicy.type = "externalSandbox"` if you already sandbox the server process and want Codex to skip its own sandbox enforcement. For external sandbox mode, set `networkAccess` to `restricted` (default) or `enabled`. For `readOnly` and `workspaceWrite`, use the same optional `access` / `readOnlyAccess` structure shown above. + +Notes: + +- The server rejects empty `command` arrays. +- `sandboxPolicy` accepts the same shape used by `turn/start` (for example, `dangerFullAccess`, `readOnly`, `workspaceWrite`, `externalSandbox`). +- When omitted, `timeoutMs` falls back to the server default. + +### Read admin requirements (`configRequirements/read`) + +Use `configRequirements/read` to inspect the effective admin requirements loaded from `requirements.toml` and/or MDM. + +```json +{ "method": "configRequirements/read", "id": 52, "params": {} } +{ "id": 52, "result": { + "requirements": { + "allowedApprovalPolicies": ["onRequest", "unlessTrusted"], + "allowedSandboxModes": ["readOnly", "workspaceWrite"], + "featureRequirements": { + "personality": true, + "unified_exec": false + }, + "network": { + "enabled": true, + "allowedDomains": ["api.openai.com"], + "allowUnixSockets": ["/tmp/example.sock"], + "dangerouslyAllowAllUnixSockets": false + } + } +} } +``` + +`result.requirements` is `null` when no requirements are configured. See the docs on [`requirements.toml`](https://developers.openai.com/codex/config-reference#requirementstoml) for details on supported keys and values. + +### Windows sandbox setup (`windowsSandbox/setupStart`) + +Custom Windows clients can trigger sandbox setup asynchronously instead of blocking on startup checks. + +```json +{ "method": "windowsSandbox/setupStart", "id": 53, "params": { "mode": "elevated" } } +{ "id": 53, "result": { "started": true } } +``` + +App-server starts setup in the background and later emits a completion notification: + +```json +{ + "method": "windowsSandbox/setupCompleted", + "params": { "mode": "elevated", "success": true, "error": null } +} +``` + +Modes: + +- `elevated` - run the elevated Windows sandbox setup path. +- `unelevated` - run the legacy setup/preflight path. + +## Events + +Event notifications are the server-initiated stream for thread lifecycles, turn lifecycles, and the items within them. After you start or resume a thread, keep reading the active transport stream for `thread/started`, `thread/archived`, `thread/unarchived`, `thread/closed`, `thread/status/changed`, `turn/*`, `item/*`, and `serverRequest/resolved` notifications. + +### Notification opt-out + +Clients can suppress specific notifications per connection by sending exact method names in `initialize.params.capabilities.optOutNotificationMethods`. + +- Exact-match only: `item/agentMessage/delta` suppresses only that method. +- Unknown method names are ignored. +- Applies to both legacy (`codex/event/*`) and v2 (`thread/*`, `turn/*`, `item/*`, etc.) notifications. +- Doesn't apply to requests, responses, or errors. + +### Fuzzy file search events (experimental) + +The fuzzy file search session API emits per-query notifications: + +- `fuzzyFileSearch/sessionUpdated` - `{ sessionId, query, files }` with the current matches for the active query. +- `fuzzyFileSearch/sessionCompleted` - `{ sessionId }` once indexing and matching for that query completes. + +### Windows sandbox setup events + +- `windowsSandbox/setupCompleted` - `{ mode, success, error }` emitted after a `windowsSandbox/setupStart` request finishes. + +### Turn events + +- `turn/started` - `{ turn }` with the turn id, empty `items`, and `status: "inProgress"`. +- `turn/completed` - `{ turn }` where `turn.status` is `completed`, `interrupted`, or `failed`; failures carry `{ error: { message, codexErrorInfo?, additionalDetails? } }`. +- `turn/diff/updated` - `{ threadId, turnId, diff }` with the latest aggregated unified diff across every file change in the turn. +- `turn/plan/updated` - `{ turnId, explanation?, plan }` whenever the agent shares or changes its plan; each `plan` entry is `{ step, status }` with `status` in `pending`, `inProgress`, or `completed`. +- `thread/tokenUsage/updated` - usage updates for the active thread. + +`turn/diff/updated` and `turn/plan/updated` currently include empty `items` arrays even when item events stream. Use `item/*` notifications as the source of truth for turn items. + +### Items + +`ThreadItem` is the tagged union carried in turn responses and `item/*` notifications. Common item types include: + +- `userMessage` - `{id, content}` where `content` is a list of user inputs (`text`, `image`, or `localImage`). +- `agentMessage` - `{id, text, phase?}` containing the accumulated agent reply. When present, `phase` uses Responses API wire values (`commentary`, `final_answer`). +- `plan` - `{id, text}` containing proposed plan text in plan mode. Treat the final `plan` item from `item/completed` as authoritative. +- `reasoning` - `{id, summary, content}` where `summary` holds streamed reasoning summaries and `content` holds raw reasoning blocks. +- `commandExecution` - `{id, command, cwd, status, commandActions, aggregatedOutput?, exitCode?, durationMs?}`. +- `fileChange` - `{id, changes, status}` describing proposed edits; `changes` list `{path, kind, diff}`. +- `mcpToolCall` - `{id, server, tool, status, arguments, result?, error?}`. +- `dynamicToolCall` - `{id, tool, arguments, status, contentItems?, success?, durationMs?}` for client-executed dynamic tool invocations. +- `collabToolCall` - `{id, tool, status, senderThreadId, receiverThreadId?, newThreadId?, prompt?, agentStatus?}`. +- `webSearch` - `{id, query, action?}` for web search requests issued by the agent. +- `imageView` - `{id, path}` emitted when the agent invokes the image viewer tool. +- `enteredReviewMode` - `{id, review}` sent when the reviewer starts. +- `exitedReviewMode` - `{id, review}` emitted when the reviewer finishes. +- `contextCompaction` - `{id}` emitted when Codex compacts the conversation history. + +For `webSearch.action`, the action `type` can be `search` (`query?`, `queries?`), `openPage` (`url?`), or `findInPage` (`url?`, `pattern?`). + +The app server deprecates the legacy `thread/compacted` notification; use the `contextCompaction` item instead. + +All items emit two shared lifecycle events: + +- `item/started` - emits the full `item` when a new unit of work begins; the `item.id` matches the `itemId` used by deltas. +- `item/completed` - sends the final `item` once work finishes; treat this as the authoritative state. + +### Item deltas + +- `item/agentMessage/delta` - appends streamed text for the agent message. +- `item/plan/delta` - streams proposed plan text. The final `plan` item may not exactly equal the concatenated deltas. +- `item/reasoning/summaryTextDelta` - streams readable reasoning summaries; `summaryIndex` increments when a new summary section opens. +- `item/reasoning/summaryPartAdded` - marks a boundary between reasoning summary sections. +- `item/reasoning/textDelta` - streams raw reasoning text (when supported by the model). +- `item/commandExecution/outputDelta` - streams stdout/stderr for a command; append deltas in order. +- `item/fileChange/outputDelta` - contains the tool call response of the underlying `apply_patch` tool call. + +## Errors + +If a turn fails, the server emits an `error` event with `{ error: { message, codexErrorInfo?, additionalDetails? } }` and then finishes the turn with `status: "failed"`. When an upstream HTTP status is available, it appears in `codexErrorInfo.httpStatusCode`. + +Common `codexErrorInfo` values include: + +- `ContextWindowExceeded` +- `UsageLimitExceeded` +- `HttpConnectionFailed` (4xx/5xx upstream errors) +- `ResponseStreamConnectionFailed` +- `ResponseStreamDisconnected` +- `ResponseTooManyFailedAttempts` +- `BadRequest`, `Unauthorized`, `SandboxError`, `InternalServerError`, `Other` + +When an upstream HTTP status is available, the server forwards it in `httpStatusCode` on the relevant `codexErrorInfo` variant. + +## Approvals + +Depending on a user's Codex settings, command execution and file changes may require approval. The app-server sends a server-initiated JSON-RPC request to the client, and the client responds with a decision payload. + +- Command execution decisions: `accept`, `acceptForSession`, `decline`, `cancel`, or `{ "acceptWithExecpolicyAmendment": { "execpolicy_amendment": ["cmd", "..."] } }`. +- File change decisions: `accept`, `acceptForSession`, `decline`, `cancel`. + +- Requests include `threadId` and `turnId` - use them to scope UI state to the active conversation. +- The server resumes or declines the work and ends the item with `item/completed`. + +### Command execution approvals + +Order of messages: + +1. `item/started` shows the pending `commandExecution` item with `command`, `cwd`, and other fields. +2. `item/commandExecution/requestApproval` includes `itemId`, `threadId`, `turnId`, optional `reason`, optional `command`, optional `cwd`, optional `commandActions`, optional `proposedExecpolicyAmendment`, optional `networkApprovalContext`, and optional `availableDecisions`. When `initialize.params.capabilities.experimentalApi = true`, the payload can also include experimental `additionalPermissions` describing requested per-command sandbox access. Any filesystem paths inside `additionalPermissions` are absolute on the wire. +3. Client responds with one of the command execution approval decisions above. +4. `serverRequest/resolved` confirms that the pending request has been answered or cleared. +5. `item/completed` returns the final `commandExecution` item with `status: completed | failed | declined`. + +When `networkApprovalContext` is present, the prompt is for managed network access (not a general shell-command approval). The current v2 schema exposes the target `host` and `protocol`; clients should render a network-specific prompt and not rely on `command` being a user-meaningful shell command preview. + +Codex groups concurrent network approval prompts by destination (`host`, protocol, and port). The app-server may therefore send one prompt that unblocks multiple queued requests to the same destination, while different ports on the same host are treated separately. + +### File change approvals + +Order of messages: + +1. `item/started` emits a `fileChange` item with proposed `changes` and `status: "inProgress"`. +2. `item/fileChange/requestApproval` includes `itemId`, `threadId`, `turnId`, optional `reason`, and optional `grantRoot`. +3. Client responds with one of the file change approval decisions above. +4. `serverRequest/resolved` confirms that the pending request has been answered or cleared. +5. `item/completed` returns the final `fileChange` item with `status: completed | failed | declined`. + +### `tool/requestUserInput` + +When the client responds to `item/tool/requestUserInput`, app-server emits `serverRequest/resolved` with `{ threadId, requestId }`. If the pending request is cleared by turn start, turn completion, or turn interruption before the client answers, the server emits the same notification for that cleanup. + +### Dynamic tool calls (experimental) + +`dynamicTools` on `thread/start` and the corresponding `item/tool/call` request or response flow are experimental APIs. + +When a dynamic tool is invoked during a turn, app-server emits: + +1. `item/started` with `item.type = "dynamicToolCall"`, `status = "inProgress"`, plus `tool` and `arguments`. +2. `item/tool/call` as a server request to the client. +3. The client response payload with returned content items. +4. `item/completed` with `item.type = "dynamicToolCall"`, the final `status`, and any returned `contentItems` or `success` value. + +### MCP tool-call approvals (apps) + +App (connector) tool calls can also require approval. When an app tool call has side effects, the server may elicit approval with `tool/requestUserInput` and options such as **Accept**, **Decline**, and **Cancel**. Destructive tool annotations always trigger approval even when the tool also advertises less-privileged hints. If the user declines or cancels, the related `mcpToolCall` item completes with an error instead of running the tool. + +## Skills + +Invoke a skill by including `$` in the user text input. Add a `skill` input item (recommended) so the server injects full skill instructions instead of relying on the model to resolve the name. + +```json +{ + "method": "turn/start", + "id": 101, + "params": { + "threadId": "thread-1", + "input": [ + { + "type": "text", + "text": "$skill-creator Add a new skill for triaging flaky CI." + }, + { + "type": "skill", + "name": "skill-creator", + "path": "/Users/me/.codex/skills/skill-creator/SKILL.md" + } + ] + } +} +``` + +If you omit the `skill` item, the model will still parse the `$` marker and try to locate the skill, which can add latency. + +Example: + +``` +$skill-creator Add a new skill for triaging flaky CI and include step-by-step usage. +``` + +Use `skills/list` to fetch available skills (optionally scoped by `cwds`, with `forceReload`). You can also include `perCwdExtraUserRoots` to scan extra absolute paths as `user` scope for specific `cwd` values. App-server ignores entries whose `cwd` isn't present in `cwds`. `skills/list` may reuse a cached result per `cwd`; set `forceReload: true` to refresh from disk. When present, the server reads `interface` and `dependencies` from `SKILL.json`. + +```json +{ "method": "skills/list", "id": 25, "params": { + "cwds": ["/Users/me/project", "/Users/me/other-project"], + "forceReload": true, + "perCwdExtraUserRoots": [ + { + "cwd": "/Users/me/project", + "extraUserRoots": ["/Users/me/shared-skills"] + } + ] +} } +{ "id": 25, "result": { + "data": [{ + "cwd": "/Users/me/project", + "skills": [ + { + "name": "skill-creator", + "description": "Create or update a Codex skill", + "enabled": true, + "interface": { + "displayName": "Skill Creator", + "shortDescription": "Create or update a Codex skill" + }, + "dependencies": { + "tools": [ + { + "type": "env_var", + "value": "GITHUB_TOKEN", + "description": "GitHub API token" + }, + { + "type": "mcp", + "value": "github", + "transport": "streamable_http", + "url": "https://example.com/mcp" + } + ] + } + } + ], + "errors": [] + }] +} } +``` + +To enable or disable a skill by path: + +```json +{ + "method": "skills/config/write", + "id": 26, + "params": { + "path": "/Users/me/.codex/skills/skill-creator/SKILL.md", + "enabled": false + } +} +``` + +## Apps (connectors) + +Use `app/list` to fetch available apps. In the CLI/TUI, `/apps` is the user-facing picker; in custom clients, call `app/list` directly. Each entry includes both `isAccessible` (available to the user) and `isEnabled` (enabled in `config.toml`) so clients can distinguish install/access from local enabled state. App entries can also include optional `branding`, `appMetadata`, and `labels` fields. + +```json +{ "method": "app/list", "id": 50, "params": { + "cursor": null, + "limit": 50, + "threadId": "thread-1", + "forceRefetch": false +} } +{ "id": 50, "result": { + "data": [ + { + "id": "demo-app", + "name": "Demo App", + "description": "Example connector for documentation.", + "logoUrl": "https://example.com/demo-app.png", + "logoUrlDark": null, + "distributionChannel": null, + "branding": null, + "appMetadata": null, + "labels": null, + "installUrl": "https://chatgpt.com/apps/demo-app/demo-app", + "isAccessible": true, + "isEnabled": true + } + ], + "nextCursor": null +} } +``` + +If you provide `threadId`, app feature gating (`features.apps`) uses that thread's config snapshot. When omitted, app-server uses the latest global config. + +`app/list` returns after both accessible apps and directory apps load. Set `forceRefetch: true` to bypass app caches and fetch fresh data. Cache entries are only replaced when refreshes succeed. + +The server also emits `app/list/updated` notifications whenever either source (accessible apps or directory apps) finishes loading. Each notification includes the latest merged app list. + +```json +{ + "method": "app/list/updated", + "params": { + "data": [ + { + "id": "demo-app", + "name": "Demo App", + "description": "Example connector for documentation.", + "logoUrl": "https://example.com/demo-app.png", + "logoUrlDark": null, + "distributionChannel": null, + "branding": null, + "appMetadata": null, + "labels": null, + "installUrl": "https://chatgpt.com/apps/demo-app/demo-app", + "isAccessible": true, + "isEnabled": true + } + ] + } +} +``` + +Invoke an app by inserting `$` in the text input and adding a `mention` input item with the `app://` path (recommended). + +```json +{ + "method": "turn/start", + "id": 51, + "params": { + "threadId": "thread-1", + "input": [ + { + "type": "text", + "text": "$demo-app Pull the latest updates from the team." + }, + { + "type": "mention", + "name": "Demo App", + "path": "app://demo-app" + } + ] + } +} +``` + +### Config RPC examples for app settings + +Use `config/read`, `config/value/write`, and `config/batchWrite` to inspect or update app controls in `config.toml`. + +Read the effective app config shape (including `_default` and per-tool overrides): + +```json +{ "method": "config/read", "id": 60, "params": { "includeLayers": false } } +{ "id": 60, "result": { + "config": { + "apps": { + "_default": { + "enabled": true, + "destructive_enabled": true, + "open_world_enabled": true + }, + "google_drive": { + "enabled": true, + "destructive_enabled": false, + "default_tools_approval_mode": "prompt", + "tools": { + "files/delete": { "enabled": false, "approval_mode": "approve" } + } + } + } + } +} } +``` + +Update a single app setting: + +```json +{ + "method": "config/value/write", + "id": 61, + "params": { + "keyPath": "apps.google_drive.default_tools_approval_mode", + "value": "prompt", + "mergeStrategy": "replace" + } +} +``` + +Apply multiple app edits atomically: + +```json +{ + "method": "config/batchWrite", + "id": 62, + "params": { + "edits": [ + { + "keyPath": "apps._default.destructive_enabled", + "value": false, + "mergeStrategy": "upsert" + }, + { + "keyPath": "apps.google_drive.tools.files/delete.approval_mode", + "value": "approve", + "mergeStrategy": "upsert" + } + ] + } +} +``` + +### Detect and import external agent config + +Use `externalAgentConfig/detect` to discover migratable external-agent artifacts, then pass the selected entries to `externalAgentConfig/import`. + +Detection example: + +```json +{ "method": "externalAgentConfig/detect", "id": 63, "params": { + "includeHome": true, + "cwds": ["/Users/me/project"] +} } +{ "id": 63, "result": { + "items": [ + { + "itemType": "AGENTS_MD", + "description": "Import /Users/me/project/CLAUDE.md to /Users/me/project/AGENTS.md.", + "cwd": "/Users/me/project" + }, + { + "itemType": "SKILLS", + "description": "Copy skill folders from /Users/me/.claude/skills to /Users/me/.agents/skills.", + "cwd": null + } + ] +} } +``` + +Import example: + +```json +{ "method": "externalAgentConfig/import", "id": 64, "params": { + "migrationItems": [ + { + "itemType": "AGENTS_MD", + "description": "Import /Users/me/project/CLAUDE.md to /Users/me/project/AGENTS.md.", + "cwd": "/Users/me/project" + } + ] +} } +{ "id": 64, "result": {} } +``` + +Supported `itemType` values are `AGENTS_MD`, `CONFIG`, `SKILLS`, and `MCP_SERVER_CONFIG`. Detection returns only items that still have work to do. For example, AGENTS migration is skipped when `AGENTS.md` already exists and is non-empty, and skill imports do not overwrite existing skill directories. + +## Auth endpoints + +The JSON-RPC auth/account surface exposes request/response methods plus server-initiated notifications (no `id`). Use these to determine auth state, start or cancel logins, logout, and inspect ChatGPT rate limits. + +### Authentication modes + +Codex supports three authentication modes. `account/updated.authMode` shows the active mode, and `account/read` also reports it. + +- **API key (`apikey`)** - the caller supplies an OpenAI API key and Codex stores it for API requests. +- **ChatGPT managed (`chatgpt`)** - Codex owns the ChatGPT OAuth flow, persists tokens, and refreshes them automatically. +- **ChatGPT external tokens (`chatgptAuthTokens`)** - a host app supplies `idToken` and `accessToken` directly. Codex stores these tokens in memory, and the host app must refresh them when asked. + +### API overview + +- `account/read` - fetch current account info; optionally refresh tokens. +- `account/login/start` - begin login (`apiKey`, `chatgpt`, or `chatgptAuthTokens`). +- `account/login/completed` (notify) - emitted when a login attempt finishes (success or error). +- `account/login/cancel` - cancel a pending ChatGPT login by `loginId`. +- `account/logout` - sign out; triggers `account/updated`. +- `account/updated` (notify) - emitted whenever auth mode changes (`authMode`: `apikey`, `chatgpt`, `chatgptAuthTokens`, or `null`). +- `account/chatgptAuthTokens/refresh` (server request) - request fresh externally managed ChatGPT tokens after an authorization error. +- `account/rateLimits/read` - fetch ChatGPT rate limits. +- `account/rateLimits/updated` (notify) - emitted whenever a user's ChatGPT rate limits change. +- `mcpServer/oauthLogin/completed` (notify) - emitted after a `mcpServer/oauth/login` flow finishes; payload includes `{ name, success, error? }`. + +### 1) Check auth state + +Request: + +```json +{ "method": "account/read", "id": 1, "params": { "refreshToken": false } } +``` + +Response examples: + +```json +{ "id": 1, "result": { "account": null, "requiresOpenaiAuth": false } } +``` + +```json +{ "id": 1, "result": { "account": null, "requiresOpenaiAuth": true } } +``` + +```json +{ + "id": 1, + "result": { "account": { "type": "apiKey" }, "requiresOpenaiAuth": true } +} +``` + +```json +{ + "id": 1, + "result": { + "account": { + "type": "chatgpt", + "email": "user@example.com", + "planType": "pro" + }, + "requiresOpenaiAuth": true + } +} +``` + +Field notes: + +- `refreshToken` (boolean): set `true` to force a token refresh in managed ChatGPT mode. In external token mode (`chatgptAuthTokens`), app-server ignores this flag. +- `requiresOpenaiAuth` reflects the active provider; when `false`, Codex can run without OpenAI credentials. + +### 2) Log in with an API key + +1. Send: + + ```json + { + "method": "account/login/start", + "id": 2, + "params": { "type": "apiKey", "apiKey": "sk-..." } + } + ``` + +2. Expect: + + ```json + { "id": 2, "result": { "type": "apiKey" } } + ``` + +3. Notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": null, "success": true, "error": null } + } + ``` + + ```json + { "method": "account/updated", "params": { "authMode": "apikey" } } + ``` + +### 3) Log in with ChatGPT (browser flow) + +1. Start: + + ```json + { "method": "account/login/start", "id": 3, "params": { "type": "chatgpt" } } + ``` + + ```json + { + "id": 3, + "result": { + "type": "chatgpt", + "loginId": "", + "authUrl": "https://chatgpt.com/...&redirect_uri=http%3A%2F%2Flocalhost%3A%2Fauth%2Fcallback" + } + } + ``` + +2. Open `authUrl` in a browser; the app-server hosts the local callback. +3. Wait for notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": "", "success": true, "error": null } + } + ``` + + ```json + { "method": "account/updated", "params": { "authMode": "chatgpt" } } + ``` + +### 3b) Log in with externally managed ChatGPT tokens (`chatgptAuthTokens`) + +Use this mode when a host application owns the user's ChatGPT auth lifecycle and supplies tokens directly. + +1. Send: + + ```json + { + "method": "account/login/start", + "id": 7, + "params": { + "type": "chatgptAuthTokens", + "idToken": "", + "accessToken": "" + } + } + ``` + +2. Expect: + + ```json + { "id": 7, "result": { "type": "chatgptAuthTokens" } } + ``` + +3. Notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": null, "success": true, "error": null } + } + ``` + + ```json + { + "method": "account/updated", + "params": { "authMode": "chatgptAuthTokens" } + } + ``` + +When the server receives a `401 Unauthorized`, it may request refreshed tokens from the host app: + +```json +{ + "method": "account/chatgptAuthTokens/refresh", + "id": 8, + "params": { "reason": "unauthorized", "previousAccountId": "org-123" } +} +{ "id": 8, "result": { "idToken": "", "accessToken": "" } } +``` + +The server retries the original request after a successful refresh response. Requests time out after about 10 seconds. + +### 4) Cancel a ChatGPT login + +```json +{ "method": "account/login/cancel", "id": 4, "params": { "loginId": "" } } +{ "method": "account/login/completed", "params": { "loginId": "", "success": false, "error": "..." } } +``` + +### 5) Logout + +```json +{ "method": "account/logout", "id": 5 } +{ "id": 5, "result": {} } +{ "method": "account/updated", "params": { "authMode": null } } +``` + +### 6) Rate limits (ChatGPT) + +```json +{ "method": "account/rateLimits/read", "id": 6 } +{ "id": 6, "result": { + "rateLimits": { + "limitId": "codex", + "limitName": null, + "primary": { "usedPercent": 25, "windowDurationMins": 15, "resetsAt": 1730947200 }, + "secondary": null + }, + "rateLimitsByLimitId": { + "codex": { + "limitId": "codex", + "limitName": null, + "primary": { "usedPercent": 25, "windowDurationMins": 15, "resetsAt": 1730947200 }, + "secondary": null + }, + "codex_other": { + "limitId": "codex_other", + "limitName": "codex_other", + "primary": { "usedPercent": 42, "windowDurationMins": 60, "resetsAt": 1730950800 }, + "secondary": null + } + } +} } +{ "method": "account/rateLimits/updated", "params": { + "rateLimits": { + "limitId": "codex", + "primary": { "usedPercent": 31, "windowDurationMins": 15, "resetsAt": 1730948100 } + } +} } +``` + +Field notes: + +- `rateLimits` is the backward-compatible single-bucket view. +- `rateLimitsByLimitId` (when present) is the multi-bucket view keyed by metered `limit_id` (for example `codex`). +- `limitId` is the metered bucket identifier. +- `limitName` is an optional user-facing label for the bucket. +- `usedPercent` is current usage within the quota window. +- `windowDurationMins` is the quota window length. +- `resetsAt` is a Unix timestamp (seconds) for the next reset. \ No newline at end of file diff --git a/docs/codex-cli-reference.md b/docs/codex-cli-reference.md new file mode 100644 index 0000000..aae41c7 --- /dev/null +++ b/docs/codex-cli-reference.md @@ -0,0 +1,859 @@ +# Command line options + +export const globalFlagOptions = [ + { + key: "PROMPT", + type: "string", + description: + "Optional text instruction to start the session. Omit to launch the TUI without a pre-filled message.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach one or more image files to the initial prompt. Separate multiple paths with commas or repeat the flag.", + }, + { + key: "--model, -m", + type: "string", + description: + "Override the model set in configuration (for example `gpt-5-codex`).", + }, + { + key: "--oss", + type: "boolean", + defaultValue: "false", + description: + 'Use the local open source model provider (equivalent to `-c model_provider="oss"`). Validates that Ollama is running.', + }, + { + key: "--profile, -p", + type: "string", + description: + "Configuration profile name to load from `~/.codex/config.toml`.", + }, + { + key: "--sandbox, -s", + type: "read-only | workspace-write | danger-full-access", + description: + "Select the sandbox policy for model-generated shell commands.", + }, + { + key: "--ask-for-approval, -a", + type: "untrusted | on-request | never", + description: + "Control when Codex pauses for human approval before running a command. `on-failure` is deprecated; prefer `on-request` for interactive runs or `never` for non-interactive runs.", + }, + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Shortcut for low-friction local work: sets `--ask-for-approval on-request` and `--sandbox workspace-write`.", + }, + { + key: "--dangerously-bypass-approvals-and-sandbox, --yolo", + type: "boolean", + defaultValue: "false", + description: + "Run every command without approvals or sandboxing. Only use inside an externally hardened environment.", + }, + { + key: "--cd, -C", + type: "path", + description: + "Set the working directory for the agent before it starts processing your request.", + }, + { + key: "--search", + type: "boolean", + defaultValue: "false", + description: + 'Enable live web search (sets `web_search = "live"` instead of the default `"cached"`).', + }, + { + key: "--add-dir", + type: "path", + description: + "Grant additional directories write access alongside the main workspace. Repeat for multiple paths.", + }, + { + key: "--no-alt-screen", + type: "boolean", + defaultValue: "false", + description: + "Disable alternate screen mode for the TUI (overrides `tui.alternate_screen` for this run).", + }, + { + key: "--enable", + type: "feature", + description: + "Force-enable a feature flag (translates to `-c features.=true`). Repeatable.", + }, + { + key: "--disable", + type: "feature", + description: + "Force-disable a feature flag (translates to `-c features.=false`). Repeatable.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Override configuration values. Values parse as JSON if possible; otherwise the literal string is used.", + }, +]; + +export const commandOverview = [ + { + key: "codex", + href: "/codex/cli/reference#codex-interactive", + type: "stable", + description: + "Launch the terminal UI. Accepts the global flags above plus an optional prompt or image attachments.", + }, + { + key: "codex app-server", + href: "/codex/cli/reference#codex-app-server", + type: "experimental", + description: + "Launch the Codex app server for local development or debugging.", + }, + { + key: "codex app", + href: "/codex/cli/reference#codex-app", + type: "stable", + description: + "Launch the Codex desktop app on macOS, optionally opening a specific workspace path.", + }, + { + key: "codex debug app-server send-message-v2", + href: "/codex/cli/reference#codex-debug-app-server-send-message-v2", + type: "experimental", + description: + "Debug app-server by sending a single V2 message through the built-in test client.", + }, + { + key: "codex apply", + href: "/codex/cli/reference#codex-apply", + type: "stable", + description: + "Apply the latest diff generated by a Codex Cloud task to your local working tree. Alias: `codex a`.", + }, + { + key: "codex cloud", + href: "/codex/cli/reference#codex-cloud", + type: "experimental", + description: + "Browse or execute Codex Cloud tasks from the terminal without opening the TUI. Alias: `codex cloud-tasks`.", + }, + { + key: "codex completion", + href: "/codex/cli/reference#codex-completion", + type: "stable", + description: + "Generate shell completion scripts for Bash, Zsh, Fish, or PowerShell.", + }, + { + key: "codex features", + href: "/codex/cli/reference#codex-features", + type: "stable", + description: + "List feature flags and persistently enable or disable them in `config.toml`.", + }, + { + key: "codex exec", + href: "/codex/cli/reference#codex-exec", + type: "stable", + description: + "Run Codex non-interactively. Alias: `codex e`. Stream results to stdout or JSONL and optionally resume previous sessions.", + }, + { + key: "codex execpolicy", + href: "/codex/cli/reference#codex-execpolicy", + type: "experimental", + description: + "Evaluate execpolicy rule files and see whether a command would be allowed, prompted, or blocked.", + }, + { + key: "codex login", + href: "/codex/cli/reference#codex-login", + type: "stable", + description: + "Authenticate Codex using ChatGPT OAuth, device auth, or an API key piped over stdin.", + }, + { + key: "codex logout", + href: "/codex/cli/reference#codex-logout", + type: "stable", + description: "Remove stored authentication credentials.", + }, + { + key: "codex mcp", + href: "/codex/cli/reference#codex-mcp", + type: "experimental", + description: + "Manage Model Context Protocol servers (list, add, remove, authenticate).", + }, + { + key: "codex mcp-server", + href: "/codex/cli/reference#codex-mcp-server", + type: "experimental", + description: + "Run Codex itself as an MCP server over stdio. Useful when another agent consumes Codex.", + }, + { + key: "codex resume", + href: "/codex/cli/reference#codex-resume", + type: "stable", + description: + "Continue a previous interactive session by ID or resume the most recent conversation.", + }, + { + key: "codex fork", + href: "/codex/cli/reference#codex-fork", + type: "stable", + description: + "Fork a previous interactive session into a new thread, preserving the original transcript.", + }, + { + key: "codex sandbox", + href: "/codex/cli/reference#codex-sandbox", + type: "experimental", + description: + "Run arbitrary commands inside Codex-provided macOS seatbelt or Linux sandboxes (Landlock by default, optional bubblewrap pipeline).", + }, +]; + +export const execOptions = [ + { + key: "PROMPT", + type: "string | - (read stdin)", + description: + "Initial instruction for the task. Use `-` to pipe the prompt from stdin.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach images to the first message. Repeatable; supports comma-separated lists.", + }, + { + key: "--model, -m", + type: "string", + description: "Override the configured model for this run.", + }, + { + key: "--oss", + type: "boolean", + defaultValue: "false", + description: + "Use the local open source provider (requires a running Ollama instance).", + }, + { + key: "--sandbox, -s", + type: "read-only | workspace-write | danger-full-access", + description: + "Sandbox policy for model-generated commands. Defaults to configuration.", + }, + { + key: "--profile, -p", + type: "string", + description: "Select a configuration profile defined in config.toml.", + }, + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Apply the low-friction automation preset (`workspace-write` sandbox and `on-request` approvals).", + }, + { + key: "--dangerously-bypass-approvals-and-sandbox, --yolo", + type: "boolean", + defaultValue: "false", + description: + "Bypass approval prompts and sandboxing. Dangerous—only use inside an isolated runner.", + }, + { + key: "--cd, -C", + type: "path", + description: "Set the workspace root before executing the task.", + }, + { + key: "--skip-git-repo-check", + type: "boolean", + defaultValue: "false", + description: + "Allow running outside a Git repository (useful for one-off directories).", + }, + { + key: "--ephemeral", + type: "boolean", + defaultValue: "false", + description: "Run without persisting session rollout files to disk.", + }, + { + key: "--output-schema", + type: "path", + description: + "JSON Schema file describing the expected final response shape. Codex validates tool output against it.", + }, + { + key: "--color", + type: "always | never | auto", + defaultValue: "auto", + description: "Control ANSI color in stdout.", + }, + { + key: "--json, --experimental-json", + type: "boolean", + defaultValue: "false", + description: + "Print newline-delimited JSON events instead of formatted text.", + }, + { + key: "--output-last-message, -o", + type: "path", + description: + "Write the assistant’s final message to a file. Useful for downstream scripting.", + }, + { + key: "Resume subcommand", + type: "codex exec resume [SESSION_ID]", + description: + "Resume an exec session by ID or add `--last` to continue the most recent session from the current working directory. Add `--all` to consider sessions from any directory. Accepts an optional follow-up prompt.", + }, + { + key: "-c, --config", + type: "key=value", + description: + "Inline configuration override for the non-interactive run (repeatable).", + }, +]; + +export const appServerOptions = [ + { + key: "--listen", + type: "stdio:// | ws://IP:PORT", + defaultValue: "stdio://", + description: + "Transport listener URL. `ws://` is experimental and intended for development/testing.", + }, +]; + +export const appOptions = [ + { + key: "PATH", + type: "path", + defaultValue: ".", + description: + "Workspace path to open in Codex Desktop (`codex app` is available on macOS only).", + }, + { + key: "--download-url", + type: "url", + description: + "Advanced override for the Codex desktop DMG download URL used during install.", + }, +]; + +export const debugAppServerSendMessageV2Options = [ + { + key: "USER_MESSAGE", + type: "string", + description: + "Message text sent to app-server through the built-in V2 test-client flow.", + }, +]; + +export const resumeOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Resume the specified session. Omit and use `--last` to continue the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Skip the picker and resume the most recent conversation from the current working directory.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Include sessions outside the current working directory when selecting the most recent session.", + }, +]; + +export const featuresOptions = [ + { + key: "List subcommand", + type: "codex features list", + description: + "Show known feature flags, their maturity stage, and their effective state.", + }, + { + key: "Enable subcommand", + type: "codex features enable ", + description: + "Persistently enable a feature flag in `config.toml`. Respects the active `--profile` when provided.", + }, + { + key: "Disable subcommand", + type: "codex features disable ", + description: + "Persistently disable a feature flag in `config.toml`. Respects the active `--profile` when provided.", + }, +]; + +export const execResumeOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Resume the specified session. Omit and use `--last` to continue the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Resume the most recent conversation from the current working directory.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Include sessions outside the current working directory when selecting the most recent session.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach one or more images to the follow-up prompt. Separate multiple paths with commas or repeat the flag.", + }, + { + key: "PROMPT", + type: "string | - (read stdin)", + description: + "Optional follow-up instruction sent immediately after resuming.", + }, +]; + +export const forkOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Fork the specified session. Omit and use `--last` to fork the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Skip the picker and fork the most recent conversation automatically.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Show sessions beyond the current working directory in the picker.", + }, +]; + +export const execpolicyOptions = [ + { + key: "--rules, -r", + type: "path (repeatable)", + description: + "Path to an execpolicy rule file to evaluate. Provide multiple flags to combine rules across files.", + }, + { + key: "--pretty", + type: "boolean", + defaultValue: "false", + description: "Pretty-print the JSON result.", + }, + { + key: "COMMAND...", + type: "var-args", + description: "Command to be checked against the specified policies.", + }, +]; + +export const loginOptions = [ + { + key: "--with-api-key", + type: "boolean", + description: + "Read an API key from stdin (for example `printenv OPENAI_API_KEY | codex login --with-api-key`).", + }, + { + key: "--device-auth", + type: "boolean", + description: + "Use OAuth device code flow instead of launching a browser window.", + }, + { + key: "status subcommand", + type: "codex login status", + description: + "Print the active authentication mode and exit with 0 when logged in.", + }, +]; + +export const applyOptions = [ + { + key: "TASK_ID", + type: "string", + description: + "Identifier of the Codex Cloud task whose diff should be applied.", + }, +]; + +export const sandboxMacOptions = [ + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Grant write access to the current workspace and `/tmp` without approvals.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Pass configuration overrides into the sandboxed run (repeatable).", + }, + { + key: "COMMAND...", + type: "var-args", + description: + "Shell command to execute under macOS Seatbelt. Everything after `--` is forwarded.", + }, +]; + +export const sandboxLinuxOptions = [ + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Grant write access to the current workspace and `/tmp` inside the Landlock sandbox.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Configuration overrides applied before launching the sandbox (repeatable).", + }, + { + key: "COMMAND...", + type: "var-args", + description: + "Command to execute under Landlock + seccomp. Provide the executable after `--`.", + }, +]; + +export const completionOptions = [ + { + key: "SHELL", + type: "bash | zsh | fish | power-shell | elvish", + defaultValue: "bash", + description: "Shell to generate completions for. Output prints to stdout.", + }, +]; + +export const cloudExecOptions = [ + { + key: "QUERY", + type: "string", + description: + "Task prompt. If omitted, Codex prompts interactively for details.", + }, + { + key: "--env", + type: "ENV_ID", + description: + "Target Codex Cloud environment identifier (required). Use `codex cloud` to list options.", + }, + { + key: "--attempts", + type: "1-4", + defaultValue: "1", + description: + "Number of assistant attempts (best-of-N) Codex Cloud should run.", + }, +]; + +export const cloudListOptions = [ + { + key: "--env", + type: "ENV_ID", + description: "Filter tasks by environment identifier.", + }, + { + key: "--limit", + type: "1-20", + defaultValue: "20", + description: "Maximum number of tasks to return.", + }, + { + key: "--cursor", + type: "string", + description: "Pagination cursor returned by a previous request.", + }, + { + key: "--json", + type: "boolean", + defaultValue: "false", + description: "Emit machine-readable JSON instead of plain text.", + }, +]; + +export const mcpCommands = [ + { + key: "list", + type: "--json", + description: + "List configured MCP servers. Add `--json` for machine-readable output.", + }, + { + key: "get ", + type: "--json", + description: + "Show a specific server configuration. `--json` prints the raw config entry.", + }, + { + key: "add ", + type: "-- | --url ", + description: + "Register a server using a stdio launcher command or a streamable HTTP URL. Supports `--env KEY=VALUE` for stdio transports.", + }, + { + key: "remove ", + description: "Delete a stored MCP server definition.", + }, + { + key: "login ", + type: "--scopes scope1,scope2", + description: + "Start an OAuth login for a streamable HTTP server (servers that support OAuth only).", + }, + { + key: "logout ", + description: + "Remove stored OAuth credentials for a streamable HTTP server.", + }, +]; + +export const mcpAddOptions = [ + { + key: "COMMAND...", + type: "stdio transport", + description: + "Executable plus arguments to launch the MCP server. Provide after `--`.", + }, + { + key: "--env KEY=VALUE", + type: "repeatable", + description: + "Environment variable assignments applied when launching a stdio server.", + }, + { + key: "--url", + type: "https://…", + description: + "Register a streamable HTTP server instead of stdio. Mutually exclusive with `COMMAND...`.", + }, + { + key: "--bearer-token-env-var", + type: "ENV_VAR", + description: + "Environment variable whose value is sent as a bearer token when connecting to a streamable HTTP server.", + }, +]; + +## How to read this reference + +This page catalogs every documented Codex CLI command and flag. Use the interactive tables to search by key or description. Each section indicates whether the option is stable or experimental and calls out risky combinations. + +The CLI inherits most defaults from ~/.codex/config.toml. Any + -c key=value overrides you pass at the command line take + precedence for that invocation. See [Config + basics](https://developers.openai.com/codex/config-basic#configuration-precedence) for more information. + +## Global flags + + + +These options apply to the base `codex` command and propagate to each subcommand unless a section below specifies otherwise. +When you run a subcommand, place global flags after it (for example, `codex exec --oss ...`) so Codex applies them as intended. + +## Command overview + +The Maturity column uses feature maturity labels such as Experimental, Beta, + and Stable. See [Feature Maturity](https://developers.openai.com/codex/feature-maturity) for how to + interpret these labels. + + + +## Command details + +### `codex` (interactive) + +Running `codex` with no subcommand launches the interactive terminal UI (TUI). The agent accepts the global flags above plus image attachments. Web search defaults to cached mode; use `--search` to switch to live browsing and `--full-auto` to let Codex run most commands without prompts. + +### `codex app-server` + +Launch the Codex app server locally. This is primarily for development and debugging and may change without notice. + + + +`codex app-server --listen stdio://` keeps the default JSONL-over-stdio behavior. `--listen ws://IP:PORT` enables WebSocket transport (experimental). If you generate schemas for client bindings, add `--experimental` to include gated fields and methods. + +### `codex app` + +Launch Codex Desktop from the terminal on macOS and optionally open a specific workspace path. + + + +`codex app` installs/opens the desktop app on macOS, then opens the provided workspace path. This subcommand is macOS-only. + +### `codex debug app-server send-message-v2` + +Send one message through app-server's V2 thread/turn flow using the built-in app-server test client. + + + +This debug flow initializes with `experimentalApi: true`, starts a thread, sends a turn, and streams server notifications. Use it to reproduce and inspect app-server protocol behavior locally. + +### `codex apply` + +Apply the most recent diff from a Codex cloud task to your local repository. You must authenticate and have access to the task. + + + +Codex prints the patched files and exits non-zero if `git apply` fails (for example, due to conflicts). + +### `codex cloud` + +Interact with Codex cloud tasks from the terminal. The default command opens an interactive picker; `codex cloud exec` submits a task directly, and `codex cloud list` returns recent tasks for scripting or quick inspection. + + + +Authentication follows the same credentials as the main CLI. Codex exits non-zero if the task submission fails. + +#### `codex cloud list` + +List recent cloud tasks with optional filtering and pagination. + + + +Plain-text output prints a task URL followed by status details. Use `--json` for automation. The JSON payload contains a `tasks` array plus an optional `cursor` value. Each task includes `id`, `url`, `title`, `status`, `updated_at`, `environment_id`, `environment_label`, `summary`, `is_review`, and `attempt_total`. + +### `codex completion` + +Generate shell completion scripts and redirect the output to the appropriate location, for example `codex completion zsh > "${fpath[1]}/_codex"`. + + + +### `codex features` + +Manage feature flags stored in `~/.codex/config.toml`. The `enable` and `disable` commands persist changes so they apply to future sessions. When you launch with `--profile`, Codex writes to that profile instead of the root configuration. + + + +### `codex exec` + +Use `codex exec` (or the short form `codex e`) for scripted or CI-style runs that should finish without human interaction. + + + +Codex writes formatted output by default. Add `--json` to receive newline-delimited JSON events (one per state change). The optional `resume` subcommand lets you continue non-interactive tasks. Use `--last` to pick the most recent session from the current working directory, or add `--all` to search across all sessions: + + + +### `codex execpolicy` + +Check `execpolicy` rule files before you save them. `codex execpolicy check` accepts one or more `--rules` flags (for example, files under `~/.codex/rules`) and emits JSON showing the strictest decision and any matching rules. Add `--pretty` to format the output. The `execpolicy` command is currently in preview. + + + +### `codex login` + +Authenticate the CLI with a ChatGPT account or API key. With no flags, Codex opens a browser for the ChatGPT OAuth flow. + + + +`codex login status` exits with `0` when credentials are present, which is helpful in automation scripts. + +### `codex logout` + +Remove saved credentials for both API key and ChatGPT authentication. This command has no flags. + +### `codex mcp` + +Manage Model Context Protocol server entries stored in `~/.codex/config.toml`. + + + +The `add` subcommand supports both stdio and streamable HTTP transports: + + + +OAuth actions (`login`, `logout`) only work with streamable HTTP servers (and only when the server supports OAuth). + +### `codex mcp-server` + +Run Codex as an MCP server over stdio so that other tools can connect. This command inherits global configuration overrides and exits when the downstream client closes the connection. + +### `codex resume` + +Continue an interactive session by ID or resume the most recent conversation. `codex resume` scopes `--last` to the current working directory unless you pass `--all`. It accepts the same global flags as `codex`, including model and sandbox overrides. + + + +### `codex fork` + +Fork a previous interactive session into a new thread. By default, `codex fork` opens the session picker; add `--last` to fork your most recent session instead. + + + +### `codex sandbox` + +Use the sandbox helper to run a command under the same policies Codex uses internally. + +#### macOS seatbelt + + + +#### Linux Landlock + + + +## Flag combinations and safety tips + +- Set `--full-auto` for unattended local work, but avoid combining it with `--dangerously-bypass-approvals-and-sandbox` unless you are inside a dedicated sandbox VM. +- When you need to grant Codex write access to more directories, prefer `--add-dir` rather than forcing `--sandbox danger-full-access`. +- Pair `--json` with `--output-last-message` in CI to capture machine-readable progress and a final natural-language summary. + +## Related resources + +- [Codex CLI overview](https://developers.openai.com/codex/cli): installation, upgrades, and quick tips. +- [Config basics](https://developers.openai.com/codex/config-basic): persist defaults like the model and provider. +- [Advanced Config](https://developers.openai.com/codex/config-advanced): profiles, providers, sandbox tuning, and integrations. +- [AGENTS.md](https://developers.openai.com/codex/guides/agents-md): conceptual overview of Codex agent capabilities and best practices. \ No newline at end of file diff --git a/session-logs/LATEST.md b/session-logs/LATEST.md new file mode 100644 index 0000000..2c102bd --- /dev/null +++ b/session-logs/LATEST.md @@ -0,0 +1,10 @@ +# Session Log + +- Timestamp: 2026-04-30T19:45:00Z +- Scope: Adopt the centralized `happycatlabs/codex-review-workflow` reusable workflow for PR reviews. The local copy of the codex-code-review workflow is replaced with a thin caller that delegates to the central repo, so prompt and review-logic updates land in one place across `happycatlabs/*`. +- Verification: + - The PR's `Codex Code Review` check uses the central workflow at `happycatlabs/codex-review-workflow/.github/workflows/codex-code-review.yml@main`. + - `CODEX_AUTH_JSON` is available to the run via `secrets: inherit` (set at both repo and org levels). +- Notes: + - The caller declares `permissions: { contents: read, pull-requests: write, issues: write }` because GitHub bounds a called workflow's job-level permissions by the caller's workflow-level permissions. Without this, a reusable workflow that needs to post sticky review comments fails with a bare `startup_failure`. + - This entry also covers a CI-retrigger commit on the same branch so the husky `session-logs/LATEST.md` freshness check passes. diff --git a/specs/smoke/html-game-planning.md b/specs/smoke/html-game-planning.md new file mode 100644 index 0000000..cdd89b8 --- /dev/null +++ b/specs/smoke/html-game-planning.md @@ -0,0 +1,102 @@ +# Bun HTML Game Planning Smoke + +Manual smoke scenario for exercising: + +- multi-step planning +- pre-execution task-graph review +- consultation before execution +- execution against a local git repo in `tmp/` +- optional multi-agent prompt guidance + +The generated project stays local and gitignored under `tmp/smoke/html-game/`. + +## Workspace setup + +```bash +mkdir -p tmp/smoke/html-game +cd tmp/smoke/html-game +bun init -y +git init +git add . +git commit -m "baseline" +``` + +If you want to exercise the multi-agent prompt path, enable one of these before running Orca: + +- set `codex.multiAgent: true` in the Orca config you use for the smoke run +- or ensure `~/.codex/config.toml` contains: + +```toml +[features] +multi_agent = true +``` + +## Local spec file + +Copy the spec below into `tmp/smoke/html-game/SMOKE_SPEC.md`, then run Orca from inside `tmp/smoke/html-game/`. + +```md +# Build a Tiny HTML Arcade Game + +Create a small single-page browser game in this Bun project using plain HTML, CSS, and JavaScript. + +## Goal + +Build a simple arcade-style game that is real enough to require planning and task coordination, but still small enough to finish in one run. + +## Requirements + +- Use vanilla HTML, CSS, and JavaScript only. No framework. +- The game must be playable in the browser from local files or a tiny local static server. +- Include a visible start state, active gameplay state, and game-over state. +- Include a restart flow so the player can immediately play again after losing. +- Include score tracking that visibly updates during play. +- Include keyboard input for movement. +- Include moving obstacles, enemies, or hazards with collision detection. +- Include lightweight styling so the game feels intentional, not raw browser defaults. +- Include a short on-screen explanation of controls and objective. +- Keep code organized enough that multiple implementation tasks could reasonably be split across files or concerns. + +## Suggested game shape + +Aim for a tiny dodge-or-collect game such as: + +- move a player square or ship +- avoid falling hazards or collect targets +- increase score over time or on pickups +- end the run on collision + +You do not need sound, assets, backend code, or external libraries. + +## Verification + +- Add a lightweight local verification step and run it before finishing. +- The verification can be a small Bun test, a script, or another local check that proves the required files and core game states exist. +- Keep verification simple and local. + +## Acceptance criteria + +- The project contains the files needed to run the game locally. +- A human can open the game and play it with the keyboard. +- Score, collision handling, game over, and restart all work. +- The local verification step passes. +- Keep the implementation simple and behavior-preserving relative to the spec. +``` + +## Manual run flow + +From `tmp/smoke/html-game/`: + +```bash +orca plan --spec ./SMOKE_SPEC.md +orca run --spec ./SMOKE_SPEC.md +``` + +## Manual acceptance + +- `orca plan` should produce a multi-task graph, not the single fallback execution task. +- The planned graph should show clear ownership boundaries and only necessary dependencies. +- If multi-agent is enabled, the graph should favor safe parallelizable task breakdown instead of bundled do-everything tasks. +- The review/consultation steps should complete without a hard blocking failure. +- `orca run` should finish with a playable local HTML game. +- The verification step created by the run should pass. diff --git a/src/agents/codex/codex-path.test.ts b/src/agents/codex/codex-path.test.ts new file mode 100644 index 0000000..9ff7a96 --- /dev/null +++ b/src/agents/codex/codex-path.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, test } from "bun:test"; + +import { + clearResolvedCodexPathCacheForTests, + compareCodexCliVersions, + parseCodexCliVersion, + selectPreferredCodexBinary, +} from "./codex-path.js"; + +describe("codex-path", () => { + test("parseCodexCliVersion handles stable releases", () => { + expect(parseCodexCliVersion("codex-cli 0.77.0")).toEqual({ + major: 0, + minor: 77, + patch: 0, + prerelease: [], + raw: "codex-cli 0.77.0", + }); + }); + + test("parseCodexCliVersion handles prereleases", () => { + expect(parseCodexCliVersion("codex-cli 0.115.0-alpha.4")).toEqual({ + major: 0, + minor: 115, + patch: 0, + prerelease: ["alpha", 4], + raw: "codex-cli 0.115.0-alpha.4", + }); + }); + + test("compareCodexCliVersions prefers newer minors", () => { + const older = parseCodexCliVersion("codex-cli 0.77.0"); + const newer = parseCodexCliVersion("codex-cli 0.115.0-alpha.4"); + + expect(older).not.toBeNull(); + expect(newer).not.toBeNull(); + expect(compareCodexCliVersions(newer!, older!)).toBeGreaterThan(0); + }); + + test("compareCodexCliVersions prefers stable over prerelease for same numeric version", () => { + const prerelease = parseCodexCliVersion("codex-cli 0.115.0-alpha.4"); + const stable = parseCodexCliVersion("codex-cli 0.115.0"); + + expect(prerelease).not.toBeNull(); + expect(stable).not.toBeNull(); + expect(compareCodexCliVersions(stable!, prerelease!)).toBeGreaterThan(0); + }); + + test("selectPreferredCodexBinary prefers the newest parsed version", () => { + expect( + selectPreferredCodexBinary([ + { path: "/usr/local/bin/codex", versionOutput: "codex-cli 0.77.0" }, + { + path: "/Applications/Codex.app/Contents/Resources/codex", + versionOutput: "codex-cli 0.115.0-alpha.4", + }, + ]), + ).toBe("/Applications/Codex.app/Contents/Resources/codex"); + }); + + test("selectPreferredCodexBinary falls back to the first available path when versions are unavailable", () => { + expect( + selectPreferredCodexBinary([ + { path: "/first/codex", versionOutput: null }, + { path: "/second/codex", versionOutput: null }, + ]), + ).toBe("/first/codex"); + }); + + test("clearResolvedCodexPathCacheForTests is callable", () => { + clearResolvedCodexPathCacheForTests(); + expect(true).toBe(true); + }); +}); diff --git a/src/agents/codex/codex-path.ts b/src/agents/codex/codex-path.ts new file mode 100644 index 0000000..85d58eb --- /dev/null +++ b/src/agents/codex/codex-path.ts @@ -0,0 +1,231 @@ +import { execFile as execFileCallback } from "node:child_process"; +import { accessSync, constants as fsConstants } from "node:fs"; +import { access } from "node:fs/promises"; +import path from "node:path"; +import { promisify } from "node:util"; + +const execFile = promisify(execFileCallback); +const FALLBACK_CODEX_PATH = "codex"; + +const KNOWN_CODEX_BINARY_CANDIDATES = [ + "/Applications/Codex.app/Contents/Resources/codex", + "/opt/homebrew/bin/codex", + "/usr/local/bin/codex", +] as const; + +export interface ParsedCodexCliVersion { + major: number; + minor: number; + patch: number; + prerelease: Array; + raw: string; +} + +export interface CodexBinaryProbe { + path: string; + versionOutput: string | null; +} + +let cachedResolvedCodexPath: Promise | null = null; + +export function clearResolvedCodexPathCacheForTests(): void { + cachedResolvedCodexPath = null; +} + +export function parseCodexCliVersion(output: string): ParsedCodexCliVersion | null { + const match = output.match(/codex-cli\s+(\d+)\.(\d+)\.(\d+)(?:-([A-Za-z0-9.-]+))?/i); + if (!match) { + return null; + } + + const prerelease = match[4] + ? match[4] + .split(".") + .map((part) => (/^\d+$/.test(part) ? Number(part) : part)) + : []; + + return { + major: Number(match[1]), + minor: Number(match[2]), + patch: Number(match[3]), + prerelease, + raw: match[0], + }; +} + +export function compareCodexCliVersions(a: ParsedCodexCliVersion, b: ParsedCodexCliVersion): number { + for (const key of ["major", "minor", "patch"] as const) { + if (a[key] !== b[key]) { + return a[key] - b[key]; + } + } + + if (a.prerelease.length === 0 && b.prerelease.length === 0) { + return 0; + } + + if (a.prerelease.length === 0) { + return 1; + } + + if (b.prerelease.length === 0) { + return -1; + } + + const maxLength = Math.max(a.prerelease.length, b.prerelease.length); + for (let index = 0; index < maxLength; index += 1) { + const left = a.prerelease[index]; + const right = b.prerelease[index]; + + if (left === undefined) { + return -1; + } + + if (right === undefined) { + return 1; + } + + if (left === right) { + continue; + } + + if (typeof left === "number" && typeof right === "number") { + return left - right; + } + + if (typeof left === "number") { + return -1; + } + + if (typeof right === "number") { + return 1; + } + + return left.localeCompare(right); + } + + return 0; +} + +export function selectPreferredCodexBinary(probes: CodexBinaryProbe[]): string | null { + const candidates = probes.filter((probe) => probe.path.trim().length > 0); + if (candidates.length === 0) { + return null; + } + + let best = candidates[0] ?? null; + if (!best) { + return null; + } + + let bestVersion = parseCodexCliVersion(best.versionOutput ?? ""); + + for (const candidate of candidates.slice(1)) { + const candidateVersion = parseCodexCliVersion(candidate.versionOutput ?? ""); + if (!bestVersion) { + if (candidateVersion) { + best = candidate; + bestVersion = candidateVersion; + } + continue; + } + + if (!candidateVersion) { + continue; + } + + if (compareCodexCliVersions(candidateVersion, bestVersion) > 0) { + best = candidate; + bestVersion = candidateVersion; + } + } + + return best.path; +} + +function resolveCodexPathOnPath(): string | null { + const pathValue = process.env.PATH?.trim(); + if (!pathValue) { + return null; + } + + for (const entry of pathValue.split(path.delimiter)) { + const trimmed = entry.trim(); + if (trimmed.length === 0) { + continue; + } + + const candidatePath = path.join(trimmed, "codex"); + try { + accessSync(candidatePath, fsConstants.X_OK); + return candidatePath; + } catch { + continue; + } + } + + return null; +} + +function getCandidatePaths(): string[] { + return Array.from( + new Set( + [resolveCodexPathOnPath(), ...KNOWN_CODEX_BINARY_CANDIDATES].filter( + (value): value is string => typeof value === "string" && value.trim().length > 0, + ), + ), + ); +} + +async function isExecutable(filePath: string): Promise { + try { + await access(filePath, fsConstants.X_OK); + return true; + } catch { + return false; + } +} + +async function readCodexCliVersion(filePath: string): Promise { + try { + const { stdout, stderr } = await execFile(filePath, ["--version"], { + timeout: 1_500, + }); + const output = `${stdout ?? ""}\n${stderr ?? ""}`.trim(); + return output.length > 0 ? output : null; + } catch { + return null; + } +} + +async function autoResolveCodexPath(): Promise { + const candidates = getCandidatePaths(); + const available = await Promise.all( + candidates.map(async (candidatePath) => { + if (!(await isExecutable(candidatePath))) { + return null; + } + + return { + path: candidatePath, + versionOutput: await readCodexCliVersion(candidatePath), + } satisfies CodexBinaryProbe; + }), + ); + + const preferred = selectPreferredCodexBinary( + available.filter((probe): probe is CodexBinaryProbe => probe !== null), + ); + + return preferred ?? FALLBACK_CODEX_PATH; +} + +export async function resolveCodexPath(): Promise { + const explicitPath = process.env.ORCA_CODEX_PATH?.trim(); + if (explicitPath && explicitPath.length > 0) { + return explicitPath; + } + + cachedResolvedCodexPath ??= autoResolveCodexPath(); + return cachedResolvedCodexPath; +} diff --git a/src/agents/codex/session.test.ts b/src/agents/codex/session.test.ts index 78fa17f..52b3a45 100644 --- a/src/agents/codex/session.test.ts +++ b/src/agents/codex/session.test.ts @@ -6,8 +6,8 @@ import { createCodexSession } from "./session.js"; // Try common locations for the codex binary const CODEX_PATHS = [ + "/Applications/Codex.app/Contents/Resources/codex", Bun.which("codex"), - `${process.env.HOME}/.nvm/versions/node/v22.22.0/bin/codex`, "/opt/homebrew/bin/codex", "/usr/local/bin/codex", ].filter(Boolean) as string[]; diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 107bdad..c907919 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1,19 +1,36 @@ +import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import { CodexClient } from "@ratley/codex-client"; -import type { CompletedTurn } from "@ratley/codex-client"; +import type { + CompletedTurn, + RequestId, + ToolRequestUserInputParams, + ToolRequestUserInputResponse, +} from "@ratley/codex-client"; import type { + HookEvent, OrcaConfig, PlanResult, + RunId, Task, TaskExecutionResult, TaskGraphReviewOperation, TaskGraphReviewResult } from "../../types/index.js"; +import { isCodexMultiAgentActive } from "../../core/codex-config.js"; +import { + buildQuestionHookMessage, + createPendingQuestion, + parseQuestionAnswerInput, +} from "../../core/question-flow.js"; import { TaskGraphReviewPayloadSchema } from "../../core/task-graph-review.js"; +import { RunStore } from "../../state/store.js"; import type { CodexEffort } from "../../types/effort.js"; import { loadSkills, type LoadedSkill } from "../../utils/skill-loader.js"; +import { logger } from "../../utils/logger.js"; +import { resolveCodexPath } from "./codex-path.js"; export type { PlanResult, TaskExecutionResult }; @@ -28,12 +45,26 @@ function getCodeSimplifierGuidance(): string[] { ]; } -function buildPlanningPrompt(spec: string, systemContext: string): string { +function getMultiAgentPlanningGuidance(multiAgentActive: boolean): string[] { + if (!multiAgentActive) { + return []; + } + + return [ + "Codex multi-agent mode is enabled for this run. Shape the task graph so safe subagent parallelization is obvious.", + "Assign clear file or subsystem ownership per task so subagents do not step on each other.", + "Only add dependencies that are truly required for correctness.", + "Do not bundle unrelated work into a single do-everything task when it can be safely split.", + ]; +} + +function buildPlanningPrompt(spec: string, systemContext: string, multiAgentActive: boolean): string { return [ systemContext, "You are decomposing a spec into an ordered task graph.", "Prefer task decomposition that maximizes safe parallelism for independent workstreams.", "Isolate task ownership (files/subsystems) to avoid cross-task collisions.", + ...getMultiAgentPlanningGuidance(multiAgentActive), ...getCodeSimplifierGuidance(), "Return a JSON array of tasks.", "Each task must include fields: id, name, description, dependencies, acceptance_criteria, status, retries, maxRetries.", @@ -51,11 +82,20 @@ function buildTaskExecutionPrompt( runId: string, cwd: string, systemContext?: string, + multiAgentActive = false, ): string { return [ ...(systemContext ? [systemContext] : []), "You are Orca's task execution assistant.", ...getCodeSimplifierGuidance(), + ...(multiAgentActive + ? [ + "Codex multi-agent mode is enabled for this run.", + "If this task contains clearly independent subtasks with disjoint ownership, use subagents to parallelize them.", + "Do not use subagents for tightly coupled, blocking, or highly stateful work.", + "Integrate subagent results yourself before final completion.", + ] + : []), `Run ID: ${runId}`, `Repository CWD: ${cwd}`, `Task ID: ${task.id}`, @@ -88,11 +128,20 @@ function buildPlanDecisionPrompt(spec: string, systemContext: string): string { ].join("\n\n"); } -function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string): string { +function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string, multiAgentActive: boolean): string { return [ systemContext, "You are Orca's pre-execution task-graph reviewer.", ...getCodeSimplifierGuidance(), + ...(multiAgentActive + ? [ + "Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization.", + "Split independent work into separate tasks when subagents could execute it in parallel.", + "Remove fake dependencies that unnecessarily serialize independent work.", + "Flag ownership collisions where multiple tasks would touch the same files or subsystem without coordination.", + "Add coordination tasks when parallel work needs a final integration step.", + ] + : []), "Return JSON matching this shape exactly: {\"changes\":[...operations...]}", "Allowed operation shapes:", "- {\"op\":\"update_task\",\"taskId\":\"...\",\"fields\":{\"name\"?:string,\"description\"?:string,\"acceptance_criteria\"?:string[]}}", @@ -106,6 +155,32 @@ function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string): strin ].join("\n\n"); } +function buildTaskGraphConsultationPrompt(tasks: Task[], multiAgentActive: boolean): string { + const taskGraphJson = JSON.stringify(tasks, null, 2); + + return [ + "Review this Orca task graph before execution.", + "Flag any: missing steps, wrong dependency order, tasks that are underdefined, or potential blockers.", + ...(multiAgentActive + ? [ + "", + "Codex multi-agent mode is enabled for this run.", + "Treat missed safe parallelism, fake dependencies, overlapping ownership, or missing integration tasks as review concerns.", + "Flag tasks that should be split for safe subagent execution, or tasks that would cause subagents to step on each other.", + ] + : []), + "", + "Set ok: false ONLY if there is a hard blocking issue — dependency cycle, circular reference, a task that cannot possibly run as defined, or a critical missing step that would cause the run to fail.", + "For minor issues (ambiguous wording, style preferences, nice-to-haves): list them in issues but set ok: true.", + "If the graph looks generally reasonable and executable, set ok: true even if you have minor suggestions.", + "", + "Be brief. Output JSON on the last line: { \"issues\": [...], \"ok\": boolean }", + "", + "Task graph:", + taskGraphJson, + ].join("\n"); +} + function parseTaskGraphReview(raw: string): TaskGraphReviewResult { const parsed = JSON.parse(extractJson(raw)) as unknown; const result = TaskGraphReviewPayloadSchema.safeParse(parsed); @@ -289,21 +364,17 @@ function getModel(config?: OrcaConfig): string { return config?.codex?.model ?? process.env.ORCA_CODEX_MODEL ?? "gpt-5.3-codex"; } -function getCodexPath(): string { - return ( - process.env.ORCA_CODEX_PATH ?? - `${process.env.HOME}/.nvm/versions/node/v22.22.0/bin/codex` - ); -} - -type ThinkingStep = "decision" | "planning" | "execution"; +type ThinkingStep = "decision" | "planning" | "review" | "execution"; const DEFAULT_THINKING_BY_STEP: Record = { decision: "low", planning: "high", + review: "high", execution: "medium", }; +const ANSWER_FILE_POLL_MS = 500; + function getEffort(config: OrcaConfig | undefined, step: ThinkingStep): CodexEffort { const explicitThinkingLevel = config?.codex?.thinkingLevel?.[step]; if (explicitThinkingLevel !== undefined) { @@ -317,15 +388,26 @@ function getEffort(config: OrcaConfig | undefined, step: ThinkingStep): CodexEff return DEFAULT_THINKING_BY_STEP[step]; } -function buildTurnInput(text: string, skills: LoadedSkill[]): Array<{ type: "text"; text: string } | { type: "skill"; name: string; path: string }> { - return [ - { type: "text", text }, - ...skills.map((skill) => ({ - type: "skill" as const, - name: skill.name, - path: skill.dirPath, - })), - ]; +function buildTurnInput(text: string, skills: LoadedSkill[]): Array<{ type: "text"; text: string }> { + const usableSkills = skills.filter((skill) => skill.body.trim().length > 0); + if (usableSkills.length === 0) { + return [{ type: "text", text }]; + } + + const skillContext = usableSkills.map((skill) => [ + `Skill: ${skill.name}`, + `Source: ${skill.filePath}`, + skill.body.trim(), + ].join("\n")).join("\n\n"); + + return [{ + type: "text", + text: [ + text, + "Referenced Orca skills:", + skillContext, + ].join("\n\n"), + }]; } interface RawSkill { @@ -367,22 +449,36 @@ function getPerCwdExtraUserRootsForCwd(config: OrcaConfig | undefined, cwd: stri } async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: OrcaConfig): Promise { - const maybeRequest = Reflect.get(client as object, "request"); - if (typeof maybeRequest !== "function") { - return []; - } - - const request = maybeRequest as (this: unknown, method: string, params?: unknown, timeoutMs?: number) => Promise; - const perCwdExtraUserRoots = getPerCwdExtraUserRootsForCwd(config, cwd); let response: unknown; try { - response = await request.call(client, "skills/list", { - cwds: [cwd], - forceReload: true, - ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), - }); + const maybeListSkills = Reflect.get(client as object, "listSkills"); + if (typeof maybeListSkills === "function") { + response = await maybeListSkills.call(client, { + cwds: [cwd], + forceReload: true, + ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), + }); + } else { + const maybeRequest = Reflect.get(client as object, "request"); + if (typeof maybeRequest !== "function") { + return []; + } + + const request = maybeRequest as ( + this: unknown, + method: string, + params?: unknown, + timeoutMs?: number + ) => Promise; + + response = await request.call(client, "skills/list", { + cwds: [cwd], + forceReload: true, + ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), + }); + } } catch { return []; } @@ -408,10 +504,17 @@ async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: continue; } + let skillBody = ""; + try { + skillBody = await readFile(normalizedSkillPath, "utf8"); + } catch { + skillBody = ""; + } + discovered.push({ name: skill.name, description: "", - body: "", + body: skillBody, dirPath: path.dirname(normalizedSkillPath), filePath: normalizedSkillPath, }); @@ -459,6 +562,132 @@ async function resolveTurnSkills(client: CodexClient, config: OrcaConfig | undef return [...mergedByName.values()]; } +function extractUnknownFeatureKey(line: string): string | null { + const match = line.match(/unknown feature key in config:\s*([A-Za-z0-9_.-]+)/i); + return match?.[1] ?? null; +} + +function isIgnorableMcpStderrLine(line: string): boolean { + return ( + line.includes("codex_rmcp_client::oauth: failed to read OAuth tokens from keyring") || + line.includes("rmcp::transport::worker: worker quit with fatal: Transport channel closed, when AuthRequired(") || + line.includes("codex_core::mcp_connection_manager: Failed to list resources for MCP server") || + line.includes("codex_core::mcp_connection_manager: Failed to list resource templates for MCP server") || + line.includes("codex_core::shell_snapshot: Failed to delete shell snapshot") || + line.includes("codex_rmcp_client::rmcp_client: Failed to kill MCP process group") || + line.includes("codex_protocol::openai_models: Model personality requested but model_messages is missing") + ); +} + +function attachCodexStderrDiagnostics(client: CodexClient, codexPath: string): void { + const on = Reflect.get(client as object, "on"); + if (typeof on !== "function") { + return; + } + + const reportedLines = new Set(); + const reportedUnsupportedFeatures = new Set(); + + on.call(client, "stderr", (payload: unknown) => { + const line = String(payload).trim(); + if (line.length === 0) { + return; + } + + const unsupportedFeature = extractUnknownFeatureKey(line); + if (unsupportedFeature) { + if (!reportedUnsupportedFeatures.has(unsupportedFeature)) { + reportedUnsupportedFeatures.add(unsupportedFeature); + logger.warn( + `Codex binary ${codexPath} does not support feature '${unsupportedFeature}'. Orca will continue, but you should update Codex or point ORCA_CODEX_PATH at a newer binary.`, + ); + } + return; + } + + if (isIgnorableMcpStderrLine(line)) { + return; + } + + if (reportedLines.has(line)) { + return; + } + + reportedLines.add(line); + logger.warn(`Codex app-server: ${line}`); + }); +} + +async function warnAboutUnavailableMcpServers(client: CodexClient): Promise { + const request = Reflect.get(client as object, "request"); + if (typeof request !== "function") { + return; + } + + let response: unknown; + try { + response = await request.call(client, "mcpServerStatus/list", { limit: 50 }, 10_000); + } catch { + return; + } + + if (!response || typeof response !== "object" || !("data" in response) || !Array.isArray(response.data)) { + return; + } + + const unavailableServers = response.data + .filter((entry): entry is { name: string; authStatus: string } => + !!entry && + typeof entry === "object" && + typeof (entry as { name?: unknown }).name === "string" && + typeof (entry as { authStatus?: unknown }).authStatus === "string", + ) + .filter((entry) => entry.authStatus === "notLoggedIn") + .map((entry) => entry.name); + + if (unavailableServers.length === 0) { + return; + } + + const loginCommands = unavailableServers.map((name) => `codex mcp login ${name}`).join(" ; "); + logger.warn( + `Configured Codex MCP servers need login and will be unavailable for this Orca run: ${unavailableServers.join(", ")}. Orca will continue without them. Run ${loginCommands} or disable them in ~/.codex/config.toml if you do not need them.`, + ); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +async function appendRunError( + store: RunStore, + runId: RunId, + message: string, + taskId?: string, +): Promise { + const run = await store.getRun(runId); + if (!run) { + return; + } + + await store.updateRun(runId, { + errors: [...run.errors, { at: new Date().toISOString(), message, ...(taskId ? { taskId } : {}) }], + }); +} + +async function clearAnswerFile(store: RunStore, runId: RunId): Promise { + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); + await unlink(answerPath).catch(() => undefined); +} + +export interface SessionInteractionContext { + runId: RunId; + store: RunStore; + emitHook?: (event: HookEvent) => Promise; +} + /** * Create a persistent Codex session. The thread persists across calls — * planSpec and executeTask share context within the same session. @@ -471,6 +700,7 @@ export interface ConsultationResult { export async function createCodexSession( cwd: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise<{ decidePlanningNeed: (spec: string, systemContext: string) => Promise; planSpec: (spec: string, systemContext: string) => Promise; @@ -478,19 +708,170 @@ export async function createCodexSession( executeTask: (task: Task, runId: string, systemContext?: string) => Promise; consultTaskGraph: (tasks: Task[]) => Promise; reviewChanges: (threadId?: string) => Promise; - runPrompt: (prompt: string) => Promise; + runPrompt: (prompt: string, step?: ThinkingStep) => Promise; disconnect: () => Promise; threadId: string; }> { + const multiAgentActive = await isCodexMultiAgentActive(config); + const codexPath = await resolveCodexPath(); + const client = new CodexClient({ - codexPath: getCodexPath(), + codexPath, model: getModel(config), cwd, approvalPolicy: "never", sandbox: "workspace-write", }); + attachCodexStderrDiagnostics(client, codexPath); await client.connect(); + await warnAboutUnavailableMcpServers(client); + + let activeTaskContext: { taskId: string; taskName: string } | undefined; + + const respondToUserInputRequest = (requestId: RequestId, response: ToolRequestUserInputResponse): void => { + const specificResponder = Reflect.get(client as object, "respondToUserInputRequest"); + if (typeof specificResponder === "function") { + specificResponder.call(client, requestId, response); + return; + } + + const genericResponder = Reflect.get(client as object, "respondToServerRequest"); + if (typeof genericResponder === "function") { + genericResponder.call(client, requestId, response); + return; + } + + throw new Error("Codex client does not support responding to server requests"); + }; + + const rejectUserInputRequest = (requestId: RequestId, message: string): void => { + const rejector = Reflect.get(client as object, "rejectServerRequest"); + if (typeof rejector === "function") { + rejector.call(client, requestId, { code: -32603, message }); + return; + } + + throw new Error("Codex client does not support rejecting server requests"); + }; + + const clearPendingQuestion = async (requestId: RequestId, overallStatus: "running" | "waiting_for_answer"): Promise => { + if (!interactionContext) { + return; + } + + const currentRun = await interactionContext.store.getRun(interactionContext.runId); + if (!currentRun || currentRun.pendingQuestion?.requestId !== requestId) { + return; + } + + await interactionContext.store.updateRun(interactionContext.runId, { + overallStatus, + pendingQuestion: undefined, + }); + }; + + const on = Reflect.get(client as object, "on"); + if (typeof on === "function") { + on.call( + client, + "request:userInput", + (request: { requestId: RequestId } & ToolRequestUserInputParams) => { + void (async () => { + if (!interactionContext) { + rejectUserInputRequest( + request.requestId, + "Orca cannot answer Codex requestUserInput prompts without an interactive run context.", + ); + return; + } + + const pendingQuestion = createPendingQuestion(request.requestId, request); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + await interactionContext.store.updateRun(interactionContext.runId, { + overallStatus: "waiting_for_answer", + pendingQuestion, + }); + + if (interactionContext.emitHook) { + await interactionContext.emitHook({ + runId: interactionContext.runId, + hook: "onQuestion", + message: buildQuestionHookMessage(pendingQuestion), + timestamp: pendingQuestion.receivedAt, + requestId: pendingQuestion.requestId, + threadId: pendingQuestion.threadId, + turnId: pendingQuestion.turnId, + itemId: pendingQuestion.itemId, + questions: pendingQuestion.questions, + ...(activeTaskContext + ? { taskId: activeTaskContext.taskId, taskName: activeTaskContext.taskName } + : {}), + metadata: { + questionCount: pendingQuestion.questions.length, + }, + }); + } + + const answerPath = path.join(interactionContext.store.getRunDir(interactionContext.runId), "answer.txt"); + + while (true) { + const currentRun = await interactionContext.store.getRun(interactionContext.runId); + if (!currentRun) { + rejectUserInputRequest(request.requestId, `Run not found while waiting for answer: ${interactionContext.runId}`); + return; + } + + if (currentRun.overallStatus === "cancelled") { + rejectUserInputRequest(request.requestId, `Run ${interactionContext.runId} was cancelled while waiting for input.`); + await clearPendingQuestion(request.requestId, "waiting_for_answer"); + return; + } + + let rawAnswer: string; + try { + rawAnswer = await readFile(answerPath, "utf8"); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + await sleep(ANSWER_FILE_POLL_MS); + continue; + } + + throw error; + } + + try { + const parsedAnswer = parseQuestionAnswerInput(rawAnswer, pendingQuestion); + respondToUserInputRequest(request.requestId, parsedAnswer); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + await clearPendingQuestion(request.requestId, "running"); + return; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + logger.warn(`Invalid answer for run ${interactionContext.runId}; waiting for another response (${message})`); + await appendRunError( + interactionContext.store, + interactionContext.runId, + `invalid-answer: ${message}`, + activeTaskContext?.taskId, + ); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + } + } + })().catch(async (error) => { + const message = error instanceof Error ? error.message : String(error); + logger.warn(`Failed while handling Codex requestUserInput: ${message}`); + if (interactionContext) { + await appendRunError(interactionContext.store, interactionContext.runId, `request-user-input-failed: ${message}`, activeTaskContext?.taskId); + } + }); + }, + ); + + on.call(client, "serverRequest:resolved", (notification: { requestId: RequestId }) => { + void clearPendingQuestion(notification.requestId, "running"); + }); + } let skills: LoadedSkill[]; let threadId: string; @@ -524,7 +905,7 @@ export async function createCodexSession( const result = await client.runTurn({ threadId, effort: getEffort(config, "planning"), - input: buildTurnInput(buildPlanningPrompt(spec, systemContext), skills), + input: buildTurnInput(buildPlanningPrompt(spec, systemContext, multiAgentActive), skills), }); const rawResponse = extractAgentText(result); @@ -538,8 +919,8 @@ export async function createCodexSession( async reviewTaskGraph(tasks: Task[], systemContext: string): Promise { const result = await client.runTurn({ threadId, - effort: getEffort(config, "planning"), - input: buildTurnInput(buildTaskGraphReviewPrompt(tasks, systemContext), skills), + effort: getEffort(config, "review"), + input: buildTurnInput(buildTaskGraphReviewPrompt(tasks, systemContext, multiAgentActive), skills), }); const rawResponse = extractAgentText(result); @@ -551,11 +932,17 @@ export async function createCodexSession( runId: string, systemContext?: string, ): Promise { - const result = await client.runTurn({ - threadId, - effort: getEffort(config, "execution"), - input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext), skills), - }); + activeTaskContext = { taskId: task.id, taskName: task.name }; + let result: CompletedTurn; + try { + result = await client.runTurn({ + threadId, + effort: getEffort(config, "execution"), + input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext, multiAgentActive), skills), + }); + } finally { + activeTaskContext = undefined; + } const rawResponse = extractAgentText(result); @@ -580,24 +967,11 @@ export async function createCodexSession( }, async consultTaskGraph(tasks: Task[]): Promise { - const taskGraphJson = JSON.stringify(tasks, null, 2); - const prompt = [ - "Review this Orca task graph before execution.", - "Flag any: missing steps, wrong dependency order, tasks that are underdefined, or potential blockers.", - "", - "Set ok: false ONLY if there is a hard blocking issue — dependency cycle, circular reference, a task that cannot possibly run as defined, or a critical missing step that would cause the run to fail.", - "For minor issues (ambiguous wording, style preferences, nice-to-haves): list them in issues but set ok: true.", - "If the graph looks generally reasonable and executable, set ok: true even if you have minor suggestions.", - "", - "Be brief. Output JSON on the last line: { \"issues\": [...], \"ok\": boolean }", - "", - "Task graph:", - taskGraphJson, - ].join("\n"); + const prompt = buildTaskGraphConsultationPrompt(tasks, multiAgentActive); const result = await client.runTurn({ threadId, - effort: getEffort(config, "decision"), + effort: getEffort(config, "review"), input: buildTurnInput(prompt, skills), }); @@ -628,10 +1002,10 @@ export async function createCodexSession( return result.reviewText; }, - async runPrompt(prompt: string): Promise { + async runPrompt(prompt: string, step: ThinkingStep = "execution"): Promise { const result = await client.runTurn({ threadId, - effort: getEffort(config, "execution"), + effort: getEffort(config, step), input: buildTurnInput(prompt, skills), }); @@ -653,8 +1027,9 @@ export async function decidePlanningNeed( spec: string, systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.decidePlanningNeed(spec, systemContext); @@ -667,8 +1042,9 @@ export async function planSpec( spec: string, systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.planSpec(spec, systemContext); @@ -681,8 +1057,9 @@ export async function reviewTaskGraph( tasks: Task[], systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.reviewTaskGraph(tasks, systemContext); @@ -696,8 +1073,9 @@ export async function executeTask( runId: string, config?: OrcaConfig, systemContext?: string, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.executeTask(task, runId, systemContext); diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 4f74513..b876be2 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -1,10 +1,84 @@ +import { mkdtemp, mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import { EventEmitter } from "node:events"; +import os from "node:os"; +import path from "node:path"; import { afterEach, describe, expect, mock, test } from "bun:test"; +import { RunStore } from "../../state/store.js"; + afterEach(() => { mock.restore(); }); +async function waitFor(load: () => Promise, timeoutMs = 2_000): Promise { + const start = Date.now(); + + while (Date.now() - start < timeoutMs) { + const value = await load(); + if (value !== null) { + return value; + } + + await new Promise((resolve) => setTimeout(resolve, 20)); + } + + throw new Error(`Timed out after ${timeoutMs}ms`); +} + +function mockMultiAgentDetection(active = false): void { + mock.module("../../core/codex-config.js", () => ({ + isCodexMultiAgentActive: async () => active, + })); +} + describe("codex session effort wiring", () => { + test("uses ORCA_CODEX_PATH override and otherwise resolves a default Codex binary", async () => { + const constructedOptions: Array<{ codexPath?: string }> = []; + const originalCodexPath = process.env.ORCA_CODEX_PATH; + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + constructor(options: { codexPath?: string }) { + constructedOptions.push(options); + } + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + + try { + delete process.env.ORCA_CODEX_PATH; + const defaultSession = await createCodexSession(process.cwd()); + await defaultSession.disconnect(); + + process.env.ORCA_CODEX_PATH = "/tmp/custom-codex"; + const overriddenSession = await createCodexSession(process.cwd()); + await overriddenSession.disconnect(); + + expect(constructedOptions[0]?.codexPath).toBeTruthy(); + expect(constructedOptions[1]?.codexPath).toBe("/tmp/custom-codex"); + } finally { + if (originalCodexPath === undefined) { + delete process.env.ORCA_CODEX_PATH; + } else { + process.env.ORCA_CODEX_PATH = originalCodexPath; + } + } + }); + test("passes configured effort into Codex runTurn", async () => { const runTurnMock = mock(async () => ({ agentMessage: "[]", @@ -12,6 +86,7 @@ describe("codex session effort wiring", () => { items: [], })); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -42,7 +117,7 @@ describe("codex session effort wiring", () => { } }); - test("smoke: uses per-step thinkingLevel values for decision/planning/execution turns", async () => { + test("smoke: uses per-step thinkingLevel values for decision/planning/review/execution turns", async () => { const efforts: string[] = []; const runTurnMock = mock(async (params: { effort?: string; input?: Array<{ text?: string }> }) => { efforts.push(params.effort ?? ""); @@ -55,6 +130,22 @@ describe("codex session effort wiring", () => { }; } + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: "[]", turn: { status: "completed" }, @@ -62,6 +153,7 @@ describe("codex session effort wiring", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -86,6 +178,7 @@ describe("codex session effort wiring", () => { thinkingLevel: { decision: "low", planning: "xhigh", + review: "high", execution: "medium", }, }, @@ -94,6 +187,7 @@ describe("codex session effort wiring", () => { try { await session.decidePlanningNeed("spec", "context"); await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); await session.executeTask( { id: "t1", @@ -108,8 +202,10 @@ describe("codex session effort wiring", () => { "run-1", "context", ); + await session.consultTaskGraph([]); + await session.runPrompt("review prompt", "review"); - expect(efforts).toEqual(["low", "xhigh", "medium"]); + expect(efforts).toEqual(["low", "xhigh", "high", "medium", "high", "high"]); } finally { await session.disconnect(); } @@ -139,6 +235,7 @@ describe("codex session code-simplifier guidance", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -190,10 +287,185 @@ describe("codex session code-simplifier guidance", () => { }); }); +describe("codex session multi-agent prompt guidance", () => { + test("includes multi-agent guidance in planning, review, consultation, and execution prompts when active", async () => { + const prompts: string[] = []; + const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { + const prompt = params.input?.[0]?.text ?? ""; + prompts.push(prompt); + + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + + return { + agentMessage: "[]", + turn: { status: "completed" }, + items: [], + }; + }); + + mockMultiAgentDetection(true); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); + await session.consultTaskGraph([]); + await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + const planningPrompt = prompts.find((prompt) => prompt.includes("You are decomposing a spec into an ordered task graph.")) ?? ""; + const reviewPrompt = prompts.find((prompt) => prompt.includes("You are Orca's pre-execution task-graph reviewer.")) ?? ""; + const consultationPrompt = prompts.find((prompt) => prompt.includes("Review this Orca task graph before execution.")) ?? ""; + const executionPrompt = prompts.find((prompt) => prompt.includes("You are Orca's task execution assistant.")) ?? ""; + + expect(planningPrompt).toContain("Codex multi-agent mode is enabled for this run. Shape the task graph so safe subagent parallelization is obvious."); + expect(planningPrompt).toContain("Do not bundle unrelated work into a single do-everything task when it can be safely split."); + + expect(reviewPrompt).toContain("Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization."); + expect(reviewPrompt).toContain("Flag ownership collisions where multiple tasks would touch the same files or subsystem without coordination."); + + expect(consultationPrompt).toContain("Codex multi-agent mode is enabled for this run."); + expect(consultationPrompt).toContain("Treat missed safe parallelism, fake dependencies, overlapping ownership, or missing integration tasks as review concerns."); + + expect(executionPrompt).toContain("Codex multi-agent mode is enabled for this run."); + expect(executionPrompt).toContain("If this task contains clearly independent subtasks with disjoint ownership, use subagents to parallelize them."); + expect(executionPrompt).toContain("Integrate subagent results yourself before final completion."); + } finally { + await session.disconnect(); + } + }); + + test("omits multi-agent guidance from planning, review, consultation, and execution prompts when inactive", async () => { + const prompts: string[] = []; + const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { + const prompt = params.input?.[0]?.text ?? ""; + prompts.push(prompt); + + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + + return { + agentMessage: "[]", + turn: { status: "completed" }, + items: [], + }; + }); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); + await session.consultTaskGraph([]); + await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + for (const prompt of prompts) { + expect(prompt).not.toContain("Codex multi-agent mode is enabled for this run."); + expect(prompt).not.toContain("use subagents to parallelize them"); + expect(prompt).not.toContain("safe subagent parallelization"); + } + } finally { + await session.disconnect(); + } + }); +}); + describe("codex session skill discovery", () => { test("calls skills/list with forceReload and perCwdExtraUserRoots", async () => { const requestMock = mock(async () => ({ data: [] })); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -235,10 +507,22 @@ describe("codex session skill discovery", () => { }); test("merges app-server listed skills after Orca-loaded skills without overriding deterministic precedence", async () => { - type TurnInputItem = { type: "text"; text: string } | { type: "skill"; name: string; path: string }; + type TurnInputItem = { type: "text"; text: string }; let capturedInput: TurnInputItem[] = []; - + const listedSkillsRoot = await mkdtemp(path.join(os.tmpdir(), "orca-listed-skills-")); + const alphaSkillPath = path.join(listedSkillsRoot, "alpha-skill", "SKILL.md"); + const codeSimplifierPath = path.join(listedSkillsRoot, "code-simplifier", "SKILL.md"); + const zetaSkillPath = path.join(listedSkillsRoot, "zeta-skill", "SKILL.md"); + + await mkdir(path.dirname(alphaSkillPath), { recursive: true }); + await mkdir(path.dirname(codeSimplifierPath), { recursive: true }); + await mkdir(path.dirname(zetaSkillPath), { recursive: true }); + await writeFile(alphaSkillPath, "alpha body", "utf8"); + await writeFile(codeSimplifierPath, "server code simplifier body", "utf8"); + await writeFile(zetaSkillPath, "zeta body", "utf8"); + + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -255,9 +539,9 @@ describe("codex session skill discovery", () => { data: [ { skills: [ - { name: "zeta-skill", path: "/srv/zeta/zeta-skill/SKILL.md" }, - { name: "code-simplifier", path: "/srv/override/code-simplifier/SKILL.md" }, - { name: "alpha-skill", path: "/srv/alpha/alpha-skill/SKILL.md" }, + { name: "zeta-skill", path: zetaSkillPath }, + { name: "code-simplifier", path: codeSimplifierPath }, + { name: "alpha-skill", path: alphaSkillPath }, ], }, ], @@ -287,22 +571,26 @@ describe("codex session skill discovery", () => { try { await session.planSpec("spec", "context"); - const skills = capturedInput.filter((item): item is { type: "skill"; name: string; path: string } => item.type === "skill"); - expect(skills).toEqual([ - { type: "skill", name: "code-simplifier", path: "/tmp/skills/code-simplifier" }, - { type: "skill", name: "alpha-skill", path: "/srv/alpha/alpha-skill" }, - { type: "skill", name: "zeta-skill", path: "/srv/zeta/zeta-skill" }, - ]); + const prompt = capturedInput[0]?.text ?? ""; + expect(prompt).toContain("Referenced Orca skills:"); + expect(prompt).toContain("Skill: code-simplifier"); + expect(prompt).toContain("Skill: alpha-skill"); + expect(prompt).toContain("Skill: zeta-skill"); + expect(prompt).toContain("body"); + expect(prompt).toContain("alpha body"); + expect(prompt).toContain("zeta body"); + expect(prompt).not.toContain("server code simplifier body"); } finally { await session.disconnect(); } }); }); -describe("codex session explicit skill input", () => { +describe("codex session inline skill context", () => { test("disconnects Codex client if skill loading fails during session creation", async () => { const disconnectMock = mock(async () => {}); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -331,8 +619,8 @@ describe("codex session explicit skill input", () => { expect(disconnectMock).toHaveBeenCalledTimes(1); }); - test("includes skill items with valid name/path alongside text input for every runTurn", async () => { - type TurnInputItem = { type: "text"; text: string } | { type: "skill"; name: string; path: string }; + test("includes inline skill context inside the text input for every runTurn", async () => { + type TurnInputItem = { type: "text"; text: string }; const runTurnCalls: Array<{ input?: TurnInputItem[] }> = []; const runTurnMock = mock(async (params: { input?: TurnInputItem[] }) => { @@ -362,6 +650,7 @@ describe("codex session explicit skill input", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -414,20 +703,176 @@ describe("codex session explicit skill input", () => { expect(runTurnCalls.length).toBe(5); for (const call of runTurnCalls) { - const textItem = call.input?.find((item) => item.type === "text"); - expect(textItem?.type).toBe("text"); - expect((textItem as { text?: string } | undefined)?.text).toBeTruthy(); - - const skillItems = call.input?.filter((item) => item.type === "skill") ?? []; - expect(skillItems).toHaveLength(1); - expect(skillItems[0]).toEqual({ - type: "skill", - name: "code-simplifier", - path: "/tmp/skills/code-simplifier", - }); + expect(call.input).toHaveLength(1); + + const text = call.input?.[0]?.text ?? ""; + expect(text).toBeTruthy(); + expect(text).toContain("Referenced Orca skills:"); + expect(text).toContain("Skill: code-simplifier"); + expect(text).toContain("Source: /tmp/skills/code-simplifier/SKILL.md"); + expect(text).toContain("body"); } } finally { await session.disconnect(); } }); }); + +describe("codex session question flow", () => { + test("persists pending questions, emits onQuestion, and resumes the same run after an answer", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const hookEvents: Array<{ hook: string; message: string; taskId?: string; questions?: Array<{ id: string }> }> = []; + const responses: Array<{ requestId: string | number; response: unknown }> = []; + let resolveAnswerResponse: (() => void) | undefined; + const answerResponse = new Promise((resolve) => { + resolveAnswerResponse = resolve; + }); + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(requestId: string | number, response: unknown): void { + responses.push({ requestId, response }); + resolveAnswerResponse?.(); + } + rejectServerRequest(): void {} + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }); + }); + + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + emitHook: async (event) => { + hookEvents.push({ + hook: event.hook, + message: event.message, + ...(event.taskId ? { taskId: event.taskId } : {}), + ...("questions" in event ? { questions: event.questions.map((question) => ({ id: question.id })) } : {}), + }); + }, + }); + + try { + const executionPromise = session.executeTask( + { + id: "task-1", + name: "Build the game", + description: "Implement the requested game.", + dependencies: [], + acceptance_criteria: ["Game is implemented"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + const waitingRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run?.pendingQuestion ? run : null; + }); + + expect(waitingRun.overallStatus).toBe("waiting_for_answer"); + expect(waitingRun.pendingQuestion?.requestId).toBe("req-1"); + expect(waitingRun.pendingQuestion?.questions[0]?.id).toBe("game_type"); + expect(hookEvents).toContainEqual({ + hook: "onQuestion", + message: "Which game type should I build?", + taskId: "task-1", + questions: [{ id: "game_type" }], + }); + + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); + await writeFile( + answerPath, + `${JSON.stringify({ answers: { game_type: { answers: ["Arcade"] } } })}\n`, + "utf8", + ); + + const result = await executionPromise; + expect(result.outcome).toBe("done"); + expect(responses).toEqual([ + { + requestId: "req-1", + response: { + answers: { + game_type: { + answers: ["Arcade"], + }, + }, + }, + }, + ]); + + const resumedRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + expect(resumedRun.overallStatus).toBe("running"); + await expect(readFile(answerPath, "utf8")).rejects.toThrow(); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); +}); diff --git a/src/cli/commands/answer.test.ts b/src/cli/commands/answer.test.ts index 6d8a28d..d361ac0 100644 --- a/src/cli/commands/answer.test.ts +++ b/src/cli/commands/answer.test.ts @@ -85,21 +85,41 @@ afterEach(async () => { }); describe("answer command", () => { - test("submits positional answer and resumes waiting run", async () => { + test("submits positional answer for a single pending question", async () => { const runId = "answer-positional-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: [{ label: "A", description: "Use migration A" }], + }, + ], + }, + }); const { answerModule } = await loadAnswerModule(); await answerModule.answerCommandHandler(runId, "yes", {}); const run = await store.getRun(runId); - expect(run?.overallStatus).toBe("running"); + expect(run?.overallStatus).toBe("waiting_for_answer"); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("yes\n"); + expect(payload).toContain('"migration"'); + expect(payload).toContain('"yes"'); expect(logs.join("\n")).toContain(`Answer submitted. Run ${runId} will resume shortly.`); }); @@ -120,7 +140,26 @@ describe("answer command", () => { const runId = "answer-no-input-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); const { answerModule } = await loadAnswerModule(); @@ -133,7 +172,26 @@ describe("answer command", () => { const runId = "answer-prompt-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: [{ label: "A", description: "Use migration A" }], + }, + ], + }, + }); setStdoutTty(true); const { answerModule, inputMock } = await loadAnswerModule({ @@ -145,14 +203,34 @@ describe("answer command", () => { expect(inputMock).toHaveBeenCalled(); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("from prompt\n"); + expect(payload).toContain('"migration"'); + expect(payload).toContain('"from prompt"'); }); test("uses interactive run selection when no run id is provided in tty mode", async () => { const runId = "selected-run-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); setStdoutTty(true); const { answerModule, selectMock } = await loadAnswerModule({ @@ -164,7 +242,46 @@ describe("answer command", () => { expect(selectMock).toHaveBeenCalled(); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("selected answer\n"); + expect(payload).toContain('"selected answer"'); + }); + + test("requires JSON mapping when multiple pending questions are answered non-interactively", async () => { + const runId = "answer-multi-1000-abcd"; + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Backend", + id: "backend", + question: "Which backend?", + isOther: true, + isSecret: false, + options: null, + }, + { + header: "Frontend", + id: "frontend", + question: "Which frontend?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); + + const { answerModule } = await loadAnswerModule(); + await expect(answerModule.answerCommandHandler(runId, "just one answer", {})).rejects.toThrow( + "multiple pending questions require a JSON object mapping question ids to answers", + ); }); test("fails when positional run-id and --run are both provided", async () => { diff --git a/src/cli/commands/answer.ts b/src/cli/commands/answer.ts index e299394..5c51718 100644 --- a/src/cli/commands/answer.ts +++ b/src/cli/commands/answer.ts @@ -4,7 +4,9 @@ import { promises as fs } from "node:fs"; import { input } from "@inquirer/prompts"; import type { Command } from "commander"; +import { parseQuestionAnswerInput, serializeQuestionAnswerResponse } from "../../core/question-flow.js"; import { RunStore } from "../../state/store.js"; +import type { PendingQuestion } from "../../types/index.js"; import { selectRun } from "../../utils/select-run.js"; export interface AnswerCommandOptions { @@ -24,7 +26,17 @@ function resolveRunId(positionalRunId: string | undefined, optionRunId: string | return positionalRunId ?? optionRunId; } -async function resolveAnswer(answerArg: string | undefined): Promise { +function formatQuestionPrompt(question: PendingQuestion["questions"][number]): string { + const options = question.options && question.options.length > 0 + ? ` Options: ${question.options.map((option) => option.label).join(", ")}.` + : ""; + return `${question.header}: ${question.question}${options}`; +} + +async function resolveAnswerPayload( + pendingQuestion: PendingQuestion | undefined, + answerArg: string | undefined, +): Promise { if (answerArg) { return answerArg; } @@ -33,12 +45,26 @@ async function resolveAnswer(answerArg: string | undefined): Promise { throw new Error("no answer provided"); } - const value = await input({ message: "Answer:" }); - if (!value) { - throw new Error("no answer provided"); + if (!pendingQuestion || pendingQuestion.questions.length === 0) { + const value = await input({ message: "Answer:" }); + if (!value) { + throw new Error("no answer provided"); + } + + return value; + } + + const answers: Record = {}; + for (const question of pendingQuestion.questions) { + const value = await input({ message: formatQuestionPrompt(question) }); + if (!value) { + throw new Error(`no answer provided for question '${question.id}'`); + } + + answers[question.id] = { answers: [value] }; } - return value; + return JSON.stringify({ answers }); } export async function answerCommandHandler( @@ -75,12 +101,14 @@ export async function answerCommandHandler( return; } - const answer = await resolveAnswer(answerArg); + const answerPayload = await resolveAnswerPayload(run.pendingQuestion, answerArg); + const serialized = run.pendingQuestion + ? serializeQuestionAnswerResponse(parseQuestionAnswerInput(answerPayload, run.pendingQuestion)) + : `${answerPayload}\n`; const answerPath = path.join(store.getRunDir(runId), "answer.txt"); await fs.mkdir(path.dirname(answerPath), { recursive: true }); - await fs.writeFile(answerPath, `${answer}\n`, "utf8"); + await fs.writeFile(answerPath, serialized, "utf8"); - await store.updateRun(runId, { overallStatus: "running" }); console.log(`Answer submitted. Run ${runId} will resume shortly.`); } diff --git a/src/cli/commands/help.ts b/src/cli/commands/help.ts index 53559aa..50e490f 100644 --- a/src/cli/commands/help.ts +++ b/src/cli/commands/help.ts @@ -34,6 +34,7 @@ function printStyledHelpPage(): void { { command: "orca status", description: "list all runs" }, { command: "orca status --last", description: "show most recent run" }, { command: "orca status --run ", description: "show run details" }, + { command: "orca answer ", description: "answer a waiting question" }, { command: "orca resume --last", description: "resume most recent run" }, { command: "orca resume --run ", description: "resume incomplete run" }, { command: "orca cancel --run ", description: "cancel active run" } @@ -59,6 +60,7 @@ function printStyledHelpPage(): void { { command: "--codex-effort ", description: "override Codex thinking level for the current run" }, { command: "--full-auto", description: "skip all questions, proceed autonomously" }, { command: "--on-complete ", description: "shell hook on run complete" }, + { command: "--on-question ", description: "shell hook on question required" }, { command: "--on-error ", description: "shell hook on run error" }, { command: "-h, --help", description: "show help for any command" }, { command: "-V, --version", description: "show version" } diff --git a/src/cli/commands/run-command.test-harness.ts b/src/cli/commands/run-command.test-harness.ts index 10afc4a..6587d6a 100644 --- a/src/cli/commands/run-command.test-harness.ts +++ b/src/cli/commands/run-command.test-harness.ts @@ -2,6 +2,7 @@ import { afterEach, beforeEach, mock } from "bun:test"; import { mkdtemp, rm } from "node:fs/promises"; import os from "node:os"; import path from "node:path"; +import { pathToFileURL } from "node:url"; import { Command } from "commander"; type RunModule = typeof import("./run.js"); @@ -25,18 +26,23 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH const originalRunsDir = process.env.ORCA_RUNS_DIR; const originalSkipValidators = process.env.ORCA_SKIP_VALIDATORS; const originalOpenaiApiKey = process.env.OPENAI_API_KEY; + const originalHome = process.env.HOME; + const originalCwd = process.cwd(); beforeEach(async () => { tempDir = await mkdtemp(path.join(os.tmpdir(), tempPrefix)); process.env.ORCA_RUNS_DIR = path.join(tempDir, "runs"); process.env.ORCA_SKIP_VALIDATORS = "1"; process.env.OPENAI_API_KEY = "test-openai-key"; + process.env.HOME = tempDir; + process.chdir(tempDir); process.exitCode = 0; }); afterEach(async () => { mock.restore(); process.exitCode = 0; + process.chdir(originalCwd); if (originalRunsDir === undefined) { delete process.env.ORCA_RUNS_DIR; } else { @@ -52,6 +58,11 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH } else { process.env.OPENAI_API_KEY = originalOpenaiApiKey; } + if (originalHome === undefined) { + delete process.env.HOME; + } else { + process.env.HOME = originalHome; + } await rm(tempDir, { recursive: true, force: true }); }); @@ -95,8 +106,14 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH } } - const { resolveConfig: realResolveConfig } = await import(`../../core/config-loader.js?real=${Math.random()}`); - const resolveConfigMock = mock((configPath?: string) => realResolveConfig(configPath)); + const resolveConfigMock = mock(async (configPath?: string) => { + if (configPath) { + const imported = await import(`${pathToFileURL(configPath).href}?test=${Math.random()}`); + return imported.default; + } + + return { executor: "codex" as const }; + }); const ensureCodexMultiAgentMock = mock(async () => ({ action: "skipped" as const, path: path.join(tempDir, "mock-codex-config.toml") diff --git a/src/cli/commands/run.postexec-json.integration.test.ts b/src/cli/commands/run.postexec-json.integration.test.ts index 58ca1cf..1aaabac 100644 --- a/src/cli/commands/run.postexec-json.integration.test.ts +++ b/src/cli/commands/run.postexec-json.integration.test.ts @@ -30,7 +30,8 @@ describe("post-exec reviewer JSON hardening integration", () => { expect(runPromptMock).toHaveBeenCalledTimes(2); expect(runPromptMock).toHaveBeenNthCalledWith( 2, - expect.stringContaining("previous post-execution review response was invalid") + expect.stringContaining("previous post-execution review response was invalid"), + "review" ); }); @@ -56,7 +57,8 @@ describe("post-exec reviewer JSON hardening integration", () => { expect(runPromptMock).toHaveBeenCalledTimes(2); expect(runPromptMock).toHaveBeenNthCalledWith( 2, - expect.stringContaining("Schema validation failed") + expect.stringContaining("Schema validation failed"), + "review" ); }); diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index b533131..71b3d52 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -58,6 +58,7 @@ export interface RunCommandOptions { codexOnly?: boolean; codexEffort?: CodexEffort; onMilestone?: string; + onQuestion?: string; onTaskComplete?: string; onTaskFail?: string; onInvalidPlan?: string; @@ -68,6 +69,7 @@ export interface RunCommandOptions { const ALL_HOOKS: HookName[] = [ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", @@ -77,6 +79,7 @@ const ALL_HOOKS: HookName[] = [ ]; const VALID_HOOK_NAMES = new Set([ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", @@ -146,6 +149,7 @@ function computeFinalStatus(overallStatus: string, allTasksDone: boolean): "comp function buildCliCommandHooks(options: RunCommandOptions): Partial> { return { ...(options.onMilestone ? { onMilestone: options.onMilestone } : {}), + ...(options.onQuestion ? { onQuestion: options.onQuestion } : {}), ...(options.onTaskComplete ? { onTaskComplete: options.onTaskComplete } : {}), ...(options.onTaskFail ? { onTaskFail: options.onTaskFail } : {}), ...(options.onInvalidPlan ? { onInvalidPlan: options.onInvalidPlan } : {}), @@ -440,7 +444,7 @@ export async function runCommandHandler(options: RunCommandOptions): Promise codexSession.runPrompt(prompt, "review"), cycleIndex, prompt, reviewConfig.prompt @@ -622,6 +630,7 @@ export function registerRunCommand(program: Command): void { .option("--codex-only", "Force Codex executor for this run (overrides config)") .option("--codex-effort ", "Codex thinking level override for this run", parseCodexEffortOption) .option("--on-milestone ", "Shell hook command for onMilestone") + .option("--on-question ", "Shell hook command for onQuestion") .option("--on-task-complete ", "Shell hook command for onTaskComplete") .option("--on-task-fail ", "Shell hook command for onTaskFail") .option("--on-invalid-plan ", "Shell hook command for onInvalidPlan") diff --git a/src/cli/commands/status.test.ts b/src/cli/commands/status.test.ts new file mode 100644 index 0000000..af5a3f8 --- /dev/null +++ b/src/cli/commands/status.test.ts @@ -0,0 +1,83 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import os from "node:os"; +import path from "node:path"; +import { mkdtemp, rm } from "node:fs/promises"; + +import { RunStore } from "../../state/store.js"; + +type StatusModule = typeof import("./status.js"); + +let tempDir = ""; +let runsDir = ""; +let logs: string[] = []; +const originalRunsDir = process.env.ORCA_RUNS_DIR; +const originalConsoleLog = console.log; + +async function loadStatusModule(): Promise { + return import(`./status.js?test=${Math.random()}`); +} + +beforeEach(async () => { + tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-status-test-")); + runsDir = path.join(tempDir, "runs"); + process.env.ORCA_RUNS_DIR = runsDir; + logs = []; + process.exitCode = 0; + + console.log = (...args: unknown[]) => { + logs.push(args.map(String).join(" ")); + }; +}); + +afterEach(async () => { + console.log = originalConsoleLog; + + if (originalRunsDir === undefined) { + delete process.env.ORCA_RUNS_DIR; + } else { + process.env.ORCA_RUNS_DIR = originalRunsDir; + } + + process.exitCode = 0; + await rm(tempDir, { recursive: true, force: true }); +}); + +describe("status command", () => { + test("prints pending question details for runs waiting for input", async () => { + const runId = "run-1000-abcd"; + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + mode: "run", + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }, + }); + + const statusModule = await loadStatusModule(); + await statusModule.statusCommandHandler({ run: runId }); + + const output = logs.join("\n"); + expect(output).toContain("Pending Question:"); + expect(output).toContain("Game Type: Which game type should I build?"); + expect(output).toContain("Options: Arcade, Puzzle."); + }); +}); diff --git a/src/cli/commands/status.ts b/src/cli/commands/status.ts index fd46565..d738aba 100644 --- a/src/cli/commands/status.ts +++ b/src/cli/commands/status.ts @@ -1,5 +1,6 @@ import type { Command } from "commander"; +import { formatPendingQuestionForStatus } from "../../core/question-flow.js"; import { RunStore } from "../../state/store.js"; import type { RunStatus, Task } from "../../types/index.js"; import { getLastRun } from "../../utils/last-run.js"; @@ -64,6 +65,12 @@ async function printDetailedRun(run: RunStatus): Promise { console.log(`Updated: ${run.updatedAt}`); console.log(`Milestones: ${run.milestones.length}`); console.log(`Errors: ${run.errors.length}`); + if (run.pendingQuestion) { + console.log(""); + for (const line of formatPendingQuestionForStatus(run.pendingQuestion)) { + console.log(line); + } + } console.log(""); console.log("Tasks:"); if (run.tasks.length === 0) { diff --git a/src/core/codex-config.test.ts b/src/core/codex-config.test.ts index 1cb76ab..b0dfcc9 100644 --- a/src/core/codex-config.test.ts +++ b/src/core/codex-config.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "bun:test"; -import { ensureCodexMultiAgent } from "./codex-config.js"; +import { ensureCodexMultiAgent, isCodexMultiAgentActive } from "./codex-config.js"; let tmpDir: string; let tmpConfigFile: string; @@ -87,3 +87,34 @@ describe("ensureCodexMultiAgent", () => { expect(content).toContain("multi_agent = true"); }); }); + +describe("isCodexMultiAgentActive", () => { + it("returns true when Orca config enables multiAgent", async () => { + await expect(isCodexMultiAgentActive({ codex: { multiAgent: true } }, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns true when root config enables multi_agent", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = true\n", "utf8"); + + await expect(isCodexMultiAgentActive(undefined, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns true when both Orca config and root config enable multi-agent", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = true\n", "utf8"); + + await expect(isCodexMultiAgentActive({ codex: { multiAgent: true } }, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns false when Orca config is false and no root config enables multi-agent", async () => { + await expect(isCodexMultiAgentActive({ codex: { multiAgent: false } }, tmpConfigFile)).resolves.toBe(false); + }); + + it("returns false when root config contains multi_agent = false", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = false\n", "utf8"); + + await expect(isCodexMultiAgentActive(undefined, tmpConfigFile)).resolves.toBe(false); + }); +}); diff --git a/src/core/codex-config.ts b/src/core/codex-config.ts index 6b893ba..2d18fe7 100644 --- a/src/core/codex-config.ts +++ b/src/core/codex-config.ts @@ -22,6 +22,75 @@ function containsMultiAgentSetting(content: string): boolean { return /multi_agent\s*=/.test(content); } +function isRootFeaturesSection(sectionPath: string[]): boolean { + return sectionPath.length === 1 && sectionPath[0] === "features"; +} + +function parseSectionPath(line: string): string[] | null { + const match = line.match(/^\[(.+)\]$/); + if (!match?.[1]) { + return null; + } + + return match[1] + .split(".") + .map((part) => part.trim().replace(/^"(.*)"$/, "$1")) + .filter((part) => part.length > 0); +} + +function hasEnabledRootMultiAgentSetting(content: string): boolean { + let currentSection: string[] = []; + + for (const rawLine of content.split(/\r?\n/u)) { + const line = rawLine.replace(/\s+#.*$/u, "").trim(); + if (line.length === 0 || line.startsWith("#")) { + continue; + } + + const sectionPath = parseSectionPath(line); + if (sectionPath !== null) { + currentSection = sectionPath; + continue; + } + + if (!isRootFeaturesSection(currentSection)) { + continue; + } + + const match = line.match(/^multi_agent\s*=\s*(true|false)\s*$/u); + if (!match?.[1]) { + continue; + } + + return match[1] === "true"; + } + + return false; +} + +export async function isCodexMultiAgentActive( + config?: OrcaConfig, + _configFile?: string, +): Promise { + if (isMultiAgentEnabled(config)) { + return true; + } + + const configFile = _configFile ?? GLOBAL_CONFIG_FILE; + + let existingContent: string; + try { + existingContent = await readFile(configFile, "utf8"); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + return false; + } + throw err; + } + + return hasEnabledRootMultiAgentSetting(existingContent); +} + /** * Ensures `~/.codex/config.toml` has `multi_agent = true` set. * diff --git a/src/core/config-loader.test.ts b/src/core/config-loader.test.ts index af69e11..55f2ff2 100644 --- a/src/core/config-loader.test.ts +++ b/src/core/config-loader.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { promises as fs } from "node:fs"; -import { mergeConfigs, resolveConfig, resolveConfigFromPaths } from "./config-loader.js"; +import { mergeConfigs, resolveConfigFromPaths } from "./config-loader.js"; describe("config-loader", () => { let tempDir: string; @@ -26,14 +26,17 @@ describe("config-loader", () => { await fs.rm(tempDir, { recursive: true, force: true }); }); - test("resolveConfig prefers project ts config over project js config when both exist", async () => { - process.chdir(tempDir); - process.env.HOME = tempDir; - - await fs.writeFile(path.join(tempDir, "orca.config.js"), "export default { runsDir: 'from-js' };\n", "utf8"); - await fs.writeFile(path.join(tempDir, "orca.config.ts"), "export default { runsDir: 'from-ts' };\n", "utf8"); + test("resolveConfigFromPaths prefers project ts config over project js config when both exist", async () => { + const projectJsPath = path.join(tempDir, "orca.config.js"); + const projectTsPath = path.join(tempDir, "orca.config.ts"); + await fs.writeFile(projectJsPath, "export default { runsDir: 'from-js' };\n", "utf8"); + await fs.writeFile(projectTsPath, "export default { runsDir: 'from-ts' };\n", "utf8"); - const resolved = await resolveConfig(); + const resolved = await resolveConfigFromPaths( + path.join(tempDir, "missing-global.js"), + projectJsPath, + projectTsPath + ); expect(resolved?.runsDir).toBe("from-ts"); }); @@ -76,17 +79,17 @@ describe("config-loader", () => { expect(resolved?.sessionLogs).toBe("/tmp/orca-session-logs"); }); - test("resolveConfigFromPaths throws on invalid executor value", async () => { + test("resolveConfigFromPaths coerces stale executor values to codex", async () => { const cliPath = path.join(tempDir, "cli.config.js"); - await fs.writeFile(cliPath, "export default { executor: 'invalid-executor' };\n", "utf8"); + await fs.writeFile(cliPath, "export default { executor: 'claude' };\n", "utf8"); - await expect( - resolveConfigFromPaths( - path.join(tempDir, "missing-global.js"), - path.join(tempDir, "missing-project.js"), - cliPath - ) - ).rejects.toThrow("Config.executor must be 'codex', got invalid-executor"); + const resolved = await resolveConfigFromPaths( + path.join(tempDir, "missing-global.js"), + path.join(tempDir, "missing-project.js"), + cliPath + ); + + expect(resolved?.executor).toBe("codex"); }); test("resolveConfigFromPaths rejects unknown hookCommands keys", async () => { @@ -184,7 +187,7 @@ describe("config-loader", () => { const cliPath = path.join(tempDir, "cli.config.js"); await fs.writeFile( cliPath, - "export default { codex: { thinkingLevel: { decision: 'low', planning: 'xhigh', execution: 'medium' } } };\n", + "export default { codex: { thinkingLevel: { decision: 'low', planning: 'xhigh', review: 'high', execution: 'medium' } } };\n", "utf8" ); @@ -197,6 +200,7 @@ describe("config-loader", () => { expect(resolved?.codex?.thinkingLevel).toEqual({ decision: "low", planning: "xhigh", + review: "high", execution: "medium", }); }); @@ -367,6 +371,7 @@ describe("config-loader", () => { const projectConfig = { codex: { thinkingLevel: { + review: "high" as const, execution: "medium" as const, }, }, @@ -377,6 +382,7 @@ describe("config-loader", () => { expect(merged?.codex?.thinkingLevel).toEqual({ decision: "low", planning: "high", + review: "high", execution: "medium", }); }); diff --git a/src/core/config-loader.ts b/src/core/config-loader.ts index 8eb4d5f..ebcb83d 100644 --- a/src/core/config-loader.ts +++ b/src/core/config-loader.ts @@ -9,6 +9,7 @@ import type { HookName, OrcaConfig } from "../types/index.js"; const KNOWN_HOOK_NAMES: HookName[] = [ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", @@ -97,16 +98,7 @@ function coerceConfig(candidate: unknown): OrcaConfig { } if ("executor" in candidate && candidate.executor !== undefined) { - if (candidate.executor !== "codex") { - const executorDisplay = - typeof candidate.executor === "string" - ? candidate.executor - : (JSON.stringify(candidate.executor) ?? describeType(candidate.executor)); - - throw new Error( - `Config.executor must be 'codex', got ${executorDisplay}` - ); - } + candidate.executor = "codex"; } if ("codex" in candidate && candidate.codex !== undefined) { @@ -129,7 +121,7 @@ function coerceConfig(candidate: unknown): OrcaConfig { throw new Error(`Config.codex.thinkingLevel must be an object, got ${describeType(candidate.codex.thinkingLevel)}`); } - for (const key of ["decision", "planning", "execution"] as const) { + for (const key of ["decision", "planning", "review", "execution"] as const) { const value = candidate.codex.thinkingLevel[key]; if (value !== undefined) { if (typeof value !== "string") { diff --git a/src/core/planner.ts b/src/core/planner.ts index 512630e..7342f36 100644 --- a/src/core/planner.ts +++ b/src/core/planner.ts @@ -1,8 +1,13 @@ import { promises as fs } from "node:fs"; import path from "node:path"; -import { decidePlanningNeed as decidePlanningNeedWithCodex, planSpec as planSpecWithCodex, reviewTaskGraph as reviewTaskGraphWithCodex } from "../agents/codex/session.js"; -import type { OrcaConfig, Task, TaskGraphReviewResult } from "../types/index.js"; +import { + decidePlanningNeed as decidePlanningNeedWithCodex, + planSpec as planSpecWithCodex, + reviewTaskGraph as reviewTaskGraphWithCodex, + type SessionInteractionContext, +} from "../agents/codex/session.js"; +import type { HookEvent, OrcaConfig, Task, TaskGraphReviewResult } from "../types/index.js"; import { logger } from "../utils/logger.js"; import { loadSkills, type LoadedSkill } from "../utils/skill-loader.js"; import { RunStore } from "../state/store.js"; @@ -177,6 +182,7 @@ async function runTaskGraphReview( tasks: Task[], systemContext: string, config: OrcaConfig | undefined, + interactionContext?: SessionInteractionContext, ): Promise<{ finalTasks: Task[]; review: TaskGraphReviewResult | null }> { const planReviewConfig = getPlanReviewConfig(config); if (!planReviewConfig.enabled) { @@ -188,7 +194,7 @@ async function runTaskGraphReview( const reviewFn = resolveReviewTaskGraphImpl(config); let review: TaskGraphReviewResult; try { - review = await reviewFn(tasks, systemContext, config); + review = await reviewFn(tasks, systemContext, config, interactionContext); } catch (error) { if (planReviewConfig.onInvalid === "warn_skip") { logger.warn(`Review output invalid; skipping review changes (${error instanceof Error ? error.message : String(error)})`); @@ -218,6 +224,7 @@ async function runTaskGraphReview( type PlannerOptions = { allowPlanSkip?: boolean; + emitHook?: (event: HookEvent) => Promise; }; function buildSingleExecutionTask(spec: string): Task[] { @@ -236,9 +243,14 @@ function buildSingleExecutionTask(spec: string): Task[] { ]; } -async function runFullPlanning(spec: string, systemContext: string, config?: OrcaConfig): Promise { +async function runFullPlanning( + spec: string, + systemContext: string, + config?: OrcaConfig, + interactionContext?: SessionInteractionContext, +): Promise { const planSpecImpl = resolvePlanSpecImpl(config); - const result = await planSpecImpl(spec, systemContext, config); + const result = await planSpecImpl(spec, systemContext, config, interactionContext); try { validateDAG(result.tasks); @@ -249,7 +261,7 @@ async function runFullPlanning(spec: string, systemContext: string, config?: Orc const planReviewConfig = getPlanReviewConfig(config); let finalTasks = result.tasks; try { - const reviewed = await runTaskGraphReview(result.tasks, systemContext, config); + const reviewed = await runTaskGraphReview(result.tasks, systemContext, config, interactionContext); finalTasks = reviewed.finalTasks; } catch (error) { if (planReviewConfig.onInvalid === "warn_skip") { @@ -275,21 +287,26 @@ export async function runPlanner( const spec = await fs.readFile(specPath, "utf8"); const [skills, instructions] = await Promise.all([loadSkills(config), loadProjectInstructions(specPath)]); const systemContext = buildSystemContext(skills, instructions); + const interactiveContext = { + runId: runId as HookEvent["runId"], + store, + ...(options?.emitHook ? { emitHook: options.emitHook } : {}), + }; let finalTasks: Task[]; if (options?.allowPlanSkip === true) { const decidePlanningNeed = resolveDecidePlanningNeedImpl(config); - const decision = await decidePlanningNeed(spec, systemContext, config); + const decision = await decidePlanningNeed(spec, systemContext, config, interactiveContext); if (!decision.needsPlan) { logger.info(`Planning skipped: ${decision.reason}`); finalTasks = buildSingleExecutionTask(spec); } else { logger.info(`Planning required: ${decision.reason}`); - finalTasks = await runFullPlanning(spec, systemContext, config); + finalTasks = await runFullPlanning(spec, systemContext, config, interactiveContext); } } else { - finalTasks = await runFullPlanning(spec, systemContext, config); + finalTasks = await runFullPlanning(spec, systemContext, config, interactiveContext); } await store.writeTasks(runId, finalTasks); diff --git a/src/core/question-flow.ts b/src/core/question-flow.ts new file mode 100644 index 0000000..062d61d --- /dev/null +++ b/src/core/question-flow.ts @@ -0,0 +1,157 @@ +import type { + ToolRequestUserInputParams, + ToolRequestUserInputResponse, +} from "@ratley/codex-client"; + +import type { PendingQuestion, PendingQuestionPrompt } from "../types/index.js"; + +function normalizeQuestionPrompt(question: ToolRequestUserInputParams["questions"][number]): PendingQuestionPrompt { + return { + header: question.header, + id: question.id, + question: question.question, + isOther: question.isOther ?? false, + isSecret: question.isSecret ?? false, + ...(question.options !== undefined ? { options: question.options } : {}), + }; +} + +function normalizeAnswerList(value: unknown): string[] | null { + if (typeof value === "string") { + const trimmed = value.trim(); + return trimmed.length > 0 ? [trimmed] : []; + } + + if (Array.isArray(value)) { + const answers = value + .filter((entry): entry is string => typeof entry === "string") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0); + return answers; + } + + if ( + value && + typeof value === "object" && + "answers" in value && + Array.isArray((value as { answers?: unknown }).answers) + ) { + return normalizeAnswerList((value as { answers?: unknown[] }).answers); + } + + return null; +} + +function formatQuestionBlock(question: PendingQuestionPrompt): string { + const optionText = question.options && question.options.length > 0 + ? ` Options: ${question.options.map((option) => option.label).join(", ")}.` + : ""; + + return `${question.header}: ${question.question}${optionText}`; +} + +export function createPendingQuestion( + requestId: string | number, + params: ToolRequestUserInputParams, + receivedAt: string = new Date().toISOString(), +): PendingQuestion { + return { + requestId, + threadId: params.threadId, + turnId: params.turnId, + itemId: params.itemId, + receivedAt, + questions: params.questions.map((question) => normalizeQuestionPrompt(question)), + }; +} + +export function buildQuestionHookMessage(pendingQuestion: PendingQuestion): string { + if (pendingQuestion.questions.length === 1) { + return pendingQuestion.questions[0]?.question ?? "Codex requested user input."; + } + + return `Codex requested answers for ${pendingQuestion.questions.length} questions.`; +} + +export function formatPendingQuestionForStatus(pendingQuestion: PendingQuestion): string[] { + return [ + "Pending Question:", + ...pendingQuestion.questions.map((question) => `- ${formatQuestionBlock(question)}`), + ]; +} + +export function serializeQuestionAnswerResponse(response: ToolRequestUserInputResponse): string { + return `${JSON.stringify(response, null, 2)}\n`; +} + +export function parseQuestionAnswerInput( + rawInput: string, + pendingQuestion: PendingQuestion, +): ToolRequestUserInputResponse { + const trimmed = rawInput.trim(); + if (trimmed.length === 0) { + throw new Error("answer payload is empty"); + } + + if (pendingQuestion.questions.length === 1 && !trimmed.startsWith("{")) { + const onlyQuestion = pendingQuestion.questions[0]; + if (!onlyQuestion) { + throw new Error("pending question is missing its question definition"); + } + + return { + answers: { + [onlyQuestion.id]: { + answers: [trimmed], + }, + }, + }; + } + + let parsed: unknown; + try { + parsed = JSON.parse(trimmed); + } catch (error) { + throw new Error( + pendingQuestion.questions.length === 1 + ? `answer payload is not valid JSON: ${error instanceof Error ? error.message : String(error)}` + : "multiple pending questions require a JSON object mapping question ids to answers", + ); + } + + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + throw new Error("answer payload must be a JSON object"); + } + + const record = parsed as Record; + if ("answers" in record && record.answers && typeof record.answers === "object" && !Array.isArray(record.answers)) { + const normalizedAnswers: Record = {}; + + for (const [questionId, answerValue] of Object.entries(record.answers as Record)) { + const answers = normalizeAnswerList(answerValue); + if (answers === null) { + throw new Error(`answer payload for '${questionId}' must be a string, string array, or { answers: string[] }`); + } + + normalizedAnswers[questionId] = { answers }; + } + + return { answers: normalizedAnswers }; + } + + const normalizedAnswers: Record = {}; + for (const question of pendingQuestion.questions) { + if (!(question.id in record)) { + throw new Error(`answer payload is missing question id '${question.id}'`); + } + + const answers = normalizeAnswerList(record[question.id]); + if (answers === null) { + throw new Error(`answer payload for '${question.id}' must be a string, string array, or { answers: string[] }`); + } + + normalizedAnswers[question.id] = { answers }; + } + + return { answers: normalizedAnswers }; +} diff --git a/src/core/task-runner.ts b/src/core/task-runner.ts index f45af15..3f2f255 100644 --- a/src/core/task-runner.ts +++ b/src/core/task-runner.ts @@ -68,7 +68,7 @@ function hasPendingTasks(tasks: Task[]): boolean { } export interface TaskRunnerOptions { - runId: string; + runId: RunId; store: RunStore; config?: OrcaConfig; emitHook?: EmitHook; @@ -153,7 +153,11 @@ export async function runTaskRunner(options: TaskRunnerOptions): Promise { if (mockFn) { executeTaskFn = mockFn; } else { - codexSession = await createCodexSession(process.cwd(), config); + codexSession = await createCodexSession(process.cwd(), config, { + runId, + store, + emitHook, + }); executeTaskFn = (task, taskRunId, _cfg, systemContext) => codexSession!.executeTask(task, taskRunId, systemContext); } diff --git a/src/state/schema.ts b/src/state/schema.ts index 267eb98..bc4ac03 100644 --- a/src/state/schema.ts +++ b/src/state/schema.ts @@ -28,6 +28,29 @@ const ErrorEntrySchema = z.object({ taskId: z.string().optional() }); +const PendingQuestionOptionSchema = z.object({ + label: z.string(), + description: z.string() +}); + +const PendingQuestionPromptSchema = z.object({ + header: z.string(), + id: z.string(), + question: z.string(), + isOther: z.boolean(), + isSecret: z.boolean(), + options: z.array(PendingQuestionOptionSchema).nullable().optional() +}); + +const PendingQuestionSchema = z.object({ + requestId: z.union([z.string(), z.number().int()]), + threadId: z.string(), + turnId: z.string(), + itemId: z.string(), + receivedAt: z.string(), + questions: z.array(PendingQuestionPromptSchema) +}); + const PrStatusSchema = z.object({ draftTitle: z.string().optional(), draftBody: z.string().optional(), @@ -54,6 +77,7 @@ export const RunStatusSchema = z.object({ tasks: z.array(TaskSchema), milestones: z.array(z.string()), errors: z.array(ErrorEntrySchema), + pendingQuestion: PendingQuestionSchema.optional(), pr: PrStatusSchema.optional() }); diff --git a/src/state/store.ts b/src/state/store.ts index 7385d3d..f243fdf 100644 --- a/src/state/store.ts +++ b/src/state/store.ts @@ -103,7 +103,7 @@ export class RunStore { } private async writeJsonAtomic(filePath: string, data: unknown): Promise { - const tmpPath = `${filePath}.tmp`; + const tmpPath = `${filePath}.${process.pid}.${Date.now()}.${Math.random().toString(16).slice(2)}.tmp`; const payload = `${JSON.stringify(data, null, 2)}\n`; await fs.writeFile(tmpPath, payload, "utf8"); diff --git a/src/types/config-typing.typecheck.ts b/src/types/config-typing.typecheck.ts index f515a95..6450177 100644 --- a/src/types/config-typing.typecheck.ts +++ b/src/types/config-typing.typecheck.ts @@ -2,6 +2,10 @@ import { defineOrcaConfig } from "./index.js"; defineOrcaConfig({ hooks: { + onQuestion: async (event) => { + const questionId: string = event.questions[0]?.id ?? ""; + void questionId; + }, onTaskComplete: async (event, context) => { const taskId: string = event.taskId; const taskName: string = event.taskName; diff --git a/src/types/index.ts b/src/types/index.ts index 4de444f..4a55b02 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -48,6 +48,7 @@ export interface RunStatus { tasks: Task[]; milestones: string[]; errors: Array<{ at: string; message: string; taskId?: string }>; + pendingQuestion?: PendingQuestion | undefined; pr?: { draftTitle?: string; draftBody?: string; @@ -57,6 +58,29 @@ export interface RunStatus { }; } +export interface PendingQuestionOption { + label: string; + description: string; +} + +export interface PendingQuestionPrompt { + header: string; + id: string; + question: string; + isOther: boolean; + isSecret: boolean; + options?: PendingQuestionOption[] | null; +} + +export interface PendingQuestion { + requestId: string | number; + threadId: string; + turnId: string; + itemId: string; + receivedAt: string; + questions: PendingQuestionPrompt[]; +} + export interface BaseHookEvent { runId: RunId; message: string; @@ -69,6 +93,14 @@ export interface BaseHookEvent { export interface HookEventMap { onMilestone: BaseHookEvent & { hook: "onMilestone" }; + onQuestion: BaseHookEvent & { + hook: "onQuestion"; + requestId: string | number; + threadId: string; + turnId: string; + itemId: string; + questions: PendingQuestionPrompt[]; + }; onTaskComplete: BaseHookEvent & { hook: "onTaskComplete"; taskId: string; taskName: string }; onTaskFail: BaseHookEvent & { hook: "onTaskFail"; taskId: string; taskName: string; error: string }; onInvalidPlan: BaseHookEvent & { hook: "onInvalidPlan"; error: string }; @@ -152,6 +184,7 @@ export interface OrcaConfig { thinkingLevel?: { decision?: CodexEffort; planning?: CodexEffort; + review?: CodexEffort; execution?: CodexEffort; }; command?: string;