diff --git a/assets/lab/environments/AGENTS.md b/assets/lab/environments/AGENTS.md index 3b1493e9f..ebd0db161 100644 --- a/assets/lab/environments/AGENTS.md +++ b/assets/lab/environments/AGENTS.md @@ -712,8 +712,8 @@ by the framework; do not accept `None` or write `config = config or MyEnvConfig( knobs on `TasksetConfig` or `HarnessConfig`, not on `EnvConfig` itself. Environment packages should not subclass `Env`. Reusable taskset environments can type `harness` as `vf.HarnessConfig`; TOML can -then select a registered harness with names like `type = "terminus2"` or -`type = "pi"` inside the harness table. +then select a registered harness with names like `type = "codex"`, +`type = "claude-code"`, or `type = "pi"` inside the harness table. The taskset-only shape is: diff --git a/docs/byo-harness.md b/docs/byo-harness.md index e736d5058..b3cc17ce0 100644 --- a/docs/byo-harness.md +++ b/docs/byo-harness.md @@ -429,9 +429,9 @@ config surface; do not subclass `Env` just to bypass inference. Packaged CLI harnesses should use the same boundary. These implementations live under `verifiers.v1.packages` while the v1 surface stabilizes, and are -re-exported through `verifiers.v1`. `OpenCode`, `Pi`, `MiniSWEAgent`, -`Terminus2`, and `RLM` are bundled `Harness` leaf wrappers for common -command-line agents: +re-exported through `verifiers.v1`. `OpenCode`, `ClaudeCode`, `Codex`, `Pi`, +`MiniSWEAgent`, `Terminus2`, and `RLM` are bundled `Harness` leaf wrappers for +common command-line agents: ```python class HarborEnvConfig(vf.EnvConfig): @@ -543,8 +543,8 @@ and harness config types for the loader. Reusable taskset environments can keep `harness` typed as `vf.HarnessConfig`. Then TOML may select a registered harness config with `type`, for example -`type = "terminus2"` or `type = "pi"`, and pass that config's ordinary fields -beside it. Use `harness = "pi"` when the selected harness needs no field +`type = "codex"` or `type = "claude-code"`, and pass that config's ordinary +fields beside it. Use `harness = "pi"` when the selected harness needs no field overrides. ```python diff --git a/docs/environments.md b/docs/environments.md index 19c4845c5..fb925a726 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -705,8 +705,8 @@ by the framework; do not accept `None` or write `config = config or MyEnvConfig( knobs on `TasksetConfig` or `HarnessConfig`, not on `EnvConfig` itself. Environment packages should not subclass `Env`. Reusable taskset environments can type `harness` as `vf.HarnessConfig`; TOML can -then select a registered harness with names like `type = "terminus2"` or -`type = "pi"` inside the harness table. +then select a registered harness with names like `type = "codex"`, +`type = "claude-code"`, or `type = "pi"` inside the harness table. The taskset-only shape is: diff --git a/docs/evaluation.md b/docs/evaluation.md index f31ae5452..8feb2e10b 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -383,15 +383,15 @@ optional: | `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) | Use `harness.type` to choose a registered v1 harness config for reusable taskset -environments. Bundled names include `opencode`, `mini-swe-agent`, `pi`, `rlm`, -and `terminus2`: +environments. Bundled names include `opencode`, `claude-code`, `codex`, +`mini-swe-agent`, `pi`, `rlm`, and `terminus2`: ```toml [[eval]] id = "openthoughts-tblite" [eval.harness] -type = "terminus2" +type = "codex" max_turns = 4 ``` diff --git a/docs/reference.md b/docs/reference.md index 9c4bf7990..f5771ef61 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1008,9 +1008,9 @@ Nested config defaults should be explicit config objects, e.g. `taskset: MyTasksetConfig = MyTasksetConfig()`. When `harness` is typed as `HarnessConfig`, TOML can select a registered -harness config with `type`, such as `type = "terminus2"` or `type = "pi"`, then -pass the normal fields for that config in the same table. The shorthand form -`harness = "pi"` is also accepted when no fields need to be overridden. +harness config with `type`, such as `type = "codex"` or `type = "claude-code"`, +then pass the normal fields for that config in the same table. The shorthand +form `harness = "pi"` is also accepted when no fields need to be overridden. `Config` subclasses are strict Pydantic config models. Validate raw mappings with `MyConfig.model_validate(...)` or use the typed object directly. diff --git a/environments/AGENTS.md b/environments/AGENTS.md index a0f681f85..419637d0f 100644 --- a/environments/AGENTS.md +++ b/environments/AGENTS.md @@ -711,8 +711,8 @@ by the framework; do not accept `None` or write `config = config or MyEnvConfig( knobs on `TasksetConfig` or `HarnessConfig`, not on `EnvConfig` itself. Environment packages should not subclass `Env`. Reusable taskset environments can type `harness` as `vf.HarnessConfig`; TOML can -then select a registered harness with names like `type = "terminus2"` or -`type = "pi"` inside the harness table. +then select a registered harness with names like `type = "codex"`, +`type = "claude-code"`, or `type = "pi"` inside the harness table. The taskset-only shape is: diff --git a/tests/test_v1_config_extension.py b/tests/test_v1_config_extension.py index d2b601327..d9fead1cd 100644 --- a/tests/test_v1_config_extension.py +++ b/tests/test_v1_config_extension.py @@ -1565,19 +1565,22 @@ class LocalEnvConfig(EnvConfig): @pytest.mark.parametrize( - ("alias", "config_cls", "harness_cls"), + ("alias", "config_cls", "harness_cls", "config_fields"), [ - ("opencode", vf.OpenCodeConfig, vf.OpenCode), - ("open-code", vf.OpenCodeConfig, vf.OpenCode), - ("mini-swe-agent", vf.MiniSWEAgentConfig, vf.MiniSWEAgent), - ("pi", vf.PiConfig, vf.Pi), - ("rlm", vf.RLMConfig, vf.RLM), - ("terminus2", vf.Terminus2Config, vf.Terminus2), - ("terminus-2", vf.Terminus2Config, vf.Terminus2), + ("opencode", vf.OpenCodeConfig, vf.OpenCode, {"max_turns": 4}), + ("open-code", vf.OpenCodeConfig, vf.OpenCode, {"max_turns": 4}), + ("claude", vf.ClaudeCodeConfig, vf.ClaudeCode, {"max_turns": 4}), + ("claude-code", vf.ClaudeCodeConfig, vf.ClaudeCode, {"max_turns": 4}), + ("codex", vf.CodexConfig, vf.Codex, {}), + ("mini-swe-agent", vf.MiniSWEAgentConfig, vf.MiniSWEAgent, {"max_turns": 4}), + ("pi", vf.PiConfig, vf.Pi, {"max_turns": 4}), + ("rlm", vf.RLMConfig, vf.RLM, {"max_turns": 4}), + ("terminus2", vf.Terminus2Config, vf.Terminus2, {"max_turns": 4}), + ("terminus-2", vf.Terminus2Config, vf.Terminus2, {"max_turns": 4}), ], ) def test_env_config_harness_type_selects_packaged_harness_config( - alias, config_cls, harness_cls + alias, config_cls, harness_cls, config_fields ) -> None: class GenericEnvConfig(EnvConfig): taskset: TasksetConfig = TasksetConfig(source=[]) @@ -1585,12 +1588,13 @@ class GenericEnvConfig(EnvConfig): config = coerce_config( GenericEnvConfig, - {"harness": {"type": alias, "max_turns": 4}}, + {"harness": {"type": alias, **config_fields}}, ) env = Env(config=config) assert isinstance(config.harness, config_cls) - assert config.harness.max_turns == 4 + if "max_turns" in config_fields: + assert config.harness.max_turns == 4 assert isinstance(env.harness, harness_cls) diff --git a/tests/test_v1_harbor_cli.py b/tests/test_v1_harbor_cli.py index ead97349b..6a909aec4 100644 --- a/tests/test_v1_harbor_cli.py +++ b/tests/test_v1_harbor_cli.py @@ -12,6 +12,8 @@ import verifiers as root_vf import verifiers.v1 as vf +from verifiers.v1.packages.harnesses.claude_code import claude_code_mcp_json +from verifiers.v1.packages.harnesses.codex import codex_mcp_toml from verifiers.v1.packages.harnesses.configs import ( TERMINUS_2_DEFAULT_API_BASE_URL, TERMINUS_2_DEFAULT_HARBOR_PACKAGE, @@ -212,9 +214,19 @@ async def test_harbor_reward_uses_background_job_for_tests( def test_packaged_harbor_and_opencode_imports_are_reexported() -> None: - from verifiers.v1.packages.harnesses import OpenCode, OpenCodeConfig, Pi + from verifiers.v1.packages.harnesses import ( + ClaudeCode, + Codex, + OpenCode, + OpenCodeConfig, + Pi, + ) from verifiers.v1.packages.tasksets import HarborTaskset + assert vf.ClaudeCode is ClaudeCode + assert root_vf.ClaudeCode is ClaudeCode + assert vf.Codex is Codex + assert root_vf.Codex is Codex assert vf.OpenCode is OpenCode assert vf.OpenCodeConfig is OpenCodeConfig assert vf.Pi is Pi @@ -255,6 +267,8 @@ def test_opencode_config_owns_opencode_harness_fields() -> None: ("harness_cls", "config_cls"), [ (vf.OpenCode, vf.OpenCodeConfig), + (vf.ClaudeCode, vf.ClaudeCodeConfig), + (vf.Codex, vf.CodexConfig), (vf.MiniSWEAgent, vf.MiniSWEAgentConfig), (vf.Pi, vf.PiConfig), (vf.RLM, vf.RLMConfig), @@ -312,6 +326,73 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None: assert mcp["mcpServers"]["verifiers-tools"]["command"] == "python3" +def test_claude_code_harness_builds_sandbox_program() -> None: + harness = vf.ClaudeCode( + config=vf.ClaudeCodeConfig( + system_prompt="extra system prompt", + agent_workdir="/workspace", + max_turns=7, + ) + ) + program = cast(dict[str, object], harness.program) + command = cast(list[object], program["command"]) + setup = cast(str, program["setup"]) + files = cast(dict[str, object], program["files"]) + env = cast(dict[str, object], program["env"]) + mcp = json.loads(claude_code_mcp_json()) + + assert "npm install -g @anthropic-ai/claude-code" in setup + assert "/claude-code/instruction.txt" in files + assert "/claude-code/system.txt" in files + assert program["channels"] == "mcp" + assert env["ANTHROPIC_MODEL"] == "runtime.model" + assert env["CLAUDE_CODE_ENABLE_GATEWAY_MODEL_DISCOVERY"] == "1" + assert "cat /claude-code/instruction.txt | claude -p" in cast(str, command[2]) + assert '"$(cat /claude-code/instruction.txt)"' not in cast(str, command[2]) + assert "--max-turns 7" in cast(str, command[2]) + assert "--permission-mode bypassPermissions" in cast(str, command[2]) + assert "--mcp-config /tmp/claude-code-mcp.json" in cast(str, command[2]) + assert mcp["mcpServers"]["verifiers-tools"]["command"] == "python3" + + +def test_codex_harness_builds_sandbox_program() -> None: + harness = vf.Codex( + config=vf.CodexConfig( + system_prompt="extra system prompt", + agent_workdir="/workspace", + codex_sandbox="workspace-write", + model_reasoning_effort="high", + ) + ) + program = cast(dict[str, object], harness.program) + command = cast(list[object], program["command"]) + setup = cast(str, program["setup"]) + files = cast(dict[str, object], program["files"]) + env = cast(dict[str, object], program["env"]) + mcp_toml = codex_mcp_toml() + + assert "npm install -g @openai/codex" in setup + assert "/codex/instruction.txt" in files + assert "/codex/system.txt" in files + assert program["channels"] == "mcp" + assert env["OPENAI_MODEL"] == "runtime.model" + assert callable(env["CODEX_API_KEY"]) + assert 'model_provider = "verifiers"' in cast(str, command[2]) + assert 'approval_policy = "never"' in cast(str, command[2]) + assert 'sandbox_mode = "workspace-write"' in cast(str, command[2]) + assert 'model_reasoning_effort = "high"' in cast(str, command[2]) + assert "--sandbox workspace-write" in cast(str, command[2]) + assert "--output-last-message /logs/agent/codex.txt.final" in cast(str, command[2]) + assert "- < /logs/agent/codex.txt.prompt" in cast(str, command[2]) + assert '"$(cat /logs/agent/codex.txt.prompt)"' not in cast(str, command[2]) + assert 'command = "python3"' in mcp_toml + + +def test_codex_config_rejects_max_turns() -> None: + with pytest.raises(ValueError, match="CodexConfig.max_turns is not supported"): + vf.CodexConfig(max_turns=7) + + def test_terminus_2_harness_builds_sandbox_program() -> None: harness = vf.Terminus2( config=vf.Terminus2Config( diff --git a/verifiers/__init__.py b/verifiers/__init__.py index b39b0e3f7..23bf30e43 100644 --- a/verifiers/__init__.py +++ b/verifiers/__init__.py @@ -88,6 +88,10 @@ "UserConfig", "HarborTaskset", "HarborTasksetConfig", + "ClaudeCode", + "ClaudeCodeConfig", + "Codex", + "CodexConfig", "MiniSWEAgent", "MiniSWEAgentConfig", "OpenCode", @@ -219,6 +223,10 @@ "UserConfig": "verifiers.v1:UserConfig", "HarborTaskset": "verifiers.v1:HarborTaskset", "HarborTasksetConfig": "verifiers.v1:HarborTasksetConfig", + "ClaudeCode": "verifiers.v1:ClaudeCode", + "ClaudeCodeConfig": "verifiers.v1:ClaudeCodeConfig", + "Codex": "verifiers.v1:Codex", + "CodexConfig": "verifiers.v1:CodexConfig", "MiniSWEAgent": "verifiers.v1:MiniSWEAgent", "MiniSWEAgentConfig": "verifiers.v1:MiniSWEAgentConfig", "OpenCode": "verifiers.v1:OpenCode", @@ -315,6 +323,10 @@ def __getattr__(name: str): HarborTasksetConfig, MCPTool, MCPToolConfig, + ClaudeCode, + ClaudeCodeConfig, + Codex, + CodexConfig, MiniSWEAgent, MiniSWEAgentConfig, MutableConfigMap, diff --git a/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md b/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md index 3c40300e7..187967bb2 100644 --- a/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +++ b/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md @@ -42,8 +42,8 @@ your loader runs. The type annotation is not cosmetic. your loader can convert or forward them. 3. If your environment has custom harness fields, the same rule applies to the `harness` annotation unless TOML selects a registered harness config with - `[env.harness] type = "terminus2"`, `[env.harness] type = "pi"`, or another - owner/config alias. + `[env.harness] type = "codex"`, `[env.harness] type = "claude-code"`, or + another owner/config alias. 4. The config object that reaches `load_environment` is already validated and typed. Do not reconstruct child config objects just to recover their type. 5. `vf.Env(taskset=MyTaskset(config=config.taskset), harness=MyHarness(config=config.harness))` diff --git a/verifiers/v1/README.md b/verifiers/v1/README.md index bb3ed27d7..3d7ae7433 100644 --- a/verifiers/v1/README.md +++ b/verifiers/v1/README.md @@ -574,9 +574,9 @@ signature. Reusable CLI programs should be packaged as `Harness` subclasses. Package implementations live under `verifiers.v1.packages` while the v1 API stabilizes, -and are re-exported from `verifiers.v1` for normal use. `OpenCode`, `Pi`, -`MiniSWEAgent`, `Terminus2`, and `RLM` are bundled `Harness` leaf wrappers for -common coding-agent CLIs. +and are re-exported from `verifiers.v1` for normal use. `OpenCode`, +`ClaudeCode`, `Codex`, `Pi`, `MiniSWEAgent`, `Terminus2`, and `RLM` are +bundled `Harness` leaf wrappers for common coding-agent CLIs. ```python import verifiers as vf @@ -603,6 +603,8 @@ endpoint and, when tools are enabled, installs the Pi MCP adapter and writes a project `.mcp.json`. Neither side needs to know the other's private fields. `MiniSWEAgent` owns mini-swe-agent installation, config layering, endpoint env, and log/trajectory artifacts. +`ClaudeCode` and `Codex` package the Claude Code and Codex CLI non-interactive +modes with endpoint, MCP proxy, and log artifact wiring. `Terminus2` owns Harbor Terminus agent installation, endpoint env, and log artifacts. `RLM` follows the same boundary for recursive LLM runs: `HarborTaskset` owns @@ -1243,8 +1245,8 @@ and harness config types for the loader. Reusable taskset environments can keep `harness` typed as `vf.HarnessConfig`. Then TOML may select a registered harness config with `type`, for example -`type = "terminus2"` or `type = "pi"`, and pass that config's ordinary fields -beside it. Use `harness = "pi"` when the selected harness needs no field +`type = "codex"` or `type = "claude-code"`, and pass that config's ordinary +fields beside it. Use `harness = "pi"` when the selected harness needs no field overrides. ```python diff --git a/verifiers/v1/__init__.py b/verifiers/v1/__init__.py index f30904018..2d24b25eb 100644 --- a/verifiers/v1/__init__.py +++ b/verifiers/v1/__init__.py @@ -37,6 +37,10 @@ from .env import Env from .harness import Harness from .packages.harnesses import ( + ClaudeCode, + ClaudeCodeConfig, + Codex, + CodexConfig, MiniSWEAgent, MiniSWEAgentConfig, OpenCode, @@ -80,6 +84,10 @@ __all__ = [ "ConfigData", "CallableConfig", + "ClaudeCode", + "ClaudeCodeConfig", + "Codex", + "CodexConfig", "Config", "ConfigMap", "Env", diff --git a/verifiers/v1/packages/harnesses/__init__.py b/verifiers/v1/packages/harnesses/__init__.py index 423e1fd45..6a8455136 100644 --- a/verifiers/v1/packages/harnesses/__init__.py +++ b/verifiers/v1/packages/harnesses/__init__.py @@ -1,10 +1,14 @@ from .configs import ( + ClaudeCodeConfig, + CodexConfig, MiniSWEAgentConfig, OpenCodeConfig, PiConfig, RLMConfig, Terminus2Config, ) +from .claude_code import ClaudeCode +from .codex import Codex from .mini_swe_agent import MiniSWEAgent from .opencode import OpenCode from .pi import Pi @@ -12,6 +16,10 @@ from .terminus_2 import Terminus2 __all__ = [ + "ClaudeCode", + "ClaudeCodeConfig", + "Codex", + "CodexConfig", "MiniSWEAgent", "MiniSWEAgentConfig", "OpenCode", diff --git a/verifiers/v1/packages/harnesses/claude_code.py b/verifiers/v1/packages/harnesses/claude_code.py new file mode 100644 index 000000000..1ccb012a0 --- /dev/null +++ b/verifiers/v1/packages/harnesses/claude_code.py @@ -0,0 +1,133 @@ +import json +import shlex +from pathlib import PurePosixPath +from typing import ClassVar, cast + +from .command import configure_command_harness +from .configs import ClaudeCodeConfig +from ...harness import Harness +from ...types import ProgramCommand, ProgramChannels, ProgramOptionMap +from ...utils.mcp_proxy_utils import proxy_command + + +class ClaudeCode(Harness[ClaudeCodeConfig]): + _config_aliases: ClassVar[tuple[str, ...]] = ("claude", "claude-code") + + def __init__(self, config: ClaudeCodeConfig | None = None): + config = cast(ClaudeCodeConfig, self._coerce_config(config)) + super().__init__(config=config.model_copy(update={"program": None})) + self.config = config + configure_command_harness( + self, + config, + command=self.command(config), + setup=self.setup(config), + env=self.env(config), + artifacts=self.artifacts(config), + channels=self.channels(config), + ) + + def command(self, config: ClaudeCodeConfig) -> ProgramCommand: + return [ + "bash", + "-lc", + build_claude_code_run_script( + agent_workdir=config.agent_workdir, + instruction_path=config.instruction_path, + system_prompt_path=config.system_prompt_path + if config.system_prompt is not None + else None, + log_path=config.log_path, + permission_mode=config.permission_mode, + max_turns=config.max_turns, + ), + ] + + def setup(self, config: ClaudeCodeConfig) -> str: + return build_claude_code_install_script(package=config.package) + + def env(self, config: ClaudeCodeConfig) -> ProgramOptionMap: + return { + "ANTHROPIC_MODEL": "runtime.model", + "CLAUDE_CODE_ENABLE_GATEWAY_MODEL_DISCOVERY": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + "DISABLE_TELEMETRY": "1", + } + + def artifacts(self, config: ClaudeCodeConfig) -> ProgramOptionMap: + return { + "claude_code_log": { + "path": config.log_path, + "format": "text", + "optional": True, + } + } + + def channels(self, config: ClaudeCodeConfig) -> ProgramChannels: + return "mcp" + + +def build_claude_code_install_script(package: str) -> str: + return f"""\ +set -e +apt-get -o Acquire::Retries=3 update -qq && apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates nodejs npm > /dev/null 2>&1 +npm install -g {shlex.quote(package)} +""" + + +def build_claude_code_run_script( + *, + agent_workdir: str, + instruction_path: str, + system_prompt_path: str | None, + log_path: str, + permission_mode: str, + max_turns: int, +) -> str: + log_dir = str(PurePosixPath(log_path).parent) + final_path = f"{log_path}.final" + mcp_config_path = "/tmp/claude-code-mcp.json" + mcp_config = claude_code_mcp_json() + system_prompt_arg = ( + f'--append-system-prompt "$(cat {shlex.quote(system_prompt_path)})"' + if system_prompt_path is not None + else "" + ) + return f"""\ +set -eo pipefail + +CLAUDE_WORKDIR="${{AGENT_WORKDIR:-}}" +if [ -z "$CLAUDE_WORKDIR" ]; then + CLAUDE_WORKDIR={shlex.quote(agent_workdir)} +fi + +mkdir -p {shlex.quote(log_dir)} "$CLAUDE_WORKDIR" +cat > {shlex.quote(mcp_config_path)} <<'EOFMCP' +{mcp_config} +EOFMCP + +cd "$CLAUDE_WORKDIR" +cat {shlex.quote(instruction_path)} | claude -p \ + --model "$ANTHROPIC_MODEL" \ + --max-turns {int(max_turns)} \ + --permission-mode {shlex.quote(permission_mode)} \ + --mcp-config {shlex.quote(mcp_config_path)} \ + --output-format text \ + {system_prompt_arg} > {shlex.quote(final_path)} 2> {shlex.quote(log_path)} +cat {shlex.quote(final_path)} >> {shlex.quote(log_path)} +cat {shlex.quote(final_path)} +""" + + +def claude_code_mcp_json() -> str: + command, *args = proxy_command() + config = { + "mcpServers": { + "verifiers-tools": { + "type": "stdio", + "command": command, + "args": args, + } + } + } + return json.dumps(config, indent=2) diff --git a/verifiers/v1/packages/harnesses/codex.py b/verifiers/v1/packages/harnesses/codex.py new file mode 100644 index 000000000..2907888fe --- /dev/null +++ b/verifiers/v1/packages/harnesses/codex.py @@ -0,0 +1,148 @@ +import json +import shlex +from pathlib import PurePosixPath +from typing import ClassVar, cast + +from .command import configure_command_harness +from .configs import CodexConfig +from ...harness import Harness +from ...state import State +from ...types import ProgramCommand, ProgramChannels, ProgramOptionMap +from ...utils.mcp_proxy_utils import proxy_command + + +class Codex(Harness[CodexConfig]): + _config_aliases: ClassVar[tuple[str, ...]] = ("codex", "codex-cli") + + def __init__(self, config: CodexConfig | None = None): + config = cast(CodexConfig, self._coerce_config(config)) + super().__init__(config=config.model_copy(update={"program": None})) + self.config = config + configure_command_harness( + self, + config, + command=self.command(config), + setup=self.setup(config), + env=self.env(config), + artifacts=self.artifacts(config), + channels=self.channels(config), + ) + + def command(self, config: CodexConfig) -> ProgramCommand: + return [ + "bash", + "-lc", + build_codex_run_script( + agent_workdir=config.agent_workdir, + instruction_path=config.instruction_path, + system_prompt_path=config.system_prompt_path + if config.system_prompt is not None + else None, + log_path=config.log_path, + codex_sandbox=config.codex_sandbox, + model_reasoning_effort=config.model_reasoning_effort, + ), + ] + + def setup(self, config: CodexConfig) -> str: + return build_codex_install_script(package=config.package) + + def env(self, config: CodexConfig) -> ProgramOptionMap: + return { + "OPENAI_MODEL": "runtime.model", + "CODEX_API_KEY": codex_api_key, + "DISABLE_TELEMETRY": "1", + } + + def artifacts(self, config: CodexConfig) -> ProgramOptionMap: + return { + "codex_log": { + "path": config.log_path, + "format": "text", + "optional": True, + } + } + + def channels(self, config: CodexConfig) -> ProgramChannels: + return "mcp" + + +def build_codex_install_script(package: str) -> str: + return f"""\ +set -e +apt-get -o Acquire::Retries=3 update -qq && apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates nodejs npm > /dev/null 2>&1 +npm install -g {shlex.quote(package)} +""" + + +def build_codex_run_script( + *, + agent_workdir: str, + instruction_path: str, + system_prompt_path: str | None, + log_path: str, + codex_sandbox: str, + model_reasoning_effort: str | None, +) -> str: + log_dir = str(PurePosixPath(log_path).parent) + final_path = f"{log_path}.final" + prompt_path = f"{log_path}.prompt" + effort_config = ( + f"model_reasoning_effort = {json.dumps(model_reasoning_effort)}\n" + if model_reasoning_effort is not None + else "" + ) + mcp_toml = codex_mcp_toml() + system_prompt = ( + f"cat {shlex.quote(system_prompt_path)} > {shlex.quote(prompt_path)}\n" + f"printf '\\n\\n' >> {shlex.quote(prompt_path)}" + if system_prompt_path is not None + else f": > {shlex.quote(prompt_path)}" + ) + return f"""\ +set -eo pipefail + +CODEX_WORKDIR="${{AGENT_WORKDIR:-}}" +if [ -z "$CODEX_WORKDIR" ]; then + CODEX_WORKDIR={shlex.quote(agent_workdir)} +fi + +mkdir -p {shlex.quote(log_dir)} "$CODEX_WORKDIR" "$CODEX_WORKDIR/.codex" +cat > "$CODEX_WORKDIR/.codex/config.toml" <> {shlex.quote(prompt_path)} + +cd "$CODEX_WORKDIR" +CODEX_HOME="$CODEX_WORKDIR/.codex" codex exec \ + --skip-git-repo-check \ + --sandbox {shlex.quote(codex_sandbox)} \ + --model "$OPENAI_MODEL" \ + --output-last-message {shlex.quote(final_path)} \ + - < {shlex.quote(prompt_path)} > {shlex.quote(log_path)} 2>&1 +cat {shlex.quote(final_path)} +""" + + +def codex_mcp_toml() -> str: + command, *args = proxy_command() + return ( + "[mcp_servers.verifiers-tools]\n" + f"command = {json.dumps(command)}\n" + f"args = {json.dumps(args)}\n" + ) + + +def codex_api_key(state: State) -> str: + return state.get_endpoint_config(api="responses")["api_key"] diff --git a/verifiers/v1/packages/harnesses/configs.py b/verifiers/v1/packages/harnesses/configs.py index 3f084dbab..ce901ed56 100644 --- a/verifiers/v1/packages/harnesses/configs.py +++ b/verifiers/v1/packages/harnesses/configs.py @@ -1,4 +1,5 @@ from collections.abc import Mapping +from typing import ClassVar, Literal from pydantic import field_validator @@ -62,6 +63,20 @@ PI_DEFAULT_LOG_PATH = "/logs/agent/pi.txt" PI_DEFAULT_SYSTEM_PROMPT = "Complete the user's task using the available tools." +CLAUDE_CODE_DEFAULT_AGENT_WORKDIR = "/app" +CLAUDE_CODE_DEFAULT_INSTRUCTION_PATH = "/claude-code/instruction.txt" +CLAUDE_CODE_DEFAULT_SYSTEM_PROMPT_PATH = "/claude-code/system.txt" +CLAUDE_CODE_DEFAULT_LOG_PATH = "/logs/agent/claude-code.txt" +CLAUDE_CODE_DEFAULT_PACKAGE = "@anthropic-ai/claude-code" +CLAUDE_CODE_DEFAULT_PERMISSION_MODE = "bypassPermissions" + +CODEX_DEFAULT_AGENT_WORKDIR = "/app" +CODEX_DEFAULT_INSTRUCTION_PATH = "/codex/instruction.txt" +CODEX_DEFAULT_SYSTEM_PROMPT_PATH = "/codex/system.txt" +CODEX_DEFAULT_LOG_PATH = "/logs/agent/codex.txt" +CODEX_DEFAULT_PACKAGE = "@openai/codex" +CODEX_DEFAULT_SANDBOX_MODE = "danger-full-access" + RLM_DEFAULT_REPO_URL = "github.com/PrimeIntellect-ai/rlm-harness.git" RLM_DEFAULT_REPO_REF = "main" RLM_DEFAULT_MAX_TURNS = 100 @@ -129,6 +144,52 @@ class PiConfig(HarnessConfig): max_turns: int = 4 +class ClaudeCodeConfig(HarnessConfig): + _config_aliases: ClassVar[tuple[str, ...]] = ("claude", "claude-code") + + agent_workdir: str = CLAUDE_CODE_DEFAULT_AGENT_WORKDIR + instruction_path: str = CLAUDE_CODE_DEFAULT_INSTRUCTION_PATH + system_prompt_path: str = CLAUDE_CODE_DEFAULT_SYSTEM_PROMPT_PATH + log_path: str = CLAUDE_CODE_DEFAULT_LOG_PATH + system_prompt: PromptInput | None = None + package: str = CLAUDE_CODE_DEFAULT_PACKAGE + permission_mode: Literal[ + "default", + "acceptEdits", + "plan", + "auto", + "dontAsk", + "bypassPermissions", + ] = CLAUDE_CODE_DEFAULT_PERMISSION_MODE + sandbox: SandboxConfig | None = SandboxConfig() + max_turns: int = 4 + + +class CodexConfig(HarnessConfig): + _config_aliases: ClassVar[tuple[str, ...]] = ("codex", "codex-cli") + + agent_workdir: str = CODEX_DEFAULT_AGENT_WORKDIR + instruction_path: str = CODEX_DEFAULT_INSTRUCTION_PATH + system_prompt_path: str = CODEX_DEFAULT_SYSTEM_PROMPT_PATH + log_path: str = CODEX_DEFAULT_LOG_PATH + system_prompt: PromptInput | None = None + package: str = CODEX_DEFAULT_PACKAGE + codex_sandbox: Literal["read-only", "workspace-write", "danger-full-access"] = ( + CODEX_DEFAULT_SANDBOX_MODE + ) + model_reasoning_effort: ( + Literal["minimal", "low", "medium", "high", "xhigh"] | None + ) = None + sandbox: SandboxConfig | None = SandboxConfig() + + @field_validator("max_turns") + @classmethod + def validate_no_max_turns(cls, value: int) -> int: + if value != HarnessConfig.model_fields["max_turns"].default: + raise ValueError("CodexConfig.max_turns is not supported by codex exec.") + return value + + class RLMConfig(HarnessConfig): workdir: str = RLM_DEFAULT_WORKDIR instruction_path: str = RLM_DEFAULT_INSTRUCTION_PATH