ai4curation · justaddcoffee · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 30, 2025
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,16 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(gh pr view:*)",
+      "Bash(gh pr checkout:*)",
+      "Bash(git stash:*)",
+      "Bash(uv sync:*)",
+      "Bash(make test:*)",
+      "Bash(uv run pytest:*)",
+      "Bash(uv run:*)",
+      "Bash(make:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt
@@ -0,0 +1,4 @@
+DEEPEVAL_ID=8f34a7cd-a262-4e39-90b4-f3833cfcfd73
+DEEPEVAL_STATUS=old
+DEEPEVAL_LAST_FEATURE=evaluation
+DEEPEVAL_EVALUATION_STATUS=old
diff --git a/...4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml b/...4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml
@@ -0,0 +1,34 @@
+GOOSE_MODEL: claude-sonnet-4-20250514
+GOOSE_PROVIDER: anthropic
+extensions:
+  developer:
+    bundled: true
+    display_name: Developer
+    enabled: true
+    name: developer
+    timeout: 300
+    type: builtin
+  ols:
+    args:
+    - mcp-ols
+    bundled: null
+    cmd: uvx
+    enabled: true
+    env_keys: []
+    envs: {}
+    name: ols
+    timeout: 300
+    type: stdio
+  pubmed:
+    args:
+    - mcp-simple-pubmed
+    bundled: null
+    cmd: uvx
+    enabled: true
+    env_keys:
+    - PUBMED_EMAIL
+    envs:
+      PUBMED_EMAIL: [email protected]
+    name: pubmed
+    timeout: 300
+    type: stdio
diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,16 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
+            stdin=subprocess.DEVNULL,  # Use DEVNULL instead of PIPE to prevent interactive prompts
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,6 +194,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        # lines are already str decoded as UTF-8
         def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
@@ -352,7 +366,8 @@ def prepare_workdir(self):
 
         if self.config_objects is None:
             self.config_objects = self.default_config_objects()
-        logger.info(f"📁 Preparing workdir: {self.workdir}")
+        logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
+        logger.info(f"                     (resolved): {Path(self.workdir).resolve()}")
         with change_directory(self.workdir):
             # clear old config objects
             for path, _type in self.default_config_paths().items():
@@ -366,7 +381,10 @@ def prepare_workdir(self):
                         path.unlink()
             logger.debug(f"🔧 Writing config objects: {self.config_objects}")
             for config_object in self.config_objects:
-                path = Path(config_object.relative_path)
+                rel = Path(config_object.relative_path)
+                if rel.is_absolute():
+                    raise ValueError(f"Config object path must be relative: {rel}")
+                path = rel
                 path.parent.mkdir(parents=True, exist_ok=True)
                 logger.info(
                     f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:
@@ -260,5 +260,8 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                         f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. "
                         f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
                     )
-                raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
+                # Don't raise for other errors - let evaluation continue and mark test as failed
+                logger.warning(
+                    f"Claude returned error (test will be marked as failed): {ao.result_text}"
+                )
             return ao
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
@@ -3,104 +3,151 @@
 import time
 import logging
 import shutil
+from typing import Any
 
 from metacoder.coders.base_coder import (
     BaseCoder,
     CoderConfigObject,
     CoderOutput,
     FileType,
+    change_directory,
 )
+from metacoder.configuration import ConfigFileRole, MCPConfig, MCPType
 
 
 logger = logging.getLogger(__name__)
 
 
 class CodexCoder(BaseCoder):
     """
-    For AWS bedrock, you may need to copy ~/.aws/
+    OpenAI Codex CLI integration.
 
+    Codex-specific configuration:
+
+    You can provide the following files in your configuration directory:
+
+    - `AGENTS.md` - Primary instructions for the assistant
+    - `.codex/config.toml` - Configuration including MCP servers
+
+    MCP Support:
+
+    Codex CLI supports MCP (Model Context Protocol) servers through the
+    mcp_servers configuration in .codex/config.toml. When MCPs are configured
+    through Metacoder, they will be automatically added to the config file.
+
+    The Codex CLI expects MCP servers to be configured in TOML format:
+
+        [mcp_servers.server_name]
+        command = "uvx"
+        args = ["mcp-server-name"]
+        env = { "API_KEY" = "value" }
+
+    Note: Requires codex CLI to be installed.
     """
 
     @classmethod
     def is_available(cls) -> bool:
         """Check if codex command is available."""
         return shutil.which("codex") is not None
 
+    @classmethod
+    def supports_mcp(cls) -> bool:
+        """CodexCoder supports MCP extensions."""
+        return True
+
+    @classmethod
+    def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
+        return {
+            Path("AGENTS.md"): ConfigFileRole.PRIMARY_INSTRUCTION,
+            Path(".codex/config.toml"): ConfigFileRole.CONFIG,
+        }
+
     @property
     def instructions_path(self) -> Path:
         return Path("AGENTS.md")
 
-    def default_config_objects(self) -> list[CoderConfigObject]:
-        """
-        extensions:
-            developer:
-                bundled: true
-                display_name: Developer
-                enabled: true
-                name: developer
-                timeout: 300
-                type: builtin
-            pdfreader:
-                args:
-                - mcp-read-pdf
-                bundled: null
-                cmd: uvx
-                description: Read large and complex PDF documents
-                enabled: true
-                env_keys: []
-                envs: {}
-                name: pdfreader
-                timeout: 300
-                type: stdio
-        """
-        return [
-            CoderConfigObject(
-                file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
-                content={
-                    "GOOSE_MODEL": "gpt-4o",
-                    "GOOSE_PROVIDER": "openai",
-                    "extensions": {
-                        "developer": {
-                            "bundled": True,
-                            "display_name": "Developer",
-                            "enabled": True,
-                            "name": "developer",
-                            "timeout": 300,
-                            "type": "builtin",
-                        },
-                        "pdfreader": {
-                            "args": ["mcp-read-pdf"],
-                            "bundled": None,
-                            "cmd": "uvx",
-                            "description": "Read large and complex PDF documents",
-                            "enabled": True,
-                            "env_keys": [],
-                            "envs": {},
-                            "name": "pdfreader",
-                            "timeout": 300,
-                            "type": "stdio",
-                        },
-                    },
-                },
+    def mcp_config_to_codex_format(self, mcp: MCPConfig) -> dict[str, Any]:
+        """Convert MCPConfig to Codex's MCP server format."""
+        server_config: dict[str, Any] = {}
+
+        # For stdio type MCPs
+        if mcp.type == MCPType.STDIO and mcp.command:
+            server_config["command"] = mcp.command
+            if mcp.args:
+                server_config["args"] = mcp.args
+            if mcp.env:
+                server_config["env"] = mcp.env
+
+        # For HTTP type MCPs
+        elif mcp.type == MCPType.HTTP:
+            raise NotImplementedError(
+                "HTTP MCPs are not supported for Codex wrapper yet"
             )
-        ]
+
+        return server_config
+
+    def _generate_toml_config(self, mcp_servers: dict[str, dict[str, Any]]) -> str:
+        """Generate TOML configuration string for Codex config.toml."""
+        lines = []
+
+        for server_name, server_config in mcp_servers.items():
+            lines.append(f"[mcp_servers.{server_name}]")
+            for key, value in server_config.items():
+                if key == "command":
+                    lines.append(f'command = "{value}"')
+                elif key == "args":
+                    args_str = ", ".join(f'"{arg}"' for arg in value)
+                    lines.append(f"args = [{args_str}]")
+                elif key == "env":
+                    env_parts = []
+                    for env_key, env_val in value.items():
+                        env_parts.append(f'"{env_key}" = "{env_val}"')
+                    env_str = ", ".join(env_parts)
+                    lines.append(f"env = {{ {env_str} }}")
+            lines.append("")
+
+        return "\n".join(lines)
+
+    def default_config_objects(self) -> list[CoderConfigObject]:
+        """Generate config objects including MCP configuration."""
+        config_objects = []
+
+        # Create .codex/config.toml if we have MCP extensions
+        if self.config and self.config.extensions:
+            mcp_servers = {}
+            for mcp in self.config.extensions:
+                if mcp.enabled:
+                    mcp_servers[mcp.name] = self.mcp_config_to_codex_format(mcp)
+
+            if mcp_servers:
+                toml_content = self._generate_toml_config(mcp_servers)
+                config_objects.append(
+                    CoderConfigObject(
+                        file_type=FileType.TEXT,
+                        relative_path=".codex/config.toml",
+                        content=toml_content,
+                    )
+                )
+
+        return config_objects
 
     def run(self, input_text: str) -> CoderOutput:
         """
-        Run claude code with the given input text.
+        Run codex with the given input text.
         """
         env = self.expand_env(self.env)
-        # important - ensure that only local config files are used
-        # we assue chdir has been called beforehand
-        env["HOME"] = "."
-        text = self.expand_prompt(input_text)
-        command = ["claude", "-p", "--verbose", "--output-format", "stream-json", text]
-
-        print(f"🤖 Running command: {' '.join(command)}")
-        # time the command
-        start_time = time.time()
-        ao = self.run_process(command, env)
+        self.prepare_workdir()
+
+        with change_directory(self.workdir):
+            # Codex reads .codex/config.toml from current directory automatically.
+            # Do NOT set HOME=. as this breaks authentication (401 Unauthorized).
+            text = self.expand_prompt(input_text)
+            command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]
+
+            print(f"📝 Running command: {' '.join(command)}")
+            # time the command
+            start_time = time.time()
+            ao = self.run_process(command, env)
         # parse the jsonl output
         ao.structured_messages = [
             json.loads(line) for line in ao.stdout.split("\n") if line
@@ -115,9 +162,9 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"📝 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
-            raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
+            raise ValueError(f"Codex failed with error: {ao.stderr} // {ao}")
         return ao