Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
cd95918
Enforce UTF-8 for Goose session files.
Aug 29, 2025
a791ce5
Fixes issue #15. Prevents divide by zero errors and cleans up summari…
Aug 29, 2025
49891a3
Cleaned up output by using consistent printing methods.
Aug 29, 2025
46ad344
Fixes Issue #18 by implementing metric downgrades to Claude if OpenAP…
Aug 30, 2025
fc7ba41
Satisfied ruff's bizarre rules.
Aug 30, 2025
54dd3d3
Added extra logging and test for goose UTF-8 handling.
Aug 30, 2025
72f586c
Added metacoder configuration test cases for claude downgrade and no …
Aug 30, 2025
d7beb19
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
d88ca90
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
e7bba40
Added assertion to confirm that ClaudeJudge completed scoring the met…
Aug 30, 2025
d27277b
Added assertion to force test to fail on Exception. Increased logging…
Aug 30, 2025
3f22fc6
Fixed runtime issues related to metric downgrade from CorrectnessMetr…
Aug 30, 2025
d6e1e44
Added test coverage of new evaluation judge functionality. Added test…
Aug 30, 2025
882a3d9
Reduced logging verbosity. Added Anthropic quota check. Added automat…
Sep 2, 2025
c98c9d7
Fixed issue #23. Forced processes to be launched with UTF-8 encoding …
Sep 2, 2025
4761d19
Addressed ruff formatting issue.
Sep 2, 2025
6b64a79
Added output file check to fail if the output file already exists. Ot…
Sep 2, 2025
c436e7f
Modified save_results to append to existing output file rather than o…
Sep 2, 2025
b0b1c8b
Updated ClaudeJudge model to claude-sonnet-4-20250514.
Sep 3, 2025
a7e71e3
Revert "Modified save_results to append to existing output file rathe…
Sep 3, 2025
7e143da
Added UTF-8 encoding to prevent character mangling during YAML export…
Sep 4, 2025
37cbb2f
Added support for grouping test case eval results with 'group' key in…
Sep 4, 2025
bdec2e3
Updated test_runner.py to include Default case_group in EvalResults t…
Sep 4, 2025
9386097
Updated Anthropic fallback mode from claude-3-5-sonnet-20240620 to cl…
Sep 4, 2025
0d855bc
Corrected test cases to match the expected Anthropic model.
Sep 4, 2025
9d9bca0
Removed unnecessary duplicate path element in work directory. Readabi…
Sep 5, 2025
bd474c9
Fix Issue #30. Goose supports an environment variable to disable usin…
Sep 5, 2025
142b8b8
Partially addresses Issue #29 Windows compatibility. Uses os.cwd() in…
Sep 5, 2025
b5faef3
Uses safer XDG_CONFIG_HOME instead of changing HOME environment varia…
Sep 5, 2025
6d6ba8d
Changed informational log message to make it clear that a directory p…
Sep 5, 2025
80772c2
The Goose executable is now detected in a cross-platform way, and the…
Sep 5, 2025
ef6337c
Moved hard-coded values into variables in preparation for cross-platf…
Sep 5, 2025
87d556d
Added OS-specific Goose config folder structures. Replaced hard-coded…
Sep 5, 2025
a911381
Refactored OS environment detection to create relative paths for code…
Sep 18, 2025
a7429aa
Add custom GEval support with criteria, rubric, and evaluation_steps
justaddcoffee Nov 24, 2025
3e90e9b
Fix ruff linting error: remove unused field_validator import
justaddcoffee Nov 24, 2025
3bf2816
Fix GeminiCoder CLI invocation to use positional arguments
justaddcoffee Nov 26, 2025
808e0ae
Copy OAuth credentials to workdir for gemini CLI
justaddcoffee Dec 1, 2025
b2bcfff
Remove HOME="." override - gemini works fine without it
justaddcoffee Dec 1, 2025
997e9a9
Fix: Prevent process hangs by closing stdin in run_process()
justaddcoffee Dec 1, 2025
009b4c6
Fix ruff formatting issues
justaddcoffee Dec 1, 2025
45fb77c
Remove debug test file that was causing import errors
justaddcoffee Dec 1, 2025
f46e1ed
Add model selection support to GeminiCoder
justaddcoffee Dec 2, 2025
55093c8
Fix gemini CLI invocation with MCP servers using -p flag
justaddcoffee Dec 2, 2025
7140ed2
Add --include-directories flag for MCP workspace access
justaddcoffee Dec 3, 2025
873b7fc
Fix --include-directories to use absolute path - resolve() needed
justaddcoffee Dec 3, 2025
74cf121
Fix path duplication - use Path.cwd() instead of Path(workdir).resolve()
justaddcoffee Dec 3, 2025
67b6ab0
Use stdin=DEVNULL instead of PIPE+close() to prevent stdin issues
justaddcoffee Dec 4, 2025
28d16fa
Fix GeminiCoder hanging by adding --output-format text flag
justaddcoffee Dec 4, 2025
fbcff9d
Fix gemini CLI hanging by using positional arguments
justaddcoffee Dec 4, 2025
d12acec
Revert positional argument change - use -p flag with --output-format …
justaddcoffee Dec 5, 2025
369b972
Fix CodexCoder to use codex CLI instead of claude
justaddcoffee Dec 5, 2025
794b32f
Fix codex.py: add prepare_workdir and change_directory for proper MCP…
justaddcoffee Dec 5, 2025
1ac0504
Add MCP support to CodexCoder
justaddcoffee Dec 5, 2025
8d45b4d
Fix CodexCoder 401 Unauthorized error
justaddcoffee Dec 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"permissions": {
"allow": [
"Bash(gh pr view:*)",
"Bash(gh pr checkout:*)",
"Bash(git stash:*)",
"Bash(uv sync:*)",
"Bash(make test:*)",
"Bash(uv run pytest:*)",
"Bash(uv run:*)",
"Bash(make:*)"
],
"deny": [],
"ask": []
}
}
4 changes: 4 additions & 0 deletions .deepeval/.deepeval_telemetry.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
DEEPEVAL_ID=8f34a7cd-a262-4e39-90b4-f3833cfcfd73
DEEPEVAL_STATUS=old
DEEPEVAL_LAST_FEATURE=evaluation
DEEPEVAL_EVALUATION_STATUS=old
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
GOOSE_MODEL: claude-sonnet-4-20250514
GOOSE_PROVIDER: anthropic
extensions:
developer:
bundled: true
display_name: Developer
enabled: true
name: developer
timeout: 300
type: builtin
ols:
args:
- mcp-ols
bundled: null
cmd: uvx
enabled: true
env_keys: []
envs: {}
name: ols
timeout: 300
type: stdio
pubmed:
args:
- mcp-simple-pubmed
bundled: null
cmd: uvx
enabled: true
env_keys:
- PUBMED_EMAIL
envs:
PUBMED_EMAIL: [email protected]
name: pubmed
timeout: 300
type: stdio
22 changes: 20 additions & 2 deletions src/metacoder/coders/base_coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,16 @@ def run_process(
"""
if env is None:
env = self.expand_env(self.env)

# Decode the child process output as UTF-8 (instead of default encoding)
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.DEVNULL, # Use DEVNULL instead of PIPE to prevent interactive prompts
text=True,
encoding="utf-8",
errors="replace", # avoid crashes on the occasional bad byte
env=env,
bufsize=1,
universal_newlines=True,
Expand All @@ -189,6 +194,15 @@ def run_process(
# check verbosity level
quiet_mode = logger.getEffectiveLevel() <= logging.INFO

# Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
for s in (sys.stdout, sys.stderr):
try:
s.reconfigure(encoding="utf-8", errors="replace") # Python 3.7+
except Exception as e:
logger.info(f"{e}")
pass # OK if not available (e.g., redirected or older Python)

# lines are already str decoded as UTF-8
def stream_output(pipe, output_lines, stream):
for line in iter(pipe.readline, ""):
if not quiet_mode:
Expand Down Expand Up @@ -352,7 +366,8 @@ def prepare_workdir(self):

if self.config_objects is None:
self.config_objects = self.default_config_objects()
logger.info(f"📁 Preparing workdir: {self.workdir}")
logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
logger.info(f" (resolved): {Path(self.workdir).resolve()}")
with change_directory(self.workdir):
# clear old config objects
for path, _type in self.default_config_paths().items():
Expand All @@ -366,7 +381,10 @@ def prepare_workdir(self):
path.unlink()
logger.debug(f"🔧 Writing config objects: {self.config_objects}")
for config_object in self.config_objects:
path = Path(config_object.relative_path)
rel = Path(config_object.relative_path)
if rel.is_absolute():
raise ValueError(f"Config object path must be relative: {rel}")
path = rel
path.parent.mkdir(parents=True, exist_ok=True)
logger.info(
f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"
Expand Down
7 changes: 5 additions & 2 deletions src/metacoder/coders/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
ao.tool_uses = tool_uses

end_time = time.time()
logger.info(f"🤖 Command took {end_time - start_time} seconds")
logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand All @@ -260,5 +260,8 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. "
f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
)
raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
# Don't raise for other errors - let evaluation continue and mark test as failed
logger.warning(
f"Claude returned error (test will be marked as failed): {ao.result_text}"
)
return ao
183 changes: 115 additions & 68 deletions src/metacoder/coders/codex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,104 +3,151 @@
import time
import logging
import shutil
from typing import Any

from metacoder.coders.base_coder import (
BaseCoder,
CoderConfigObject,
CoderOutput,
FileType,
change_directory,
)
from metacoder.configuration import ConfigFileRole, MCPConfig, MCPType


logger = logging.getLogger(__name__)


class CodexCoder(BaseCoder):
"""
For AWS bedrock, you may need to copy ~/.aws/
OpenAI Codex CLI integration.

Codex-specific configuration:

You can provide the following files in your configuration directory:

- `AGENTS.md` - Primary instructions for the assistant
- `.codex/config.toml` - Configuration including MCP servers

MCP Support:

Codex CLI supports MCP (Model Context Protocol) servers through the
mcp_servers configuration in .codex/config.toml. When MCPs are configured
through Metacoder, they will be automatically added to the config file.

The Codex CLI expects MCP servers to be configured in TOML format:

[mcp_servers.server_name]
command = "uvx"
args = ["mcp-server-name"]
env = { "API_KEY" = "value" }

Note: Requires codex CLI to be installed.
"""

@classmethod
def is_available(cls) -> bool:
"""Check if codex command is available."""
return shutil.which("codex") is not None

@classmethod
def supports_mcp(cls) -> bool:
"""CodexCoder supports MCP extensions."""
return True

@classmethod
def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
return {
Path("AGENTS.md"): ConfigFileRole.PRIMARY_INSTRUCTION,
Path(".codex/config.toml"): ConfigFileRole.CONFIG,
}

@property
def instructions_path(self) -> Path:
return Path("AGENTS.md")

def default_config_objects(self) -> list[CoderConfigObject]:
"""
extensions:
developer:
bundled: true
display_name: Developer
enabled: true
name: developer
timeout: 300
type: builtin
pdfreader:
args:
- mcp-read-pdf
bundled: null
cmd: uvx
description: Read large and complex PDF documents
enabled: true
env_keys: []
envs: {}
name: pdfreader
timeout: 300
type: stdio
"""
return [
CoderConfigObject(
file_type=FileType.YAML,
relative_path=".config/goose/config.yaml",
content={
"GOOSE_MODEL": "gpt-4o",
"GOOSE_PROVIDER": "openai",
"extensions": {
"developer": {
"bundled": True,
"display_name": "Developer",
"enabled": True,
"name": "developer",
"timeout": 300,
"type": "builtin",
},
"pdfreader": {
"args": ["mcp-read-pdf"],
"bundled": None,
"cmd": "uvx",
"description": "Read large and complex PDF documents",
"enabled": True,
"env_keys": [],
"envs": {},
"name": "pdfreader",
"timeout": 300,
"type": "stdio",
},
},
},
def mcp_config_to_codex_format(self, mcp: MCPConfig) -> dict[str, Any]:
"""Convert MCPConfig to Codex's MCP server format."""
server_config: dict[str, Any] = {}

# For stdio type MCPs
if mcp.type == MCPType.STDIO and mcp.command:
server_config["command"] = mcp.command
if mcp.args:
server_config["args"] = mcp.args
if mcp.env:
server_config["env"] = mcp.env

# For HTTP type MCPs
elif mcp.type == MCPType.HTTP:
raise NotImplementedError(
"HTTP MCPs are not supported for Codex wrapper yet"
)
]

return server_config

def _generate_toml_config(self, mcp_servers: dict[str, dict[str, Any]]) -> str:
"""Generate TOML configuration string for Codex config.toml."""
lines = []

for server_name, server_config in mcp_servers.items():
lines.append(f"[mcp_servers.{server_name}]")
for key, value in server_config.items():
if key == "command":
lines.append(f'command = "{value}"')
elif key == "args":
args_str = ", ".join(f'"{arg}"' for arg in value)
lines.append(f"args = [{args_str}]")
elif key == "env":
env_parts = []
for env_key, env_val in value.items():
env_parts.append(f'"{env_key}" = "{env_val}"')
env_str = ", ".join(env_parts)
lines.append(f"env = {{ {env_str} }}")
lines.append("")

return "\n".join(lines)

def default_config_objects(self) -> list[CoderConfigObject]:
"""Generate config objects including MCP configuration."""
config_objects = []

# Create .codex/config.toml if we have MCP extensions
if self.config and self.config.extensions:
mcp_servers = {}
for mcp in self.config.extensions:
if mcp.enabled:
mcp_servers[mcp.name] = self.mcp_config_to_codex_format(mcp)

if mcp_servers:
toml_content = self._generate_toml_config(mcp_servers)
config_objects.append(
CoderConfigObject(
file_type=FileType.TEXT,
relative_path=".codex/config.toml",
content=toml_content,
)
)

return config_objects

def run(self, input_text: str) -> CoderOutput:
"""
Run claude code with the given input text.
Run codex with the given input text.
"""
env = self.expand_env(self.env)
# important - ensure that only local config files are used
# we assue chdir has been called beforehand
env["HOME"] = "."
text = self.expand_prompt(input_text)
command = ["claude", "-p", "--verbose", "--output-format", "stream-json", text]

print(f"🤖 Running command: {' '.join(command)}")
# time the command
start_time = time.time()
ao = self.run_process(command, env)
self.prepare_workdir()

with change_directory(self.workdir):
# Codex reads .codex/config.toml from current directory automatically.
# Do NOT set HOME=. as this breaks authentication (401 Unauthorized).
text = self.expand_prompt(input_text)
command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]

print(f"📝 Running command: {' '.join(command)}")
# time the command
start_time = time.time()
ao = self.run_process(command, env)
# parse the jsonl output
ao.structured_messages = [
json.loads(line) for line in ao.stdout.split("\n") if line
Expand All @@ -115,9 +162,9 @@ def run(self, input_text: str) -> CoderOutput:
if "result" in message:
ao.result_text = message["result"]
end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"📝 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
raise ValueError(f"Codex failed with error: {ao.stderr} // {ao}")
return ao
Loading
Loading