Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/metacoder/coders/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,5 +260,6 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. "
f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
)
raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
# Don't raise for other errors - let evaluation continue and mark test as failed
logger.warning(f"Claude returned error (test will be marked as failed): {ao.result_text}")
return ao
77 changes: 73 additions & 4 deletions src/metacoder/evals/eval_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,49 @@
from typing import Any, Optional, List, Dict
from pydantic import BaseModel, Field
from typing import Any, Optional, List, Dict, Union
from pydantic import BaseModel, Field, model_validator
from metacoder.configuration import AIModelConfig, MCPConfig


class RubricItem(BaseModel):
"""A single rubric scoring guideline."""

score: float = Field(..., description="Score value (typically 0.0 or 1.0)")
criteria: str = Field(..., description="Criteria for this score")


class MetricConfig(BaseModel):
"""Configuration for a metric with optional custom rubric."""

name: str = Field(..., description="Metric name (e.g., CorrectnessMetric)")
rubric: Optional[List[RubricItem]] = Field(
default=None, description="Custom rubric for evaluation"
)
criteria: Optional[str] = Field(
default=None, description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)"
)
evaluation_steps: Optional[List[str]] = Field(
default=None, description="Custom evaluation steps (mutually exclusive with criteria)"
)

@model_validator(mode='after')
def validate_mutual_exclusivity(self):
"""Ensure criteria and evaluation_steps are mutually exclusive and at least one is provided."""
# Check mutual exclusivity
if self.criteria is not None and self.evaluation_steps is not None:
raise ValueError(
"Cannot specify both 'criteria' and 'evaluation_steps'. "
"Use one or the other. evaluation_steps provides more control, "
"while criteria auto-generates steps."
)

# Check that at least one is provided
if self.criteria is None and self.evaluation_steps is None and self.rubric is None:
raise ValueError(
"Must provide at least one of: criteria, evaluation_steps, or rubric"
)

return self


class EvalCase(BaseModel):
"""
A single evaluation test case.
Expand All @@ -18,15 +59,43 @@ class EvalCase(BaseModel):
expected_output: "Example Paper Title"
threshold: 0.9
```

Example with custom rubric:
```yaml
name: "retraction_check"
metrics:
- name: CorrectnessMetric
rubric:
- score: 0.0
criteria: "Output indicates paper not retracted"
- score: 1.0
criteria: "Output indicates paper is retracted"
input: "Is PMC4831113 retracted?"
expected_output: "Yes"
```

Example with custom evaluation_steps:
```yaml
name: "exact_text_extraction"
metrics:
- name: CorrectnessMetric
evaluation_steps:
- "Check whether the actual output contains the exact text from expected output"
- "Heavily penalize any deviation, paraphrasing, or added explanations"
- "The text must be verbatim, not approximate"
input: "What is the first sentence of section 2?"
expected_output: "This is the exact sentence."
threshold: 0.9
```
"""

name: str = Field(..., description="Unique identifier for the test case")
group: Optional[str] = Field(
default="Default", description="Test category for result grouping."
)
metrics: List[str] = Field(
metrics: List[Union[str, MetricConfig]] = Field(
...,
description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
description="List of metric names or metric configurations with custom rubrics",
)
input: str = Field(
..., description="The prompt or question to send to the AI coder"
Expand Down
97 changes: 87 additions & 10 deletions src/metacoder/evals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from metacoder.coders.base_coder import BaseCoder, CoderOutput
from metacoder.registry import AVAILABLE_CODERS
from metacoder.evals.eval_model import EvalCase, EvalDataset
from metacoder.evals.eval_model import EvalCase, EvalDataset, MetricConfig
from metacoder.configuration import AIModelConfig, CoderConfig

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -65,11 +65,14 @@ def is_successful(self) -> bool:


def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
"""Creates a GEval instance with the specified model."""
"""Creates a GEval instance with the specified model.

Uses evaluation_steps (not criteria) for more reliable scoring across runs.
"""
return GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
# NOTE: you can only provide either criteria or evaluation_steps, and not both
# Using evaluation_steps for more control and reliability
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also heavily penalize omission of detail",
Expand All @@ -85,6 +88,63 @@ def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
)


def make_custom_geval(
metric_config: MetricConfig, model: Optional[DeepEvalBaseLLM] = None
) -> GEval:
"""Creates a GEval instance with custom criteria/rubric/evaluation_steps from MetricConfig.

Args:
metric_config: Configuration with custom evaluation parameters
model: Optional LLM model (defaults to OpenAI GPT-4)

Returns:
Configured GEval instance

Note:
criteria and evaluation_steps are mutually exclusive (enforced by MetricConfig validator).
evaluation_steps provides more control and reliability, while criteria auto-generates steps.
"""
from deepeval.metrics.g_eval.utils import Rubric

# Convert rubric if provided
rubrics = []
if metric_config.rubric:
for item in metric_config.rubric:
rubrics.append(
Rubric(
score_range=(item.score, item.score),
expected_outcome=item.criteria,
)
)

# Build kwargs for GEval
kwargs = {
"name": metric_config.name,
"evaluation_params": [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
"model": model,
}

# Add evaluation_steps OR criteria (mutually exclusive)
# Note: Pydantic validator already ensures mutual exclusivity
if metric_config.evaluation_steps:
kwargs["evaluation_steps"] = metric_config.evaluation_steps
elif metric_config.criteria:
kwargs["criteria"] = metric_config.criteria
else:
# Default criteria if only rubric provided
kwargs["criteria"] = "Evaluate the actual output based on the rubric criteria."

# Add rubric if provided
if rubrics:
kwargs["rubric"] = rubrics

return GEval(**kwargs)


def get_default_metrics(
model: Optional[DeepEvalBaseLLM] = None,
) -> Dict[str, BaseMetric]:
Expand Down Expand Up @@ -282,14 +342,31 @@ def run_single_eval(
execution_time = time.time() - start_time

# Run each metric
for metric_name in case.metrics:
default_metrics = get_default_metrics()
if metric_name in default_metrics:
metric = default_metrics[metric_name]
for metric_item in case.metrics:
# Handle both string metrics and MetricConfig objects
if isinstance(metric_item, str):
# Original behavior: string metric name
metric_name = metric_item
metric_config = None
else:
# New behavior: MetricConfig object with potential custom rubric
metric_name = metric_item.name
metric_config = metric_item

# Create the metric instance
if metric_config and (metric_config.rubric or metric_config.criteria or metric_config.evaluation_steps):
# Use custom configuration if provided
logger.info(f"Using custom configuration for {metric_name}")
metric = make_custom_geval(metric_config, model=None)
else:
# Get metric class and instantiate
metric_class = self.get_metric_class(metric_name)
metric = metric_class(threshold=case.threshold) # type: ignore
# Use default metric behavior
default_metrics = get_default_metrics()
if metric_name in default_metrics:
metric = default_metrics[metric_name]
else:
# Get metric class and instantiate
metric_class = self.get_metric_class(metric_name)
metric = metric_class(threshold=case.threshold) # type: ignore

# Create test case
test_case = self.create_test_case(case, actual_output)
Expand Down
70 changes: 70 additions & 0 deletions tests/test_custom_geval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Test custom GEval metric configuration."""

import pytest
from metacoder.evals.eval_model import MetricConfig, RubricItem


def test_evaluation_steps_only():
"""Test creating MetricConfig with only evaluation_steps."""
m = MetricConfig(
name="CorrectnessMetric", evaluation_steps=["Check if output is correct"]
)
assert m.evaluation_steps == ["Check if output is correct"]
assert m.criteria is None
assert m.rubric is None


def test_criteria_only():
"""Test creating MetricConfig with only criteria."""
m = MetricConfig(name="CorrectnessMetric", criteria="Check correctness")
assert m.criteria == "Check correctness"
assert m.evaluation_steps is None
assert m.rubric is None


def test_rubric_only():
"""Test creating MetricConfig with only rubric."""
rubric = [
RubricItem(score=0.0, criteria="Wrong"),
RubricItem(score=1.0, criteria="Correct"),
]
m = MetricConfig(name="CorrectnessMetric", rubric=rubric)
assert len(m.rubric) == 2
assert m.criteria is None
assert m.evaluation_steps is None


def test_criteria_and_evaluation_steps_mutually_exclusive():
"""Test that providing both criteria and evaluation_steps raises ValueError."""
with pytest.raises(ValueError, match="Cannot specify both"):
MetricConfig(
name="CorrectnessMetric",
criteria="Check correctness",
evaluation_steps=["Step 1"],
)


def test_requires_at_least_one():
"""Test that at least one of criteria/evaluation_steps/rubric is required."""
with pytest.raises(ValueError, match="Must provide at least one"):
MetricConfig(name="CorrectnessMetric")


def test_criteria_with_rubric():
"""Test that criteria can be combined with rubric."""
rubric = [RubricItem(score=0.0, criteria="Wrong")]
m = MetricConfig(
name="CorrectnessMetric", criteria="Check correctness", rubric=rubric
)
assert m.criteria == "Check correctness"
assert len(m.rubric) == 1


def test_evaluation_steps_with_rubric():
"""Test that evaluation_steps can be combined with rubric."""
rubric = [RubricItem(score=1.0, criteria="Correct")]
m = MetricConfig(
name="CorrectnessMetric", evaluation_steps=["Step 1"], rubric=rubric
)
assert m.evaluation_steps == ["Step 1"]
assert len(m.rubric) == 1
Loading