ai4curation · justaddcoffee · Nov 24, 2025 · Nov 24, 2025
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
@@ -260,5 +260,6 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                         f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. "
                         f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
                     )
-                raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
+                # Don't raise for other errors - let evaluation continue and mark test as failed
+                logger.warning(f"Claude returned error (test will be marked as failed): {ao.result_text}")
             return ao
diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
@@ -1,8 +1,49 @@
-from typing import Any, Optional, List, Dict
-from pydantic import BaseModel, Field
+from typing import Any, Optional, List, Dict, Union
+from pydantic import BaseModel, Field, model_validator
 from metacoder.configuration import AIModelConfig, MCPConfig
 
 
+class RubricItem(BaseModel):
+    """A single rubric scoring guideline."""
+
+    score: float = Field(..., description="Score value (typically 0.0 or 1.0)")
+    criteria: str = Field(..., description="Criteria for this score")
+
+
+class MetricConfig(BaseModel):
+    """Configuration for a metric with optional custom rubric."""
+
+    name: str = Field(..., description="Metric name (e.g., CorrectnessMetric)")
+    rubric: Optional[List[RubricItem]] = Field(
+        default=None, description="Custom rubric for evaluation"
+    )
+    criteria: Optional[str] = Field(
+        default=None, description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)"
+    )
+    evaluation_steps: Optional[List[str]] = Field(
+        default=None, description="Custom evaluation steps (mutually exclusive with criteria)"
+    )
+
+    @model_validator(mode='after')
+    def validate_mutual_exclusivity(self):
+        """Ensure criteria and evaluation_steps are mutually exclusive and at least one is provided."""
+        # Check mutual exclusivity
+        if self.criteria is not None and self.evaluation_steps is not None:
+            raise ValueError(
+                "Cannot specify both 'criteria' and 'evaluation_steps'. "
+                "Use one or the other. evaluation_steps provides more control, "
+                "while criteria auto-generates steps."
+            )
+
+        # Check that at least one is provided
+        if self.criteria is None and self.evaluation_steps is None and self.rubric is None:
+            raise ValueError(
+                "Must provide at least one of: criteria, evaluation_steps, or rubric"
+            )
+
+        return self
+
+
 class EvalCase(BaseModel):
     """
     A single evaluation test case.
@@ -18,15 +59,43 @@ class EvalCase(BaseModel):
         expected_output: "Example Paper Title"
         threshold: 0.9
         ```
+
+    Example with custom rubric:
+        ```yaml
+        name: "retraction_check"
+        metrics:
+          - name: CorrectnessMetric
+            rubric:
+              - score: 0.0
+                criteria: "Output indicates paper not retracted"
+              - score: 1.0
+                criteria: "Output indicates paper is retracted"
+        input: "Is PMC4831113 retracted?"
+        expected_output: "Yes"
+        ```
+
+    Example with custom evaluation_steps:
+        ```yaml
+        name: "exact_text_extraction"
+        metrics:
+          - name: CorrectnessMetric
+            evaluation_steps:
+              - "Check whether the actual output contains the exact text from expected output"
+              - "Heavily penalize any deviation, paraphrasing, or added explanations"
+              - "The text must be verbatim, not approximate"
+        input: "What is the first sentence of section 2?"
+        expected_output: "This is the exact sentence."
+        threshold: 0.9
+        ```
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
     group: Optional[str] = Field(
         default="Default", description="Test category for result grouping."
     )
-    metrics: List[str] = Field(
+    metrics: List[Union[str, MetricConfig]] = Field(
         ...,
-        description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
+        description="List of metric names or metric configurations with custom rubrics",
     )
     input: str = Field(
         ..., description="The prompt or question to send to the AI coder"

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
@@ -28,7 +28,7 @@
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
-from metacoder.evals.eval_model import EvalCase, EvalDataset
+from metacoder.evals.eval_model import EvalCase, EvalDataset, MetricConfig
 from metacoder.configuration import AIModelConfig, CoderConfig
 
 logger = logging.getLogger(__name__)
@@ -65,11 +65,14 @@ def is_successful(self) -> bool:
 
 
 def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
-    """Creates a GEval instance with the specified model."""
+    """Creates a GEval instance with the specified model.
+
+    Uses evaluation_steps (not criteria) for more reliable scoring across runs.
+    """
     return GEval(
         name="Correctness",
-        criteria="Determine whether the actual output is factually correct based on the expected output.",
         # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        # Using evaluation_steps for more control and reliability
         evaluation_steps=[
             "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
             "You should also heavily penalize omission of detail",
@@ -85,6 +88,63 @@ def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
     )
 
 
+def make_custom_geval(
+    metric_config: MetricConfig, model: Optional[DeepEvalBaseLLM] = None
+) -> GEval:
+    """Creates a GEval instance with custom criteria/rubric/evaluation_steps from MetricConfig.
+
+    Args:
+        metric_config: Configuration with custom evaluation parameters
+        model: Optional LLM model (defaults to OpenAI GPT-4)
+
+    Returns:
+        Configured GEval instance
+
+    Note:
+        criteria and evaluation_steps are mutually exclusive (enforced by MetricConfig validator).
+        evaluation_steps provides more control and reliability, while criteria auto-generates steps.
+    """
+    from deepeval.metrics.g_eval.utils import Rubric
+
+    # Convert rubric if provided
+    rubrics = []
+    if metric_config.rubric:
+        for item in metric_config.rubric:
+            rubrics.append(
+                Rubric(
+                    score_range=(item.score, item.score),
+                    expected_outcome=item.criteria,
+                )
+            )
+
+    # Build kwargs for GEval
+    kwargs = {
+        "name": metric_config.name,
+        "evaluation_params": [
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        "model": model,
+    }
+
+    # Add evaluation_steps OR criteria (mutually exclusive)
+    # Note: Pydantic validator already ensures mutual exclusivity
+    if metric_config.evaluation_steps:
+        kwargs["evaluation_steps"] = metric_config.evaluation_steps
+    elif metric_config.criteria:
+        kwargs["criteria"] = metric_config.criteria
+    else:
+        # Default criteria if only rubric provided
+        kwargs["criteria"] = "Evaluate the actual output based on the rubric criteria."
+
+    # Add rubric if provided
+    if rubrics:
+        kwargs["rubric"] = rubrics
+
+    return GEval(**kwargs)
+
+
 def get_default_metrics(
     model: Optional[DeepEvalBaseLLM] = None,
 ) -> Dict[str, BaseMetric]:
@@ -282,14 +342,31 @@ def run_single_eval(
         execution_time = time.time() - start_time
 
         # Run each metric
-        for metric_name in case.metrics:
-            default_metrics = get_default_metrics()
-            if metric_name in default_metrics:
-                metric = default_metrics[metric_name]
+        for metric_item in case.metrics:
+            # Handle both string metrics and MetricConfig objects
+            if isinstance(metric_item, str):
+                # Original behavior: string metric name
+                metric_name = metric_item
+                metric_config = None
+            else:
+                # New behavior: MetricConfig object with potential custom rubric
+                metric_name = metric_item.name
+                metric_config = metric_item
+
+            # Create the metric instance
+            if metric_config and (metric_config.rubric or metric_config.criteria or metric_config.evaluation_steps):
+                # Use custom configuration if provided
+                logger.info(f"Using custom configuration for {metric_name}")
+                metric = make_custom_geval(metric_config, model=None)
             else:
-                # Get metric class and instantiate
-                metric_class = self.get_metric_class(metric_name)
-                metric = metric_class(threshold=case.threshold)  # type: ignore
+                # Use default metric behavior
+                default_metrics = get_default_metrics()
+                if metric_name in default_metrics:
+                    metric = default_metrics[metric_name]
+                else:
+                    # Get metric class and instantiate
+                    metric_class = self.get_metric_class(metric_name)
+                    metric = metric_class(threshold=case.threshold)  # type: ignore
 
             # Create test case
             test_case = self.create_test_case(case, actual_output)

diff --git a/tests/test_custom_geval.py b/tests/test_custom_geval.py
@@ -0,0 +1,70 @@
+"""Test custom GEval metric configuration."""
+
+import pytest
+from metacoder.evals.eval_model import MetricConfig, RubricItem
+
+
+def test_evaluation_steps_only():
+    """Test creating MetricConfig with only evaluation_steps."""
+    m = MetricConfig(
+        name="CorrectnessMetric", evaluation_steps=["Check if output is correct"]
+    )
+    assert m.evaluation_steps == ["Check if output is correct"]
+    assert m.criteria is None
+    assert m.rubric is None
+
+
+def test_criteria_only():
+    """Test creating MetricConfig with only criteria."""
+    m = MetricConfig(name="CorrectnessMetric", criteria="Check correctness")
+    assert m.criteria == "Check correctness"
+    assert m.evaluation_steps is None
+    assert m.rubric is None
+
+
+def test_rubric_only():
+    """Test creating MetricConfig with only rubric."""
+    rubric = [
+        RubricItem(score=0.0, criteria="Wrong"),
+        RubricItem(score=1.0, criteria="Correct"),
+    ]
+    m = MetricConfig(name="CorrectnessMetric", rubric=rubric)
+    assert len(m.rubric) == 2
+    assert m.criteria is None
+    assert m.evaluation_steps is None
+
+
+def test_criteria_and_evaluation_steps_mutually_exclusive():
+    """Test that providing both criteria and evaluation_steps raises ValueError."""
+    with pytest.raises(ValueError, match="Cannot specify both"):
+        MetricConfig(
+            name="CorrectnessMetric",
+            criteria="Check correctness",
+            evaluation_steps=["Step 1"],
+        )
+
+
+def test_requires_at_least_one():
+    """Test that at least one of criteria/evaluation_steps/rubric is required."""
+    with pytest.raises(ValueError, match="Must provide at least one"):
+        MetricConfig(name="CorrectnessMetric")
+
+
+def test_criteria_with_rubric():
+    """Test that criteria can be combined with rubric."""
+    rubric = [RubricItem(score=0.0, criteria="Wrong")]
+    m = MetricConfig(
+        name="CorrectnessMetric", criteria="Check correctness", rubric=rubric
+    )
+    assert m.criteria == "Check correctness"
+    assert len(m.rubric) == 1
+
+
+def test_evaluation_steps_with_rubric():
+    """Test that evaluation_steps can be combined with rubric."""
+    rubric = [RubricItem(score=1.0, criteria="Correct")]
+    m = MetricConfig(
+        name="CorrectnessMetric", evaluation_steps=["Step 1"], rubric=rubric
+    )
+    assert m.evaluation_steps == ["Step 1"]
+    assert len(m.rubric) == 1