diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py index 1a43295..d380032 100644 --- a/src/metacoder/coders/claude.py +++ b/src/metacoder/coders/claude.py @@ -260,5 +260,6 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. " f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}" ) - raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}") + # Don't raise for other errors - let evaluation continue and mark test as failed + logger.warning(f"Claude returned error (test will be marked as failed): {ao.result_text}") return ao diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py index 471c13d..6a60f03 100644 --- a/src/metacoder/evals/eval_model.py +++ b/src/metacoder/evals/eval_model.py @@ -1,8 +1,49 @@ -from typing import Any, Optional, List, Dict -from pydantic import BaseModel, Field +from typing import Any, Optional, List, Dict, Union +from pydantic import BaseModel, Field, model_validator from metacoder.configuration import AIModelConfig, MCPConfig +class RubricItem(BaseModel): + """A single rubric scoring guideline.""" + + score: float = Field(..., description="Score value (typically 0.0 or 1.0)") + criteria: str = Field(..., description="Criteria for this score") + + +class MetricConfig(BaseModel): + """Configuration for a metric with optional custom rubric.""" + + name: str = Field(..., description="Metric name (e.g., CorrectnessMetric)") + rubric: Optional[List[RubricItem]] = Field( + default=None, description="Custom rubric for evaluation" + ) + criteria: Optional[str] = Field( + default=None, description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)" + ) + evaluation_steps: Optional[List[str]] = Field( + default=None, description="Custom evaluation steps (mutually exclusive with criteria)" + ) + + @model_validator(mode='after') + def validate_mutual_exclusivity(self): + """Ensure criteria and evaluation_steps are mutually exclusive and at least one is provided.""" + # Check mutual exclusivity + if self.criteria is not None and self.evaluation_steps is not None: + raise ValueError( + "Cannot specify both 'criteria' and 'evaluation_steps'. " + "Use one or the other. evaluation_steps provides more control, " + "while criteria auto-generates steps." + ) + + # Check that at least one is provided + if self.criteria is None and self.evaluation_steps is None and self.rubric is None: + raise ValueError( + "Must provide at least one of: criteria, evaluation_steps, or rubric" + ) + + return self + + class EvalCase(BaseModel): """ A single evaluation test case. @@ -18,15 +59,43 @@ class EvalCase(BaseModel): expected_output: "Example Paper Title" threshold: 0.9 ``` + + Example with custom rubric: + ```yaml + name: "retraction_check" + metrics: + - name: CorrectnessMetric + rubric: + - score: 0.0 + criteria: "Output indicates paper not retracted" + - score: 1.0 + criteria: "Output indicates paper is retracted" + input: "Is PMC4831113 retracted?" + expected_output: "Yes" + ``` + + Example with custom evaluation_steps: + ```yaml + name: "exact_text_extraction" + metrics: + - name: CorrectnessMetric + evaluation_steps: + - "Check whether the actual output contains the exact text from expected output" + - "Heavily penalize any deviation, paraphrasing, or added explanations" + - "The text must be verbatim, not approximate" + input: "What is the first sentence of section 2?" + expected_output: "This is the exact sentence." + threshold: 0.9 + ``` """ name: str = Field(..., description="Unique identifier for the test case") group: Optional[str] = Field( default="Default", description="Test category for result grouping." ) - metrics: List[str] = Field( + metrics: List[Union[str, MetricConfig]] = Field( ..., - description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)", + description="List of metric names or metric configurations with custom rubrics", ) input: str = Field( ..., description="The prompt or question to send to the AI coder" diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py index a80060a..f65ca03 100644 --- a/src/metacoder/evals/runner.py +++ b/src/metacoder/evals/runner.py @@ -28,7 +28,7 @@ from metacoder.coders.base_coder import BaseCoder, CoderOutput from metacoder.registry import AVAILABLE_CODERS -from metacoder.evals.eval_model import EvalCase, EvalDataset +from metacoder.evals.eval_model import EvalCase, EvalDataset, MetricConfig from metacoder.configuration import AIModelConfig, CoderConfig logger = logging.getLogger(__name__) @@ -65,11 +65,14 @@ def is_successful(self) -> bool: def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval: - """Creates a GEval instance with the specified model.""" + """Creates a GEval instance with the specified model. + + Uses evaluation_steps (not criteria) for more reliable scoring across runs. + """ return GEval( name="Correctness", - criteria="Determine whether the actual output is factually correct based on the expected output.", # NOTE: you can only provide either criteria or evaluation_steps, and not both + # Using evaluation_steps for more control and reliability evaluation_steps=[ "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", "You should also heavily penalize omission of detail", @@ -85,6 +88,63 @@ def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval: ) +def make_custom_geval( + metric_config: MetricConfig, model: Optional[DeepEvalBaseLLM] = None +) -> GEval: + """Creates a GEval instance with custom criteria/rubric/evaluation_steps from MetricConfig. + + Args: + metric_config: Configuration with custom evaluation parameters + model: Optional LLM model (defaults to OpenAI GPT-4) + + Returns: + Configured GEval instance + + Note: + criteria and evaluation_steps are mutually exclusive (enforced by MetricConfig validator). + evaluation_steps provides more control and reliability, while criteria auto-generates steps. + """ + from deepeval.metrics.g_eval.utils import Rubric + + # Convert rubric if provided + rubrics = [] + if metric_config.rubric: + for item in metric_config.rubric: + rubrics.append( + Rubric( + score_range=(item.score, item.score), + expected_outcome=item.criteria, + ) + ) + + # Build kwargs for GEval + kwargs = { + "name": metric_config.name, + "evaluation_params": [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + "model": model, + } + + # Add evaluation_steps OR criteria (mutually exclusive) + # Note: Pydantic validator already ensures mutual exclusivity + if metric_config.evaluation_steps: + kwargs["evaluation_steps"] = metric_config.evaluation_steps + elif metric_config.criteria: + kwargs["criteria"] = metric_config.criteria + else: + # Default criteria if only rubric provided + kwargs["criteria"] = "Evaluate the actual output based on the rubric criteria." + + # Add rubric if provided + if rubrics: + kwargs["rubric"] = rubrics + + return GEval(**kwargs) + + def get_default_metrics( model: Optional[DeepEvalBaseLLM] = None, ) -> Dict[str, BaseMetric]: @@ -282,14 +342,31 @@ def run_single_eval( execution_time = time.time() - start_time # Run each metric - for metric_name in case.metrics: - default_metrics = get_default_metrics() - if metric_name in default_metrics: - metric = default_metrics[metric_name] + for metric_item in case.metrics: + # Handle both string metrics and MetricConfig objects + if isinstance(metric_item, str): + # Original behavior: string metric name + metric_name = metric_item + metric_config = None + else: + # New behavior: MetricConfig object with potential custom rubric + metric_name = metric_item.name + metric_config = metric_item + + # Create the metric instance + if metric_config and (metric_config.rubric or metric_config.criteria or metric_config.evaluation_steps): + # Use custom configuration if provided + logger.info(f"Using custom configuration for {metric_name}") + metric = make_custom_geval(metric_config, model=None) else: - # Get metric class and instantiate - metric_class = self.get_metric_class(metric_name) - metric = metric_class(threshold=case.threshold) # type: ignore + # Use default metric behavior + default_metrics = get_default_metrics() + if metric_name in default_metrics: + metric = default_metrics[metric_name] + else: + # Get metric class and instantiate + metric_class = self.get_metric_class(metric_name) + metric = metric_class(threshold=case.threshold) # type: ignore # Create test case test_case = self.create_test_case(case, actual_output) diff --git a/tests/test_custom_geval.py b/tests/test_custom_geval.py new file mode 100644 index 0000000..83cdbf1 --- /dev/null +++ b/tests/test_custom_geval.py @@ -0,0 +1,70 @@ +"""Test custom GEval metric configuration.""" + +import pytest +from metacoder.evals.eval_model import MetricConfig, RubricItem + + +def test_evaluation_steps_only(): + """Test creating MetricConfig with only evaluation_steps.""" + m = MetricConfig( + name="CorrectnessMetric", evaluation_steps=["Check if output is correct"] + ) + assert m.evaluation_steps == ["Check if output is correct"] + assert m.criteria is None + assert m.rubric is None + + +def test_criteria_only(): + """Test creating MetricConfig with only criteria.""" + m = MetricConfig(name="CorrectnessMetric", criteria="Check correctness") + assert m.criteria == "Check correctness" + assert m.evaluation_steps is None + assert m.rubric is None + + +def test_rubric_only(): + """Test creating MetricConfig with only rubric.""" + rubric = [ + RubricItem(score=0.0, criteria="Wrong"), + RubricItem(score=1.0, criteria="Correct"), + ] + m = MetricConfig(name="CorrectnessMetric", rubric=rubric) + assert len(m.rubric) == 2 + assert m.criteria is None + assert m.evaluation_steps is None + + +def test_criteria_and_evaluation_steps_mutually_exclusive(): + """Test that providing both criteria and evaluation_steps raises ValueError.""" + with pytest.raises(ValueError, match="Cannot specify both"): + MetricConfig( + name="CorrectnessMetric", + criteria="Check correctness", + evaluation_steps=["Step 1"], + ) + + +def test_requires_at_least_one(): + """Test that at least one of criteria/evaluation_steps/rubric is required.""" + with pytest.raises(ValueError, match="Must provide at least one"): + MetricConfig(name="CorrectnessMetric") + + +def test_criteria_with_rubric(): + """Test that criteria can be combined with rubric.""" + rubric = [RubricItem(score=0.0, criteria="Wrong")] + m = MetricConfig( + name="CorrectnessMetric", criteria="Check correctness", rubric=rubric + ) + assert m.criteria == "Check correctness" + assert len(m.rubric) == 1 + + +def test_evaluation_steps_with_rubric(): + """Test that evaluation_steps can be combined with rubric.""" + rubric = [RubricItem(score=1.0, criteria="Correct")] + m = MetricConfig( + name="CorrectnessMetric", evaluation_steps=["Step 1"], rubric=rubric + ) + assert m.evaluation_steps == ["Step 1"] + assert len(m.rubric) == 1