diff --git a/backend/src/evalassist/judges/base.py b/backend/src/evalassist/judges/base.py index 6cd06331..7e52f166 100644 --- a/backend/src/evalassist/judges/base.py +++ b/backend/src/evalassist/judges/base.py @@ -505,7 +505,7 @@ def _evaluate( result=positional_bias_instance_result, ) - return results + return results[:results_len] else: return self._run(instances=instances, criteria=criteria) diff --git a/backend/src/evalassist/judges/mprometheus_judge.py b/backend/src/evalassist/judges/mprometheus_judge.py index 9f5fc35a..5a049b10 100644 --- a/backend/src/evalassist/judges/mprometheus_judge.py +++ b/backend/src/evalassist/judges/mprometheus_judge.py @@ -10,7 +10,9 @@ class MPrometheusJudge: m_prometheus_model_name: str - def __init__(self, billions_of_params: Literal[3, 7, 14] = 3, **kwargs): + def __init__( + self, billions_of_params: Literal[3, 7, 14, "3", "7", "14"] = 3, **kwargs + ): super().__init__(**kwargs) self.m_prometheus_model_name = ( f"Unbabel/M-Prometheus-{str(billions_of_params)}B" diff --git a/backend/src/evalassist/notebook_generation.py b/backend/src/evalassist/notebook_generation.py index 08e0a126..95870078 100644 --- a/backend/src/evalassist/notebook_generation.py +++ b/backend/src/evalassist/notebook_generation.py @@ -1,5 +1,4 @@ import json -import re from abc import ABC, abstractmethod from typing import Literal, cast @@ -208,17 +207,26 @@ def get_evaluation_type(self): return "direct" def get_import_code(self): - return """\ -from unitxt.inference import CrossProviderInferenceEngine -from evalassist.judges import DirectJudge, Instance, Criteria, DirectInstanceResult, DirectInstanceResult + if self.judge_requires_model: + inference_engine_import = ( + "from unitxt.inference import CrossProviderInferenceEngine\n" + ) + else: + inference_engine_import = "" + return f"""\ +{inference_engine_import}from evalassist.judges import {self.judge_class.__name__}, Instance, Criteria, DirectInstanceResult, DirectInstanceResult import nest_asyncio nest_asyncio.apply()\ """ def get_setup_and_run_eval_code(self): if self.judge_requires_model: - inference_engine_construct_str = generate_constructor_code( - "CrossProviderInferenceEngine", params=self.inference_engine_params + inference_engine_construct_str = ( + "inference_engine = " + + generate_constructor_code( + "CrossProviderInferenceEngine", params=self.inference_engine_params + ) + + "\n" ) # type: ignore else: inference_engine_construct_str = "" @@ -231,8 +239,7 @@ def get_setup_and_run_eval_code(self): ) return f"""\ -inference_engine = {inference_engine_construct_str} - +{inference_engine_construct_str} judge = {judge_construct_str} results: list[DirectInstanceResult] = judge(instances, criteria) @@ -250,23 +257,35 @@ def get_evaluation_type(self): return "pairwise" def get_import_code(self): - return """\ + return f"""\ from unitxt.inference import CrossProviderInferenceEngine -from evalassist.judges import PairwiseJudge, Instance, Criteria, PairwiseInstanceResult +from evalassist.judges import {self.judge_class.__name__}, Instance, Criteria, PairwiseInstanceResult import nest_asyncio nest_asyncio.apply()\ """ def get_setup_and_run_eval_code(self): - params = re.sub( - r"\btrue\b", "True", json.dumps(self.inference_engine_params, indent=4) + if self.judge_requires_model: + inference_engine_construct_str = ( + "inference_engine = " + + generate_constructor_code( + "CrossProviderInferenceEngine", params=self.inference_engine_params + ) + + "\n" + ) # type: ignore + else: + inference_engine_construct_str = "" + + judge_params = self.judge_params + if self.judge_requires_model: + judge_params["inference_engine"] = VariableRef("inference_engine") + judge_construct_str = generate_constructor_code( + self.judge_class.__name__, params=judge_params ) - return f"""\ -inference_engine = CrossProviderInferenceEngine(**{params}) -judge = PairwiseJudge( - inference_engine=inference_engine, -) + return f"""\ +{inference_engine_construct_str} +judge = {judge_construct_str} results: list[PairwiseInstanceResult] = judge(instances, criteria)