Skip to content
Merged

Fixes #141

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/src/evalassist/judges/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def _evaluate(
result=positional_bias_instance_result,
)

return results
return results[:results_len]
else:
return self._run(instances=instances, criteria=criteria)

Expand Down
4 changes: 3 additions & 1 deletion backend/src/evalassist/judges/mprometheus_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
class MPrometheusJudge:
m_prometheus_model_name: str

def __init__(self, billions_of_params: Literal[3, 7, 14] = 3, **kwargs):
def __init__(
self, billions_of_params: Literal[3, 7, 14, "3", "7", "14"] = 3, **kwargs
):
super().__init__(**kwargs)
self.m_prometheus_model_name = (
f"Unbabel/M-Prometheus-{str(billions_of_params)}B"
Expand Down
53 changes: 36 additions & 17 deletions backend/src/evalassist/notebook_generation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import re
from abc import ABC, abstractmethod
from typing import Literal, cast

Expand Down Expand Up @@ -208,17 +207,26 @@ def get_evaluation_type(self):
return "direct"

def get_import_code(self):
return """\
from unitxt.inference import CrossProviderInferenceEngine
from evalassist.judges import DirectJudge, Instance, Criteria, DirectInstanceResult, DirectInstanceResult
if self.judge_requires_model:
inference_engine_import = (
"from unitxt.inference import CrossProviderInferenceEngine\n"
)
else:
inference_engine_import = ""
return f"""\
{inference_engine_import}from evalassist.judges import {self.judge_class.__name__}, Instance, Criteria, DirectInstanceResult, DirectInstanceResult
import nest_asyncio
nest_asyncio.apply()\
"""

def get_setup_and_run_eval_code(self):
if self.judge_requires_model:
inference_engine_construct_str = generate_constructor_code(
"CrossProviderInferenceEngine", params=self.inference_engine_params
inference_engine_construct_str = (
"inference_engine = "
+ generate_constructor_code(
"CrossProviderInferenceEngine", params=self.inference_engine_params
)
+ "\n"
) # type: ignore
else:
inference_engine_construct_str = ""
Expand All @@ -231,8 +239,7 @@ def get_setup_and_run_eval_code(self):
)

return f"""\
inference_engine = {inference_engine_construct_str}

{inference_engine_construct_str}
judge = {judge_construct_str}

results: list[DirectInstanceResult] = judge(instances, criteria)
Expand All @@ -250,23 +257,35 @@ def get_evaluation_type(self):
return "pairwise"

def get_import_code(self):
return """\
return f"""\
from unitxt.inference import CrossProviderInferenceEngine
from evalassist.judges import PairwiseJudge, Instance, Criteria, PairwiseInstanceResult
from evalassist.judges import {self.judge_class.__name__}, Instance, Criteria, PairwiseInstanceResult
import nest_asyncio
nest_asyncio.apply()\
"""

def get_setup_and_run_eval_code(self):
params = re.sub(
r"\btrue\b", "True", json.dumps(self.inference_engine_params, indent=4)
if self.judge_requires_model:
inference_engine_construct_str = (
"inference_engine = "
+ generate_constructor_code(
"CrossProviderInferenceEngine", params=self.inference_engine_params
)
+ "\n"
) # type: ignore
else:
inference_engine_construct_str = ""

judge_params = self.judge_params
if self.judge_requires_model:
judge_params["inference_engine"] = VariableRef("inference_engine")
judge_construct_str = generate_constructor_code(
self.judge_class.__name__, params=judge_params
)
return f"""\
inference_engine = CrossProviderInferenceEngine(**{params})

judge = PairwiseJudge(
inference_engine=inference_engine,
)
return f"""\
{inference_engine_construct_str}
judge = {judge_construct_str}

results: list[PairwiseInstanceResult] = judge(instances, criteria)

Expand Down