From 8e41fe3d5aa51cd75e155e86bb6bb22a79219f9a Mon Sep 17 00:00:00 2001 From: Jithin James Date: Wed, 16 Oct 2024 15:08:33 +0530 Subject: [PATCH] fix: import error for TestsetGeneration and small fixes (#1516) --- src/ragas/dataset_schema.py | 114 ++++++++++-------- .../testset/synthesizers/testset_schema.py | 55 ++++++--- tests/unit/test_dataset_schema.py | 64 ++++++---- tests/unit/test_testset_schema.py | 50 ++++++++ 4 files changed, 197 insertions(+), 86 deletions(-) create mode 100644 tests/unit/test_testset_schema.py diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index 3a32da2e0..060db899b 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -2,6 +2,7 @@ import json import typing as t +from abc import ABC, abstractmethod from dataclasses import dataclass, field from datasets import Dataset as HFDataset @@ -12,6 +13,8 @@ from ragas.utils import safe_nanmean if t.TYPE_CHECKING: + from pathlib import Path + from datasets import Dataset as HFDataset from pandas import DataFrame as PandasDataframe @@ -136,9 +139,20 @@ def pretty_repr(self): Sample = t.TypeVar("Sample", bound=BaseSample) -class RagasDataset(BaseModel, t.Generic[Sample]): +class RagasDataset(ABC, BaseModel, t.Generic[Sample]): samples: t.List[Sample] + @abstractmethod + def to_list(self) -> t.List[t.Dict]: + """Converts the samples to a list of dictionaries.""" + pass + + @classmethod + @abstractmethod + def from_list(cls, data: t.List[t.Dict]) -> RagasDataset[Sample]: + """Creates an EvaluationDataset from a list of dictionaries.""" + pass + @field_validator("samples") def validate_samples(cls, samples: t.List[BaseSample]) -> t.List[BaseSample]: """Validates that all samples are of the same type.""" @@ -155,20 +169,6 @@ def get_sample_type(self) -> t.Type[Sample]: """Returns the type of the samples in the dataset.""" return type(self.samples[0]) - def _to_list(self) -> t.List[t.Dict]: - """Converts the samples to a list of dictionaries.""" - rows = [sample.to_dict() for sample in self.samples] - - if self.get_sample_type() == MultiTurnSample: - for sample in rows: - for item in sample["user_input"]: - if not isinstance(item["content"], str): - item["content"] = json.dumps( - item["content"], ensure_ascii=False - ) - - return rows - def to_hf_dataset(self) -> HFDataset: """Converts the dataset to a Hugging Face Dataset.""" try: @@ -178,7 +178,7 @@ def to_hf_dataset(self) -> HFDataset: "datasets is not installed. Please install it to use this function." ) - return HFDataset.from_list(self._to_list()) + return HFDataset.from_list(self.to_list()) @classmethod def from_hf_dataset(cls, dataset: HFDataset): @@ -194,26 +194,13 @@ def to_pandas(self) -> PandasDataframe: "pandas is not installed. Please install it to use this function." ) - data = self._to_list() + data = self.to_list() return pd.DataFrame(data) def features(self): """Returns the features of the samples.""" return self.samples[0].get_features() - @classmethod - def from_list(cls, mapping: t.List[t.Dict]): - """Creates an EvaluationDataset from a list of dictionaries.""" - samples = [] - if all( - "user_input" in item and isinstance(mapping[0]["user_input"], list) - for item in mapping - ): - samples.extend(MultiTurnSample(**sample) for sample in mapping) - else: - samples.extend(SingleTurnSample(**sample) for sample in mapping) - return cls(samples=samples) - @classmethod def from_dict(cls, mapping: t.Dict): """Creates an EvaluationDataset from a dictionary.""" @@ -227,25 +214,15 @@ def from_dict(cls, mapping: t.Dict): samples.extend(SingleTurnSample(**sample) for sample in mapping) return cls(samples=samples) - @classmethod - def from_csv(cls, path: str): - """Creates an EvaluationDataset from a CSV file.""" - import csv - - with open(path, "r", newline="") as csvfile: - reader = csv.DictReader(csvfile) - data = [row for row in reader] - return cls.from_list(data) - - def to_csv(self, path: str): + def to_csv(self, path: t.Union[str, Path]): """Converts the dataset to a CSV file.""" import csv - data = self._to_list() + data = self.to_list() if not data: return - fieldnames = self.features() + fieldnames = data[0].keys() with open(path, "w", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) @@ -253,14 +230,14 @@ def to_csv(self, path: str): for row in data: writer.writerow(row) - def to_jsonl(self, path: str): + def to_jsonl(self, path: t.Union[str, Path]): """Converts the dataset to a JSONL file.""" with open(path, "w") as jsonlfile: for sample in self.samples: jsonlfile.write(json.dumps(sample.to_dict(), ensure_ascii=False) + "\n") @classmethod - def from_jsonl(cls, path: str): + def from_jsonl(cls, path: t.Union[str, Path]): """Creates an EvaluationDataset from a JSONL file.""" with open(path, "r") as jsonlfile: data = [json.loads(line) for line in jsonlfile] @@ -307,8 +284,6 @@ class EvaluationDataset(RagasDataset[SingleTurnSampleOrMultiTurnSample]): Creates an EvaluationDataset from a list of dictionaries. from_dict(mapping) Creates an EvaluationDataset from a dictionary. - from_csv(path) - Creates an EvaluationDataset from a CSV file. to_csv(path) Converts the dataset to a CSV file. to_jsonl(path) @@ -333,6 +308,37 @@ def __getitem__( else: raise TypeError("Index must be int or slice") + def to_list(self) -> t.List[t.Dict]: + rows = [sample.to_dict() for sample in self.samples] + + if self.get_sample_type() == MultiTurnSample: + for sample in rows: + for item in sample["user_input"]: + if not isinstance(item["content"], str): + item["content"] = json.dumps( + item["content"], ensure_ascii=False + ) + + return rows + + @classmethod + def from_list(cls, data: t.List[t.Dict]) -> EvaluationDataset: + samples = [] + if all( + "user_input" in item and isinstance(data[0]["user_input"], list) + for item in data + ): + samples.extend(MultiTurnSample(**sample) for sample in data) + else: + samples.extend(SingleTurnSample(**sample) for sample in data) + return cls(samples=samples) + + +class EvaluationResultRow(BaseModel): + dataset_row: t.Dict + scores: t.Dict[str, t.Any] + trace: t.Dict[str, t.Any] = field(default_factory=dict) # none for now + @dataclass class EvaluationResult: @@ -352,7 +358,7 @@ class EvaluationResult: """ scores: t.List[t.Dict[str, t.Any]] - dataset: t.Optional[EvaluationDataset] = None + dataset: EvaluationDataset binary_columns: t.List[str] = field(default_factory=list) cost_cb: t.Optional[CostCallbackHandler] = None @@ -407,6 +413,18 @@ def to_pandas(self, batch_size: int | None = None, batched: bool = False): dataset_df = self.dataset.to_pandas() return pd.concat([dataset_df, scores_df], axis=1) + def serialized(self) -> t.List[EvaluationResultRow]: + """ + Convert the result to a list of EvaluationResultRow. + """ + return [ + EvaluationResultRow( + dataset_row=self.dataset[i].to_dict(), + scores=self.scores[i], + ) + for i in range(len(self.scores)) + ] + def total_tokens(self) -> t.Union[t.List[TokenUsage], TokenUsage]: """ Compute the total tokens used in the evaluation. diff --git a/src/ragas/testset/synthesizers/testset_schema.py b/src/ragas/testset/synthesizers/testset_schema.py index 9ec9be9ad..5a81389ce 100644 --- a/src/ragas/testset/synthesizers/testset_schema.py +++ b/src/ragas/testset/synthesizers/testset_schema.py @@ -2,14 +2,16 @@ import typing as t -from ragas.dataset_schema import BaseSample, RagasDataset +from ragas.dataset_schema import ( + BaseSample, + EvaluationDataset, + MultiTurnSample, + RagasDataset, + SingleTurnSample, +) if t.TYPE_CHECKING: - from ragas.dataset_schema import ( - EvaluationDataset, - MultiTurnSample, - SingleTurnSample, - ) + from ragas.dataset_schema import MultiTurnSample, SingleTurnSample class TestsetSample(BaseSample): @@ -48,13 +50,34 @@ def to_evaluation_dataset(self) -> EvaluationDataset: samples=[sample.eval_sample for sample in self.samples] ) - def _to_list(self) -> t.List[t.Dict]: - eval_list = self.to_evaluation_dataset()._to_list() - testset_list_without_eval_sample = [ - sample.model_dump(exclude={"eval_sample"}) for sample in self.samples - ] - testset_list = [ - {**eval_sample, **sample} - for eval_sample, sample in zip(eval_list, testset_list_without_eval_sample) - ] - return testset_list + def to_list(self) -> t.List[t.Dict]: + """ + Converts the Testset to a list of dictionaries. + """ + return [sample.model_dump() for sample in self.samples] + + @classmethod + def from_list(cls, data: t.List[t.Dict]) -> Testset: + """ + Converts a list of dictionaries to a Testset. + """ + # first create the samples + samples = [] + for sample in data: + eval_sample = sample["eval_sample"] + + # if user_input is a list it is MultiTurnSample + if "user_input" in eval_sample and not isinstance( + eval_sample.get("user_input"), list + ): + eval_sample = SingleTurnSample(**sample["eval_sample"]) + else: + eval_sample = MultiTurnSample(**sample["eval_sample"]) + + samples.append( + TestsetSample( + eval_sample=eval_sample, synthesizer_name=sample["synthesizer_name"] + ) + ) + # then create the testset + return Testset(samples=samples) diff --git a/tests/unit/test_dataset_schema.py b/tests/unit/test_dataset_schema.py index ccb55654d..b1bac953e 100644 --- a/tests/unit/test_dataset_schema.py +++ b/tests/unit/test_dataset_schema.py @@ -3,46 +3,66 @@ import pytest from pydantic import ValidationError -from ragas.dataset_schema import EvaluationDataset, MultiTurnSample, SingleTurnSample - - -def test_evaluation_dataset(): - single_turn_sample = SingleTurnSample(user_input="What is X", response="Y") - - dataset = EvaluationDataset(samples=[single_turn_sample, single_turn_sample]) +from ragas.dataset_schema import ( + EvaluationDataset, + HumanMessage, + MultiTurnSample, + SingleTurnSample, +) + +samples = [ + SingleTurnSample(user_input="What is X", response="Y"), + MultiTurnSample( + user_input=[HumanMessage(content="What is X")], + reference="Y", + ), +] + + +@pytest.mark.parametrize("eval_sample", samples) +def test_evaluation_dataset(eval_sample): + dataset = EvaluationDataset(samples=[eval_sample, eval_sample]) hf_dataset = dataset.to_hf_dataset() - assert dataset.get_sample_type() == SingleTurnSample + assert dataset.get_sample_type() is type(eval_sample) assert len(hf_dataset) == 2 - assert dataset.features() == ["user_input", "response"] assert len(dataset) == 2 - assert dataset[0] == single_turn_sample + assert dataset[0] == eval_sample -def test_evaluation_dataset_save_load(tmpdir): - single_turn_sample = SingleTurnSample(user_input="What is X", response="Y") +@pytest.mark.parametrize("eval_sample", samples) +def test_evaluation_dataset_save_load_csv(tmpdir, eval_sample): + dataset = EvaluationDataset(samples=[eval_sample, eval_sample]) - dataset = EvaluationDataset(samples=[single_turn_sample, single_turn_sample]) + # save and load to csv + csv_path = tmpdir / "csvfile.csv" + dataset.to_csv(csv_path) - hf_dataset = dataset.to_hf_dataset() - # save and load to csv - dataset.to_csv(tmpdir / "csvfile.csv") - loaded_dataset = EvaluationDataset.from_csv(tmpdir / "csvfile.csv") - assert loaded_dataset == dataset +@pytest.mark.parametrize("eval_sample", samples) +def test_evaluation_dataset_save_load_jsonl(tmpdir, eval_sample): + dataset = EvaluationDataset(samples=[eval_sample, eval_sample]) # save and load to jsonl - dataset.to_jsonl(tmpdir / "jsonlfile.jsonl") - loaded_dataset = EvaluationDataset.from_jsonl(tmpdir / "jsonlfile.jsonl") + jsonl_path = tmpdir / "jsonlfile.jsonl" + dataset.to_jsonl(jsonl_path) + loaded_dataset = EvaluationDataset.from_jsonl(jsonl_path) assert loaded_dataset == dataset - # load from hf dataset + +@pytest.mark.parametrize("eval_sample", samples) +def test_evaluation_dataset_load_from_hf(eval_sample): + dataset = EvaluationDataset(samples=[eval_sample, eval_sample]) + + # convert to and load from hf dataset + hf_dataset = dataset.to_hf_dataset() loaded_dataset = EvaluationDataset.from_hf_dataset(hf_dataset) assert loaded_dataset == dataset -def test_single_type_evaluation_dataset(): +@pytest.mark.parametrize("eval_sample", samples) +def test_single_type_evaluation_dataset(eval_sample): single_turn_sample = SingleTurnSample(user_input="What is X", response="Y") multi_turn_sample = MultiTurnSample( user_input=[{"content": "What is X"}], diff --git a/tests/unit/test_testset_schema.py b/tests/unit/test_testset_schema.py new file mode 100644 index 000000000..470e5f4df --- /dev/null +++ b/tests/unit/test_testset_schema.py @@ -0,0 +1,50 @@ +import pytest + +from ragas.dataset_schema import ( + EvaluationDataset, + HumanMessage, + MultiTurnSample, + SingleTurnSample, +) +from ragas.testset.synthesizers.testset_schema import Testset, TestsetSample + +samples = [ + SingleTurnSample(user_input="What is X", response="Y"), + MultiTurnSample( + user_input=[HumanMessage(content="What is X")], + reference="Y", + ), +] + + +@pytest.mark.parametrize("eval_sample", samples) +def test_testset_to_evaluation_dataset(eval_sample): + testset_sample = TestsetSample(eval_sample=eval_sample, synthesizer_name="test") + testset = Testset(samples=[testset_sample, testset_sample]) + evaluation_dataset = testset.to_evaluation_dataset() + assert evaluation_dataset == EvaluationDataset(samples=[eval_sample, eval_sample]) + + +@pytest.mark.parametrize("eval_sample", samples) +def test_testset_save_load_csv(tmpdir, eval_sample): + testset_sample = TestsetSample(eval_sample=eval_sample, synthesizer_name="test") + testset = Testset(samples=[testset_sample, testset_sample]) + testset.to_csv(tmpdir / "csvfile.csv") + + +@pytest.mark.parametrize("eval_sample", samples) +def test_testset_save_load_jsonl(tmpdir, eval_sample): + testset_sample = TestsetSample(eval_sample=eval_sample, synthesizer_name="test") + testset = Testset(samples=[testset_sample, testset_sample]) + testset.to_jsonl(tmpdir / "jsonlfile.jsonl") + loaded_testset = Testset.from_jsonl(tmpdir / "jsonlfile.jsonl") + assert loaded_testset == testset + + +@pytest.mark.parametrize("eval_sample", samples) +def test_testset_save_load_hf(tmpdir, eval_sample): + testset_sample = TestsetSample(eval_sample=eval_sample, synthesizer_name="test") + testset = Testset(samples=[testset_sample, testset_sample]) + hf_testset = testset.to_hf_dataset() + loaded_testset = Testset.from_hf_dataset(hf_testset) + assert loaded_testset == testset