Skip to content

Commit

Permalink
feat: make EvaluationResult more streamlined
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmachan committed Oct 14, 2024
1 parent ab91a8f commit 6780faf
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 41 deletions.
70 changes: 50 additions & 20 deletions src/ragas/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
from dataclasses import dataclass, field

from datasets import Dataset as HFDataset
from datasets import concatenate_datasets
from pydantic import BaseModel, field_validator

from ragas.cost import CostCallbackHandler
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.utils import safe_nanmean

if t.TYPE_CHECKING:
from datasets import Dataset as HFDataset
from pandas import DataFrame as PandasDataframe

from ragas.cost import CostCallbackHandler, TokenUsage
from ragas.cost import TokenUsage


class BaseSample(BaseModel):
Expand Down Expand Up @@ -270,17 +270,17 @@ def __iter__(self) -> t.Iterator[Sample]: # type: ignore
def __len__(self) -> int:
return len(self.samples)

def __getitem__(self, idx: int) -> Sample:
return self.samples[idx]

def __str__(self) -> str:
return f"EvaluationDataset(features={self.features()}, len={len(self.samples)})"

def __repr__(self) -> str:
return self.__str__()


class EvaluationDataset(RagasDataset[t.Union[SingleTurnSample, MultiTurnSample]]):
SingleTurnSampleOrMultiTurnSample = t.Union[SingleTurnSample, MultiTurnSample]


class EvaluationDataset(RagasDataset[SingleTurnSampleOrMultiTurnSample]):
"""
Represents a dataset of evaluation samples.
Expand Down Expand Up @@ -315,11 +315,25 @@ class EvaluationDataset(RagasDataset[t.Union[SingleTurnSample, MultiTurnSample]]
Creates an EvaluationDataset from a JSONL file.
"""

pass
@t.overload
def __getitem__(self, idx: int) -> SingleTurnSampleOrMultiTurnSample: ...

@t.overload
def __getitem__(self, idx: slice) -> "EvaluationDataset": ...

def __getitem__(
self, idx: t.Union[int, slice]
) -> t.Union[SingleTurnSampleOrMultiTurnSample, "EvaluationDataset"]:
if isinstance(idx, int):
return self.samples[idx]
elif isinstance(idx, slice):
return type(self)(samples=self.samples[idx])
else:
raise TypeError("Index must be int or slice")


@dataclass
class EvaluationResult(dict):
class EvaluationResult:
"""
A class to store and process the results of the evaluation.
Expand All @@ -335,17 +349,23 @@ class EvaluationResult(dict):
The callback handler for cost computation. Default is None.
"""

scores: HFDataset
dataset: t.Optional[HFDataset] = None
scores: t.List[t.Dict[str, t.Any]]
dataset: t.Optional[EvaluationDataset] = None
binary_columns: t.List[str] = field(default_factory=list)
cost_cb: t.Optional[CostCallbackHandler] = None

def __post_init__(self):
# transform scores from list of dicts to dict of lists
self._scores_dict = {
k: [d[k] for d in self.scores] for k in self.scores[0].keys()
}

values = []
for cn in self.scores[0].keys():
value = safe_nanmean(self.scores[cn])
self[cn] = value
if cn not in self.binary_columns:
self._repr_dict = {}
for metric_name in self._scores_dict.keys():
value = safe_nanmean(self._scores_dict[metric_name])
self._repr_dict[metric_name] = value
if metric_name not in self.binary_columns:
value = t.cast(float, value)
values.append(value + 1e-10)

Expand All @@ -370,12 +390,20 @@ def to_pandas(self, batch_size: int | None = None, batched: bool = False):
ValueError
If the dataset is not provided.
"""
try:
import pandas as pd
except ImportError:
raise ImportError(
"pandas is not installed. Please install it to use this function."
)

if self.dataset is None:
raise ValueError("dataset is not provided for the results class")
assert self.scores.shape[0] == self.dataset.shape[0]
result_ds = concatenate_datasets([self.dataset, self.scores], axis=1)

return result_ds.to_pandas(batch_size=batch_size, batched=batched)
assert len(self.scores) == len(self.dataset)
# convert both to pandas dataframes and concatenate
scores_df = pd.DataFrame(self.scores)
dataset_df = self.dataset.to_pandas()
return pd.concat([dataset_df, scores_df], axis=1)

def total_tokens(self) -> t.Union[t.List[TokenUsage], TokenUsage]:
"""
Expand Down Expand Up @@ -434,6 +462,8 @@ def total_cost(
)

def __repr__(self) -> str:
scores = self.copy()
score_strs = [f"'{k}': {v:0.4f}" for k, v in scores.items()]
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
return "{" + ", ".join(score_strs) + "}"

def __getitem__(self, key: str) -> t.List[float]:
return self._scores_dict[key]
21 changes: 4 additions & 17 deletions src/ragas/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from __future__ import annotations

import typing as t
from dataclasses import dataclass, field

import numpy as np
from datasets import Dataset, concatenate_datasets
from datasets import Dataset
from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager
from langchain_core.embeddings import Embeddings as LangchainEmbeddings
from langchain_core.language_models import BaseLanguageModel as LangchainLLM
Expand Down Expand Up @@ -38,12 +37,7 @@
is_reproducable,
)
from ragas.run_config import RunConfig
from ragas.utils import (
convert_v1_to_v2_dataset,
convert_v2_to_v1_dataset,
get_feature_language,
safe_nanmean,
)
from ragas.utils import convert_v1_to_v2_dataset, get_feature_language
from ragas.validation import (
remap_column_names,
validate_required_columns,
Expand Down Expand Up @@ -171,10 +165,8 @@ def evaluate(

metrics = [answer_relevancy, context_precision, faithfulness, context_recall]

v1_input = False
if isinstance(dataset, Dataset):
# remap column names from the dataset
v1_input = True
dataset = remap_column_names(dataset, column_map)
dataset = convert_v1_to_v2_dataset(dataset)
# validation
Expand Down Expand Up @@ -293,7 +285,7 @@ def evaluate(
else:
raise ValueError(f"Unsupported sample type {sample_type}")

scores = []
scores: t.List[t.Dict[str, t.Any]] = []
try:
# get the results
results = executor.results()
Expand All @@ -320,14 +312,9 @@ def evaluate(
else:
# evalution run was successful
# now lets process the results
# convert to v.1 dataset
dataset = dataset.to_hf_dataset()
if v1_input:
dataset = convert_v2_to_v1_dataset(dataset)

cost_cb = ragas_callbacks["cost_cb"] if "cost_cb" in ragas_callbacks else None
result = EvaluationResult(
scores=Dataset.from_list(scores),
scores=scores,
dataset=dataset,
binary_columns=binary_metrics,
cost_cb=t.cast(
Expand Down
8 changes: 4 additions & 4 deletions src/ragas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@ def get_debug_mode() -> bool:
return False


def safe_nanmean(arr):
def safe_nanmean(arr: t.List[float]) -> float:
if len(arr) == 0:
return np.nan # or some other value or behavior for empty arrays

arr = np.asarray(arr) # Ensure input is a numpy array
arr_numpy = np.asarray(arr) # Ensure input is a numpy array

if np.isnan(arr).all():
if np.isnan(arr_numpy).all():
return np.nan # or some other value or behavior for all-NaN arrays

return np.nanmean(arr)
return float(np.nanmean(arr_numpy))


def check_if_sum_is_close(
Expand Down
14 changes: 14 additions & 0 deletions tests/unit/test_dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,17 @@ def test_evaluation_dataset_iter():

for sample in dataset:
assert sample == single_turn_sample


def test_evaluation_dataset_type():
single_turn_sample = SingleTurnSample(user_input="What is X", response="Y")
multi_turn_sample = MultiTurnSample(
user_input=[{"content": "What is X"}],
response="Y", # type: ignore (this type error is what we want to test)
)

dataset = EvaluationDataset(samples=[single_turn_sample])
assert dataset.get_sample_type() == SingleTurnSample

dataset = EvaluationDataset(samples=[multi_turn_sample])
assert dataset.get_sample_type() == MultiTurnSample

0 comments on commit 6780faf

Please sign in to comment.