diff --git a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py index 4ada15f84c..062de4ac85 100644 --- a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py +++ b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py @@ -220,8 +220,8 @@ def __call__( def __call__( self, *, - query: str, - response: List[dict], + query: str | List[dict], + response: str | List[dict], tool_definitions: List[dict], ) -> Dict[str, Union[str, float]]: """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported. @@ -308,6 +308,18 @@ def has_context(self, eval_input: dict) -> bool: Treats None, empty strings, empty lists, and lists of empty strings as no context. """ context = eval_input.get("context", None) + return self._validate_context(context) + + def _validate_context(self, context) -> bool: + """ + Validate if the provided context is non-empty and meaningful. + + Treats None, empty strings, empty lists, and lists of empty strings as no context. + :param context: The context to validate + :type context: Union[str, List, None] + :return: True if context is valid and non-empty, False otherwise + :rtype: bool + """ if not context: return False if context == "<>": # Special marker for no context @@ -363,6 +375,14 @@ async def _real_call(self, **kwargs): else: raise ex + def _is_single_entry(self, value): + """Determine if the input value represents a single entry, unsure is returned as False.""" + if isinstance(value, str): + return True + if isinstance(value, list) and len(value) == 1: + return True + return False + def _convert_kwargs_to_eval_input(self, **kwargs): if kwargs.get("context") or kwargs.get("conversation"): return super()._convert_kwargs_to_eval_input(**kwargs) @@ -386,7 +406,16 @@ def _convert_kwargs_to_eval_input(self, **kwargs): ) context = self._get_context_from_agent_response(response, tool_definitions) - filtered_response = self._filter_file_search_results(response) + if not self._validate_context(context) and self._is_single_entry(response) and self._is_single_entry(query): + msg = f"{type(self).__name__}: No valid context provided or could be extracted from the query or response." + raise EvaluationException( + message=msg, + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.GROUNDEDNESS_EVALUATOR, + ) + + filtered_response = self._filter_file_search_results(response) if self._validate_context(context) else response return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query) def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/assets/evaluators/builtin/groundedness/spec.yaml b/assets/evaluators/builtin/groundedness/spec.yaml index 1c718a783a..0189706354 100644 --- a/assets/evaluators/builtin/groundedness/spec.yaml +++ b/assets/evaluators/builtin/groundedness/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.groundedness" -version: 3 +version: 4 displayName: "Groundedness-Evaluator" description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context." evaluatorType: "builtin"