explodinggradients · jjmachan · Dec 14, 2023 · Dec 10, 2023 · Dec 10, 2023 · Dec 11, 2023
diff --git a/src/ragas/async_utils.py b/src/ragas/async_utils.py
@@ -44,5 +44,5 @@ async def _gather() -> List[Any]:
         # run the operation w/o tqdm on hitting a fatal
         # may occur in some environments where tqdm.asyncio
         # is not supported
-        raise RuntimeError("Fatal error occurred while running async tasks.", e)
+        raise RuntimeError("Fatal error occurred while running async tasks.", e) from e
     return outputs
diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py
@@ -16,6 +16,11 @@
         "Please, install it with `pip install llama_index`."
     )
 
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError
+
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
@@ -58,6 +63,10 @@
     "conditional": "_condition_question",
 }
 
+retry_errors = (
+    ValidationError,
+)
+
 DataRow = namedtuple(
     "DataRow",
     [
@@ -69,6 +78,8 @@
     ],
 )
 
+Proposal = namedtuple("Proposal", ["question", "text_chunk"])
+
 
 @dataclass
 class TestDataset:
@@ -291,6 +302,70 @@ def _embed_nodes(self, nodes: t.List[BaseNode]) -> t.Dict[str, t.List[float]]:
 
         return embeddings
 
+    def _make_proposal(
+        self, cur_node: BaseNode, neighbor_nodes: t.List[BaseNode], evolve_type: str
+    ) -> t.Union[Proposal, None]:
+        # Append multiple nodes randomly to remove chunking bias
+        size = self.rng.integers(1, 3)
+        nodes = (
+            self._get_neighbour_node(cur_node, neighbor_nodes)
+            if size > 1 and evolve_type != "multi_context"
+            else [cur_node]
+        )
+
+        text_chunk = " ".join([node.get_content() for node in nodes])
+        score = self._filter_context(text_chunk)
+        if not score:
+            return None
+        seed_question = self._seed_question(text_chunk)
+        is_valid_question = self._filter_question(seed_question)
+        if not is_valid_question:
+            return None
+
+        if evolve_type == "multi_context":
+            # Find most similar chunk in same document
+            node_embedding = self._embed_nodes([nodes[-1]])
+            neighbor_nodes = self._remove_nodes(neighbor_nodes, nodes)
+            neighbor_emb = self._embed_nodes(neighbor_nodes)
+
+            _, indices = get_top_k_embeddings(
+                list(node_embedding.values())[0],
+                list(neighbor_emb.values()),
+                similarity_cutoff=self.threshold / 10,
+            )
+            if indices:
+                # type cast indices from list[Any] to list[int]
+                indices = t.cast(t.List[int], indices)
+                best_neighbor = neighbor_nodes[indices[0]]
+                question = self._multicontext_question(
+                    question=seed_question,
+                    context1=text_chunk,
+                    context2=best_neighbor.get_content(),
+                )
+                text_chunk = "\n".join([text_chunk, best_neighbor.get_content()])
+            else:
+                return None
+
+        # for reasoning and conditional modes, evolve question with the
+        # functions from question_deep_map
+        else:
+            evolve_fun = question_deep_map.get(evolve_type)
+            question = (
+                getattr(self, evolve_fun)(seed_question, text_chunk)
+                if evolve_fun
+                else seed_question
+            )
+
+        # compress question or convert into conversational questions
+        if evolve_type != "simple":
+            prob = self.rng.uniform(0, 1)
+            if self.chat_qa and prob <= self.chat_qa:
+                question = self._conversational_question(question=question)
+            else:
+                question = self._compress_question(question=question)
+
+        return Proposal(question=question, text_chunk=text_chunk)
+
     def generate(
         self,
         documents: t.List[LlamaindexDocument] | t.List[LangchainDocument],
@@ -339,64 +414,20 @@ def generate(
 
             neighbor_nodes = doc_nodes_map[curr_node.source_node.node_id]
 
-            # Append multiple nodes randomly to remove chunking bias
-            size = self.rng.integers(1, 3)
-            nodes = (
-                self._get_neighbour_node(curr_node, neighbor_nodes)
-                if size > 1 and evolve_type != "multi_context"
-                else [curr_node]
-            )
-
-            text_chunk = " ".join([node.get_content() for node in nodes])
-            score = self._filter_context(text_chunk)
-            if not score:
-                continue
-            seed_question = self._seed_question(text_chunk)
-            is_valid_question = self._filter_question(seed_question)
-            if not is_valid_question:
-                continue
-
-            if evolve_type == "multi_context":
-                # Find most similar chunk in same document
-                node_embedding = self._embed_nodes([nodes[-1]])
-                neighbor_nodes = self._remove_nodes(neighbor_nodes, nodes)
-                neighbor_emb = self._embed_nodes(neighbor_nodes)
-
-                _, indices = get_top_k_embeddings(
-                    list(node_embedding.values())[0],
-                    list(neighbor_emb.values()),
-                    similarity_cutoff=self.threshold / 10,
-                )
-                if indices:
-                    # type cast indices from list[Any] to list[int]
-                    indices = t.cast(t.List[int], indices)
-                    best_neighbor = neighbor_nodes[indices[0]]
-                    question = self._multicontext_question(
-                        question=seed_question,
-                        context1=text_chunk,
-                        context2=best_neighbor.get_content(),
-                    )
-                    text_chunk = "\n".join([text_chunk, best_neighbor.get_content()])
-                else:
-                    continue
-
-            # for reasoning and conditional modes, evolve question with the
-            # functions from question_deep_map
-            else:
-                evolve_fun = question_deep_map.get(evolve_type)
-                question = (
-                    getattr(self, evolve_fun)(seed_question, text_chunk)
-                    if evolve_fun
-                    else seed_question
+            proposal = None
+            try:
+                proposal = self._make_proposal(
+                    curr_node, neighbor_nodes, evolve_type
                 )
+            except Exception as e:
+                err_cause = e.__cause__
+                if not isinstance(err_cause, retry_errors):
+                    raise e
 
-            # compress question or convert into conversational questions
-            if evolve_type != "simple":
-                prob = self.rng.uniform(0, 1)
-                if self.chat_qa and prob <= self.chat_qa:
-                    question = self._conversational_question(question=question)
-                else:
-                    question = self._compress_question(question=question)
+            if proposal is None:
+                continue
+            question = proposal.question
+            text_chunk = proposal.text_chunk
 
             is_valid_question = self._filter_question(question)
             if is_valid_question: