Support for reading topic tree from JSONL file, issue #9

poppysec · poppysec · commit 86d54680f962 · 2025-01-16T18:02:14.000Z
Signed-off-by: poppysec &lt;poppaea@stacklok.com&gt;
diff --git a/promptwright/cli.py b/promptwright/cli.py
@@ -9,7 +9,8 @@
 from .config import PromptWrightConfig, construct_model_string
 from .engine import DataEngine
 from .hf_hub import HFUploader
-from .topic_tree import TopicTree
+from .topic_tree import TopicTree, TopicTreeArguments
+from .utils import read_topic_tree_from_jsonl
 
 
 def handle_error(ctx: click.Context, error: Exception) -> None:  # noqa: ARG001
@@ -27,6 +28,7 @@ def cli():
 @cli.command()
 @click.argument("config_file", type=click.Path(exists=True))
 @click.option("--topic-tree-save-as", help="Override the save path for the topic tree")
+@click.option('--topic-tree-jsonl', type=click.Path(exists=True), help='Path to the JSONL file containing the topic tree.')
 @click.option("--dataset-save-as", help="Override the save path for the dataset")
 @click.option("--provider", help="Override the LLM provider (e.g., ollama)")
 @click.option("--model", help="Override the model name (e.g., mistral:latest)")
@@ -55,6 +57,7 @@ def cli():
 def start(  # noqa: PLR0912
     config_file: str,
     topic_tree_save_as: str | None = None,
+    topic_tree_jsonl: str | None = None,
     dataset_save_as: str | None = None,
     provider: str | None = None,
     model: str | None = None,
@@ -85,6 +88,9 @@ def start(  # noqa: PLR0912
             handle_error(
                 click.get_current_context(), f"Error loading config file: {str(e)}"
             )
+        # Get dataset parameters
+        dataset_config = config.get_dataset_config()
+        dataset_params = dataset_config.get("creation", {})
 
         # Prepare topic tree overrides
         tree_overrides = {}
@@ -99,26 +105,53 @@ def start(  # noqa: PLR0912
         if tree_depth:
             tree_overrides["tree_depth"] = tree_depth
 
+        # Construct model name
+        model_name = construct_model_string(
+            provider or dataset_params.get("provider", "default_provider"),
+            model or dataset_params.get("model", "default_model")
+        )
+
         # Create and build topic tree
         try:
-            tree = TopicTree(args=config.get_topic_tree_args(**tree_overrides))
-            tree.build_tree()
+            print("Creating TopicTree object...")
+            if topic_tree_jsonl:
+                print(f"Reading topic tree from JSONL file: {topic_tree_jsonl}")
+                dict_list = read_topic_tree_from_jsonl(topic_tree_jsonl)
+                default_args = TopicTreeArguments(
+                    root_prompt="default",
+                    model_name=model_name
+                )
+                tree = TopicTree(args=default_args)
+                tree.from_dict_list(dict_list)
+            else:
+                if hasattr(config, 'topic_tree'):
+                    tree_args = config.get_topic_tree_args(**tree_overrides)
+                else:
+                    tree_args = TopicTreeArguments(
+                        root_prompt="default",
+                        model_name=model_name
+                    )
+                tree = TopicTree(args=tree_args)
+                print("Building topic tree...")
+                tree.build_tree()
         except Exception as e:
             handle_error(
                 click.get_current_context(), f"Error building topic tree: {str(e)}"
             )
 
-        # Save topic tree
-        try:
-            tree_save_path = topic_tree_save_as or config.topic_tree.get(
-                "save_as", "topic_tree.jsonl"
-            )
-            tree.save(tree_save_path)
-            click.echo(f"Topic tree saved to: {tree_save_path}")
-        except Exception as e:
-            handle_error(
-                click.get_current_context(), f"Error saving topic tree: {str(e)}"
-            )
+        # Save topic tree if JSONL file is not provided
+        if not topic_tree_jsonl:
+            try:
+                tree_save_path = topic_tree_save_as or config.topic_tree.get(
+                    "save_as", "topic_tree.jsonl"
+                )
+                print(f"Saving topic tree to: {tree_save_path}")
+                tree.save(tree_save_path)
+                click.echo(f"Topic tree saved to: {tree_save_path}")
+            except Exception as e:
+                handle_error(
+                    click.get_current_context(), f"Error saving topic tree: {str(e)}"
+                )
 
         # Prepare engine overrides
         engine_overrides = {}
@@ -137,17 +170,11 @@ def start(  # noqa: PLR0912
                 click.get_current_context(), f"Error creating data engine: {str(e)}"
             )
 
-        # Get dataset parameters
-        dataset_config = config.get_dataset_config()
-        dataset_params = dataset_config.get("creation", {})
-
         # Construct model name for dataset creation
-        if provider and model:
-            model_name = construct_model_string(provider, model)
-        else:
-            dataset_provider = dataset_params.get("provider", "ollama")
-            dataset_model = dataset_params.get("model", "mistral:latest")
-            model_name = construct_model_string(dataset_provider, dataset_model)
+        model_name = construct_model_string(
+            provider or dataset_params.get("provider", "ollama"),
+            model or dataset_params.get("model", "mistral:latest")
+        )
 
         # Create dataset with overrides
         try:
diff --git a/promptwright/topic_tree.py b/promptwright/topic_tree.py
@@ -299,3 +299,21 @@ def print_tree(self) -> None:
         print("Topic Tree Structure:")
         for path in self.tree_paths:
             print(" -> ".join(path))
+
+    def from_dict_list(self, dict_list: list[dict[str, Any]]) -> None:
+        """
+        Construct the topic tree from a list of dictionaries.
+
+        Args:
+            dict_list (list[dict]): The list of dictionaries representing the topic tree.
+        """
+        self.tree_paths = []
+        self.failed_generations = []
+
+        for d in dict_list:
+            if 'path' in d:
+                self.tree_paths.append(d['path'])
+            if 'failed_generation' in d:
+                self.failed_generations.append(d['failed_generation'])
+
+        print(f"Loaded {len(self.tree_paths)} paths and {len(self.failed_generations)} failed generations from JSONL file")
diff --git a/promptwright/utils.py b/promptwright/utils.py
@@ -96,3 +96,19 @@ def safe_literal_eval(list_string: str):
         except (SyntaxError, ValueError):
             print("Failed to parse the list due to syntax issues.")
             return None
+
+def read_topic_tree_from_jsonl(file_path: str) -> list[dict]:
+    """
+    Read the topic tree from a JSONL file.
+
+    Args:
+        file_path (str): The path to the JSONL file.
+
+    Returns:
+        list[dict]: The topic tree.
+    """
+    topic_tree = []
+    with open(file_path) as file:
+        for line in file:
+            topic_tree.append(json.loads(line.strip()))
+    return topic_tree
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -282,6 +282,60 @@ def test_start_command_with_overrides(
     assert kwargs["model_name"] == "override/model"
     assert kwargs["sys_msg"] is False
 
+@patch("promptwright.cli.read_topic_tree_from_jsonl")
+@patch("promptwright.cli.TopicTree")
+@patch("promptwright.cli.DataEngine")
+
+def test_start_command_with_jsonl(
+    mock_data_engine, mock_topic_tree, mock_read_topic_tree_from_jsonl, cli_runner,
+    sample_config_file
+    ):
+    """Test start command with JSONL file."""
+    mock_tree_instance = Mock()
+    mock_topic_tree.return_value = mock_tree_instance
+    mock_read_topic_tree_from_jsonl.return_value = [{"path": ["root", "child"]}]
+
+    mock_engine_instance = Mock()
+    mock_data_engine.return_value = mock_engine_instance
+    mock_dataset = Mock()
+    mock_engine_instance.create_data.return_value = mock_dataset
+    # Create a temporary JSONL file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+        f.write('{"path": ["root", "child"]}\n')
+        temp_jsonl_path = f.name
+
+    try:
+        # Run command with JSONL file
+        result = cli_runner.invoke(
+            cli,
+            [
+                "start",
+                sample_config_file,
+                "--topic-tree-jsonl",
+                temp_jsonl_path
+            ],
+        )
+
+        # Print output if command fails
+        if result.exit_code != 0:
+            print(result.output)
+
+        # Verify command executed successfully
+        assert result.exit_code == 0
+
+        # Verify JSONL read function was called
+        mock_read_topic_tree_from_jsonl.assert_called_once_with(temp_jsonl_path)
+
+        # Verify from_dict_list was called with the correct data
+        mock_tree_instance.from_dict_list.assert_called_once_with([{"path": ["root", "child"]}])
+
+        # Verify save was not called since JSONL file was provided
+        mock_tree_instance.save.assert_not_called()
+
+    finally:
+        # Cleanup the temporary JSONL file
+        if os.path.exists(temp_jsonl_path):
+            os.unlink(temp_jsonl_path)
 
 def test_start_command_missing_config(cli_runner):
     """Test start command with missing config file."""