From b76fc7f38f4b1f459fea74325684b43f79537e11 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 18:04:53 +0200 Subject: [PATCH 01/18] add a new function --- .../datasets/harmbench_multimodal_dataset.py | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 pyrit/datasets/harmbench_multimodal_dataset.py diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py new file mode 100644 index 000000000..8b38b4e93 --- /dev/null +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -0,0 +1,102 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import uuid +from pathlib import Path +from typing import Literal, Optional + +from pyrit.datasets.dataset_helper import fetch_examples +from pyrit.models import SeedPromptDataset +from pyrit.models.seed_prompt import SeedPrompt + + +def fetch_harmbench_multimodal_dataset( + source: str = ( + "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/behavior_datasets/" + "harmbench_behaviors_multimodal_all.csv" + ), + source_type: Literal["public_url", "file"] = "public_url", + cache: bool = True, + data_home: Optional[Path] = None, +) -> SeedPromptDataset: + """ + Fetch HarmBench multimodal examples and create a SeedPromptDataset. + + Args: + source (str): The source from which to fetch examples. Defaults to the HarmBench repository. + source_type (Literal["public_url", "file"]): The type of source. Defaults to 'public_url'. + cache (bool): Whether to cache the fetched examples. Defaults to True. + data_home (Optional[Path]): Directory to store cached data. Defaults to None. + + Returns: + SeedPromptDataset: A SeedPromptDataset containing the multimodal examples. + + Note: + For more information related to the HarmBench project and the original dataset, visit: https://www.harmbench.org/ + Paper: https://arxiv.org/abs/2402.04249 + Authors: + Mantas Mazeika & Long Phan & Xuwang Yin & Andy Zou & Zifan Wang & Norman Mu & Elham Sakhaee + & Nathaniel Li & Steven Basart & Bo Li & David Forsyth & Dan Hendrycks + """ + required_keys = {"Behavior", "BehaviorID", "FunctionalCategory", "SemanticCategory", "ImageFileName"} + examples = fetch_examples(source, source_type, cache, data_home) + prompts = [] + + for example in examples: + missing_keys = required_keys - example.keys() + if missing_keys: + raise ValueError(f"Missing keys in example: {', '.join(missing_keys)}") + + if example["FunctionalCategory"] != "multimodal": + continue + + behavior_text = example["Behavior"] + behavior_id = example["BehaviorID"] + semantic_category = example["SemanticCategory"] + image_filename = example["ImageFileName"] + image_description = example.get("ImageDescription", "") + redacted_description = example.get("RedactedImageDescription", "") + + # A unique group ID to link the text and image prompts + # since they are part of the same example + group_id = uuid.uuid4() + + text_prompt = SeedPrompt( + value=behavior_text, + data_type="text", + name=f"HarmBench Multimodal Text - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=list(semantic_category), + description=(f"A text prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}"), + source=source, + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + }, + ) + + image_prompt = SeedPrompt( + # Note: All images in the HarmBench dataset are stored as .png files, even if + # the ImageFileName field specifies a different extension (.jpg or .jpeg). + # https://github.com/centerforaisafety/HarmBench/tree/c0423b9/data/multimodal_behavior_images + value=f"https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/{image_filename.rsplit('.', 1)[0]}.png", + data_type="image_path", + name=f"HarmBench Multimodal Image - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=list(semantic_category), + description=f"An image prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}", + source=example.get("Source", ""), + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + "image_description": image_description, + "redacted_image_description": redacted_description, + }, + ) + + prompts.append((text_prompt, image_prompt)) + + seed_prompt_dataset = SeedPromptDataset(prompts=prompts) + return seed_prompt_dataset From 14861463b15620e60d100b35651eb400a7283ec4 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 18:21:25 +0200 Subject: [PATCH 02/18] fix --- pyrit/datasets/harmbench_multimodal_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 8b38b4e93..cba2f4ee5 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -75,6 +75,7 @@ def fetch_harmbench_multimodal_dataset( "behavior_id": behavior_id, }, ) + prompts.append(text_prompt) image_prompt = SeedPrompt( # Note: All images in the HarmBench dataset are stored as .png files, even if @@ -95,8 +96,7 @@ def fetch_harmbench_multimodal_dataset( "redacted_image_description": redacted_description, }, ) - - prompts.append((text_prompt, image_prompt)) + prompts.append(image_prompt) seed_prompt_dataset = SeedPromptDataset(prompts=prompts) return seed_prompt_dataset From 9de39d971ef70a9ab239ab8482dbc06cdf53d40c Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:48:19 +0200 Subject: [PATCH 03/18] add tests --- .../datasets/harmbench_multimodal_dataset.py | 6 +- .../test_harmbench_multimodal_dataset.py | 159 ++++++++++++++++++ 2 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 tests/unit/datasets/test_harmbench_multimodal_dataset.py diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index cba2f4ee5..7c1f06b04 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -66,7 +66,7 @@ def fetch_harmbench_multimodal_dataset( data_type="text", name=f"HarmBench Multimodal Text - {behavior_id}", dataset_name="HarmBench Multimodal Examples", - harm_categories=list(semantic_category), + harm_categories=[semantic_category], description=(f"A text prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}"), source=source, prompt_group_id=group_id, @@ -85,9 +85,9 @@ def fetch_harmbench_multimodal_dataset( data_type="image_path", name=f"HarmBench Multimodal Image - {behavior_id}", dataset_name="HarmBench Multimodal Examples", - harm_categories=list(semantic_category), + harm_categories=[semantic_category], description=f"An image prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}", - source=example.get("Source", ""), + source=source, prompt_group_id=group_id, sequence=0, metadata={ diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py new file mode 100644 index 000000000..262af9c16 --- /dev/null +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -0,0 +1,159 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import patch + +import pytest + +from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset +from pyrit.models import SeedPromptDataset + + +@pytest.mark.usefixtures("patch_central_database") +class TestHarmBenchMultimodalDataset: + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_fetching_with_valid_inputs(self, mock_fetch): + mock_examples = [ + { + "Behavior": "Test harmful behavior with image", + "BehaviorID": "test_behavior_1", + "SemanticCategory": "harmful_category", + "FunctionalCategory": "multimodal", + "ImageFileName": "test_image.png", + "ImageDescription": "A test image description", + "RedactedImageDescription": "A redacted test image description", + }, + { + "Behavior": "Another harmful behavior", + "BehaviorID": "test_behavior_2", + "SemanticCategory": "illegal_activity", + "FunctionalCategory": "multimodal", + "ImageFileName": "another_image.jpg", + "ImageDescription": "Another image description", + "RedactedImageDescription": "Another redacted description", + }, + ] + + mock_fetch.return_value = mock_examples + result = fetch_harmbench_multimodal_dataset() + + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 4 # 2 text + 2 image prompts + + text_prompts = [p for p in result.prompts if p.data_type == "text"] + image_prompts = [p for p in result.prompts if p.data_type == "image_path"] + assert len(text_prompts) == 2 + assert len(image_prompts) == 2 + + # Verify each text prompt has corresponding image prompt with same group_id + for text_prompt in text_prompts: + matching_image = next(p for p in image_prompts if p.prompt_group_id == text_prompt.prompt_group_id) + assert matching_image is not None + assert text_prompt.prompt_group_id == matching_image.prompt_group_id + + # Test that the prompts have correct properties set + + # Text properties + assert text_prompts[0].value == "Test harmful behavior with image" + assert text_prompts[0].data_type == "text" + assert text_prompts[0].name == "HarmBench Multimodal Text - test_behavior_1" + assert text_prompts[0].dataset_name == "HarmBench Multimodal Examples" + assert text_prompts[0].harm_categories == ["harmful_category"] + assert ( + text_prompts[0].description + == "A text prompt from the HarmBench multimodal dataset, BehaviorID: test_behavior_1" + ) + assert text_prompts[0].sequence == 0 + assert text_prompts[0].metadata == {"behavior_id": "test_behavior_1"} + + # Image properties + assert ( + image_prompts[1].value + == "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/another_image.png" + ) # note the .png extension + assert image_prompts[1].data_type == "image_path" + assert image_prompts[1].name == "HarmBench Multimodal Image - test_behavior_2" + assert image_prompts[1].dataset_name == "HarmBench Multimodal Examples" + assert image_prompts[1].harm_categories == ["illegal_activity"] + assert ( + image_prompts[1].description + == "An image prompt from the HarmBench multimodal dataset, BehaviorID: test_behavior_2" + ) + assert image_prompts[1].sequence == 0 + expected_metadata = { + "behavior_id": "test_behavior_2", + "image_description": "Another image description", + "redacted_image_description": "Another redacted description", + } + assert image_prompts[1].metadata == expected_metadata + + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_fetching_with_missing_required_keys(self, mock_fetch): + mock_examples = [ + { + "Behavior": "Test behavior", + "BehaviorID": "test_id", + "FunctionalCategory": "multimodal", + # Missing SemanticCategory and ImageFileName + } + ] + + mock_fetch.return_value = mock_examples + with pytest.raises(ValueError, match="Missing keys in example"): + fetch_harmbench_multimodal_dataset() + + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_fetching_with_missing_optional_fields(self, mock_fetch): + mock_examples = [ + { + "Behavior": "Test behavior", + "BehaviorID": "test_optional", + "SemanticCategory": "test_category", + "FunctionalCategory": "multimodal", + "ImageFileName": "test_optional.png", + # Missing optional fields: ImageDescription, RedactedImageDescription + } + ] + + mock_fetch.return_value = mock_examples + result = fetch_harmbench_multimodal_dataset() + + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 + + # Verify image prompt handles missing optional fields + image_prompt = next(p for p in result.prompts if p.data_type == "image_path") + assert image_prompt.metadata["image_description"] == "" + assert image_prompt.metadata["redacted_image_description"] == "" + + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_fetching_with_empty_examples(self, mock_fetch): + mock_fetch.return_value = [] + + with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): + fetch_harmbench_multimodal_dataset() + + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_filtering_out_non_multimodal_examples(self, mock_fetch): + mock_examples = [ + { + "Behavior": "Text only behavior", + "BehaviorID": "text_only", + "SemanticCategory": "harmful", + "FunctionalCategory": "text_generation", # Non-multimodal + "ImageFileName": "unused.png", + }, + { + "Behavior": "Multimodal behavior", + "BehaviorID": "multimodal_id", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", # This should be included + "ImageFileName": "valid.png", + }, + ] + + mock_fetch.return_value = mock_examples + result = fetch_harmbench_multimodal_dataset() + + assert len(result.prompts) == 2 # one example (1 text + 1 image) + assert all(p.metadata["behavior_id"] == "multimodal_id" for p in result.prompts) From 28918f4a2ff062430d3174581804121c36fb151a Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:58:14 +0200 Subject: [PATCH 04/18] fix/improve docs formatting --- pyrit/datasets/harmbench_multimodal_dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 7c1f06b04..70d379e56 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -32,8 +32,9 @@ def fetch_harmbench_multimodal_dataset( SeedPromptDataset: A SeedPromptDataset containing the multimodal examples. Note: - For more information related to the HarmBench project and the original dataset, visit: https://www.harmbench.org/ - Paper: https://arxiv.org/abs/2402.04249 + For more information related to the HarmBench project and the original dataset, visit: + https://www.harmbench.org/ \n + Paper: https://arxiv.org/abs/2402.04249 \n Authors: Mantas Mazeika & Long Phan & Xuwang Yin & Andy Zou & Zifan Wang & Norman Mu & Elham Sakhaee & Nathaniel Li & Steven Basart & Bo Li & David Forsyth & Dan Hendrycks From ccf1f96b84ddff9dea77d03cf4ff55915b957288 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 23:01:21 +0200 Subject: [PATCH 05/18] add filtering based on `SemanticCategory` --- .../datasets/harmbench_multimodal_dataset.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 70d379e56..f6651c2d8 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -3,14 +3,24 @@ import uuid from pathlib import Path -from typing import Literal, Optional +from typing import List, Literal, Optional from pyrit.datasets.dataset_helper import fetch_examples from pyrit.models import SeedPromptDataset from pyrit.models.seed_prompt import SeedPrompt +SemanticCategoriesLiteral = Literal[ + "cybercrime_intrusion", # n=54 + "illegal", # 36 + "harmful", # 9 + "chemical_biological", # 4 + "harassment_bullying", # 4 + "misinformation_disinformation", # 3 +] + def fetch_harmbench_multimodal_dataset( + *, source: str = ( "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/behavior_datasets/" "harmbench_behaviors_multimodal_all.csv" @@ -18,19 +28,30 @@ def fetch_harmbench_multimodal_dataset( source_type: Literal["public_url", "file"] = "public_url", cache: bool = True, data_home: Optional[Path] = None, + categories: Optional[List[SemanticCategoriesLiteral]] = None, ) -> SeedPromptDataset: """ Fetch HarmBench multimodal examples and create a SeedPromptDataset. + The HarmBench multimodal dataset contains 110 harmful multimodal behaviors. + Each example consists of an image ("image_path") and a behavior string referencing the image ("text"). + The text and image prompts that belong to the same example are linked using the same ``prompt_group_id``. + You can extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. + Args: source (str): The source from which to fetch examples. Defaults to the HarmBench repository. source_type (Literal["public_url", "file"]): The type of source. Defaults to 'public_url'. cache (bool): Whether to cache the fetched examples. Defaults to True. data_home (Optional[Path]): Directory to store cached data. Defaults to None. + categories (Optional[List[SemanticCategoriesLiteral]]): List of semantic categories + to filter examples. If None, all categories are included (default). Returns: SeedPromptDataset: A SeedPromptDataset containing the multimodal examples. + Raises: + ValueError: If any of the specified categories are invalid. + Note: For more information related to the HarmBench project and the original dataset, visit: https://www.harmbench.org/ \n @@ -39,6 +60,11 @@ def fetch_harmbench_multimodal_dataset( Mantas Mazeika & Long Phan & Xuwang Yin & Andy Zou & Zifan Wang & Norman Mu & Elham Sakhaee & Nathaniel Li & Steven Basart & Bo Li & David Forsyth & Dan Hendrycks """ + if categories is not None: + invalid_categories = set(categories) - set(SemanticCategoriesLiteral.__args__) + if invalid_categories: + raise ValueError(f"Invalid semantic categories: {', '.join(invalid_categories)}") + required_keys = {"Behavior", "BehaviorID", "FunctionalCategory", "SemanticCategory", "ImageFileName"} examples = fetch_examples(source, source_type, cache, data_home) prompts = [] @@ -51,9 +77,13 @@ def fetch_harmbench_multimodal_dataset( if example["FunctionalCategory"] != "multimodal": continue + semantic_category = example["SemanticCategory"] + + if categories is not None and semantic_category not in categories: + continue + behavior_text = example["Behavior"] behavior_id = example["BehaviorID"] - semantic_category = example["SemanticCategory"] image_filename = example["ImageFileName"] image_description = example.get("ImageDescription", "") redacted_description = example.get("RedactedImageDescription", "") From bcef278600335f6e72ce621efa415ced33fcb57f Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 23:03:59 +0200 Subject: [PATCH 06/18] improve docstring --- pyrit/datasets/harmbench_multimodal_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index f6651c2d8..4ff6ecead 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -34,9 +34,10 @@ def fetch_harmbench_multimodal_dataset( Fetch HarmBench multimodal examples and create a SeedPromptDataset. The HarmBench multimodal dataset contains 110 harmful multimodal behaviors. - Each example consists of an image ("image_path") and a behavior string referencing the image ("text"). - The text and image prompts that belong to the same example are linked using the same ``prompt_group_id``. - You can extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. + Each example consists of an image (with data type "image_path") and a behavior string + that references the image (with data type "text"). The text and image prompts that + belong to the same example are linked using the same ``prompt_group_id``. You can + extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. Args: source (str): The source from which to fetch examples. Defaults to the HarmBench repository. From c770e87a5e54183f9ee60fd2043a8db1d1644625 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 23:07:01 +0200 Subject: [PATCH 07/18] refactor: improve docstring --- pyrit/datasets/harmbench_multimodal_dataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 4ff6ecead..f6651c2d8 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -34,10 +34,9 @@ def fetch_harmbench_multimodal_dataset( Fetch HarmBench multimodal examples and create a SeedPromptDataset. The HarmBench multimodal dataset contains 110 harmful multimodal behaviors. - Each example consists of an image (with data type "image_path") and a behavior string - that references the image (with data type "text"). The text and image prompts that - belong to the same example are linked using the same ``prompt_group_id``. You can - extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. + Each example consists of an image ("image_path") and a behavior string referencing the image ("text"). + The text and image prompts that belong to the same example are linked using the same ``prompt_group_id``. + You can extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. Args: source (str): The source from which to fetch examples. Defaults to the HarmBench repository. From c71ea4ba4d53548597121cd7d91196972c94f715 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Wed, 24 Sep 2025 23:18:58 +0200 Subject: [PATCH 08/18] update tests --- .../test_harmbench_multimodal_dataset.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index 262af9c16..dfbb40e49 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -157,3 +157,59 @@ def test_filtering_out_non_multimodal_examples(self, mock_fetch): assert len(result.prompts) == 2 # one example (1 text + 1 image) assert all(p.metadata["behavior_id"] == "multimodal_id" for p in result.prompts) + + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + def test_filtering_by_semantic_categories(self, mock_fetch): + mock_examples = [ + { + "Behavior": "Illegal behavior", + "BehaviorID": "illegal_behavior", + "SemanticCategory": "illegal", + "FunctionalCategory": "multimodal", + "ImageFileName": "illegal.png", + }, + { + "Behavior": "Harmful behavior", + "BehaviorID": "harmful_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "harmful.png", + }, + { + "Behavior": "Cybercrime behavior", + "BehaviorID": "cybercrime_behavior", + "SemanticCategory": "cybercrime_intrusion", + "FunctionalCategory": "multimodal", + "ImageFileName": "cybercrime.png", + }, + { + "Behavior": "Harassment behavior", + "BehaviorID": "harassment_behavior", + "SemanticCategory": "harassment_bullying", + "FunctionalCategory": "multimodal", + "ImageFileName": "harassment.png", + }, + ] + mock_fetch.return_value = mock_examples + + # Filter by single category + result = fetch_harmbench_multimodal_dataset(categories=["illegal"]) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 # 1 text + 1 image prompt for illegal category + assert all(p.metadata["behavior_id"] == "illegal_behavior" for p in result.prompts) + assert all(p.harm_categories == ["illegal"] for p in result.prompts) + + # Filter by multiple categories + result = fetch_harmbench_multimodal_dataset(categories=["cybercrime_intrusion", "harmful"]) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 4 # 2 examples × 2 prompts each + behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} + assert behavior_ids == {"cybercrime_behavior", "harmful_behavior"} + + # Filter with invalid category + with pytest.raises(ValueError, match="Invalid semantic categories"): + fetch_harmbench_multimodal_dataset(categories=["nonexistent_category", "illegal"]) + + # Filter with an empty list + with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): + fetch_harmbench_multimodal_dataset(categories=[]) From eae6f17008be533e067235fae7ca24c70961e72a Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 20:56:19 +0200 Subject: [PATCH 09/18] actually download images --- doc/api.rst | 1 + pyrit/datasets/__init__.py | 2 + .../datasets/harmbench_multimodal_dataset.py | 37 ++++++++++-- .../test_harmbench_multimodal_dataset.py | 60 ++++++++++++------- 4 files changed, 74 insertions(+), 26 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 85c1bb1b1..2e9a27a83 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -126,6 +126,7 @@ API Reference fetch_examples fetch_forbidden_questions_dataset fetch_harmbench_dataset + fetch_harmbench_multimodal_dataset_async fetch_librAI_do_not_answer_dataset fetch_llm_latent_adversarial_training_harmful_dataset fetch_jbb_behaviors_by_harm_category diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py index f99d1922b..b95459558 100644 --- a/pyrit/datasets/__init__.py +++ b/pyrit/datasets/__init__.py @@ -10,6 +10,7 @@ from pyrit.datasets.dataset_helper import fetch_examples from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset from pyrit.datasets.harmbench_dataset import fetch_harmbench_dataset +from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset_async from pyrit.datasets.librAI_do_not_answer_dataset import fetch_librAI_do_not_answer_dataset from pyrit.datasets.llm_latent_adversarial_training_harmful_dataset import ( fetch_llm_latent_adversarial_training_harmful_dataset, @@ -47,6 +48,7 @@ "fetch_examples", "fetch_forbidden_questions_dataset", "fetch_harmbench_dataset", + "fetch_harmbench_multimodal_dataset_async", "fetch_librAI_do_not_answer_dataset", "fetch_llm_latent_adversarial_training_harmful_dataset", "fetch_many_shot_jailbreaking_dataset", diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index f6651c2d8..2ea060848 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -1,14 +1,18 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import logging import uuid from pathlib import Path from typing import List, Literal, Optional +from pyrit.common.net_utility import make_request_and_raise_if_error_async from pyrit.datasets.dataset_helper import fetch_examples -from pyrit.models import SeedPromptDataset +from pyrit.models import SeedPromptDataset, data_serializer_factory from pyrit.models.seed_prompt import SeedPrompt +logger = logging.getLogger(__name__) + SemanticCategoriesLiteral = Literal[ "cybercrime_intrusion", # n=54 "illegal", # 36 @@ -19,7 +23,7 @@ ] -def fetch_harmbench_multimodal_dataset( +async def fetch_harmbench_multimodal_dataset_async( *, source: str = ( "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/behavior_datasets/" @@ -108,11 +112,13 @@ def fetch_harmbench_multimodal_dataset( ) prompts.append(text_prompt) + # Note: All images in the HarmBench dataset are stored as .png files, even if the ImageFileName + # field specifies a different extension (.jpg or .jpeg). Hence we always use .png extension here. + image_url = f"https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/{image_filename.rsplit('.', 1)[0]}.png" + local_image_path = await _fetch_and_save_image_async(image_url, behavior_id) + image_prompt = SeedPrompt( - # Note: All images in the HarmBench dataset are stored as .png files, even if - # the ImageFileName field specifies a different extension (.jpg or .jpeg). - # https://github.com/centerforaisafety/HarmBench/tree/c0423b9/data/multimodal_behavior_images - value=f"https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/{image_filename.rsplit('.', 1)[0]}.png", + value=local_image_path, data_type="image_path", name=f"HarmBench Multimodal Image - {behavior_id}", dataset_name="HarmBench Multimodal Examples", @@ -125,9 +131,28 @@ def fetch_harmbench_multimodal_dataset( "behavior_id": behavior_id, "image_description": image_description, "redacted_image_description": redacted_description, + "original_image_url": image_url, }, ) prompts.append(image_prompt) seed_prompt_dataset = SeedPromptDataset(prompts=prompts) return seed_prompt_dataset + + +async def _fetch_and_save_image_async(image_url: str, behavior_id: str) -> str: + filename = f"harmbench_{behavior_id}.png" + serializer = data_serializer_factory(category="seed-prompt-entries", data_type="image_path", extension="png") + + # Return existing path if image already exists for this BehaviorID + serializer.value = str(serializer._memory.results_path + serializer.data_sub_directory + f"/{filename}") + try: + if await serializer._memory.results_storage_io.path_exists(serializer.value): + return serializer.value + except Exception as e: + logger.warning(f"Failed to check whether image for {behavior_id} already exists: {e}") + + response = await make_request_and_raise_if_error_async(endpoint_uri=image_url, method="GET") + await serializer.save_data(data=response.content, output_filename=filename.replace(".png", "")) + + return str(serializer.value) diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index dfbb40e49..0ad0c4669 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -5,14 +5,16 @@ import pytest -from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset +from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset_async from pyrit.models import SeedPromptDataset +@pytest.mark.asyncio @pytest.mark.usefixtures("patch_central_database") class TestHarmBenchMultimodalDataset: + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_fetching_with_valid_inputs(self, mock_fetch): + async def test_fetching_with_valid_inputs(self, mock_fetch, mock_fetch_image): mock_examples = [ { "Behavior": "Test harmful behavior with image", @@ -35,7 +37,12 @@ def test_fetching_with_valid_inputs(self, mock_fetch): ] mock_fetch.return_value = mock_examples - result = fetch_harmbench_multimodal_dataset() + mock_fetch_image.side_effect = [ + "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_1.png", + "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_2.png", + ] + + result = await fetch_harmbench_multimodal_dataset_async() assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 4 # 2 text + 2 image prompts @@ -67,10 +74,7 @@ def test_fetching_with_valid_inputs(self, mock_fetch): assert text_prompts[0].metadata == {"behavior_id": "test_behavior_1"} # Image properties - assert ( - image_prompts[1].value - == "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/another_image.png" - ) # note the .png extension + assert image_prompts[1].value == "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_2.png" assert image_prompts[1].data_type == "image_path" assert image_prompts[1].name == "HarmBench Multimodal Image - test_behavior_2" assert image_prompts[1].dataset_name == "HarmBench Multimodal Examples" @@ -84,11 +88,13 @@ def test_fetching_with_valid_inputs(self, mock_fetch): "behavior_id": "test_behavior_2", "image_description": "Another image description", "redacted_image_description": "Another redacted description", + "original_image_url": "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/another_image.png", } assert image_prompts[1].metadata == expected_metadata + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_fetching_with_missing_required_keys(self, mock_fetch): + async def test_fetching_with_missing_required_keys(self, mock_fetch, mock_fetch_image): mock_examples = [ { "Behavior": "Test behavior", @@ -100,10 +106,11 @@ def test_fetching_with_missing_required_keys(self, mock_fetch): mock_fetch.return_value = mock_examples with pytest.raises(ValueError, match="Missing keys in example"): - fetch_harmbench_multimodal_dataset() + await fetch_harmbench_multimodal_dataset_async() + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_fetching_with_missing_optional_fields(self, mock_fetch): + async def test_fetching_with_missing_optional_fields(self, mock_fetch, mock_fetch_image): mock_examples = [ { "Behavior": "Test behavior", @@ -116,7 +123,9 @@ def test_fetching_with_missing_optional_fields(self, mock_fetch): ] mock_fetch.return_value = mock_examples - result = fetch_harmbench_multimodal_dataset() + mock_fetch_image.return_value = "/dbdata/seed-prompt-entries/images/harmbench_test_optional.png" + + result = await fetch_harmbench_multimodal_dataset_async() assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 2 @@ -126,15 +135,17 @@ def test_fetching_with_missing_optional_fields(self, mock_fetch): assert image_prompt.metadata["image_description"] == "" assert image_prompt.metadata["redacted_image_description"] == "" + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_fetching_with_empty_examples(self, mock_fetch): + async def test_fetching_with_empty_examples(self, mock_fetch, mock_fetch_image): mock_fetch.return_value = [] with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): - fetch_harmbench_multimodal_dataset() + await fetch_harmbench_multimodal_dataset_async() + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_filtering_out_non_multimodal_examples(self, mock_fetch): + async def test_filtering_out_non_multimodal_examples(self, mock_fetch, mock_fetch_image): mock_examples = [ { "Behavior": "Text only behavior", @@ -153,13 +164,16 @@ def test_filtering_out_non_multimodal_examples(self, mock_fetch): ] mock_fetch.return_value = mock_examples - result = fetch_harmbench_multimodal_dataset() + mock_fetch_image.return_value = "/dbdata/seed-prompt-entries/images/harmbench_multimodal_id.png" + + result = await fetch_harmbench_multimodal_dataset_async() assert len(result.prompts) == 2 # one example (1 text + 1 image) assert all(p.metadata["behavior_id"] == "multimodal_id" for p in result.prompts) + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - def test_filtering_by_semantic_categories(self, mock_fetch): + async def test_filtering_by_semantic_categories(self, mock_fetch, mock_fetch_image): mock_examples = [ { "Behavior": "Illegal behavior", @@ -192,15 +206,21 @@ def test_filtering_by_semantic_categories(self, mock_fetch): ] mock_fetch.return_value = mock_examples + mock_fetch_image.side_effect = [ + "/dbdata/seed-prompt-entries/images/harmbench_illegal_behavior.png", + "/dbdata/seed-prompt-entries/images/harmbench_cybercrime_behavior.png", + "/dbdata/seed-prompt-entries/images/harmbench_harmful_behavior.png", + ] + # Filter by single category - result = fetch_harmbench_multimodal_dataset(categories=["illegal"]) + result = await fetch_harmbench_multimodal_dataset_async(categories=["illegal"]) assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 2 # 1 text + 1 image prompt for illegal category assert all(p.metadata["behavior_id"] == "illegal_behavior" for p in result.prompts) assert all(p.harm_categories == ["illegal"] for p in result.prompts) # Filter by multiple categories - result = fetch_harmbench_multimodal_dataset(categories=["cybercrime_intrusion", "harmful"]) + result = await fetch_harmbench_multimodal_dataset_async(categories=["cybercrime_intrusion", "harmful"]) assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 4 # 2 examples × 2 prompts each behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} @@ -208,8 +228,8 @@ def test_filtering_by_semantic_categories(self, mock_fetch): # Filter with invalid category with pytest.raises(ValueError, match="Invalid semantic categories"): - fetch_harmbench_multimodal_dataset(categories=["nonexistent_category", "illegal"]) + await fetch_harmbench_multimodal_dataset_async(categories=["nonexistent_category", "illegal"]) # Filter with an empty list with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): - fetch_harmbench_multimodal_dataset(categories=[]) + await fetch_harmbench_multimodal_dataset_async(categories=[]) From 6382cc3e7860a55d7ea409e787ae08dd8d84ae88 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:14:49 +0200 Subject: [PATCH 10/18] add to integration tests --- tests/integration/datasets/test_fetch_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py index 6d5fadaf3..822d414ee 100644 --- a/tests/integration/datasets/test_fetch_datasets.py +++ b/tests/integration/datasets/test_fetch_datasets.py @@ -13,6 +13,7 @@ fetch_equitymedqa_dataset_unique_values, fetch_forbidden_questions_dataset, fetch_harmbench_dataset, + fetch_harmbench_multimodal_dataset_async, fetch_jbb_behaviors_by_harm_category, fetch_jbb_behaviors_by_jbb_category, fetch_jbb_behaviors_dataset, @@ -46,6 +47,7 @@ (fetch_equitymedqa_dataset_unique_values, True), (fetch_forbidden_questions_dataset, True), (fetch_harmbench_dataset, True), + (fetch_harmbench_multimodal_dataset_async, True), (fetch_jbb_behaviors_dataset, True), (fetch_librAI_do_not_answer_dataset, True), (fetch_llm_latent_adversarial_training_harmful_dataset, True), From 648de3bd0b320ca640d8f61de45f563114c717f8 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:20:23 +0200 Subject: [PATCH 11/18] docs --- pyrit/datasets/harmbench_multimodal_dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 2ea060848..2ec8d7bb8 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -37,11 +37,14 @@ async def fetch_harmbench_multimodal_dataset_async( """ Fetch HarmBench multimodal examples and create a SeedPromptDataset. - The HarmBench multimodal dataset contains 110 harmful multimodal behaviors. + The HarmBench multimodal dataset contains 110 harmful behaviors. Each example consists of an image ("image_path") and a behavior string referencing the image ("text"). The text and image prompts that belong to the same example are linked using the same ``prompt_group_id``. You can extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. + Note: The first call may be slow as images need to be downloaded from the remote repository. + Subsequent calls will be faster since images are cached locally and won't need to be re-downloaded. + Args: source (str): The source from which to fetch examples. Defaults to the HarmBench repository. source_type (Literal["public_url", "file"]): The type of source. Defaults to 'public_url'. @@ -92,8 +95,7 @@ async def fetch_harmbench_multimodal_dataset_async( image_description = example.get("ImageDescription", "") redacted_description = example.get("RedactedImageDescription", "") - # A unique group ID to link the text and image prompts - # since they are part of the same example + # A unique group ID to link the text and image prompts since they are part of the same example group_id = uuid.uuid4() text_prompt = SeedPrompt( From 7f4f3cc5f9a648908368f11dd50f0c9a65ba3b28 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:24:47 +0200 Subject: [PATCH 12/18] pe-commit for the test file --- tests/unit/datasets/test_harmbench_multimodal_dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index 0ad0c4669..6a1a700d5 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -5,7 +5,9 @@ import pytest -from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset_async +from pyrit.datasets.harmbench_multimodal_dataset import ( + fetch_harmbench_multimodal_dataset_async, +) from pyrit.models import SeedPromptDataset @@ -88,7 +90,8 @@ async def test_fetching_with_valid_inputs(self, mock_fetch, mock_fetch_image): "behavior_id": "test_behavior_2", "image_description": "Another image description", "redacted_image_description": "Another redacted description", - "original_image_url": "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/another_image.png", + "original_image_url": "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9" + + "/data/multimodal_behavior_images/another_image.png", } assert image_prompts[1].metadata == expected_metadata From 444e834a2fd9da2f28d2e698c5be9757ce4d4050 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:36:06 +0200 Subject: [PATCH 13/18] fix line too long & use enum for SemanticCategory --- .../datasets/harmbench_multimodal_dataset.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 2ec8d7bb8..8b7c09a90 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -3,6 +3,7 @@ import logging import uuid +from enum import Enum from pathlib import Path from typing import List, Literal, Optional @@ -13,14 +14,14 @@ logger = logging.getLogger(__name__) -SemanticCategoriesLiteral = Literal[ - "cybercrime_intrusion", # n=54 - "illegal", # 36 - "harmful", # 9 - "chemical_biological", # 4 - "harassment_bullying", # 4 - "misinformation_disinformation", # 3 -] + +class SemanticCategory(Enum): + CYBERCRIME_INTRUSION = "cybercrime_intrusion" # n=54 + ILLEGAL = "illegal" # 36 + HARMFUL = "harmful" # 9 + CHEMICAL_BIOLOGICAL = "chemical_biological" # 4 + HARASSMENT_BULLYING = "harassment_bullying" # 4 + MISINFORMATION_DISINFORMATION = "misinformation_disinformation" # 3 async def fetch_harmbench_multimodal_dataset_async( @@ -32,7 +33,7 @@ async def fetch_harmbench_multimodal_dataset_async( source_type: Literal["public_url", "file"] = "public_url", cache: bool = True, data_home: Optional[Path] = None, - categories: Optional[List[SemanticCategoriesLiteral]] = None, + categories: Optional[List[SemanticCategory]] = None, ) -> SeedPromptDataset: """ Fetch HarmBench multimodal examples and create a SeedPromptDataset. @@ -50,7 +51,7 @@ async def fetch_harmbench_multimodal_dataset_async( source_type (Literal["public_url", "file"]): The type of source. Defaults to 'public_url'. cache (bool): Whether to cache the fetched examples. Defaults to True. data_home (Optional[Path]): Directory to store cached data. Defaults to None. - categories (Optional[List[SemanticCategoriesLiteral]]): List of semantic categories + categories (Optional[List[SemanticCategory]]): List of semantic categories to filter examples. If None, all categories are included (default). Returns: @@ -68,7 +69,10 @@ async def fetch_harmbench_multimodal_dataset_async( & Nathaniel Li & Steven Basart & Bo Li & David Forsyth & Dan Hendrycks """ if categories is not None: - invalid_categories = set(categories) - set(SemanticCategoriesLiteral.__args__) + valid_categories = {category.value for category in SemanticCategory} + invalid_categories = ( + set(cat.value if isinstance(cat, SemanticCategory) else cat for cat in categories) - valid_categories + ) if invalid_categories: raise ValueError(f"Invalid semantic categories: {', '.join(invalid_categories)}") @@ -86,8 +90,10 @@ async def fetch_harmbench_multimodal_dataset_async( semantic_category = example["SemanticCategory"] - if categories is not None and semantic_category not in categories: - continue + if categories is not None: + category_values = {cat.value for cat in categories} + if semantic_category not in category_values: + continue behavior_text = example["Behavior"] behavior_id = example["BehaviorID"] @@ -116,7 +122,10 @@ async def fetch_harmbench_multimodal_dataset_async( # Note: All images in the HarmBench dataset are stored as .png files, even if the ImageFileName # field specifies a different extension (.jpg or .jpeg). Hence we always use .png extension here. - image_url = f"https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/{image_filename.rsplit('.', 1)[0]}.png" + image_url = ( + "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/" + f"{image_filename.rsplit('.', 1)[0]}.png" + ) local_image_path = await _fetch_and_save_image_async(image_url, behavior_id) image_prompt = SeedPrompt( From 0a9f91f83c76ef3fd1586855bd8da8b71eebd580 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:42:53 +0200 Subject: [PATCH 14/18] fix tests --- .../datasets/test_harmbench_multimodal_dataset.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index 6a1a700d5..549e6a11b 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -6,6 +6,7 @@ import pytest from pyrit.datasets.harmbench_multimodal_dataset import ( + SemanticCategory, fetch_harmbench_multimodal_dataset_async, ) from pyrit.models import SeedPromptDataset @@ -216,23 +217,21 @@ async def test_filtering_by_semantic_categories(self, mock_fetch, mock_fetch_ima ] # Filter by single category - result = await fetch_harmbench_multimodal_dataset_async(categories=["illegal"]) + result = await fetch_harmbench_multimodal_dataset_async(categories=[SemanticCategory.ILLEGAL]) assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 2 # 1 text + 1 image prompt for illegal category assert all(p.metadata["behavior_id"] == "illegal_behavior" for p in result.prompts) assert all(p.harm_categories == ["illegal"] for p in result.prompts) # Filter by multiple categories - result = await fetch_harmbench_multimodal_dataset_async(categories=["cybercrime_intrusion", "harmful"]) + result = await fetch_harmbench_multimodal_dataset_async( + categories=[SemanticCategory.CYBERCRIME_INTRUSION, SemanticCategory.HARMFUL] + ) assert isinstance(result, SeedPromptDataset) assert len(result.prompts) == 4 # 2 examples × 2 prompts each behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} assert behavior_ids == {"cybercrime_behavior", "harmful_behavior"} - # Filter with invalid category - with pytest.raises(ValueError, match="Invalid semantic categories"): - await fetch_harmbench_multimodal_dataset_async(categories=["nonexistent_category", "illegal"]) - # Filter with an empty list with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): await fetch_harmbench_multimodal_dataset_async(categories=[]) From 7a5cc4d71a6d3f78e170f17adc82e3c4795e5cc9 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Sat, 4 Oct 2025 21:27:04 +0200 Subject: [PATCH 15/18] add authors, groups --- .../datasets/harmbench_multimodal_dataset.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 8b7c09a90..e6284d6a9 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -117,6 +117,27 @@ async def fetch_harmbench_multimodal_dataset_async( metadata={ "behavior_id": behavior_id, }, + authors=[ + "Mantas Mazeika", + "Long Phan", + "Xuwang Yin", + "Andy Zou", + "Zifan Wang", + "Norman Mu", + "Elham Sakhaee", + "Nathaniel Li", + "Steven Basart", + "Bo Li", + "David Forsyth", + "Dan Hendrycks", + ], + groups=[ + "University of Illinois Urbana-Champaign", + "Center for AI Safety", + "Carnegie Mellon University", + "UC Berkeley", + "Microsoft", + ], ) prompts.append(text_prompt) From 329a57269c977fbf970e5d14591b539b542a6b7e Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Sat, 4 Oct 2025 21:55:28 +0200 Subject: [PATCH 16/18] only include examples where image fetch is successful --- .../datasets/harmbench_multimodal_dataset.py | 119 +++++++++--------- .../test_harmbench_multimodal_dataset.py | 34 +++++ 2 files changed, 96 insertions(+), 57 deletions(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index e6284d6a9..3f5e09db2 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -104,69 +104,74 @@ async def fetch_harmbench_multimodal_dataset_async( # A unique group ID to link the text and image prompts since they are part of the same example group_id = uuid.uuid4() - text_prompt = SeedPrompt( - value=behavior_text, - data_type="text", - name=f"HarmBench Multimodal Text - {behavior_id}", - dataset_name="HarmBench Multimodal Examples", - harm_categories=[semantic_category], - description=(f"A text prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}"), - source=source, - prompt_group_id=group_id, - sequence=0, - metadata={ - "behavior_id": behavior_id, - }, - authors=[ - "Mantas Mazeika", - "Long Phan", - "Xuwang Yin", - "Andy Zou", - "Zifan Wang", - "Norman Mu", - "Elham Sakhaee", - "Nathaniel Li", - "Steven Basart", - "Bo Li", - "David Forsyth", - "Dan Hendrycks", - ], - groups=[ - "University of Illinois Urbana-Champaign", - "Center for AI Safety", - "Carnegie Mellon University", - "UC Berkeley", - "Microsoft", - ], - ) - prompts.append(text_prompt) - # Note: All images in the HarmBench dataset are stored as .png files, even if the ImageFileName # field specifies a different extension (.jpg or .jpeg). Hence we always use .png extension here. image_url = ( "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/" f"{image_filename.rsplit('.', 1)[0]}.png" ) - local_image_path = await _fetch_and_save_image_async(image_url, behavior_id) - - image_prompt = SeedPrompt( - value=local_image_path, - data_type="image_path", - name=f"HarmBench Multimodal Image - {behavior_id}", - dataset_name="HarmBench Multimodal Examples", - harm_categories=[semantic_category], - description=f"An image prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}", - source=source, - prompt_group_id=group_id, - sequence=0, - metadata={ - "behavior_id": behavior_id, - "image_description": image_description, - "redacted_image_description": redacted_description, - "original_image_url": image_url, - }, - ) - prompts.append(image_prompt) + + try: + # Only include examples where image fetch is successful + local_image_path = await _fetch_and_save_image_async(image_url, behavior_id) + + image_prompt = SeedPrompt( + value=local_image_path, + data_type="image_path", + name=f"HarmBench Multimodal Image - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=[semantic_category], + description=f"An image prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}", + source=source, + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + "image_description": image_description, + "redacted_image_description": redacted_description, + "original_image_url": image_url, + }, + ) + prompts.append(image_prompt) + except Exception as e: + logger.warning(f"Failed to fetch image for behavior {behavior_id}: {e}. Skipping this example.") + else: + text_prompt = SeedPrompt( + value=behavior_text, + data_type="text", + name=f"HarmBench Multimodal Text - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=[semantic_category], + description=(f"A text prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}"), + source=source, + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + }, + authors=[ + "Mantas Mazeika", + "Long Phan", + "Xuwang Yin", + "Andy Zou", + "Zifan Wang", + "Norman Mu", + "Elham Sakhaee", + "Nathaniel Li", + "Steven Basart", + "Bo Li", + "David Forsyth", + "Dan Hendrycks", + ], + groups=[ + "University of Illinois Urbana-Champaign", + "Center for AI Safety", + "Carnegie Mellon University", + "UC Berkeley", + "Microsoft", + ], + ) + prompts.append(text_prompt) seed_prompt_dataset = SeedPromptDataset(prompts=prompts) return seed_prompt_dataset diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index 549e6a11b..b8741f77f 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -235,3 +235,37 @@ async def test_filtering_by_semantic_categories(self, mock_fetch, mock_fetch_ima # Filter with an empty list with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): await fetch_harmbench_multimodal_dataset_async(categories=[]) + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_skips_example_when_image_fetch_fails(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Successful behavior", + "BehaviorID": "success_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "success.png", + }, + { + "Behavior": "Failed behavior", + "BehaviorID": "failed_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "failed.png", + }, + ] + + mock_fetch.return_value = mock_examples + mock_fetch_image.side_effect = [ # first call succeeds, second call fails + "/dbdata/seed-prompt-entries/images/harmbench_success_behavior.png", + Exception("Network error - image not found"), + ] + + result = await fetch_harmbench_multimodal_dataset_async() + + # Only the successful example should be included (1 text + 1 image = 2 prompts) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 + behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} + assert behavior_ids == {"success_behavior"} From 7d99a65746d6ce9b36a0cac3404f7aaaf3d2a085 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:09:10 +0200 Subject: [PATCH 17/18] update integration test --- .../integration/datasets/test_fetch_datasets.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py index 822d414ee..3d898f61c 100644 --- a/tests/integration/datasets/test_fetch_datasets.py +++ b/tests/integration/datasets/test_fetch_datasets.py @@ -47,7 +47,6 @@ (fetch_equitymedqa_dataset_unique_values, True), (fetch_forbidden_questions_dataset, True), (fetch_harmbench_dataset, True), - (fetch_harmbench_multimodal_dataset_async, True), (fetch_jbb_behaviors_dataset, True), (fetch_librAI_do_not_answer_dataset, True), (fetch_llm_latent_adversarial_training_harmful_dataset, True), @@ -74,6 +73,21 @@ def test_fetch_datasets(fetch_function, is_seed_prompt_dataset): assert len(data.prompts) > 0 +@pytest.mark.asyncio +@pytest.mark.parametrize( + "fetch_function, number_of_prompts", + [ + (fetch_harmbench_multimodal_dataset_async, 110 * 2), + ], +) +async def test_fetch_multimodal_datasets(fetch_function, number_of_prompts): + data = await fetch_function() + + assert data is not None + assert isinstance(data, SeedPromptDataset) + assert len(data.prompts) == number_of_prompts + + @pytest.mark.integration def test_fetch_jbb_behaviors_by_harm_category(): """Integration test for filtering by harm category with real data.""" From d43830bf389193a09b59c95fb8c08d8d479b5021 Mon Sep 17 00:00:00 2001 From: Paulina Kalicka <71526180+paulinek13@users.noreply.github.com> Date: Sun, 12 Oct 2025 15:55:40 +0200 Subject: [PATCH 18/18] add a warning --- pyrit/datasets/harmbench_multimodal_dataset.py | 5 +++++ tests/unit/datasets/test_harmbench_multimodal_dataset.py | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py index 3f5e09db2..f6d7c1b81 100644 --- a/pyrit/datasets/harmbench_multimodal_dataset.py +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -79,6 +79,7 @@ async def fetch_harmbench_multimodal_dataset_async( required_keys = {"Behavior", "BehaviorID", "FunctionalCategory", "SemanticCategory", "ImageFileName"} examples = fetch_examples(source, source_type, cache, data_home) prompts = [] + failed_image_count = 0 for example in examples: missing_keys = required_keys - example.keys() @@ -134,6 +135,7 @@ async def fetch_harmbench_multimodal_dataset_async( ) prompts.append(image_prompt) except Exception as e: + failed_image_count += 1 logger.warning(f"Failed to fetch image for behavior {behavior_id}: {e}. Skipping this example.") else: text_prompt = SeedPrompt( @@ -173,6 +175,9 @@ async def fetch_harmbench_multimodal_dataset_async( ) prompts.append(text_prompt) + if failed_image_count > 0: + logger.warning(f"Total skipped examples: {failed_image_count} (image fetch failures)") + seed_prompt_dataset = SeedPromptDataset(prompts=prompts) return seed_prompt_dataset diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py index b8741f77f..bbc3a6748 100644 --- a/tests/unit/datasets/test_harmbench_multimodal_dataset.py +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -238,7 +238,7 @@ async def test_filtering_by_semantic_categories(self, mock_fetch, mock_fetch_ima @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") - async def test_skips_example_when_image_fetch_fails(self, mock_fetch, mock_fetch_image): + async def test_skips_example_when_image_fetch_fails(self, mock_fetch, mock_fetch_image, caplog): mock_examples = [ { "Behavior": "Successful behavior", @@ -269,3 +269,8 @@ async def test_skips_example_when_image_fetch_fails(self, mock_fetch, mock_fetch assert len(result.prompts) == 2 behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} assert behavior_ids == {"success_behavior"} + + warning_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + assert len(warning_messages) == 2 + assert any("Failed to fetch image for behavior failed_behavior" in msg for msg in warning_messages) + assert any("Total skipped examples: 1 (image fetch failures)" in msg for msg in warning_messages)