From c700d9dbb93e7b1e2dc0e3b07d46ee1a6c7ebb05 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Thu, 15 May 2025 17:46:02 +0300 Subject: [PATCH 01/18] Implemented preprocessing for datasets --- src/guidellm/__main__.py | 316 ++++++++++++++++++++------ src/guidellm/preprocess/__init__.py | 6 + tests/unit/preprocess/__init__.py | 0 tests/unit/preprocess/test_dataset.py | 193 ++++++++++++++++ 4 files changed, 449 insertions(+), 66 deletions(-) create mode 100644 src/guidellm/preprocess/__init__.py create mode 100644 tests/unit/preprocess/__init__.py create mode 100644 tests/unit/preprocess/test_dataset.py diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index f38b11aa..79cb97a3 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -8,6 +8,7 @@ from guidellm.backend import BackendType from guidellm.benchmark import ProfileType, benchmark_generative_text from guidellm.config import print_config +from guidellm.preprocess.dataset import process_dataset, ShortPromptStrategy from guidellm.scheduler import StrategyType STRATEGY_PROFILE_CHOICES = set( @@ -40,6 +41,7 @@ def parse_number_str(ctx, param, value): # noqa: ARG001 @click.group() def cli(): + """GuideLLM CLI tools.""" pass @@ -56,8 +58,8 @@ def cli(): "--backend-type", type=click.Choice(list(get_args(BackendType))), help=( - "The type of backend to use to run requests against. Defaults to 'openai_http'." - f" Supported types: {', '.join(get_args(BackendType))}" + "The type of backend to use to run requests against. Defaults to 'openai_http'." + f" Supported types: {', '.join(get_args(BackendType))}" ), default="openai_http", ) @@ -66,8 +68,8 @@ def cli(): callback=parse_json, default=None, help=( - "A JSON string containing any arguments to pass to the backend as a " - "dict with **kwargs." + "A JSON string containing any arguments to pass to the backend as a " + "dict with **kwargs." ), ) @click.option( @@ -75,8 +77,8 @@ def cli(): default=None, type=str, help=( - "The ID of the model to benchmark within the backend. " - "If None provided (default), then it will use the first model available." + "The ID of the model to benchmark within the backend. " + "If None provided (default), then it will use the first model available." ), ) @click.option( @@ -84,9 +86,9 @@ def cli(): default=None, type=str, help=( - "The processor or tokenizer to use to calculate token counts for statistics " - "and synthetic data generation. If None provided (default), will load " - "using the model arg, if needed." + "The processor or tokenizer to use to calculate token counts for statistics " + "and synthetic data generation. If None provided (default), will load " + "using the model arg, if needed." ), ) @click.option( @@ -94,8 +96,8 @@ def cli(): default=None, callback=parse_json, help=( - "A JSON string containing any arguments to pass to the processor constructor " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the processor constructor " + "as a dict with **kwargs." ), ) @click.option( @@ -103,17 +105,17 @@ def cli(): required=True, type=str, help=( - "The HuggingFace dataset ID, a path to a HuggingFace dataset, " - "a path to a data file csv, json, jsonl, or txt, " - "or a synthetic data config as a json or key=value string." + "The HuggingFace dataset ID, a path to a HuggingFace dataset, " + "a path to a data file csv, json, jsonl, or txt, " + "or a synthetic data config as a json or key=value string." ), ) @click.option( "--data-args", callback=parse_json, help=( - "A JSON string containing any arguments to pass to the dataset creation " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the dataset creation " + "as a dict with **kwargs." ), ) @click.option( @@ -121,8 +123,8 @@ def cli(): default=None, type=click.Choice(["random"]), help=( - "The data sampler type to use. 'random' will add a random shuffle on the data. " - "Defaults to None" + "The data sampler type to use. 'random' will add a random shuffle on the data. " + "Defaults to None" ), ) @click.option( @@ -130,8 +132,8 @@ def cli(): required=True, type=click.Choice(STRATEGY_PROFILE_CHOICES), help=( - "The type of benchmark to run. " - f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. " + "The type of benchmark to run. " + f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. " ), ) @click.option( @@ -139,28 +141,28 @@ def cli(): default=None, callback=parse_number_str, help=( - "The rates to run the benchmark at. " - "Can be a single number or a comma-separated list of numbers. " - "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. " - "For rate-type=concurrent, this is the number of concurrent requests. " - "For rate-type=async,constant,poisson, this is the rate requests per second. " - "For rate-type=synchronous,throughput, this must not be set." + "The rates to run the benchmark at. " + "Can be a single number or a comma-separated list of numbers. " + "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. " + "For rate-type=concurrent, this is the number of concurrent requests. " + "For rate-type=async,constant,poisson, this is the rate requests per second. " + "For rate-type=synchronous,throughput, this must not be set." ), ) @click.option( "--max-seconds", type=float, help=( - "The maximum number of seconds each benchmark can run for. " - "If None, will run until max_requests or the data is exhausted." + "The maximum number of seconds each benchmark can run for. " + "If None, will run until max_requests or the data is exhausted." ), ) @click.option( "--max-requests", type=int, help=( - "The maximum number of requests each benchmark can run for. " - "If None, will run until max_seconds or the data is exhausted." + "The maximum number of requests each benchmark can run for. " + "If None, will run until max_seconds or the data is exhausted." ), ) @click.option( @@ -168,18 +170,18 @@ def cli(): type=float, default=None, help=( - "The percent of the benchmark (based on max-seconds, max-requets, " - "or lenth of dataset) to run as a warmup and not include in the final results. " - "Defaults to None." + "The percent of the benchmark (based on max-seconds, max-requets, " + "or lenth of dataset) to run as a warmup and not include in the final results. " + "Defaults to None." ), ) @click.option( "--cooldown-percent", type=float, help=( - "The percent of the benchmark (based on max-seconds, max-requets, or lenth " - "of dataset) to run as a cooldown and not include in the final results. " - "Defaults to None." + "The percent of the benchmark (based on max-seconds, max-requets, or lenth " + "of dataset) to run as a cooldown and not include in the final results. " + "Defaults to None." ), ) @click.option( @@ -202,10 +204,10 @@ def cli(): type=click.Path(), default=Path.cwd() / "benchmarks.json", help=( - "The path to save the output to. If it is a directory, " - "it will save benchmarks.json under it. " - "Otherwise, json, yaml, or csv files are supported for output types " - "which will be read from the extension for the file path." + "The path to save the output to. If it is a directory, " + "it will save benchmarks.json under it. " + "Otherwise, json, yaml, or csv files are supported for output types " + "which will be read from the extension for the file path." ), ) @click.option( @@ -217,8 +219,8 @@ def cli(): "--output-sampling", type=int, help=( - "The number of samples to save in the output file. " - "If None (default), will save all samples." + "The number of samples to save in the output file. " + "If None (default), will save all samples." ), default=None, ) @@ -229,28 +231,28 @@ def cli(): help="The random seed to use for benchmarking to ensure reproducibility.", ) def benchmark( - target, - backend_type, - backend_args, - model, - processor, - processor_args, - data, - data_args, - data_sampler, - rate_type, - rate, - max_seconds, - max_requests, - warmup_percent, - cooldown_percent, - disable_progress, - display_scheduler_stats, - disable_console_outputs, - output_path, - output_extras, - output_sampling, - random_seed, + target, + backend_type, + backend_args, + model, + processor, + processor_args, + data, + data_args, + data_sampler, + rate_type, + rate, + max_seconds, + max_requests, + warmup_percent, + cooldown_percent, + disable_progress, + display_scheduler_stats, + disable_console_outputs, + output_path, + output_extras, + output_sampling, + random_seed, ): asyncio.run( benchmark_generative_text( @@ -282,13 +284,195 @@ def benchmark( @cli.command( help=( - "Print out the available configuration settings that can be set " - "through environment variables." + "Print out the available configuration settings that can be set " + "through environment variables." ) ) def config(): print_config() +@cli.group(help="Preprocessing utilities for datasets.") +def preprocess(): + pass + + +@preprocess.command( + help="Convert a dataset to have specific prompt and output token sizes.\n\n" + "INPUT_DATA: Path to the input dataset or dataset ID.\n" + "OUTPUT_PATH: Directory to save the converted dataset. " + "The dataset will be saved as an Arrow dataset (.arrow) inside the directory." +) +@click.argument( + "input_data", + type=str, + metavar="INPUT_DATA", + required=True, +) +@click.argument( + "output_path", + type=click.Path(file_okay=False, dir_okay=True, writable=True, resolve_path=True), + metavar="OUTPUT_PATH", + required=True, +) +@click.option( + "--processor", + type=str, + required=True, + help=( + "The processor or tokenizer to use to calculate token counts for statistics " + "and synthetic data generation." + ), +) +@click.option( + "--processor-args", + default=None, + callback=parse_json, + help=( + "A JSON string containing any arguments to pass to the processor constructor " + "as a dict with **kwargs." + ), +) +@click.option( + "--data-args", + callback=parse_json, + help=( + "A JSON string containing any arguments to pass to the dataset creation " + "as a dict with **kwargs." + ), +) +@click.option( + "--short-prompt-strategy", + type=click.Choice([s.value for s in ShortPromptStrategy]), + default=ShortPromptStrategy.IGNORE.value, + show_default=True, + help="Strategy to handle prompts shorter than the target length. " +) +@click.option( + "--pad-token", + type=str, + default=None, + help="The token to pad short prompts with when using the 'pad' strategy." +) +@click.option( + "--prompt-tokens-average", + type=int, + default=10, + show_default=True, + help="Average target number of tokens for prompts." +) +@click.option( + "--prompt-tokens-stdev", + type=int, + default=None, + help="Standard deviation for prompt tokens sampling." +) +@click.option( + "--prompt-tokens-min", + type=int, + default=None, + help="Minimum number of prompt tokens." +) +@click.option( + "--prompt-tokens-max", + type=int, + default=None, + help="Maximum number of prompt tokens." +) +@click.option( + "--prompt-random-seed", + type=int, + default=42, + show_default=True, + help="Random seed for prompt token sampling." +) +@click.option( + "--output-tokens-average", + type=int, + default=10, + show_default=True, + help="Average target number of tokens for outputs." +) +@click.option( + "--output-tokens-stdev", + type=int, + default=None, + help="Standard deviation for output tokens sampling." +) +@click.option( + "--output-tokens-min", + type=int, + default=None, + help="Minimum number of output tokens." +) +@click.option( + "--output-tokens-max", + type=int, + default=None, + help="Maximum number of output tokens." +) +@click.option( + "--output-random-seed", + type=int, + default=123, + show_default=True, + help="Random seed for output token sampling." +) +@click.option( + "--push-to-hub", + is_flag=True, + help="Set this flag to push the converted dataset to the Hugging Face Hub." +) +@click.option( + "--hub-dataset-id", + type=str, + default=None, + help="The Hugging Face Hub dataset ID to push to. " + "Required if --push-to-hub is used." +) +def dataset( + input_data, + output_path, + processor, + processor_args, + data_args, + short_prompt_strategy, + pad_token, + prompt_tokens_average, + prompt_tokens_stdev, + prompt_tokens_min, + prompt_tokens_max, + prompt_random_seed, + output_tokens_average, + output_tokens_stdev, + output_tokens_min, + output_tokens_max, + output_random_seed, + push_to_hub, + hub_dataset_id, +): + process_dataset( + input_data=input_data, + output_path=output_path, + processor=processor, + processor_args=processor_args, + data_args=data_args, + short_prompt_strategy=short_prompt_strategy, + pad_token=pad_token, + prompt_tokens_average=prompt_tokens_average, + prompt_tokens_stdev=prompt_tokens_stdev, + prompt_tokens_min=prompt_tokens_min, + prompt_tokens_max=prompt_tokens_max, + prompt_random_seed=prompt_random_seed, + output_tokens_average=output_tokens_average, + output_tokens_stdev=output_tokens_stdev, + output_tokens_min=output_tokens_min, + output_tokens_max=output_tokens_max, + output_random_seed=output_random_seed, + push_to_hub=push_to_hub, + hub_dataset_id=hub_dataset_id, + ) + + if __name__ == "__main__": cli() diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py new file mode 100644 index 00000000..669bf409 --- /dev/null +++ b/src/guidellm/preprocess/__init__.py @@ -0,0 +1,6 @@ +from .dataset import process_dataset, ShortPromptStrategy + +__all__ = [ + "process_dataset", + "ShortPromptStrategy" +] diff --git a/tests/unit/preprocess/__init__.py b/tests/unit/preprocess/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py new file mode 100644 index 00000000..e14bffc3 --- /dev/null +++ b/tests/unit/preprocess/test_dataset.py @@ -0,0 +1,193 @@ +import os +from unittest import mock +from unittest.mock import patch, MagicMock + +import pytest +from datasets import Dataset +from transformers import PreTrainedTokenizerBase + +from guidellm.preprocess.dataset import ( + handle_ignore_strategy, + handle_concatenate_strategy, + handle_pad_strategy, + process_dataset, + push_dataset_to_hub, + ShortPromptStrategy, + STRATEGY_HANDLERS, +) + + +@pytest.fixture +def tokenizer_mock(): + tokenizer = MagicMock(spec=PreTrainedTokenizerBase) + tokenizer.encode.side_effect = lambda x: [1] * len(x) + tokenizer.decode.side_effect = lambda x: ''.join(str(item) for item in x) + return tokenizer + + +@patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: MagicMock(return_value="processed_prompt")}) +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +def test_strategy_handler_called( + mock_sampler, mock_dataset_class, mock_check_processor, mock_load_dataset, tokenizer_mock +): + mock_handler = STRATEGY_HANDLERS[ShortPromptStrategy.IGNORE] + mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [10, 10] + + mock_dataset_obj = MagicMock(spec=Dataset) + mock_dataset_class.from_list.return_value = mock_dataset_obj + + process_dataset("input", "output_dir", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE) + + assert mock_handler.call_count == 2 + mock_load_dataset.assert_called_once() + mock_check_processor.assert_called_once() + + +def test_handle_ignore_strategy_too_short(tokenizer_mock): + result = handle_ignore_strategy("short", 10, tokenizer_mock) + assert result is None + tokenizer_mock.encode.assert_called_with("short") + + +def test_handle_ignore_strategy_sufficient_length(tokenizer_mock): + result = handle_ignore_strategy("long prompt", 5, tokenizer_mock) + assert result == "long prompt" + tokenizer_mock.encode.assert_called_with("long prompt") + + +def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): + dataset_iter = iter([{"prompt": "longer"}]) + result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock) + assert result == "shortlonger" + + +def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): + dataset_iter = iter([]) + result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock) + assert result is None + + +def test_handle_pad_strategy(tokenizer_mock): + result = handle_pad_strategy("short", 10, tokenizer_mock, "p") + assert result == "shortppppp" + + +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +def test_process_dataset_non_empty( + mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock +): + mock_dataset = [{"prompt": "Hello"}, {"prompt": "How are you?"}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [3, 3, 3] + + mock_dataset_obj = MagicMock(spec=Dataset) + mock_dataset_class.from_list.return_value = mock_dataset_obj + + process_dataset("input", "output_dir", tokenizer_mock) + + # Validate + mock_load_dataset.assert_called_once() + mock_check_processor.assert_called_once() + mock_dataset_class.from_list.assert_called_once() + mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY) + + # Check that generated dataset is correct (extracted from the mock call) + args, kwargs = mock_dataset_class.from_list.call_args + processed_list = args[0] + assert len(processed_list) == 2 + for item in processed_list: + assert "prompt" in item + assert "prompt_tokens_count" in item + assert "output_tokens_count" in item + assert len(tokenizer_mock.encode(item["prompt"])) <= 3 + + +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +def test_process_dataset_empty_after_processing( + mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock +): + mock_dataset = [{"prompt": ""}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [10] + + process_dataset("input", "output_dir", tokenizer_mock) + + mock_load_dataset.assert_called_once() + mock_check_processor.assert_called_once() + mock_dataset_class.from_list.assert_not_called() + + +@patch(f"{process_dataset.__module__}.push_dataset_to_hub") +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +def test_process_dataset_push_to_hub_called( + mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock +): + mock_dataset = [{"prompt": "abc"}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [3] + + mock_dataset_obj = MagicMock(spec=Dataset) + mock_dataset_class.from_list.return_value = mock_dataset_obj + + process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123") + mock_push.assert_called_once_with("id123", mock_dataset_obj) + + +@patch(f"{process_dataset.__module__}.push_dataset_to_hub") +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +def test_process_dataset_push_to_hub_not_called( + mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock +): + mock_dataset = [{"prompt": "abc"}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [3] + + mock_dataset_obj = MagicMock(spec=Dataset) + mock_dataset_class.from_list.return_value = mock_dataset_obj + + process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=False) + mock_push.assert_not_called() + + +def test_push_dataset_to_hub_success(): + os.environ["HF_TOKEN"] = "token" + mock_dataset = MagicMock(spec=Dataset) + push_dataset_to_hub("dataset_id", mock_dataset) + mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token") + + +def test_push_dataset_to_hub_error_no_env(): + if "HF_TOKEN" in os.environ: + del os.environ["HF_TOKEN"] + mock_dataset = MagicMock(spec=Dataset) + with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): + push_dataset_to_hub("dataset_id", mock_dataset) + + +def test_push_dataset_to_hub_error_no_id(): + os.environ["HF_TOKEN"] = "token" + mock_dataset = MagicMock(spec=Dataset) + with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): + push_dataset_to_hub(None, mock_dataset) From 91027e1b40923c5fc572fba9c1af490e408fd93c Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Thu, 15 May 2025 17:46:12 +0300 Subject: [PATCH 02/18] Implemented preprocessing for datasets --- src/guidellm/preprocess/dataset.py | 171 +++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 src/guidellm/preprocess/dataset.py diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py new file mode 100644 index 00000000..1112470c --- /dev/null +++ b/src/guidellm/preprocess/dataset.py @@ -0,0 +1,171 @@ +import os +from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union, Dict, Callable, Iterator + +from datasets import Dataset +from loguru import logger +from transformers import PreTrainedTokenizerBase + +from guidellm.dataset import load_dataset as guidellm_load_dataset +from guidellm.utils import check_load_processor, IntegerRangeSampler + + +class ShortPromptStrategy(str, Enum): + IGNORE = "ignore" + CONCATENATE = "concatenate" + PAD = "pad" + + +def handle_ignore_strategy( + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **kwargs +) -> Optional[str]: + if len(tokenizer.encode(current_prompt)) < min_prompt_tokens: + logger.warning("Prompt too short, ignoring") + return None + return current_prompt + + +def handle_concatenate_strategy( + current_prompt: str, + min_prompt_tokens: int, + dataset_iterator: Iterator[Dict[str, Any]], + prompt_column: str, + tokenizer: PreTrainedTokenizerBase, + **kwargs +) -> Optional[str]: + tokens_len = len(tokenizer.encode(current_prompt)) + while tokens_len < min_prompt_tokens: + try: + next_row = next(dataset_iterator) + except StopIteration: + logger.warning("Could not concatenate enough prompts to reach minimum length, ignoring") + return None + current_prompt += next_row[prompt_column] + tokens_len = len(tokenizer.encode(current_prompt)) + return current_prompt + + +def handle_pad_strategy( + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + pad_token: str, + **kwargs +) -> str: + while len(tokenizer.encode(current_prompt)) < min_prompt_tokens: + current_prompt += pad_token + return current_prompt + + +STRATEGY_HANDLERS: Dict[ShortPromptStrategy, Callable] = { + ShortPromptStrategy.IGNORE: handle_ignore_strategy, + ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy, + ShortPromptStrategy.PAD: handle_pad_strategy, +} + + +def process_dataset( + input_data: Union[str, Path], + output_path: Union[str, Path], + processor: Union[str, Path, PreTrainedTokenizerBase], + processor_args: Optional[dict[str, Any]] = None, + data_args: Optional[dict[str, Any]] = None, + short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, + pad_token: Optional[str] = None, + prompt_tokens_average: int = 10, + prompt_tokens_stdev: Optional[int] = None, + prompt_tokens_min: Optional[int] = None, + prompt_tokens_max: Optional[int] = None, + prompt_random_seed: int = 42, + output_tokens_average: int = 10, + output_tokens_stdev: Optional[int] = None, + output_tokens_min: Optional[int] = None, + output_tokens_max: Optional[int] = None, + output_random_seed: int = 123, + push_to_hub: bool = False, + hub_dataset_id: Optional[str] = None, +) -> None: + logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}") + + dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args) + tokenizer = check_load_processor(processor, processor_args, "Processor/tokenizer required for dataset conversion.") + prompt_column = column_mappings.get("prompt_column") + output_column = column_mappings.get("output_tokens_count_column", "output_tokens_count") + + prompt_token_sampler = iter( + IntegerRangeSampler( + average=prompt_tokens_average, + variance=prompt_tokens_stdev, + min_value=prompt_tokens_min, + max_value=prompt_tokens_max, + random_seed=prompt_random_seed, + ) + ) + + output_token_sampler = iter( + IntegerRangeSampler( + average=output_tokens_average, + variance=output_tokens_stdev, + min_value=output_tokens_min, + max_value=output_tokens_max, + random_seed=output_random_seed, + ) + ) + + dataset_iterator = iter(dataset) + processed_prompts = [] + handler = STRATEGY_HANDLERS[short_prompt_strategy] + + for prompt_row in dataset_iterator: + prompt_text = prompt_row[prompt_column] + target_prompt_len = next(prompt_token_sampler) + + if len(tokenizer.encode(prompt_text)) < target_prompt_len: + prompt_text = handler( + current_prompt=prompt_text, + min_prompt_tokens=target_prompt_len, + dataset_iterator=dataset_iterator, + prompt_column=prompt_column, + tokenizer=tokenizer, + pad_token=pad_token, + ) + if prompt_text is None: + continue + + if len(tokenizer.encode(prompt_text)) > target_prompt_len: + tokens = tokenizer.encode(prompt_text) + prompt_text = tokenizer.decode(tokens[:target_prompt_len]) + + processed_prompt = prompt_row.copy() + processed_prompt[prompt_column] = prompt_text + processed_prompt["prompt_tokens_count"] = target_prompt_len + processed_prompt[output_column] = next(output_token_sampler) + + processed_prompts.append(processed_prompt) + + if not processed_prompts: + logger.error("No prompts remained after processing") + return + + logger.info(f"Generated processed dataset with {len(processed_prompts)} prompts") + + processed_dataset = Dataset.from_list(processed_prompts) + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + processed_dataset.save_to_disk(output_path) + logger.info(f"Conversion complete. Dataset saved to: {output_path}") + + if push_to_hub: + push_dataset_to_hub(hub_dataset_id, processed_dataset) + logger.info(f"Pushed dataset to: {hub_dataset_id}") + + +def push_dataset_to_hub(hub_dataset_id: str, processed_dataset: Dataset) -> None: + hf_token = os.environ.get("HF_TOKEN") + if not hub_dataset_id or not hf_token: + raise ValueError("hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub is True") + processed_dataset.push_to_hub(hub_dataset_id, token=hf_token) From e02ec8a933a50515c948f2abd1d1f9f3abb83ce8 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Thu, 15 May 2025 17:49:07 +0300 Subject: [PATCH 03/18] Reverted irrelevant formatting --- src/guidellm/__main__.py | 133 +++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 79cb97a3..1afb6720 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -41,7 +41,6 @@ def parse_number_str(ctx, param, value): # noqa: ARG001 @click.group() def cli(): - """GuideLLM CLI tools.""" pass @@ -58,8 +57,8 @@ def cli(): "--backend-type", type=click.Choice(list(get_args(BackendType))), help=( - "The type of backend to use to run requests against. Defaults to 'openai_http'." - f" Supported types: {', '.join(get_args(BackendType))}" + "The type of backend to use to run requests against. Defaults to 'openai_http'." + f" Supported types: {', '.join(get_args(BackendType))}" ), default="openai_http", ) @@ -68,8 +67,8 @@ def cli(): callback=parse_json, default=None, help=( - "A JSON string containing any arguments to pass to the backend as a " - "dict with **kwargs." + "A JSON string containing any arguments to pass to the backend as a " + "dict with **kwargs." ), ) @click.option( @@ -77,8 +76,8 @@ def cli(): default=None, type=str, help=( - "The ID of the model to benchmark within the backend. " - "If None provided (default), then it will use the first model available." + "The ID of the model to benchmark within the backend. " + "If None provided (default), then it will use the first model available." ), ) @click.option( @@ -86,9 +85,9 @@ def cli(): default=None, type=str, help=( - "The processor or tokenizer to use to calculate token counts for statistics " - "and synthetic data generation. If None provided (default), will load " - "using the model arg, if needed." + "The processor or tokenizer to use to calculate token counts for statistics " + "and synthetic data generation. If None provided (default), will load " + "using the model arg, if needed." ), ) @click.option( @@ -96,8 +95,8 @@ def cli(): default=None, callback=parse_json, help=( - "A JSON string containing any arguments to pass to the processor constructor " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the processor constructor " + "as a dict with **kwargs." ), ) @click.option( @@ -105,17 +104,17 @@ def cli(): required=True, type=str, help=( - "The HuggingFace dataset ID, a path to a HuggingFace dataset, " - "a path to a data file csv, json, jsonl, or txt, " - "or a synthetic data config as a json or key=value string." + "The HuggingFace dataset ID, a path to a HuggingFace dataset, " + "a path to a data file csv, json, jsonl, or txt, " + "or a synthetic data config as a json or key=value string." ), ) @click.option( "--data-args", callback=parse_json, help=( - "A JSON string containing any arguments to pass to the dataset creation " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the dataset creation " + "as a dict with **kwargs." ), ) @click.option( @@ -123,8 +122,8 @@ def cli(): default=None, type=click.Choice(["random"]), help=( - "The data sampler type to use. 'random' will add a random shuffle on the data. " - "Defaults to None" + "The data sampler type to use. 'random' will add a random shuffle on the data. " + "Defaults to None" ), ) @click.option( @@ -132,8 +131,8 @@ def cli(): required=True, type=click.Choice(STRATEGY_PROFILE_CHOICES), help=( - "The type of benchmark to run. " - f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. " + "The type of benchmark to run. " + f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. " ), ) @click.option( @@ -141,28 +140,28 @@ def cli(): default=None, callback=parse_number_str, help=( - "The rates to run the benchmark at. " - "Can be a single number or a comma-separated list of numbers. " - "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. " - "For rate-type=concurrent, this is the number of concurrent requests. " - "For rate-type=async,constant,poisson, this is the rate requests per second. " - "For rate-type=synchronous,throughput, this must not be set." + "The rates to run the benchmark at. " + "Can be a single number or a comma-separated list of numbers. " + "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. " + "For rate-type=concurrent, this is the number of concurrent requests. " + "For rate-type=async,constant,poisson, this is the rate requests per second. " + "For rate-type=synchronous,throughput, this must not be set." ), ) @click.option( "--max-seconds", type=float, help=( - "The maximum number of seconds each benchmark can run for. " - "If None, will run until max_requests or the data is exhausted." + "The maximum number of seconds each benchmark can run for. " + "If None, will run until max_requests or the data is exhausted." ), ) @click.option( "--max-requests", type=int, help=( - "The maximum number of requests each benchmark can run for. " - "If None, will run until max_seconds or the data is exhausted." + "The maximum number of requests each benchmark can run for. " + "If None, will run until max_seconds or the data is exhausted." ), ) @click.option( @@ -170,18 +169,18 @@ def cli(): type=float, default=None, help=( - "The percent of the benchmark (based on max-seconds, max-requets, " - "or lenth of dataset) to run as a warmup and not include in the final results. " - "Defaults to None." + "The percent of the benchmark (based on max-seconds, max-requets, " + "or lenth of dataset) to run as a warmup and not include in the final results. " + "Defaults to None." ), ) @click.option( "--cooldown-percent", type=float, help=( - "The percent of the benchmark (based on max-seconds, max-requets, or lenth " - "of dataset) to run as a cooldown and not include in the final results. " - "Defaults to None." + "The percent of the benchmark (based on max-seconds, max-requets, or lenth " + "of dataset) to run as a cooldown and not include in the final results. " + "Defaults to None." ), ) @click.option( @@ -204,10 +203,10 @@ def cli(): type=click.Path(), default=Path.cwd() / "benchmarks.json", help=( - "The path to save the output to. If it is a directory, " - "it will save benchmarks.json under it. " - "Otherwise, json, yaml, or csv files are supported for output types " - "which will be read from the extension for the file path." + "The path to save the output to. If it is a directory, " + "it will save benchmarks.json under it. " + "Otherwise, json, yaml, or csv files are supported for output types " + "which will be read from the extension for the file path." ), ) @click.option( @@ -219,8 +218,8 @@ def cli(): "--output-sampling", type=int, help=( - "The number of samples to save in the output file. " - "If None (default), will save all samples." + "The number of samples to save in the output file. " + "If None (default), will save all samples." ), default=None, ) @@ -231,28 +230,28 @@ def cli(): help="The random seed to use for benchmarking to ensure reproducibility.", ) def benchmark( - target, - backend_type, - backend_args, - model, - processor, - processor_args, - data, - data_args, - data_sampler, - rate_type, - rate, - max_seconds, - max_requests, - warmup_percent, - cooldown_percent, - disable_progress, - display_scheduler_stats, - disable_console_outputs, - output_path, - output_extras, - output_sampling, - random_seed, + target, + backend_type, + backend_args, + model, + processor, + processor_args, + data, + data_args, + data_sampler, + rate_type, + rate, + max_seconds, + max_requests, + warmup_percent, + cooldown_percent, + disable_progress, + display_scheduler_stats, + disable_console_outputs, + output_path, + output_extras, + output_sampling, + random_seed, ): asyncio.run( benchmark_generative_text( @@ -284,8 +283,8 @@ def benchmark( @cli.command( help=( - "Print out the available configuration settings that can be set " - "through environment variables." + "Print out the available configuration settings that can be set " + "through environment variables." ) ) def config(): From fc4bfb2fd7389814c7c1ec8e6bbe7e8390c00ebd Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Thu, 15 May 2025 17:57:36 +0300 Subject: [PATCH 04/18] Removed redundant comments --- tests/unit/preprocess/test_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index e14bffc3..126e2d0e 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -95,13 +95,11 @@ def test_process_dataset_non_empty( process_dataset("input", "output_dir", tokenizer_mock) - # Validate mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() mock_dataset_class.from_list.assert_called_once() mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY) - # Check that generated dataset is correct (extracted from the mock call) args, kwargs = mock_dataset_class.from_list.call_args processed_list = args[0] assert len(processed_list) == 2 From 2bf436c68ed1db4be53dbbac1d42893e69013546 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 18 May 2025 14:40:30 +0300 Subject: [PATCH 05/18] Added support for saving in specific file format --- src/guidellm/__main__.py | 2 +- src/guidellm/preprocess/dataset.py | 40 ++++++++++++-- tests/unit/preprocess/test_dataset.py | 75 +++++++++++++++++++++------ 3 files changed, 97 insertions(+), 20 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 1afb6720..68a63f37 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -310,7 +310,7 @@ def preprocess(): ) @click.argument( "output_path", - type=click.Path(file_okay=False, dir_okay=True, writable=True, resolve_path=True), + type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True), metavar="OUTPUT_PATH", required=True, ) diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index 1112470c..6ab5c077 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -10,6 +10,12 @@ from guidellm.dataset import load_dataset as guidellm_load_dataset from guidellm.utils import check_load_processor, IntegerRangeSampler +SUPPORTED_TYPES = { + ".json", + ".csv", + ".parquet", +} + class ShortPromptStrategy(str, Enum): IGNORE = "ignore" @@ -68,6 +74,33 @@ def handle_pad_strategy( } +def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if output_path.suffix == ".csv": + dataset.to_csv(str(output_path)) + elif output_path.suffix == ".json": + dataset.to_json(str(output_path)) + elif output_path.suffix == ".parquet": + dataset.to_parquet(str(output_path)) + else: + raise ValueError( + f"Unsupported file suffix '{output_path.suffix}' in output_path '{output_path}'. " + f"Only {SUPPORTED_TYPES} are supported." + ) + + +def _validate_output_suffix(output_path: str) -> None: + output_path = Path(output_path) + suffix = output_path.suffix.lower() + if suffix not in SUPPORTED_TYPES: + raise ValueError( + f"Unsupported file suffix '{suffix}' in output_path '{output_path}'. " + f"Only {SUPPORTED_TYPES} are supported." + ) + + def process_dataset( input_data: Union[str, Path], output_path: Union[str, Path], @@ -89,6 +122,7 @@ def process_dataset( push_to_hub: bool = False, hub_dataset_id: Optional[str] = None, ) -> None: + _validate_output_suffix(output_path) logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}") dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args) @@ -138,7 +172,7 @@ def process_dataset( if len(tokenizer.encode(prompt_text)) > target_prompt_len: tokens = tokenizer.encode(prompt_text) - prompt_text = tokenizer.decode(tokens[:target_prompt_len]) + prompt_text = tokenizer.decode(tokens[:target_prompt_len], skip_special_tokens=True) processed_prompt = prompt_row.copy() processed_prompt[prompt_column] = prompt_text @@ -154,9 +188,7 @@ def process_dataset( logger.info(f"Generated processed dataset with {len(processed_prompts)} prompts") processed_dataset = Dataset.from_list(processed_prompts) - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - processed_dataset.save_to_disk(output_path) + save_dataset_to_file(processed_dataset, output_path) logger.info(f"Conversion complete. Dataset saved to: {output_path}") if push_to_hub: diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 126e2d0e..2b6c4665 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -1,5 +1,5 @@ import os -from unittest import mock +from pathlib import Path from unittest.mock import patch, MagicMock import pytest @@ -13,7 +13,7 @@ process_dataset, push_dataset_to_hub, ShortPromptStrategy, - STRATEGY_HANDLERS, + STRATEGY_HANDLERS, save_dataset_to_file, ) @@ -21,7 +21,7 @@ def tokenizer_mock(): tokenizer = MagicMock(spec=PreTrainedTokenizerBase) tokenizer.encode.side_effect = lambda x: [1] * len(x) - tokenizer.decode.side_effect = lambda x: ''.join(str(item) for item in x) + tokenizer.decode.side_effect = lambda x, *args, **kwargs: ''.join(str(item) for item in x) return tokenizer @@ -42,7 +42,7 @@ def test_strategy_handler_called( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE) + process_dataset("input", "output_dir/data.json", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE) assert mock_handler.call_count == 2 mock_load_dataset.assert_called_once() @@ -78,13 +78,21 @@ def test_handle_pad_strategy(tokenizer_mock): assert result == "shortppppp" -@patch(f"{process_dataset.__module__}.Dataset") -@patch(f"{process_dataset.__module__}.guidellm_load_dataset") -@patch(f"{process_dataset.__module__}.check_load_processor") -@patch(f"{process_dataset.__module__}.IntegerRangeSampler") +@patch("guidellm.preprocess.dataset.save_dataset_to_file") +@patch("guidellm.preprocess.dataset.Dataset") +@patch("guidellm.preprocess.dataset.guidellm_load_dataset") +@patch("guidellm.preprocess.dataset.check_load_processor") +@patch("guidellm.preprocess.dataset.IntegerRangeSampler") def test_process_dataset_non_empty( - mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_save_to_file, + tokenizer_mock, ): + from guidellm.preprocess.dataset import process_dataset + mock_dataset = [{"prompt": "Hello"}, {"prompt": "How are you?"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) mock_check_processor.return_value = tokenizer_mock @@ -93,14 +101,15 @@ def test_process_dataset_non_empty( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir", tokenizer_mock) + output_path = "output_dir/data.json" + process_dataset("input", output_path, tokenizer_mock) mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() mock_dataset_class.from_list.assert_called_once() - mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY) + mock_save_to_file.assert_called_once_with(mock_dataset_obj, output_path) - args, kwargs = mock_dataset_class.from_list.call_args + args, _ = mock_dataset_class.from_list.call_args processed_list = args[0] assert len(processed_list) == 2 for item in processed_list: @@ -122,7 +131,7 @@ def test_process_dataset_empty_after_processing( mock_check_processor.return_value = tokenizer_mock mock_sampler.side_effect = lambda **kwargs: [10] - process_dataset("input", "output_dir", tokenizer_mock) + process_dataset("input", "output_dir/data.json", tokenizer_mock) mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() @@ -145,7 +154,7 @@ def test_process_dataset_push_to_hub_called( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123") + process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123") mock_push.assert_called_once_with("id123", mock_dataset_obj) @@ -165,7 +174,7 @@ def test_process_dataset_push_to_hub_not_called( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=False) + process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=False) mock_push.assert_not_called() @@ -189,3 +198,39 @@ def test_push_dataset_to_hub_error_no_id(): mock_dataset = MagicMock(spec=Dataset) with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub(None, mock_dataset) + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_csv(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.csv") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_csv.assert_called_once_with(str(output_path)) + mock_mkdir.assert_called_once() + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_json(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.json") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(str(output_path)) + mock_mkdir.assert_called_once() + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_parquet(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.parquet") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_parquet.assert_called_once_with(str(output_path)) + mock_mkdir.assert_called_once() + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_unsupported_type(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.txt") + with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"): + save_dataset_to_file(mock_dataset, output_path) + mock_mkdir.assert_called_once() From 64a91de7aceeeaaeff38d558eda04435cf988c1d Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 18 May 2025 14:56:53 +0300 Subject: [PATCH 06/18] Fixed code styling --- src/guidellm/__main__.py | 86 +++++++++---------- src/guidellm/preprocess/__init__.py | 7 +- src/guidellm/preprocess/dataset.py | 117 +++++++++++++++----------- tests/unit/preprocess/test_dataset.py | 104 ++++++++++++++++------- 4 files changed, 186 insertions(+), 128 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 68a63f37..6df1089d 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -8,7 +8,7 @@ from guidellm.backend import BackendType from guidellm.benchmark import ProfileType, benchmark_generative_text from guidellm.config import print_config -from guidellm.preprocess.dataset import process_dataset, ShortPromptStrategy +from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset from guidellm.scheduler import StrategyType STRATEGY_PROFILE_CHOICES = set( @@ -298,9 +298,9 @@ def preprocess(): @preprocess.command( help="Convert a dataset to have specific prompt and output token sizes.\n\n" - "INPUT_DATA: Path to the input dataset or dataset ID.\n" - "OUTPUT_PATH: Directory to save the converted dataset. " - "The dataset will be saved as an Arrow dataset (.arrow) inside the directory." + "INPUT_DATA: Path to the input dataset or dataset ID.\n" + "OUTPUT_PATH: Directory to save the converted dataset. " + "The dataset will be saved as an Arrow dataset (.arrow) inside the directory." ) @click.argument( "input_data", @@ -319,8 +319,8 @@ def preprocess(): type=str, required=True, help=( - "The processor or tokenizer to use to calculate token counts for statistics " - "and synthetic data generation." + "The processor or tokenizer to use to calculate token counts for statistics " + "and synthetic data generation." ), ) @click.option( @@ -328,16 +328,16 @@ def preprocess(): default=None, callback=parse_json, help=( - "A JSON string containing any arguments to pass to the processor constructor " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the processor constructor " + "as a dict with **kwargs." ), ) @click.option( "--data-args", callback=parse_json, help=( - "A JSON string containing any arguments to pass to the dataset creation " - "as a dict with **kwargs." + "A JSON string containing any arguments to pass to the dataset creation " + "as a dict with **kwargs." ), ) @click.option( @@ -345,110 +345,110 @@ def preprocess(): type=click.Choice([s.value for s in ShortPromptStrategy]), default=ShortPromptStrategy.IGNORE.value, show_default=True, - help="Strategy to handle prompts shorter than the target length. " + help="Strategy to handle prompts shorter than the target length. ", ) @click.option( "--pad-token", type=str, default=None, - help="The token to pad short prompts with when using the 'pad' strategy." + help="The token to pad short prompts with when using the 'pad' strategy.", ) @click.option( "--prompt-tokens-average", type=int, default=10, show_default=True, - help="Average target number of tokens for prompts." + help="Average target number of tokens for prompts.", ) @click.option( "--prompt-tokens-stdev", type=int, default=None, - help="Standard deviation for prompt tokens sampling." + help="Standard deviation for prompt tokens sampling.", ) @click.option( "--prompt-tokens-min", type=int, default=None, - help="Minimum number of prompt tokens." + help="Minimum number of prompt tokens.", ) @click.option( "--prompt-tokens-max", type=int, default=None, - help="Maximum number of prompt tokens." + help="Maximum number of prompt tokens.", ) @click.option( "--prompt-random-seed", type=int, default=42, show_default=True, - help="Random seed for prompt token sampling." + help="Random seed for prompt token sampling.", ) @click.option( "--output-tokens-average", type=int, default=10, show_default=True, - help="Average target number of tokens for outputs." + help="Average target number of tokens for outputs.", ) @click.option( "--output-tokens-stdev", type=int, default=None, - help="Standard deviation for output tokens sampling." + help="Standard deviation for output tokens sampling.", ) @click.option( "--output-tokens-min", type=int, default=None, - help="Minimum number of output tokens." + help="Minimum number of output tokens.", ) @click.option( "--output-tokens-max", type=int, default=None, - help="Maximum number of output tokens." + help="Maximum number of output tokens.", ) @click.option( "--output-random-seed", type=int, default=123, show_default=True, - help="Random seed for output token sampling." + help="Random seed for output token sampling.", ) @click.option( "--push-to-hub", is_flag=True, - help="Set this flag to push the converted dataset to the Hugging Face Hub." + help="Set this flag to push the converted dataset to the Hugging Face Hub.", ) @click.option( "--hub-dataset-id", type=str, default=None, help="The Hugging Face Hub dataset ID to push to. " - "Required if --push-to-hub is used." + "Required if --push-to-hub is used.", ) def dataset( - input_data, - output_path, - processor, - processor_args, - data_args, - short_prompt_strategy, - pad_token, - prompt_tokens_average, - prompt_tokens_stdev, - prompt_tokens_min, - prompt_tokens_max, - prompt_random_seed, - output_tokens_average, - output_tokens_stdev, - output_tokens_min, - output_tokens_max, - output_random_seed, - push_to_hub, - hub_dataset_id, + input_data, + output_path, + processor, + processor_args, + data_args, + short_prompt_strategy, + pad_token, + prompt_tokens_average, + prompt_tokens_stdev, + prompt_tokens_min, + prompt_tokens_max, + prompt_random_seed, + output_tokens_average, + output_tokens_stdev, + output_tokens_min, + output_tokens_max, + output_random_seed, + push_to_hub, + hub_dataset_id, ): process_dataset( input_data=input_data, diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py index 669bf409..95d01e5f 100644 --- a/src/guidellm/preprocess/__init__.py +++ b/src/guidellm/preprocess/__init__.py @@ -1,6 +1,3 @@ -from .dataset import process_dataset, ShortPromptStrategy +from .dataset import ShortPromptStrategy, process_dataset -__all__ = [ - "process_dataset", - "ShortPromptStrategy" -] +__all__ = ["ShortPromptStrategy", "process_dataset"] diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index 6ab5c077..f330b537 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -1,14 +1,15 @@ import os +from collections.abc import Iterator from enum import Enum from pathlib import Path -from typing import Any, Optional, Union, Dict, Callable, Iterator +from typing import Any, Callable, Optional, Union from datasets import Dataset from loguru import logger from transformers import PreTrainedTokenizerBase from guidellm.dataset import load_dataset as guidellm_load_dataset -from guidellm.utils import check_load_processor, IntegerRangeSampler +from guidellm.utils import IntegerRangeSampler, check_load_processor SUPPORTED_TYPES = { ".json", @@ -24,10 +25,10 @@ class ShortPromptStrategy(str, Enum): def handle_ignore_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - **kwargs + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, ) -> Optional[str]: if len(tokenizer.encode(current_prompt)) < min_prompt_tokens: logger.warning("Prompt too short, ignoring") @@ -36,19 +37,21 @@ def handle_ignore_strategy( def handle_concatenate_strategy( - current_prompt: str, - min_prompt_tokens: int, - dataset_iterator: Iterator[Dict[str, Any]], - prompt_column: str, - tokenizer: PreTrainedTokenizerBase, - **kwargs + current_prompt: str, + min_prompt_tokens: int, + dataset_iterator: Iterator[dict[str, Any]], + prompt_column: str, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, ) -> Optional[str]: tokens_len = len(tokenizer.encode(current_prompt)) while tokens_len < min_prompt_tokens: try: next_row = next(dataset_iterator) except StopIteration: - logger.warning("Could not concatenate enough prompts to reach minimum length, ignoring") + logger.warning( + "Could not concatenate enough prompts to reach minimum length, ignoring" + ) return None current_prompt += next_row[prompt_column] tokens_len = len(tokenizer.encode(current_prompt)) @@ -56,25 +59,25 @@ def handle_concatenate_strategy( def handle_pad_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - pad_token: str, - **kwargs + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + pad_token: str, + **_kwargs, ) -> str: while len(tokenizer.encode(current_prompt)) < min_prompt_tokens: current_prompt += pad_token return current_prompt -STRATEGY_HANDLERS: Dict[ShortPromptStrategy, Callable] = { +STRATEGY_HANDLERS: dict[ShortPromptStrategy, Callable] = { ShortPromptStrategy.IGNORE: handle_ignore_strategy, ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy, ShortPromptStrategy.PAD: handle_pad_strategy, } -def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None: +def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -86,12 +89,12 @@ def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None: dataset.to_parquet(str(output_path)) else: raise ValueError( - f"Unsupported file suffix '{output_path.suffix}' in output_path '{output_path}'. " - f"Only {SUPPORTED_TYPES} are supported." + f"Unsupported file suffix '{output_path.suffix}' in output_path " + f"'{output_path}'. Only {SUPPORTED_TYPES} are supported." ) -def _validate_output_suffix(output_path: str) -> None: +def _validate_output_suffix(output_path: Union[str, Path]) -> None: output_path = Path(output_path) suffix = output_path.suffix.lower() if suffix not in SUPPORTED_TYPES: @@ -102,33 +105,44 @@ def _validate_output_suffix(output_path: str) -> None: def process_dataset( - input_data: Union[str, Path], - output_path: Union[str, Path], - processor: Union[str, Path, PreTrainedTokenizerBase], - processor_args: Optional[dict[str, Any]] = None, - data_args: Optional[dict[str, Any]] = None, - short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, - pad_token: Optional[str] = None, - prompt_tokens_average: int = 10, - prompt_tokens_stdev: Optional[int] = None, - prompt_tokens_min: Optional[int] = None, - prompt_tokens_max: Optional[int] = None, - prompt_random_seed: int = 42, - output_tokens_average: int = 10, - output_tokens_stdev: Optional[int] = None, - output_tokens_min: Optional[int] = None, - output_tokens_max: Optional[int] = None, - output_random_seed: int = 123, - push_to_hub: bool = False, - hub_dataset_id: Optional[str] = None, + input_data: Union[str, Path], + output_path: Union[str, Path], + processor: Union[str, Path, PreTrainedTokenizerBase], + processor_args: Optional[dict[str, Any]] = None, + data_args: Optional[dict[str, Any]] = None, + short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, + pad_token: Optional[str] = None, + prompt_tokens_average: int = 10, + prompt_tokens_stdev: Optional[int] = None, + prompt_tokens_min: Optional[int] = None, + prompt_tokens_max: Optional[int] = None, + prompt_random_seed: int = 42, + output_tokens_average: int = 10, + output_tokens_stdev: Optional[int] = None, + output_tokens_min: Optional[int] = None, + output_tokens_max: Optional[int] = None, + output_random_seed: int = 123, + push_to_hub: bool = False, + hub_dataset_id: Optional[str] = None, ) -> None: _validate_output_suffix(output_path) - logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}") + logger.info( + f"Starting dataset conversion | Input: {input_data} | " + f"Output directory: {output_path}" + ) - dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args) - tokenizer = check_load_processor(processor, processor_args, "Processor/tokenizer required for dataset conversion.") + dataset, column_mappings = guidellm_load_dataset( + input_data, data_args, processor, processor_args + ) + tokenizer = check_load_processor( + processor, + processor_args, + "Processor/tokenizer required for dataset conversion.", + ) prompt_column = column_mappings.get("prompt_column") - output_column = column_mappings.get("output_tokens_count_column", "output_tokens_count") + output_column = column_mappings.get( + "output_tokens_count_column", "output_tokens_count" + ) prompt_token_sampler = iter( IntegerRangeSampler( @@ -172,7 +186,9 @@ def process_dataset( if len(tokenizer.encode(prompt_text)) > target_prompt_len: tokens = tokenizer.encode(prompt_text) - prompt_text = tokenizer.decode(tokens[:target_prompt_len], skip_special_tokens=True) + prompt_text = tokenizer.decode( + tokens[:target_prompt_len], skip_special_tokens=True + ) processed_prompt = prompt_row.copy() processed_prompt[prompt_column] = prompt_text @@ -196,8 +212,11 @@ def process_dataset( logger.info(f"Pushed dataset to: {hub_dataset_id}") -def push_dataset_to_hub(hub_dataset_id: str, processed_dataset: Dataset) -> None: +def push_dataset_to_hub(hub_dataset_id: Optional[str], processed_dataset: Dataset) -> None: hf_token = os.environ.get("HF_TOKEN") if not hub_dataset_id or not hf_token: - raise ValueError("hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub is True") + raise ValueError( + "hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub" + " is True" + ) processed_dataset.push_to_hub(hub_dataset_id, token=hf_token) diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 2b6c4665..756817a5 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -1,19 +1,21 @@ import os from pathlib import Path -from unittest.mock import patch, MagicMock +from typing import Iterator +from unittest.mock import MagicMock, patch import pytest from datasets import Dataset from transformers import PreTrainedTokenizerBase from guidellm.preprocess.dataset import ( - handle_ignore_strategy, + STRATEGY_HANDLERS, + ShortPromptStrategy, handle_concatenate_strategy, + handle_ignore_strategy, handle_pad_strategy, process_dataset, push_dataset_to_hub, - ShortPromptStrategy, - STRATEGY_HANDLERS, save_dataset_to_file, + save_dataset_to_file, ) @@ -21,32 +23,48 @@ def tokenizer_mock(): tokenizer = MagicMock(spec=PreTrainedTokenizerBase) tokenizer.encode.side_effect = lambda x: [1] * len(x) - tokenizer.decode.side_effect = lambda x, *args, **kwargs: ''.join(str(item) for item in x) + tokenizer.decode.side_effect = lambda x, *args, **kwargs: "".join( + str(item) for item in x + ) return tokenizer -@patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: MagicMock(return_value="processed_prompt")}) +from unittest.mock import MagicMock, patch +from guidellm.preprocess.dataset import process_dataset, STRATEGY_HANDLERS, ShortPromptStrategy +from datasets import Dataset + + @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_strategy_handler_called( - mock_sampler, mock_dataset_class, mock_check_processor, mock_load_dataset, tokenizer_mock + mock_sampler, + mock_dataset_class, + mock_check_processor, + mock_load_dataset, + tokenizer_mock, ): - mock_handler = STRATEGY_HANDLERS[ShortPromptStrategy.IGNORE] - mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}] - mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) - mock_check_processor.return_value = tokenizer_mock - mock_sampler.side_effect = lambda **kwargs: [10, 10] + mock_handler = MagicMock(return_value="processed_prompt") + with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}): + mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}] + mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) + mock_check_processor.return_value = tokenizer_mock + mock_sampler.side_effect = lambda **kwargs: [10, 10] - mock_dataset_obj = MagicMock(spec=Dataset) - mock_dataset_class.from_list.return_value = mock_dataset_obj + mock_dataset_obj = MagicMock(spec=Dataset) + mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir/data.json", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE) + process_dataset( + "input", + "output_dir/data.json", + tokenizer_mock, + short_prompt_strategy=ShortPromptStrategy.IGNORE, + ) - assert mock_handler.call_count == 2 - mock_load_dataset.assert_called_once() - mock_check_processor.assert_called_once() + assert mock_handler.call_count == 2 + mock_load_dataset.assert_called_once() + mock_check_processor.assert_called_once() def test_handle_ignore_strategy_too_short(tokenizer_mock): @@ -63,13 +81,17 @@ def test_handle_ignore_strategy_sufficient_length(tokenizer_mock): def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): dataset_iter = iter([{"prompt": "longer"}]) - result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock) + result = handle_concatenate_strategy( + "short", 10, dataset_iter, "prompt", tokenizer_mock + ) assert result == "shortlonger" def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): - dataset_iter = iter([]) - result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock) + dataset_iter: Iterator = iter([]) + result = handle_concatenate_strategy( + "short", 10, dataset_iter, "prompt", tokenizer_mock + ) assert result is None @@ -84,12 +106,12 @@ def test_handle_pad_strategy(tokenizer_mock): @patch("guidellm.preprocess.dataset.check_load_processor") @patch("guidellm.preprocess.dataset.IntegerRangeSampler") def test_process_dataset_non_empty( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_save_to_file, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_save_to_file, + tokenizer_mock, ): from guidellm.preprocess.dataset import process_dataset @@ -124,7 +146,11 @@ def test_process_dataset_non_empty( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_empty_after_processing( - mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + tokenizer_mock, ): mock_dataset = [{"prompt": ""}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -144,7 +170,12 @@ def test_process_dataset_empty_after_processing( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_called( - mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -154,7 +185,13 @@ def test_process_dataset_push_to_hub_called( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123") + process_dataset( + "input", + "output_dir/data.json", + tokenizer_mock, + push_to_hub=True, + hub_dataset_id="id123", + ) mock_push.assert_called_once_with("id123", mock_dataset_obj) @@ -164,7 +201,12 @@ def test_process_dataset_push_to_hub_called( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_not_called( - mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) From b5a4877b577abb75aee3c4ba7f0b9258f2a8eb80 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 18 May 2025 16:14:32 +0300 Subject: [PATCH 07/18] Improved error message --- src/guidellm/preprocess/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index f330b537..9daa5256 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -137,7 +137,7 @@ def process_dataset( tokenizer = check_load_processor( processor, processor_args, - "Processor/tokenizer required for dataset conversion.", + "dataset conversion.", ) prompt_column = column_mappings.get("prompt_column") output_column = column_mappings.get( @@ -185,7 +185,7 @@ def process_dataset( continue if len(tokenizer.encode(prompt_text)) > target_prompt_len: - tokens = tokenizer.encode(prompt_text) + tokens = tokenizer.encode(prompt_text, add_special_tokens=True) prompt_text = tokenizer.decode( tokens[:target_prompt_len], skip_special_tokens=True ) From 4d624a28c8c5f8c65d89ddd3163d81bc0e62811e Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Tue, 20 May 2025 09:43:53 +0300 Subject: [PATCH 08/18] Fixed code styling --- src/guidellm/preprocess/dataset.py | 4 +- tests/unit/preprocess/test_dataset.py | 66 +++++++++++++-------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index 9daa5256..baae943c 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -212,7 +212,9 @@ def process_dataset( logger.info(f"Pushed dataset to: {hub_dataset_id}") -def push_dataset_to_hub(hub_dataset_id: Optional[str], processed_dataset: Dataset) -> None: +def push_dataset_to_hub( + hub_dataset_id: Optional[str], processed_dataset: Dataset, +) -> None: hf_token = os.environ.get("HF_TOKEN") if not hub_dataset_id or not hf_token: raise ValueError( diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 756817a5..dd4ab645 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -1,8 +1,11 @@ import os from pathlib import Path -from typing import Iterator +from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch +if TYPE_CHECKING: + from collections.abc import Iterator + import pytest from datasets import Dataset from transformers import PreTrainedTokenizerBase @@ -29,21 +32,16 @@ def tokenizer_mock(): return tokenizer -from unittest.mock import MagicMock, patch -from guidellm.preprocess.dataset import process_dataset, STRATEGY_HANDLERS, ShortPromptStrategy -from datasets import Dataset - - @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_strategy_handler_called( - mock_sampler, - mock_dataset_class, - mock_check_processor, - mock_load_dataset, - tokenizer_mock, + mock_sampler, + mock_dataset_class, + mock_check_processor, + mock_load_dataset, + tokenizer_mock, ): mock_handler = MagicMock(return_value="processed_prompt") with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}): @@ -106,12 +104,12 @@ def test_handle_pad_strategy(tokenizer_mock): @patch("guidellm.preprocess.dataset.check_load_processor") @patch("guidellm.preprocess.dataset.IntegerRangeSampler") def test_process_dataset_non_empty( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_save_to_file, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_save_to_file, + tokenizer_mock, ): from guidellm.preprocess.dataset import process_dataset @@ -146,11 +144,11 @@ def test_process_dataset_non_empty( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_empty_after_processing( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + tokenizer_mock, ): mock_dataset = [{"prompt": ""}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -170,12 +168,12 @@ def test_process_dataset_empty_after_processing( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_called( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_push, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -201,12 +199,12 @@ def test_process_dataset_push_to_hub_called( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_not_called( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_push, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) From e2ca9193fd9bc38c10eab4bc67a295efe5d4def5 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 15:32:40 +0300 Subject: [PATCH 09/18] Fixed CR comments --- pyproject.toml | 3 +- src/guidellm/__main__.py | 79 ++++++++------ src/guidellm/preprocess/dataset.py | 150 +++++++++++++++----------- tests/unit/preprocess/test_dataset.py | 69 ++++++++++-- 4 files changed, 193 insertions(+), 108 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a78b1fc5..9f86d993 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,8 +21,7 @@ name = "guidellm" description = "Guidance platform for deploying and managing large language models." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.9.0,<4.0" -license = "Apache-2.0" -license-files = ["LICENSE"] +license = { text = "Apache-2.0" } authors = [ { name = "Red Hat" } ] keywords = [ "ai", diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 6df1089d..51ac7049 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -1,4 +1,5 @@ import asyncio +import codecs import json from pathlib import Path from typing import get_args @@ -280,6 +281,18 @@ def benchmark( ) ) +def decode_escaped_str(_ctx, _param, value): + """ + Click auto adds characters. For example, when using --pad-char "\n", + it parses it as "\\n". This method decodes the string to handle escape + sequences correctly. + """ + if value is None: + return None + try: + return codecs.decode(value, "unicode_escape") + except Exception as e: + raise click.BadParameter(f"Could not decode escape sequences: {e}") from e @cli.command( help=( @@ -291,27 +304,26 @@ def config(): print_config() -@cli.group(help="Preprocessing utilities for datasets.") +@cli.group(help="General preprocessing tools and utilities.") def preprocess(): pass @preprocess.command( - help="Convert a dataset to have specific prompt and output token sizes.\n\n" - "INPUT_DATA: Path to the input dataset or dataset ID.\n" - "OUTPUT_PATH: Directory to save the converted dataset. " - "The dataset will be saved as an Arrow dataset (.arrow) inside the directory." + help=( + "Convert a dataset to have specific prompt and output token sizes.\n\n" + "INPUT_DATA: Path to the input dataset or dataset ID.\n" + "OUTPUT_PATH: Path to save the converted dataset, including file suffix. " + ) ) @click.argument( - "input_data", + "data", type=str, - metavar="INPUT_DATA", required=True, ) @click.argument( "output_path", type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True), - metavar="OUTPUT_PATH", required=True, ) @click.option( @@ -348,11 +360,21 @@ def preprocess(): help="Strategy to handle prompts shorter than the target length. ", ) @click.option( - "--pad-token", + "--pad-char", type=str, - default=None, + default="", + callback=decode_escaped_str, help="The token to pad short prompts with when using the 'pad' strategy.", ) +@click.option( + "--concat-delimiter", + type=str, + default="", + help=( + "The delimiter to use when concatenating prompts that are too short." + " Used when strategy is 'concatenate'." + ) +) @click.option( "--prompt-tokens-average", type=int, @@ -378,13 +400,6 @@ def preprocess(): default=None, help="Maximum number of prompt tokens.", ) -@click.option( - "--prompt-random-seed", - type=int, - default=42, - show_default=True, - help="Random seed for prompt token sampling.", -) @click.option( "--output-tokens-average", type=int, @@ -410,13 +425,6 @@ def preprocess(): default=None, help="Maximum number of output tokens.", ) -@click.option( - "--output-random-seed", - type=int, - default=123, - show_default=True, - help="Random seed for output token sampling.", -) @click.option( "--push-to-hub", is_flag=True, @@ -429,47 +437,54 @@ def preprocess(): help="The Hugging Face Hub dataset ID to push to. " "Required if --push-to-hub is used.", ) +@click.option( + "--random-seed", + type=int, + default=42, + show_default=True, + help="Random seed for prompt token sampling and output tokens sampling.", +) def dataset( - input_data, + data, output_path, processor, processor_args, data_args, short_prompt_strategy, - pad_token, + pad_char, + concat_delimiter, prompt_tokens_average, prompt_tokens_stdev, prompt_tokens_min, prompt_tokens_max, - prompt_random_seed, output_tokens_average, output_tokens_stdev, output_tokens_min, output_tokens_max, - output_random_seed, push_to_hub, hub_dataset_id, + random_seed, ): process_dataset( - input_data=input_data, + data=data, output_path=output_path, processor=processor, processor_args=processor_args, data_args=data_args, short_prompt_strategy=short_prompt_strategy, - pad_token=pad_token, + pad_char=pad_char, + concat_delimiter=concat_delimiter, prompt_tokens_average=prompt_tokens_average, prompt_tokens_stdev=prompt_tokens_stdev, prompt_tokens_min=prompt_tokens_min, prompt_tokens_max=prompt_tokens_max, - prompt_random_seed=prompt_random_seed, output_tokens_average=output_tokens_average, output_tokens_stdev=output_tokens_stdev, output_tokens_min=output_tokens_min, output_tokens_max=output_tokens_max, - output_random_seed=output_random_seed, push_to_hub=push_to_hub, hub_dataset_id=hub_dataset_id, + random_seed=random_seed, ) diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index baae943c..b3f3db7a 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -13,22 +13,28 @@ SUPPORTED_TYPES = { ".json", + ".jsonl", ".csv", ".parquet", } +class PromptTooShortError(Exception): + pass + + class ShortPromptStrategy(str, Enum): IGNORE = "ignore" CONCATENATE = "concatenate" PAD = "pad" + ERROR = "error" def handle_ignore_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, ) -> Optional[str]: if len(tokenizer.encode(current_prompt)) < min_prompt_tokens: logger.warning("Prompt too short, ignoring") @@ -37,12 +43,13 @@ def handle_ignore_strategy( def handle_concatenate_strategy( - current_prompt: str, - min_prompt_tokens: int, - dataset_iterator: Iterator[dict[str, Any]], - prompt_column: str, - tokenizer: PreTrainedTokenizerBase, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + dataset_iterator: Iterator[dict[str, Any]], + prompt_column: str, + tokenizer: PreTrainedTokenizerBase, + concat_delimiter: str, + **_kwargs, ) -> Optional[str]: tokens_len = len(tokenizer.encode(current_prompt)) while tokens_len < min_prompt_tokens: @@ -53,20 +60,35 @@ def handle_concatenate_strategy( "Could not concatenate enough prompts to reach minimum length, ignoring" ) return None - current_prompt += next_row[prompt_column] + current_prompt += concat_delimiter + next_row[prompt_column] tokens_len = len(tokenizer.encode(current_prompt)) return current_prompt def handle_pad_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - pad_token: str, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + pad_char: str, + **_kwargs, ) -> str: while len(tokenizer.encode(current_prompt)) < min_prompt_tokens: - current_prompt += pad_token + current_prompt += pad_char + return current_prompt + + +def handle_error_strategy( + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, +) -> Optional[str]: + prompt_len = len(tokenizer.encode(current_prompt)) + if prompt_len < min_prompt_tokens: + raise PromptTooShortError( + f"Found too short prompt: {current_prompt}, with length: {prompt_len}. " + f"Minimum length required: {min_prompt_tokens}.", + ) return current_prompt @@ -74,23 +96,25 @@ def handle_pad_strategy( ShortPromptStrategy.IGNORE: handle_ignore_strategy, ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy, ShortPromptStrategy.PAD: handle_pad_strategy, + ShortPromptStrategy.ERROR: handle_error_strategy, } def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) + suffix = output_path.suffix.lower() - if output_path.suffix == ".csv": - dataset.to_csv(str(output_path)) - elif output_path.suffix == ".json": - dataset.to_json(str(output_path)) - elif output_path.suffix == ".parquet": - dataset.to_parquet(str(output_path)) + if suffix == ".csv": + dataset.to_csv(output_path) + elif suffix in {".json", ".jsonl"}: + dataset.to_json(output_path) + elif suffix == ".parquet": + dataset.to_parquet(output_path) else: raise ValueError( - f"Unsupported file suffix '{output_path.suffix}' in output_path " - f"'{output_path}'. Only {SUPPORTED_TYPES} are supported." + f"Unsupported file suffix '{suffix}' in output_path'{output_path}'." + f" Only {SUPPORTED_TYPES} are supported." ) @@ -105,34 +129,34 @@ def _validate_output_suffix(output_path: Union[str, Path]) -> None: def process_dataset( - input_data: Union[str, Path], - output_path: Union[str, Path], - processor: Union[str, Path, PreTrainedTokenizerBase], - processor_args: Optional[dict[str, Any]] = None, - data_args: Optional[dict[str, Any]] = None, - short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, - pad_token: Optional[str] = None, - prompt_tokens_average: int = 10, - prompt_tokens_stdev: Optional[int] = None, - prompt_tokens_min: Optional[int] = None, - prompt_tokens_max: Optional[int] = None, - prompt_random_seed: int = 42, - output_tokens_average: int = 10, - output_tokens_stdev: Optional[int] = None, - output_tokens_min: Optional[int] = None, - output_tokens_max: Optional[int] = None, - output_random_seed: int = 123, - push_to_hub: bool = False, - hub_dataset_id: Optional[str] = None, + data: Union[str, Path], + output_path: Union[str, Path], + processor: Union[str, Path, PreTrainedTokenizerBase], + processor_args: Optional[dict[str, Any]] = None, + data_args: Optional[dict[str, Any]] = None, + short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, + pad_char: Optional[str] = None, + concat_delimiter: Optional[str] = None, + prompt_tokens_average: int = 10, + prompt_tokens_stdev: Optional[int] = None, + prompt_tokens_min: Optional[int] = None, + prompt_tokens_max: Optional[int] = None, + output_tokens_average: int = 10, + output_tokens_stdev: Optional[int] = None, + output_tokens_min: Optional[int] = None, + output_tokens_max: Optional[int] = None, + push_to_hub: bool = False, + hub_dataset_id: Optional[str] = None, + random_seed: int = 42, ) -> None: _validate_output_suffix(output_path) logger.info( - f"Starting dataset conversion | Input: {input_data} | " + f"Starting dataset conversion | Input: {data} | " f"Output directory: {output_path}" ) dataset, column_mappings = guidellm_load_dataset( - input_data, data_args, processor, processor_args + data, data_args, processor, processor_args ) tokenizer = check_load_processor( processor, @@ -150,7 +174,7 @@ def process_dataset( variance=prompt_tokens_stdev, min_value=prompt_tokens_min, max_value=prompt_tokens_max, - random_seed=prompt_random_seed, + random_seed=random_seed, ) ) @@ -160,35 +184,33 @@ def process_dataset( variance=output_tokens_stdev, min_value=output_tokens_min, max_value=output_tokens_max, - random_seed=output_random_seed, + random_seed=random_seed, ) ) dataset_iterator = iter(dataset) processed_prompts = [] - handler = STRATEGY_HANDLERS[short_prompt_strategy] + prompt_handler = STRATEGY_HANDLERS[short_prompt_strategy] for prompt_row in dataset_iterator: prompt_text = prompt_row[prompt_column] target_prompt_len = next(prompt_token_sampler) - if len(tokenizer.encode(prompt_text)) < target_prompt_len: - prompt_text = handler( - current_prompt=prompt_text, - min_prompt_tokens=target_prompt_len, - dataset_iterator=dataset_iterator, - prompt_column=prompt_column, - tokenizer=tokenizer, - pad_token=pad_token, - ) - if prompt_text is None: - continue + prompt_text = prompt_handler( + current_prompt=prompt_text, + min_prompt_tokens=target_prompt_len, + dataset_iterator=dataset_iterator, + prompt_column=prompt_column, + tokenizer=tokenizer, + pad_char=pad_char, + concat_delimiter=concat_delimiter, + ) + if prompt_text is None: + continue if len(tokenizer.encode(prompt_text)) > target_prompt_len: - tokens = tokenizer.encode(prompt_text, add_special_tokens=True) - prompt_text = tokenizer.decode( - tokens[:target_prompt_len], skip_special_tokens=True - ) + tokens = tokenizer.encode(prompt_text) + prompt_text = tokenizer.decode(tokens[:target_prompt_len]) processed_prompt = prompt_row.copy() processed_prompt[prompt_column] = prompt_text diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index dd4ab645..401a9e23 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -12,8 +12,10 @@ from guidellm.preprocess.dataset import ( STRATEGY_HANDLERS, + PromptTooShortError, ShortPromptStrategy, handle_concatenate_strategy, + handle_error_strategy, handle_ignore_strategy, handle_pad_strategy, process_dataset, @@ -80,15 +82,15 @@ def test_handle_ignore_strategy_sufficient_length(tokenizer_mock): def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): dataset_iter = iter([{"prompt": "longer"}]) result = handle_concatenate_strategy( - "short", 10, dataset_iter, "prompt", tokenizer_mock + "short", 10, dataset_iter, "prompt", tokenizer_mock, "\n" ) - assert result == "shortlonger" + assert result == "short\nlonger" def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): dataset_iter: Iterator = iter([]) result = handle_concatenate_strategy( - "short", 10, dataset_iter, "prompt", tokenizer_mock + "short", 10, dataset_iter, "prompt", tokenizer_mock, "" ) assert result is None @@ -98,6 +100,17 @@ def test_handle_pad_strategy(tokenizer_mock): assert result == "shortppppp" +def test_handle_error_strategy_valid_prompt(tokenizer_mock): + result = handle_error_strategy("valid prompt", 5, tokenizer_mock) + assert result == "valid prompt" + tokenizer_mock.encode.assert_called_with("valid prompt") + + +def test_handle_error_strategy_too_short_prompt(tokenizer_mock): + with pytest.raises(PromptTooShortError): + handle_error_strategy("short", 10, tokenizer_mock) + + @patch("guidellm.preprocess.dataset.save_dataset_to_file") @patch("guidellm.preprocess.dataset.Dataset") @patch("guidellm.preprocess.dataset.guidellm_load_dataset") @@ -245,8 +258,17 @@ def test_save_dataset_to_file_csv(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) output_path = Path("some/path/output.csv") save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_csv.assert_called_once_with(str(output_path)) - mock_mkdir.assert_called_once() + mock_dataset.to_csv.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_csv_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.CSV") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_csv.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) @patch.object(Path, "mkdir") @@ -254,8 +276,35 @@ def test_save_dataset_to_file_json(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) output_path = Path("some/path/output.json") save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_json.assert_called_once_with(str(output_path)) - mock_mkdir.assert_called_once() + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_json_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.JSON") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_jsonl(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.jsonl") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.JSONL") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) @patch.object(Path, "mkdir") @@ -263,8 +312,8 @@ def test_save_dataset_to_file_parquet(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) output_path = Path("some/path/output.parquet") save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_parquet.assert_called_once_with(str(output_path)) - mock_mkdir.assert_called_once() + mock_dataset.to_parquet.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) @patch.object(Path, "mkdir") @@ -273,4 +322,4 @@ def test_save_dataset_to_file_unsupported_type(mock_mkdir): output_path = Path("some/path/output.txt") with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"): save_dataset_to_file(mock_dataset, output_path) - mock_mkdir.assert_called_once() + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) From a9f4fa6f90bd59600cfddfeab750f756fe2b7809 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 16:00:14 +0300 Subject: [PATCH 10/18] Fixed UTs --- src/guidellm/__main__.py | 70 +++---------------- src/guidellm/preprocess/dataset.py | 97 ++++++++++++++++++++++----- tests/unit/preprocess/test_dataset.py | 41 ++++++++--- 3 files changed, 123 insertions(+), 85 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index cca18048..aac0efa4 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -376,54 +376,16 @@ def preprocess(): ) ) @click.option( - "--prompt-tokens-average", - type=int, - default=10, - show_default=True, - help="Average target number of tokens for prompts.", -) -@click.option( - "--prompt-tokens-stdev", - type=int, - default=None, - help="Standard deviation for prompt tokens sampling.", -) -@click.option( - "--prompt-tokens-min", - type=int, - default=None, - help="Minimum number of prompt tokens.", -) -@click.option( - "--prompt-tokens-max", - type=int, - default=None, - help="Maximum number of prompt tokens.", -) -@click.option( - "--output-tokens-average", - type=int, - default=10, - show_default=True, - help="Average target number of tokens for outputs.", -) -@click.option( - "--output-tokens-stdev", - type=int, - default=None, - help="Standard deviation for output tokens sampling.", -) -@click.option( - "--output-tokens-min", - type=int, + "--prompt-tokens", + type=str, default=None, - help="Minimum number of output tokens.", + help="Prompt tokens config (JSON, YAML file or key=value string)", ) @click.option( - "--output-tokens-max", - type=int, + "--output-tokens", + type=str, default=None, - help="Maximum number of output tokens.", + help="Output tokens config (JSON, YAML file or key=value string)", ) @click.option( "--push-to-hub", @@ -453,14 +415,8 @@ def dataset( short_prompt_strategy, pad_char, concat_delimiter, - prompt_tokens_average, - prompt_tokens_stdev, - prompt_tokens_min, - prompt_tokens_max, - output_tokens_average, - output_tokens_stdev, - output_tokens_min, - output_tokens_max, + prompt_tokens, + output_tokens, push_to_hub, hub_dataset_id, random_seed, @@ -469,19 +425,13 @@ def dataset( data=data, output_path=output_path, processor=processor, + prompt_tokens=prompt_tokens, + output_tokens=output_tokens, processor_args=processor_args, data_args=data_args, short_prompt_strategy=short_prompt_strategy, pad_char=pad_char, concat_delimiter=concat_delimiter, - prompt_tokens_average=prompt_tokens_average, - prompt_tokens_stdev=prompt_tokens_stdev, - prompt_tokens_min=prompt_tokens_min, - prompt_tokens_max=prompt_tokens_max, - output_tokens_average=output_tokens_average, - output_tokens_stdev=output_tokens_stdev, - output_tokens_min=output_tokens_min, - output_tokens_max=output_tokens_max, push_to_hub=push_to_hub, hub_dataset_id=hub_dataset_id, random_seed=random_seed, diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index b3f3db7a..b2a45e16 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -1,11 +1,14 @@ +import json import os from collections.abc import Iterator from enum import Enum from pathlib import Path from typing import Any, Callable, Optional, Union +import yaml from datasets import Dataset from loguru import logger +from pydantic import BaseModel, Field from transformers import PreTrainedTokenizerBase from guidellm.dataset import load_dataset as guidellm_load_dataset @@ -100,6 +103,71 @@ def handle_error_strategy( } +class TokensConfig(BaseModel): + average: int = Field( + description="The average number of tokens.", + gt=0, + ) + stdev: Optional[int] = Field( + description="The standard deviation of the tokens.", + gt=0, + default=None, + ) + min: Optional[int] = Field( + description="The minimum number of tokens.", + gt=0, + default=None, + ) + max: Optional[int] = Field( + description="The maximum number of tokens.", + gt=0, + default=None, + ) + + @staticmethod + def parse_str(data: Union[str, Path]) -> "TokensConfig": + if ( + isinstance(data, Path) + or data.strip().endswith(".config") + or data.strip().endswith(".yaml") + ): + return TokensConfig.parse_config_file(data) + + if data.strip().startswith("{"): + return TokensConfig.parse_json(data) + + if data.count("=") > 1: + return TokensConfig.parse_key_value_pairs(data) + + raise ValueError( + f"Unsupported data format. Expected JSON or key-value pairs, got {data}" + ) + + @staticmethod + def parse_json(data: str) -> "TokensConfig": + config_dict = json.loads(data.strip()) + + return TokensConfig(**config_dict) + + @staticmethod + def parse_key_value_pairs(data: str) -> "TokensConfig": + config_dict = {} + items = data.strip().split(",") + for item in items: + key, value = item.split("=") + config_dict[key.strip()] = ( + int(value.strip()) if value.strip().isnumeric() else value.strip() + ) + + return TokensConfig(**config_dict) # type: ignore[arg-type] + + @staticmethod + def parse_config_file(data: Union[str, Path]) -> "TokensConfig": + with Path(data).open("r") as file: + config_dict = yaml.safe_load(file) + + return TokensConfig(**config_dict) + def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -132,19 +200,13 @@ def process_dataset( data: Union[str, Path], output_path: Union[str, Path], processor: Union[str, Path, PreTrainedTokenizerBase], + prompt_tokens: Union[str, Path], + output_tokens: Union[str, Path], processor_args: Optional[dict[str, Any]] = None, data_args: Optional[dict[str, Any]] = None, short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, pad_char: Optional[str] = None, concat_delimiter: Optional[str] = None, - prompt_tokens_average: int = 10, - prompt_tokens_stdev: Optional[int] = None, - prompt_tokens_min: Optional[int] = None, - prompt_tokens_max: Optional[int] = None, - output_tokens_average: int = 10, - output_tokens_stdev: Optional[int] = None, - output_tokens_min: Optional[int] = None, - output_tokens_max: Optional[int] = None, push_to_hub: bool = False, hub_dataset_id: Optional[str] = None, random_seed: int = 42, @@ -168,22 +230,25 @@ def process_dataset( "output_tokens_count_column", "output_tokens_count" ) + prompt_tokens_cfg = TokensConfig.parse_str(prompt_tokens) + output_tokens_cfg = TokensConfig.parse_str(output_tokens) + prompt_token_sampler = iter( IntegerRangeSampler( - average=prompt_tokens_average, - variance=prompt_tokens_stdev, - min_value=prompt_tokens_min, - max_value=prompt_tokens_max, + average=prompt_tokens_cfg.average, + variance=prompt_tokens_cfg.stdev, + min_value=prompt_tokens_cfg.min, + max_value=prompt_tokens_cfg.max, random_seed=random_seed, ) ) output_token_sampler = iter( IntegerRangeSampler( - average=output_tokens_average, - variance=output_tokens_stdev, - min_value=output_tokens_min, - max_value=output_tokens_max, + average=output_tokens_cfg.average, + variance=output_tokens_cfg.stdev, + min_value=output_tokens_cfg.min, + max_value=output_tokens_cfg.max, random_seed=random_seed, ) ) diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 401a9e23..38e5bcf9 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -56,9 +56,11 @@ def test_strategy_handler_called( mock_dataset_class.from_list.return_value = mock_dataset_obj process_dataset( - "input", - "output_dir/data.json", - tokenizer_mock, + data="input", + output_path="output_dir/data.json", + processor=tokenizer_mock, + prompt_tokens="average=10,min=1", + output_tokens="average=10,min=1", short_prompt_strategy=ShortPromptStrategy.IGNORE, ) @@ -135,7 +137,13 @@ def test_process_dataset_non_empty( mock_dataset_class.from_list.return_value = mock_dataset_obj output_path = "output_dir/data.json" - process_dataset("input", output_path, tokenizer_mock) + process_dataset( + data="input", + output_path=output_path, + processor=tokenizer_mock, + prompt_tokens="average=10,min=1", + output_tokens="average=10,min=1", + ) mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() @@ -168,7 +176,13 @@ def test_process_dataset_empty_after_processing( mock_check_processor.return_value = tokenizer_mock mock_sampler.side_effect = lambda **kwargs: [10] - process_dataset("input", "output_dir/data.json", tokenizer_mock) + process_dataset( + data="input", + output_path="output_dir/data.json", + processor=tokenizer_mock, + prompt_tokens="average=10,min=1", + output_tokens="average=10,min=1", + ) mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() @@ -197,9 +211,11 @@ def test_process_dataset_push_to_hub_called( mock_dataset_class.from_list.return_value = mock_dataset_obj process_dataset( - "input", - "output_dir/data.json", - tokenizer_mock, + data="input", + output_path="output_dir/data.json", + processor=tokenizer_mock, + prompt_tokens="average=10,min=1", + output_tokens="average=10,min=1", push_to_hub=True, hub_dataset_id="id123", ) @@ -227,7 +243,14 @@ def test_process_dataset_push_to_hub_not_called( mock_dataset_obj = MagicMock(spec=Dataset) mock_dataset_class.from_list.return_value = mock_dataset_obj - process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=False) + process_dataset( + data="input", + output_path="output_dir/data.json", + processor=tokenizer_mock, + prompt_tokens="average=10,min=1", + output_tokens="average=10,min=1", + push_to_hub=False, + ) mock_push.assert_not_called() From 40c11182702c8ffb17cbfe4431e2c6b063ae9cf1 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 16:04:03 +0300 Subject: [PATCH 11/18] Added pytest mark to UTs --- tests/unit/preprocess/test_dataset.py | 46 +++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 38e5bcf9..8e9ddd58 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -33,7 +33,7 @@ def tokenizer_mock(): ) return tokenizer - +@pytest.mark.smoke @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.Dataset") @@ -68,19 +68,19 @@ def test_strategy_handler_called( mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() - +@pytest.mark.sanity def test_handle_ignore_strategy_too_short(tokenizer_mock): result = handle_ignore_strategy("short", 10, tokenizer_mock) assert result is None tokenizer_mock.encode.assert_called_with("short") - +@pytest.mark.sanity def test_handle_ignore_strategy_sufficient_length(tokenizer_mock): result = handle_ignore_strategy("long prompt", 5, tokenizer_mock) assert result == "long prompt" tokenizer_mock.encode.assert_called_with("long prompt") - +@pytest.mark.sanity def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): dataset_iter = iter([{"prompt": "longer"}]) result = handle_concatenate_strategy( @@ -88,7 +88,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): ) assert result == "short\nlonger" - +@pytest.mark.sanity def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): dataset_iter: Iterator = iter([]) result = handle_concatenate_strategy( @@ -96,23 +96,23 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): ) assert result is None - +@pytest.mark.sanity def test_handle_pad_strategy(tokenizer_mock): result = handle_pad_strategy("short", 10, tokenizer_mock, "p") assert result == "shortppppp" - +@pytest.mark.sanity def test_handle_error_strategy_valid_prompt(tokenizer_mock): result = handle_error_strategy("valid prompt", 5, tokenizer_mock) assert result == "valid prompt" tokenizer_mock.encode.assert_called_with("valid prompt") - +@pytest.mark.sanity def test_handle_error_strategy_too_short_prompt(tokenizer_mock): with pytest.raises(PromptTooShortError): handle_error_strategy("short", 10, tokenizer_mock) - +@pytest.mark.smoke @patch("guidellm.preprocess.dataset.save_dataset_to_file") @patch("guidellm.preprocess.dataset.Dataset") @patch("guidellm.preprocess.dataset.guidellm_load_dataset") @@ -159,7 +159,7 @@ def test_process_dataset_non_empty( assert "output_tokens_count" in item assert len(tokenizer_mock.encode(item["prompt"])) <= 3 - +@pytest.mark.sanity @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @@ -188,7 +188,7 @@ def test_process_dataset_empty_after_processing( mock_check_processor.assert_called_once() mock_dataset_class.from_list.assert_not_called() - +@pytest.mark.smoke @patch(f"{process_dataset.__module__}.push_dataset_to_hub") @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @@ -221,7 +221,7 @@ def test_process_dataset_push_to_hub_called( ) mock_push.assert_called_once_with("id123", mock_dataset_obj) - +@pytest.mark.sanity @patch(f"{process_dataset.__module__}.push_dataset_to_hub") @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @@ -253,14 +253,14 @@ def test_process_dataset_push_to_hub_not_called( ) mock_push.assert_not_called() - +@pytest.mark.regression def test_push_dataset_to_hub_success(): os.environ["HF_TOKEN"] = "token" mock_dataset = MagicMock(spec=Dataset) push_dataset_to_hub("dataset_id", mock_dataset) mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token") - +@pytest.mark.regression def test_push_dataset_to_hub_error_no_env(): if "HF_TOKEN" in os.environ: del os.environ["HF_TOKEN"] @@ -268,14 +268,14 @@ def test_push_dataset_to_hub_error_no_env(): with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub("dataset_id", mock_dataset) - +@pytest.mark.regression def test_push_dataset_to_hub_error_no_id(): os.environ["HF_TOKEN"] = "token" mock_dataset = MagicMock(spec=Dataset) with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub(None, mock_dataset) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_csv(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -284,7 +284,7 @@ def test_save_dataset_to_file_csv(mock_mkdir): mock_dataset.to_csv.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_csv_capitalized(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -293,7 +293,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir): mock_dataset.to_csv.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_json(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -302,7 +302,7 @@ def test_save_dataset_to_file_json(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_json_capitalized(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -311,7 +311,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_jsonl(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -320,7 +320,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -329,7 +329,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_parquet(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) @@ -338,7 +338,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir): mock_dataset.to_parquet.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - +@pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_unsupported_type(mock_mkdir): mock_dataset = MagicMock(spec=Dataset) From f61b7f07238c16071362b1a82130272ee007e6d8 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 16:08:03 +0300 Subject: [PATCH 12/18] Added docs --- src/guidellm/preprocess/dataset.py | 154 ++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 34 deletions(-) diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index b2a45e16..f2853910 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -34,11 +34,20 @@ class ShortPromptStrategy(str, Enum): def handle_ignore_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, ) -> Optional[str]: + """ + Ignores prompts that are shorter than the required minimum token length. + + :param current_prompt: The input prompt string. + :param min_prompt_tokens: Minimum required token count. + :param tokenizer: Tokenizer used to count tokens. + :return: The prompt if it meets the length, otherwise None. + """ + if len(tokenizer.encode(current_prompt)) < min_prompt_tokens: logger.warning("Prompt too short, ignoring") return None @@ -46,14 +55,26 @@ def handle_ignore_strategy( def handle_concatenate_strategy( - current_prompt: str, - min_prompt_tokens: int, - dataset_iterator: Iterator[dict[str, Any]], - prompt_column: str, - tokenizer: PreTrainedTokenizerBase, - concat_delimiter: str, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + dataset_iterator: Iterator[dict[str, Any]], + prompt_column: str, + tokenizer: PreTrainedTokenizerBase, + concat_delimiter: str, + **_kwargs, ) -> Optional[str]: + """ + Concatenates prompts until the minimum token requirement is met. + + :param current_prompt: The initial prompt. + :param min_prompt_tokens: Target minimum token length. + :param dataset_iterator: Iterator to fetch more prompts. + :param prompt_column: Column key for prompt extraction. + :param tokenizer: Tokenizer used to count tokens. + :param concat_delimiter: Delimiter to use between prompts. + :return: Concatenated prompt or None if not enough data. + """ + tokens_len = len(tokenizer.encode(current_prompt)) while tokens_len < min_prompt_tokens: try: @@ -69,23 +90,43 @@ def handle_concatenate_strategy( def handle_pad_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - pad_char: str, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + pad_char: str, + **_kwargs, ) -> str: + """ + Pads the prompt with a character until it reaches the minimum token length. + + :param current_prompt: The input prompt. + :param min_prompt_tokens: Desired minimum token count. + :param tokenizer: Tokenizer used to count tokens. + :param pad_char: Character used for padding. + :return: Padded prompt string. + """ + while len(tokenizer.encode(current_prompt)) < min_prompt_tokens: current_prompt += pad_char return current_prompt def handle_error_strategy( - current_prompt: str, - min_prompt_tokens: int, - tokenizer: PreTrainedTokenizerBase, - **_kwargs, + current_prompt: str, + min_prompt_tokens: int, + tokenizer: PreTrainedTokenizerBase, + **_kwargs, ) -> Optional[str]: + """ + Raises an error if the prompt is too short. + + :param current_prompt: The input prompt. + :param min_prompt_tokens: Required token count. + :param tokenizer: Tokenizer used to count tokens. + :return: The input prompt if valid. + :raises PromptTooShortError: If the prompt is too short. + """ + prompt_len = len(tokenizer.encode(current_prompt)) if prompt_len < min_prompt_tokens: raise PromptTooShortError( @@ -126,6 +167,17 @@ class TokensConfig(BaseModel): @staticmethod def parse_str(data: Union[str, Path]) -> "TokensConfig": + """ + Parses a string or path into a TokensConfig object. Supports: + - JSON string + - key=value pairs + - file path to .yaml/.config + + :param data: String or path containing configuration. + :return: Parsed TokensConfig instance. + :raises ValueError: If the format is not recognized. + """ + if ( isinstance(data, Path) or data.strip().endswith(".config") @@ -169,6 +221,13 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig": return TokensConfig(**config_dict) def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: + """ + Saves a HuggingFace Dataset to file in a supported format. + + :param dataset: Dataset to save. + :param output_path: Output file path (.json, .jsonl, .csv, .parquet). + :raises ValueError: If the file extension is not supported. + """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) suffix = output_path.suffix.lower() @@ -197,20 +256,39 @@ def _validate_output_suffix(output_path: Union[str, Path]) -> None: def process_dataset( - data: Union[str, Path], - output_path: Union[str, Path], - processor: Union[str, Path, PreTrainedTokenizerBase], - prompt_tokens: Union[str, Path], - output_tokens: Union[str, Path], - processor_args: Optional[dict[str, Any]] = None, - data_args: Optional[dict[str, Any]] = None, - short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, - pad_char: Optional[str] = None, - concat_delimiter: Optional[str] = None, - push_to_hub: bool = False, - hub_dataset_id: Optional[str] = None, - random_seed: int = 42, + data: Union[str, Path], + output_path: Union[str, Path], + processor: Union[str, Path, PreTrainedTokenizerBase], + prompt_tokens: Union[str, Path], + output_tokens: Union[str, Path], + processor_args: Optional[dict[str, Any]] = None, + data_args: Optional[dict[str, Any]] = None, + short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE, + pad_char: Optional[str] = None, + concat_delimiter: Optional[str] = None, + push_to_hub: bool = False, + hub_dataset_id: Optional[str] = None, + random_seed: int = 42, ) -> None: + """ + Main method to process and save a dataset with sampled prompt/output token counts. + + :param data: Path or identifier for dataset input. + :param output_path: File path to save the processed dataset. + :param processor: Tokenizer object or its config. + :param prompt_tokens: Prompt token config string or file. + :param output_tokens: Output token config string or file. + :param processor_args: Optional processor arguments. + :param data_args: Optional data loading arguments. + :param short_prompt_strategy: Strategy for handling short prompts. + :param pad_char: Character used when padding short prompts. + :param concat_delimiter: Delimiter for concatenation strategy. + :param push_to_hub: Whether to push to Hugging Face Hub. + :param hub_dataset_id: Dataset ID on Hugging Face Hub. + :param random_seed: Seed for random sampling. + :raises ValueError: If output path is invalid or pushing conditions unmet. + """ + _validate_output_suffix(output_path) logger.info( f"Starting dataset conversion | Input: {data} | " @@ -300,8 +378,16 @@ def process_dataset( def push_dataset_to_hub( - hub_dataset_id: Optional[str], processed_dataset: Dataset, + hub_dataset_id: Optional[str], processed_dataset: Dataset, ) -> None: + """ + Pushes the processed dataset to Hugging Face Hub using HF_TOKEN. + + :param hub_dataset_id: Identifier on the Hub to push to. + :param processed_dataset: HuggingFace Dataset object. + :raises ValueError: If hub_dataset_id or HF_TOKEN is not available. + """ + hf_token = os.environ.get("HF_TOKEN") if not hub_dataset_id or not hf_token: raise ValueError( From b6146c921bc0b95ace0a4d1080fccee409d9f483 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 17:59:31 +0300 Subject: [PATCH 13/18] Ran tox -e style --- pyproject.toml | 3 +- src/guidellm/__main__.py | 4 +- src/guidellm/benchmark/benchmark.py | 5 +- src/guidellm/preprocess/dataset.py | 7 ++- tests/unit/preprocess/test_dataset.py | 79 +++++++++++++++++---------- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9f86d993..a78b1fc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,8 @@ name = "guidellm" description = "Guidance platform for deploying and managing large language models." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.9.0,<4.0" -license = { text = "Apache-2.0" } +license = "Apache-2.0" +license-files = ["LICENSE"] authors = [ { name = "Red Hat" } ] keywords = [ "ai", diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index aac0efa4..66c3daa4 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -281,6 +281,7 @@ def benchmark( ) ) + def decode_escaped_str(_ctx, _param, value): """ Click auto adds characters. For example, when using --pad-char "\n", @@ -294,6 +295,7 @@ def decode_escaped_str(_ctx, _param, value): except Exception as e: raise click.BadParameter(f"Could not decode escape sequences: {e}") from e + @cli.command( help=( "Print out the available configuration settings that can be set " @@ -373,7 +375,7 @@ def preprocess(): help=( "The delimiter to use when concatenating prompts that are too short." " Used when strategy is 'concatenate'." - ) + ), ) @click.option( "--prompt-tokens", diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 9f683f8e..1e2a5f4b 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -815,10 +815,7 @@ def from_stats( req.first_token_time or req.start_time for req in total_with_output_first ], - iter_counts=[ - req.output_tokens - for req in total_with_output_first - ], + iter_counts=[req.output_tokens for req in total_with_output_first], first_iter_counts=[ req.prompt_tokens for req in total_with_output_first ], diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index f2853910..601aab31 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -220,6 +220,7 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig": return TokensConfig(**config_dict) + def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: """ Saves a HuggingFace Dataset to file in a supported format. @@ -291,8 +292,7 @@ def process_dataset( _validate_output_suffix(output_path) logger.info( - f"Starting dataset conversion | Input: {data} | " - f"Output directory: {output_path}" + f"Starting dataset conversion | Input: {data} | Output directory: {output_path}" ) dataset, column_mappings = guidellm_load_dataset( @@ -378,7 +378,8 @@ def process_dataset( def push_dataset_to_hub( - hub_dataset_id: Optional[str], processed_dataset: Dataset, + hub_dataset_id: Optional[str], + processed_dataset: Dataset, ) -> None: """ Pushes the processed dataset to Hugging Face Hub using HF_TOKEN. diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 8e9ddd58..49a8e417 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -33,17 +33,18 @@ def tokenizer_mock(): ) return tokenizer + @pytest.mark.smoke @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_strategy_handler_called( - mock_sampler, - mock_dataset_class, - mock_check_processor, - mock_load_dataset, - tokenizer_mock, + mock_sampler, + mock_dataset_class, + mock_check_processor, + mock_load_dataset, + tokenizer_mock, ): mock_handler = MagicMock(return_value="processed_prompt") with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}): @@ -68,18 +69,21 @@ def test_strategy_handler_called( mock_load_dataset.assert_called_once() mock_check_processor.assert_called_once() + @pytest.mark.sanity def test_handle_ignore_strategy_too_short(tokenizer_mock): result = handle_ignore_strategy("short", 10, tokenizer_mock) assert result is None tokenizer_mock.encode.assert_called_with("short") + @pytest.mark.sanity def test_handle_ignore_strategy_sufficient_length(tokenizer_mock): result = handle_ignore_strategy("long prompt", 5, tokenizer_mock) assert result == "long prompt" tokenizer_mock.encode.assert_called_with("long prompt") + @pytest.mark.sanity def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): dataset_iter = iter([{"prompt": "longer"}]) @@ -88,6 +92,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock): ) assert result == "short\nlonger" + @pytest.mark.sanity def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): dataset_iter: Iterator = iter([]) @@ -96,22 +101,26 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): ) assert result is None + @pytest.mark.sanity def test_handle_pad_strategy(tokenizer_mock): result = handle_pad_strategy("short", 10, tokenizer_mock, "p") assert result == "shortppppp" + @pytest.mark.sanity def test_handle_error_strategy_valid_prompt(tokenizer_mock): result = handle_error_strategy("valid prompt", 5, tokenizer_mock) assert result == "valid prompt" tokenizer_mock.encode.assert_called_with("valid prompt") + @pytest.mark.sanity def test_handle_error_strategy_too_short_prompt(tokenizer_mock): with pytest.raises(PromptTooShortError): handle_error_strategy("short", 10, tokenizer_mock) + @pytest.mark.smoke @patch("guidellm.preprocess.dataset.save_dataset_to_file") @patch("guidellm.preprocess.dataset.Dataset") @@ -119,12 +128,12 @@ def test_handle_error_strategy_too_short_prompt(tokenizer_mock): @patch("guidellm.preprocess.dataset.check_load_processor") @patch("guidellm.preprocess.dataset.IntegerRangeSampler") def test_process_dataset_non_empty( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_save_to_file, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_save_to_file, + tokenizer_mock, ): from guidellm.preprocess.dataset import process_dataset @@ -159,17 +168,18 @@ def test_process_dataset_non_empty( assert "output_tokens_count" in item assert len(tokenizer_mock.encode(item["prompt"])) <= 3 + @pytest.mark.sanity @patch(f"{process_dataset.__module__}.Dataset") @patch(f"{process_dataset.__module__}.guidellm_load_dataset") @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_empty_after_processing( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + tokenizer_mock, ): mock_dataset = [{"prompt": ""}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -188,6 +198,7 @@ def test_process_dataset_empty_after_processing( mock_check_processor.assert_called_once() mock_dataset_class.from_list.assert_not_called() + @pytest.mark.smoke @patch(f"{process_dataset.__module__}.push_dataset_to_hub") @patch(f"{process_dataset.__module__}.Dataset") @@ -195,12 +206,12 @@ def test_process_dataset_empty_after_processing( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_called( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_push, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -221,6 +232,7 @@ def test_process_dataset_push_to_hub_called( ) mock_push.assert_called_once_with("id123", mock_dataset_obj) + @pytest.mark.sanity @patch(f"{process_dataset.__module__}.push_dataset_to_hub") @patch(f"{process_dataset.__module__}.Dataset") @@ -228,12 +240,12 @@ def test_process_dataset_push_to_hub_called( @patch(f"{process_dataset.__module__}.check_load_processor") @patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_push_to_hub_not_called( - mock_sampler, - mock_check_processor, - mock_load_dataset, - mock_dataset_class, - mock_push, - tokenizer_mock, + mock_sampler, + mock_check_processor, + mock_load_dataset, + mock_dataset_class, + mock_push, + tokenizer_mock, ): mock_dataset = [{"prompt": "abc"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) @@ -253,6 +265,7 @@ def test_process_dataset_push_to_hub_not_called( ) mock_push.assert_not_called() + @pytest.mark.regression def test_push_dataset_to_hub_success(): os.environ["HF_TOKEN"] = "token" @@ -260,6 +273,7 @@ def test_push_dataset_to_hub_success(): push_dataset_to_hub("dataset_id", mock_dataset) mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token") + @pytest.mark.regression def test_push_dataset_to_hub_error_no_env(): if "HF_TOKEN" in os.environ: @@ -268,6 +282,7 @@ def test_push_dataset_to_hub_error_no_env(): with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub("dataset_id", mock_dataset) + @pytest.mark.regression def test_push_dataset_to_hub_error_no_id(): os.environ["HF_TOKEN"] = "token" @@ -275,6 +290,7 @@ def test_push_dataset_to_hub_error_no_id(): with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub(None, mock_dataset) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_csv(mock_mkdir): @@ -284,6 +300,7 @@ def test_save_dataset_to_file_csv(mock_mkdir): mock_dataset.to_csv.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_csv_capitalized(mock_mkdir): @@ -293,6 +310,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir): mock_dataset.to_csv.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_json(mock_mkdir): @@ -302,6 +320,7 @@ def test_save_dataset_to_file_json(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_json_capitalized(mock_mkdir): @@ -311,6 +330,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_jsonl(mock_mkdir): @@ -320,6 +340,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): @@ -329,6 +350,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): mock_dataset.to_json.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_parquet(mock_mkdir): @@ -338,6 +360,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir): mock_dataset.to_parquet.assert_called_once_with(output_path) mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @pytest.mark.regression @patch.object(Path, "mkdir") def test_save_dataset_to_file_unsupported_type(mock_mkdir): From f3a3cd78f0bfd166b5c6525a10c6195baf2190b6 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 18:00:50 +0300 Subject: [PATCH 14/18] Fixed help for preprocess dataset subcommand --- src/guidellm/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 66c3daa4..7dc06835 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -313,9 +313,9 @@ def preprocess(): @preprocess.command( help=( - "Convert a dataset to have specific prompt and output token sizes.\n\n" - "INPUT_DATA: Path to the input dataset or dataset ID.\n" - "OUTPUT_PATH: Path to save the converted dataset, including file suffix. " + "Convert a dataset to have specific prompt and output token sizes.\n" + "DATA: Path to the input dataset or dataset ID.\n" + "OUTPUT_PATH: Path to save the converted dataset, including file suffix." ) ) @click.argument( From 448d609ad884934b704f6cb57e4e99c381c23ff0 Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Sun, 1 Jun 2025 18:02:02 +0300 Subject: [PATCH 15/18] Fixed help for preprocess dataset subcommand --- src/guidellm/benchmark/benchmark.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 1e2a5f4b..9f683f8e 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -815,7 +815,10 @@ def from_stats( req.first_token_time or req.start_time for req in total_with_output_first ], - iter_counts=[req.output_tokens for req in total_with_output_first], + iter_counts=[ + req.output_tokens + for req in total_with_output_first + ], first_iter_counts=[ req.prompt_tokens for req in total_with_output_first ], From c48472a113f0e4cdb4b177e5bea840f4e54e331c Mon Sep 17 00:00:00 2001 From: TomerGarber Date: Thu, 5 Jun 2025 12:48:22 +0300 Subject: [PATCH 16/18] Fixed CR comments --- src/guidellm/preprocess/dataset.py | 53 ++++----------- src/guidellm/utils/__init__.py | 6 ++ src/guidellm/utils/hf_datasets.py | 36 +++++++++++ tests/unit/preprocess/test_dataset.py | 92 ++------------------------- tests/unit/utils/__init__.py | 0 tests/unit/utils/test_hf_datasets.py | 87 +++++++++++++++++++++++++ 6 files changed, 149 insertions(+), 125 deletions(-) create mode 100644 src/guidellm/utils/hf_datasets.py create mode 100644 tests/unit/utils/__init__.py create mode 100644 tests/unit/utils/test_hf_datasets.py diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index 601aab31..c6957863 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -13,13 +13,7 @@ from guidellm.dataset import load_dataset as guidellm_load_dataset from guidellm.utils import IntegerRangeSampler, check_load_processor - -SUPPORTED_TYPES = { - ".json", - ".jsonl", - ".csv", - ".parquet", -} +from guidellm.utils.hf_datasets import SUPPORTED_TYPES, save_dataset_to_file class PromptTooShortError(Exception): @@ -94,6 +88,7 @@ def handle_pad_strategy( min_prompt_tokens: int, tokenizer: PreTrainedTokenizerBase, pad_char: str, + pad_multiplier: int = 2, **_kwargs, ) -> str: """ @@ -103,13 +98,18 @@ def handle_pad_strategy( :param min_prompt_tokens: Desired minimum token count. :param tokenizer: Tokenizer used to count tokens. :param pad_char: Character used for padding. + :param pad_multiplier: Multiplier for padding character length. :return: Padded prompt string. """ - while len(tokenizer.encode(current_prompt)) < min_prompt_tokens: - current_prompt += pad_char - return current_prompt - + tokens = tokenizer.encode(current_prompt) + pad_count = 1 + prompt = current_prompt + while len(tokens) < min_prompt_tokens: + prompt += pad_char * pad_count + tokens = tokenizer.encode(prompt) + pad_count *= pad_multiplier + return prompt def handle_error_strategy( current_prompt: str, @@ -221,31 +221,6 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig": return TokensConfig(**config_dict) -def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: - """ - Saves a HuggingFace Dataset to file in a supported format. - - :param dataset: Dataset to save. - :param output_path: Output file path (.json, .jsonl, .csv, .parquet). - :raises ValueError: If the file extension is not supported. - """ - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - suffix = output_path.suffix.lower() - - if suffix == ".csv": - dataset.to_csv(output_path) - elif suffix in {".json", ".jsonl"}: - dataset.to_json(output_path) - elif suffix == ".parquet": - dataset.to_parquet(output_path) - else: - raise ValueError( - f"Unsupported file suffix '{suffix}' in output_path'{output_path}'." - f" Only {SUPPORTED_TYPES} are supported." - ) - - def _validate_output_suffix(output_path: Union[str, Path]) -> None: output_path = Path(output_path) suffix = output_path.suffix.lower() @@ -351,8 +326,8 @@ def process_dataset( if prompt_text is None: continue - if len(tokenizer.encode(prompt_text)) > target_prompt_len: - tokens = tokenizer.encode(prompt_text) + tokens = tokenizer.encode(prompt_text) + if len(tokens) > target_prompt_len: prompt_text = tokenizer.decode(tokens[:target_prompt_len]) processed_prompt = prompt_row.copy() @@ -370,7 +345,7 @@ def process_dataset( processed_dataset = Dataset.from_list(processed_prompts) save_dataset_to_file(processed_dataset, output_path) - logger.info(f"Conversion complete. Dataset saved to: {output_path}") + logger.info(f"Conversion completed. Dataset saved to: {output_path}") if push_to_hub: push_dataset_to_hub(hub_dataset_id, processed_dataset) diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 753bef02..11751c0d 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,4 +1,8 @@ from .colors import Colors +from .hf_datasets import ( + save_dataset_to_file, + SUPPORTED_TYPES, +) from .hf_transformers import ( check_load_processor, ) @@ -22,6 +26,8 @@ "filter_text", "is_puncutation", "load_text", + "save_dataset_to_file", "split_text", "split_text_list_by_length", + "SUPPORTED_TYPES", ] diff --git a/src/guidellm/utils/hf_datasets.py b/src/guidellm/utils/hf_datasets.py new file mode 100644 index 00000000..73e55ebc --- /dev/null +++ b/src/guidellm/utils/hf_datasets.py @@ -0,0 +1,36 @@ +from pathlib import Path +from typing import Union + +from datasets import Dataset + +SUPPORTED_TYPES = { + ".json", + ".jsonl", + ".csv", + ".parquet", +} + + +def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None: + """ + Saves a HuggingFace Dataset to file in a supported format. + + :param dataset: Dataset to save. + :param output_path: Output file path (.json, .jsonl, .csv, .parquet). + :raises ValueError: If the file extension is not supported. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + suffix = output_path.suffix.lower() + + if suffix == ".csv": + dataset.to_csv(output_path) + elif suffix in {".json", ".jsonl"}: + dataset.to_json(output_path) + elif suffix == ".parquet": + dataset.to_parquet(output_path) + else: + raise ValueError( + f"Unsupported file suffix '{suffix}' in output_path'{output_path}'." + f" Only {SUPPORTED_TYPES} are supported." + ) diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 49a8e417..b3878565 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch @@ -20,7 +19,6 @@ handle_pad_strategy, process_dataset, push_dataset_to_hub, - save_dataset_to_file, ) @@ -105,7 +103,7 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock): @pytest.mark.sanity def test_handle_pad_strategy(tokenizer_mock): result = handle_pad_strategy("short", 10, tokenizer_mock, "p") - assert result == "shortppppp" + assert result.startswith("shortppppp") @pytest.mark.sanity @@ -122,11 +120,11 @@ def test_handle_error_strategy_too_short_prompt(tokenizer_mock): @pytest.mark.smoke -@patch("guidellm.preprocess.dataset.save_dataset_to_file") -@patch("guidellm.preprocess.dataset.Dataset") -@patch("guidellm.preprocess.dataset.guidellm_load_dataset") -@patch("guidellm.preprocess.dataset.check_load_processor") -@patch("guidellm.preprocess.dataset.IntegerRangeSampler") +@patch(f"{process_dataset.__module__}.save_dataset_to_file") +@patch(f"{process_dataset.__module__}.Dataset") +@patch(f"{process_dataset.__module__}.guidellm_load_dataset") +@patch(f"{process_dataset.__module__}.check_load_processor") +@patch(f"{process_dataset.__module__}.IntegerRangeSampler") def test_process_dataset_non_empty( mock_sampler, mock_check_processor, @@ -291,81 +289,3 @@ def test_push_dataset_to_hub_error_no_id(): push_dataset_to_hub(None, mock_dataset) -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_csv(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.csv") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_csv.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_csv_capitalized(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.CSV") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_csv.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_json(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.json") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_json.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_json_capitalized(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.JSON") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_json.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_jsonl(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.jsonl") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_json.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.JSONL") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_json.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_parquet(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.parquet") - save_dataset_to_file(mock_dataset, output_path) - mock_dataset.to_parquet.assert_called_once_with(output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - -@pytest.mark.regression -@patch.object(Path, "mkdir") -def test_save_dataset_to_file_unsupported_type(mock_mkdir): - mock_dataset = MagicMock(spec=Dataset) - output_path = Path("some/path/output.txt") - with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"): - save_dataset_to_file(mock_dataset, output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) diff --git a/tests/unit/utils/__init__.py b/tests/unit/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/utils/test_hf_datasets.py b/tests/unit/utils/test_hf_datasets.py new file mode 100644 index 00000000..a59f6a13 --- /dev/null +++ b/tests/unit/utils/test_hf_datasets.py @@ -0,0 +1,87 @@ +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest +from datasets import Dataset + +from guidellm.utils import save_dataset_to_file + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_csv(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.csv") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_csv.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_csv_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.CSV") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_csv.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_json(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.json") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_json_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.JSON") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_jsonl(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.jsonl") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.JSONL") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_json.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_parquet(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.parquet") + save_dataset_to_file(mock_dataset, output_path) + mock_dataset.to_parquet.assert_called_once_with(output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + + +@pytest.mark.regression +@patch.object(Path, "mkdir") +def test_save_dataset_to_file_unsupported_type(mock_mkdir): + mock_dataset = MagicMock(spec=Dataset) + output_path = Path("some/path/output.txt") + with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"): + save_dataset_to_file(mock_dataset, output_path) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) \ No newline at end of file From 0cc3ffefdcb18155179c6bd9c87d12267b3cc9fe Mon Sep 17 00:00:00 2001 From: TomerG711 Date: Thu, 5 Jun 2025 12:54:39 +0300 Subject: [PATCH 17/18] Linters --- src/guidellm/benchmark/benchmark.py | 5 +---- src/guidellm/preprocess/dataset.py | 1 + src/guidellm/utils/__init__.py | 4 ++-- tests/unit/preprocess/test_dataset.py | 2 -- tests/unit/utils/test_hf_datasets.py | 4 ++-- 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 9f683f8e..1e2a5f4b 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -815,10 +815,7 @@ def from_stats( req.first_token_time or req.start_time for req in total_with_output_first ], - iter_counts=[ - req.output_tokens - for req in total_with_output_first - ], + iter_counts=[req.output_tokens for req in total_with_output_first], first_iter_counts=[ req.prompt_tokens for req in total_with_output_first ], diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index c6957863..a94b8a14 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -111,6 +111,7 @@ def handle_pad_strategy( pad_count *= pad_multiplier return prompt + def handle_error_strategy( current_prompt: str, min_prompt_tokens: int, diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 11751c0d..399c021d 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,7 +1,7 @@ from .colors import Colors from .hf_datasets import ( - save_dataset_to_file, SUPPORTED_TYPES, + save_dataset_to_file, ) from .hf_transformers import ( check_load_processor, @@ -18,6 +18,7 @@ ) __all__ = [ + "SUPPORTED_TYPES", "Colors", "EndlessTextCreator", "IntegerRangeSampler", @@ -29,5 +30,4 @@ "save_dataset_to_file", "split_text", "split_text_list_by_length", - "SUPPORTED_TYPES", ] diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index b3878565..2a2a8293 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -287,5 +287,3 @@ def test_push_dataset_to_hub_error_no_id(): mock_dataset = MagicMock(spec=Dataset) with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"): push_dataset_to_hub(None, mock_dataset) - - diff --git a/tests/unit/utils/test_hf_datasets.py b/tests/unit/utils/test_hf_datasets.py index a59f6a13..bbd41c4f 100644 --- a/tests/unit/utils/test_hf_datasets.py +++ b/tests/unit/utils/test_hf_datasets.py @@ -1,5 +1,5 @@ from pathlib import Path -from unittest.mock import patch, MagicMock +from unittest.mock import MagicMock, patch import pytest from datasets import Dataset @@ -84,4 +84,4 @@ def test_save_dataset_to_file_unsupported_type(mock_mkdir): output_path = Path("some/path/output.txt") with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"): save_dataset_to_file(mock_dataset, output_path) - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) \ No newline at end of file + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) From c0cd1c9936d6e503b9fddffafabf4380c73f98b7 Mon Sep 17 00:00:00 2001 From: TomerG711 Date: Thu, 5 Jun 2025 12:55:58 +0300 Subject: [PATCH 18/18] Linters --- src/guidellm/benchmark/benchmark.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 1e2a5f4b..9f683f8e 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -815,7 +815,10 @@ def from_stats( req.first_token_time or req.start_time for req in total_with_output_first ], - iter_counts=[req.output_tokens for req in total_with_output_first], + iter_counts=[ + req.output_tokens + for req in total_with_output_first + ], first_iter_counts=[ req.prompt_tokens for req in total_with_output_first ],