From c700d9dbb93e7b1e2dc0e3b07d46ee1a6c7ebb05 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Thu, 15 May 2025 17:46:02 +0300
Subject: [PATCH 01/18] Implemented preprocessing for datasets

---
 src/guidellm/__main__.py              | 316 ++++++++++++++++++++------
 src/guidellm/preprocess/__init__.py   |   6 +
 tests/unit/preprocess/__init__.py     |   0
 tests/unit/preprocess/test_dataset.py | 193 ++++++++++++++++
 4 files changed, 449 insertions(+), 66 deletions(-)
 create mode 100644 src/guidellm/preprocess/__init__.py
 create mode 100644 tests/unit/preprocess/__init__.py
 create mode 100644 tests/unit/preprocess/test_dataset.py

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index f38b11aa..79cb97a3 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -8,6 +8,7 @@
 from guidellm.backend import BackendType
 from guidellm.benchmark import ProfileType, benchmark_generative_text
 from guidellm.config import print_config
+from guidellm.preprocess.dataset import process_dataset, ShortPromptStrategy
 from guidellm.scheduler import StrategyType
 
 STRATEGY_PROFILE_CHOICES = set(
@@ -40,6 +41,7 @@ def parse_number_str(ctx, param, value):  # noqa: ARG001
 
 @click.group()
 def cli():
+    """GuideLLM CLI tools."""
     pass
 
 
@@ -56,8 +58,8 @@ def cli():
     "--backend-type",
     type=click.Choice(list(get_args(BackendType))),
     help=(
-        "The type of backend to use to run requests against. Defaults to 'openai_http'."
-        f" Supported types: {', '.join(get_args(BackendType))}"
+            "The type of backend to use to run requests against. Defaults to 'openai_http'."
+            f" Supported types: {', '.join(get_args(BackendType))}"
     ),
     default="openai_http",
 )
@@ -66,8 +68,8 @@ def cli():
     callback=parse_json,
     default=None,
     help=(
-        "A JSON string containing any arguments to pass to the backend as a "
-        "dict with **kwargs."
+            "A JSON string containing any arguments to pass to the backend as a "
+            "dict with **kwargs."
     ),
 )
 @click.option(
@@ -75,8 +77,8 @@ def cli():
     default=None,
     type=str,
     help=(
-        "The ID of the model to benchmark within the backend. "
-        "If None provided (default), then it will use the first model available."
+            "The ID of the model to benchmark within the backend. "
+            "If None provided (default), then it will use the first model available."
     ),
 )
 @click.option(
@@ -84,9 +86,9 @@ def cli():
     default=None,
     type=str,
     help=(
-        "The processor or tokenizer to use to calculate token counts for statistics "
-        "and synthetic data generation. If None provided (default), will load "
-        "using the model arg, if needed."
+            "The processor or tokenizer to use to calculate token counts for statistics "
+            "and synthetic data generation. If None provided (default), will load "
+            "using the model arg, if needed."
     ),
 )
 @click.option(
@@ -94,8 +96,8 @@ def cli():
     default=None,
     callback=parse_json,
     help=(
-        "A JSON string containing any arguments to pass to the processor constructor "
-        "as a dict with **kwargs."
+            "A JSON string containing any arguments to pass to the processor constructor "
+            "as a dict with **kwargs."
     ),
 )
 @click.option(
@@ -103,17 +105,17 @@ def cli():
     required=True,
     type=str,
     help=(
-        "The HuggingFace dataset ID, a path to a HuggingFace dataset, "
-        "a path to a data file csv, json, jsonl, or txt, "
-        "or a synthetic data config as a json or key=value string."
+            "The HuggingFace dataset ID, a path to a HuggingFace dataset, "
+            "a path to a data file csv, json, jsonl, or txt, "
+            "or a synthetic data config as a json or key=value string."
     ),
 )
 @click.option(
     "--data-args",
     callback=parse_json,
     help=(
-        "A JSON string containing any arguments to pass to the dataset creation "
-        "as a dict with **kwargs."
+            "A JSON string containing any arguments to pass to the dataset creation "
+            "as a dict with **kwargs."
     ),
 )
 @click.option(
@@ -121,8 +123,8 @@ def cli():
     default=None,
     type=click.Choice(["random"]),
     help=(
-        "The data sampler type to use. 'random' will add a random shuffle on the data. "
-        "Defaults to None"
+            "The data sampler type to use. 'random' will add a random shuffle on the data. "
+            "Defaults to None"
     ),
 )
 @click.option(
@@ -130,8 +132,8 @@ def cli():
     required=True,
     type=click.Choice(STRATEGY_PROFILE_CHOICES),
     help=(
-        "The type of benchmark to run. "
-        f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. "
+            "The type of benchmark to run. "
+            f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. "
     ),
 )
 @click.option(
@@ -139,28 +141,28 @@ def cli():
     default=None,
     callback=parse_number_str,
     help=(
-        "The rates to run the benchmark at. "
-        "Can be a single number or a comma-separated list of numbers. "
-        "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. "
-        "For rate-type=concurrent, this is the number of concurrent requests. "
-        "For rate-type=async,constant,poisson, this is the rate requests per second. "
-        "For rate-type=synchronous,throughput, this must not be set."
+            "The rates to run the benchmark at. "
+            "Can be a single number or a comma-separated list of numbers. "
+            "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. "
+            "For rate-type=concurrent, this is the number of concurrent requests. "
+            "For rate-type=async,constant,poisson, this is the rate requests per second. "
+            "For rate-type=synchronous,throughput, this must not be set."
     ),
 )
 @click.option(
     "--max-seconds",
     type=float,
     help=(
-        "The maximum number of seconds each benchmark can run for. "
-        "If None, will run until max_requests or the data is exhausted."
+            "The maximum number of seconds each benchmark can run for. "
+            "If None, will run until max_requests or the data is exhausted."
     ),
 )
 @click.option(
     "--max-requests",
     type=int,
     help=(
-        "The maximum number of requests each benchmark can run for. "
-        "If None, will run until max_seconds or the data is exhausted."
+            "The maximum number of requests each benchmark can run for. "
+            "If None, will run until max_seconds or the data is exhausted."
     ),
 )
 @click.option(
@@ -168,18 +170,18 @@ def cli():
     type=float,
     default=None,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, "
-        "or lenth of dataset) to run as a warmup and not include in the final results. "
-        "Defaults to None."
+            "The percent of the benchmark (based on max-seconds, max-requets, "
+            "or lenth of dataset) to run as a warmup and not include in the final results. "
+            "Defaults to None."
     ),
 )
 @click.option(
     "--cooldown-percent",
     type=float,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
-        "of dataset) to run as a cooldown and not include in the final results. "
-        "Defaults to None."
+            "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+            "of dataset) to run as a cooldown and not include in the final results. "
+            "Defaults to None."
     ),
 )
 @click.option(
@@ -202,10 +204,10 @@ def cli():
     type=click.Path(),
     default=Path.cwd() / "benchmarks.json",
     help=(
-        "The path to save the output to. If it is a directory, "
-        "it will save benchmarks.json under it. "
-        "Otherwise, json, yaml, or csv files are supported for output types "
-        "which will be read from the extension for the file path."
+            "The path to save the output to. If it is a directory, "
+            "it will save benchmarks.json under it. "
+            "Otherwise, json, yaml, or csv files are supported for output types "
+            "which will be read from the extension for the file path."
     ),
 )
 @click.option(
@@ -217,8 +219,8 @@ def cli():
     "--output-sampling",
     type=int,
     help=(
-        "The number of samples to save in the output file. "
-        "If None (default), will save all samples."
+            "The number of samples to save in the output file. "
+            "If None (default), will save all samples."
     ),
     default=None,
 )
@@ -229,28 +231,28 @@ def cli():
     help="The random seed to use for benchmarking to ensure reproducibility.",
 )
 def benchmark(
-    target,
-    backend_type,
-    backend_args,
-    model,
-    processor,
-    processor_args,
-    data,
-    data_args,
-    data_sampler,
-    rate_type,
-    rate,
-    max_seconds,
-    max_requests,
-    warmup_percent,
-    cooldown_percent,
-    disable_progress,
-    display_scheduler_stats,
-    disable_console_outputs,
-    output_path,
-    output_extras,
-    output_sampling,
-    random_seed,
+        target,
+        backend_type,
+        backend_args,
+        model,
+        processor,
+        processor_args,
+        data,
+        data_args,
+        data_sampler,
+        rate_type,
+        rate,
+        max_seconds,
+        max_requests,
+        warmup_percent,
+        cooldown_percent,
+        disable_progress,
+        display_scheduler_stats,
+        disable_console_outputs,
+        output_path,
+        output_extras,
+        output_sampling,
+        random_seed,
 ):
     asyncio.run(
         benchmark_generative_text(
@@ -282,13 +284,195 @@ def benchmark(
 
 @cli.command(
     help=(
-        "Print out the available configuration settings that can be set "
-        "through environment variables."
+            "Print out the available configuration settings that can be set "
+            "through environment variables."
     )
 )
 def config():
     print_config()
 
 
+@cli.group(help="Preprocessing utilities for datasets.")
+def preprocess():
+    pass
+
+
+@preprocess.command(
+    help="Convert a dataset to have specific prompt and output token sizes.\n\n"
+         "INPUT_DATA: Path to the input dataset or dataset ID.\n"
+         "OUTPUT_PATH: Directory to save the converted dataset. "
+         "The dataset will be saved as an Arrow dataset (.arrow) inside the directory."
+)
+@click.argument(
+    "input_data",
+    type=str,
+    metavar="INPUT_DATA",
+    required=True,
+)
+@click.argument(
+    "output_path",
+    type=click.Path(file_okay=False, dir_okay=True, writable=True, resolve_path=True),
+    metavar="OUTPUT_PATH",
+    required=True,
+)
+@click.option(
+    "--processor",
+    type=str,
+    required=True,
+    help=(
+            "The processor or tokenizer to use to calculate token counts for statistics "
+            "and synthetic data generation."
+    ),
+)
+@click.option(
+    "--processor-args",
+    default=None,
+    callback=parse_json,
+    help=(
+            "A JSON string containing any arguments to pass to the processor constructor "
+            "as a dict with **kwargs."
+    ),
+)
+@click.option(
+    "--data-args",
+    callback=parse_json,
+    help=(
+            "A JSON string containing any arguments to pass to the dataset creation "
+            "as a dict with **kwargs."
+    ),
+)
+@click.option(
+    "--short-prompt-strategy",
+    type=click.Choice([s.value for s in ShortPromptStrategy]),
+    default=ShortPromptStrategy.IGNORE.value,
+    show_default=True,
+    help="Strategy to handle prompts shorter than the target length. "
+)
+@click.option(
+    "--pad-token",
+    type=str,
+    default=None,
+    help="The token to pad short prompts with when using the 'pad' strategy."
+)
+@click.option(
+    "--prompt-tokens-average",
+    type=int,
+    default=10,
+    show_default=True,
+    help="Average target number of tokens for prompts."
+)
+@click.option(
+    "--prompt-tokens-stdev",
+    type=int,
+    default=None,
+    help="Standard deviation for prompt tokens sampling."
+)
+@click.option(
+    "--prompt-tokens-min",
+    type=int,
+    default=None,
+    help="Minimum number of prompt tokens."
+)
+@click.option(
+    "--prompt-tokens-max",
+    type=int,
+    default=None,
+    help="Maximum number of prompt tokens."
+)
+@click.option(
+    "--prompt-random-seed",
+    type=int,
+    default=42,
+    show_default=True,
+    help="Random seed for prompt token sampling."
+)
+@click.option(
+    "--output-tokens-average",
+    type=int,
+    default=10,
+    show_default=True,
+    help="Average target number of tokens for outputs."
+)
+@click.option(
+    "--output-tokens-stdev",
+    type=int,
+    default=None,
+    help="Standard deviation for output tokens sampling."
+)
+@click.option(
+    "--output-tokens-min",
+    type=int,
+    default=None,
+    help="Minimum number of output tokens."
+)
+@click.option(
+    "--output-tokens-max",
+    type=int,
+    default=None,
+    help="Maximum number of output tokens."
+)
+@click.option(
+    "--output-random-seed",
+    type=int,
+    default=123,
+    show_default=True,
+    help="Random seed for output token sampling."
+)
+@click.option(
+    "--push-to-hub",
+    is_flag=True,
+    help="Set this flag to push the converted dataset to the Hugging Face Hub."
+)
+@click.option(
+    "--hub-dataset-id",
+    type=str,
+    default=None,
+    help="The Hugging Face Hub dataset ID to push to. "
+         "Required if --push-to-hub is used."
+)
+def dataset(
+        input_data,
+        output_path,
+        processor,
+        processor_args,
+        data_args,
+        short_prompt_strategy,
+        pad_token,
+        prompt_tokens_average,
+        prompt_tokens_stdev,
+        prompt_tokens_min,
+        prompt_tokens_max,
+        prompt_random_seed,
+        output_tokens_average,
+        output_tokens_stdev,
+        output_tokens_min,
+        output_tokens_max,
+        output_random_seed,
+        push_to_hub,
+        hub_dataset_id,
+):
+    process_dataset(
+        input_data=input_data,
+        output_path=output_path,
+        processor=processor,
+        processor_args=processor_args,
+        data_args=data_args,
+        short_prompt_strategy=short_prompt_strategy,
+        pad_token=pad_token,
+        prompt_tokens_average=prompt_tokens_average,
+        prompt_tokens_stdev=prompt_tokens_stdev,
+        prompt_tokens_min=prompt_tokens_min,
+        prompt_tokens_max=prompt_tokens_max,
+        prompt_random_seed=prompt_random_seed,
+        output_tokens_average=output_tokens_average,
+        output_tokens_stdev=output_tokens_stdev,
+        output_tokens_min=output_tokens_min,
+        output_tokens_max=output_tokens_max,
+        output_random_seed=output_random_seed,
+        push_to_hub=push_to_hub,
+        hub_dataset_id=hub_dataset_id,
+    )
+
+
 if __name__ == "__main__":
     cli()
diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py
new file mode 100644
index 00000000..669bf409
--- /dev/null
+++ b/src/guidellm/preprocess/__init__.py
@@ -0,0 +1,6 @@
+from .dataset import process_dataset, ShortPromptStrategy
+
+__all__ = [
+    "process_dataset",
+    "ShortPromptStrategy"
+]
diff --git a/tests/unit/preprocess/__init__.py b/tests/unit/preprocess/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
new file mode 100644
index 00000000..e14bffc3
--- /dev/null
+++ b/tests/unit/preprocess/test_dataset.py
@@ -0,0 +1,193 @@
+import os
+from unittest import mock
+from unittest.mock import patch, MagicMock
+
+import pytest
+from datasets import Dataset
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.preprocess.dataset import (
+    handle_ignore_strategy,
+    handle_concatenate_strategy,
+    handle_pad_strategy,
+    process_dataset,
+    push_dataset_to_hub,
+    ShortPromptStrategy,
+    STRATEGY_HANDLERS,
+)
+
+
+@pytest.fixture
+def tokenizer_mock():
+    tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
+    tokenizer.encode.side_effect = lambda x: [1] * len(x)
+    tokenizer.decode.side_effect = lambda x: ''.join(str(item) for item in x)
+    return tokenizer
+
+
+@patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: MagicMock(return_value="processed_prompt")})
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+def test_strategy_handler_called(
+        mock_sampler, mock_dataset_class, mock_check_processor, mock_load_dataset, tokenizer_mock
+):
+    mock_handler = STRATEGY_HANDLERS[ShortPromptStrategy.IGNORE]
+    mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}]
+    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+    mock_check_processor.return_value = tokenizer_mock
+    mock_sampler.side_effect = lambda **kwargs: [10, 10]
+
+    mock_dataset_obj = MagicMock(spec=Dataset)
+    mock_dataset_class.from_list.return_value = mock_dataset_obj
+
+    process_dataset("input", "output_dir", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE)
+
+    assert mock_handler.call_count == 2
+    mock_load_dataset.assert_called_once()
+    mock_check_processor.assert_called_once()
+
+
+def test_handle_ignore_strategy_too_short(tokenizer_mock):
+    result = handle_ignore_strategy("short", 10, tokenizer_mock)
+    assert result is None
+    tokenizer_mock.encode.assert_called_with("short")
+
+
+def test_handle_ignore_strategy_sufficient_length(tokenizer_mock):
+    result = handle_ignore_strategy("long prompt", 5, tokenizer_mock)
+    assert result == "long prompt"
+    tokenizer_mock.encode.assert_called_with("long prompt")
+
+
+def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
+    dataset_iter = iter([{"prompt": "longer"}])
+    result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock)
+    assert result == "shortlonger"
+
+
+def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
+    dataset_iter = iter([])
+    result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock)
+    assert result is None
+
+
+def test_handle_pad_strategy(tokenizer_mock):
+    result = handle_pad_strategy("short", 10, tokenizer_mock, "p")
+    assert result == "shortppppp"
+
+
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+def test_process_dataset_non_empty(
+        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock
+):
+    mock_dataset = [{"prompt": "Hello"}, {"prompt": "How are you?"}]
+    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+    mock_check_processor.return_value = tokenizer_mock
+    mock_sampler.side_effect = lambda **kwargs: [3, 3, 3]
+
+    mock_dataset_obj = MagicMock(spec=Dataset)
+    mock_dataset_class.from_list.return_value = mock_dataset_obj
+
+    process_dataset("input", "output_dir", tokenizer_mock)
+
+    # Validate
+    mock_load_dataset.assert_called_once()
+    mock_check_processor.assert_called_once()
+    mock_dataset_class.from_list.assert_called_once()
+    mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY)
+
+    # Check that generated dataset is correct (extracted from the mock call)
+    args, kwargs = mock_dataset_class.from_list.call_args
+    processed_list = args[0]
+    assert len(processed_list) == 2
+    for item in processed_list:
+        assert "prompt" in item
+        assert "prompt_tokens_count" in item
+        assert "output_tokens_count" in item
+        assert len(tokenizer_mock.encode(item["prompt"])) <= 3
+
+
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+def test_process_dataset_empty_after_processing(
+        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock
+):
+    mock_dataset = [{"prompt": ""}]
+    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+    mock_check_processor.return_value = tokenizer_mock
+    mock_sampler.side_effect = lambda **kwargs: [10]
+
+    process_dataset("input", "output_dir", tokenizer_mock)
+
+    mock_load_dataset.assert_called_once()
+    mock_check_processor.assert_called_once()
+    mock_dataset_class.from_list.assert_not_called()
+
+
+@patch(f"{process_dataset.__module__}.push_dataset_to_hub")
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+def test_process_dataset_push_to_hub_called(
+        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock
+):
+    mock_dataset = [{"prompt": "abc"}]
+    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+    mock_check_processor.return_value = tokenizer_mock
+    mock_sampler.side_effect = lambda **kwargs: [3]
+
+    mock_dataset_obj = MagicMock(spec=Dataset)
+    mock_dataset_class.from_list.return_value = mock_dataset_obj
+
+    process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123")
+    mock_push.assert_called_once_with("id123", mock_dataset_obj)
+
+
+@patch(f"{process_dataset.__module__}.push_dataset_to_hub")
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+def test_process_dataset_push_to_hub_not_called(
+        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock
+):
+    mock_dataset = [{"prompt": "abc"}]
+    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+    mock_check_processor.return_value = tokenizer_mock
+    mock_sampler.side_effect = lambda **kwargs: [3]
+
+    mock_dataset_obj = MagicMock(spec=Dataset)
+    mock_dataset_class.from_list.return_value = mock_dataset_obj
+
+    process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=False)
+    mock_push.assert_not_called()
+
+
+def test_push_dataset_to_hub_success():
+    os.environ["HF_TOKEN"] = "token"
+    mock_dataset = MagicMock(spec=Dataset)
+    push_dataset_to_hub("dataset_id", mock_dataset)
+    mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token")
+
+
+def test_push_dataset_to_hub_error_no_env():
+    if "HF_TOKEN" in os.environ:
+        del os.environ["HF_TOKEN"]
+    mock_dataset = MagicMock(spec=Dataset)
+    with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
+        push_dataset_to_hub("dataset_id", mock_dataset)
+
+
+def test_push_dataset_to_hub_error_no_id():
+    os.environ["HF_TOKEN"] = "token"
+    mock_dataset = MagicMock(spec=Dataset)
+    with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
+        push_dataset_to_hub(None, mock_dataset)

From 91027e1b40923c5fc572fba9c1af490e408fd93c Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Thu, 15 May 2025 17:46:12 +0300
Subject: [PATCH 02/18] Implemented preprocessing for datasets

---
 src/guidellm/preprocess/dataset.py | 171 +++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 src/guidellm/preprocess/dataset.py

diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
new file mode 100644
index 00000000..1112470c
--- /dev/null
+++ b/src/guidellm/preprocess/dataset.py
@@ -0,0 +1,171 @@
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Optional, Union, Dict, Callable, Iterator
+
+from datasets import Dataset
+from loguru import logger
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.dataset import load_dataset as guidellm_load_dataset
+from guidellm.utils import check_load_processor, IntegerRangeSampler
+
+
+class ShortPromptStrategy(str, Enum):
+    IGNORE = "ignore"
+    CONCATENATE = "concatenate"
+    PAD = "pad"
+
+
+def handle_ignore_strategy(
+        current_prompt: str,
+        min_prompt_tokens: int,
+        tokenizer: PreTrainedTokenizerBase,
+        **kwargs
+) -> Optional[str]:
+    if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
+        logger.warning("Prompt too short, ignoring")
+        return None
+    return current_prompt
+
+
+def handle_concatenate_strategy(
+        current_prompt: str,
+        min_prompt_tokens: int,
+        dataset_iterator: Iterator[Dict[str, Any]],
+        prompt_column: str,
+        tokenizer: PreTrainedTokenizerBase,
+        **kwargs
+) -> Optional[str]:
+    tokens_len = len(tokenizer.encode(current_prompt))
+    while tokens_len < min_prompt_tokens:
+        try:
+            next_row = next(dataset_iterator)
+        except StopIteration:
+            logger.warning("Could not concatenate enough prompts to reach minimum length, ignoring")
+            return None
+        current_prompt += next_row[prompt_column]
+        tokens_len = len(tokenizer.encode(current_prompt))
+    return current_prompt
+
+
+def handle_pad_strategy(
+        current_prompt: str,
+        min_prompt_tokens: int,
+        tokenizer: PreTrainedTokenizerBase,
+        pad_token: str,
+        **kwargs
+) -> str:
+    while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
+        current_prompt += pad_token
+    return current_prompt
+
+
+STRATEGY_HANDLERS: Dict[ShortPromptStrategy, Callable] = {
+    ShortPromptStrategy.IGNORE: handle_ignore_strategy,
+    ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy,
+    ShortPromptStrategy.PAD: handle_pad_strategy,
+}
+
+
+def process_dataset(
+        input_data: Union[str, Path],
+        output_path: Union[str, Path],
+        processor: Union[str, Path, PreTrainedTokenizerBase],
+        processor_args: Optional[dict[str, Any]] = None,
+        data_args: Optional[dict[str, Any]] = None,
+        short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
+        pad_token: Optional[str] = None,
+        prompt_tokens_average: int = 10,
+        prompt_tokens_stdev: Optional[int] = None,
+        prompt_tokens_min: Optional[int] = None,
+        prompt_tokens_max: Optional[int] = None,
+        prompt_random_seed: int = 42,
+        output_tokens_average: int = 10,
+        output_tokens_stdev: Optional[int] = None,
+        output_tokens_min: Optional[int] = None,
+        output_tokens_max: Optional[int] = None,
+        output_random_seed: int = 123,
+        push_to_hub: bool = False,
+        hub_dataset_id: Optional[str] = None,
+) -> None:
+    logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}")
+
+    dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args)
+    tokenizer = check_load_processor(processor, processor_args, "Processor/tokenizer required for dataset conversion.")
+    prompt_column = column_mappings.get("prompt_column")
+    output_column = column_mappings.get("output_tokens_count_column", "output_tokens_count")
+
+    prompt_token_sampler = iter(
+        IntegerRangeSampler(
+            average=prompt_tokens_average,
+            variance=prompt_tokens_stdev,
+            min_value=prompt_tokens_min,
+            max_value=prompt_tokens_max,
+            random_seed=prompt_random_seed,
+        )
+    )
+
+    output_token_sampler = iter(
+        IntegerRangeSampler(
+            average=output_tokens_average,
+            variance=output_tokens_stdev,
+            min_value=output_tokens_min,
+            max_value=output_tokens_max,
+            random_seed=output_random_seed,
+        )
+    )
+
+    dataset_iterator = iter(dataset)
+    processed_prompts = []
+    handler = STRATEGY_HANDLERS[short_prompt_strategy]
+
+    for prompt_row in dataset_iterator:
+        prompt_text = prompt_row[prompt_column]
+        target_prompt_len = next(prompt_token_sampler)
+
+        if len(tokenizer.encode(prompt_text)) < target_prompt_len:
+            prompt_text = handler(
+                current_prompt=prompt_text,
+                min_prompt_tokens=target_prompt_len,
+                dataset_iterator=dataset_iterator,
+                prompt_column=prompt_column,
+                tokenizer=tokenizer,
+                pad_token=pad_token,
+            )
+            if prompt_text is None:
+                continue
+
+        if len(tokenizer.encode(prompt_text)) > target_prompt_len:
+            tokens = tokenizer.encode(prompt_text)
+            prompt_text = tokenizer.decode(tokens[:target_prompt_len])
+
+        processed_prompt = prompt_row.copy()
+        processed_prompt[prompt_column] = prompt_text
+        processed_prompt["prompt_tokens_count"] = target_prompt_len
+        processed_prompt[output_column] = next(output_token_sampler)
+
+        processed_prompts.append(processed_prompt)
+
+    if not processed_prompts:
+        logger.error("No prompts remained after processing")
+        return
+
+    logger.info(f"Generated processed dataset with {len(processed_prompts)} prompts")
+
+    processed_dataset = Dataset.from_list(processed_prompts)
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    processed_dataset.save_to_disk(output_path)
+    logger.info(f"Conversion complete. Dataset saved to: {output_path}")
+
+    if push_to_hub:
+        push_dataset_to_hub(hub_dataset_id, processed_dataset)
+        logger.info(f"Pushed dataset to: {hub_dataset_id}")
+
+
+def push_dataset_to_hub(hub_dataset_id: str, processed_dataset: Dataset) -> None:
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hub_dataset_id or not hf_token:
+        raise ValueError("hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub is True")
+    processed_dataset.push_to_hub(hub_dataset_id, token=hf_token)

From e02ec8a933a50515c948f2abd1d1f9f3abb83ce8 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Thu, 15 May 2025 17:49:07 +0300
Subject: [PATCH 03/18] Reverted irrelevant formatting

---
 src/guidellm/__main__.py | 133 +++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 67 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 79cb97a3..1afb6720 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -41,7 +41,6 @@ def parse_number_str(ctx, param, value):  # noqa: ARG001
 
 @click.group()
 def cli():
-    """GuideLLM CLI tools."""
     pass
 
 
@@ -58,8 +57,8 @@ def cli():
     "--backend-type",
     type=click.Choice(list(get_args(BackendType))),
     help=(
-            "The type of backend to use to run requests against. Defaults to 'openai_http'."
-            f" Supported types: {', '.join(get_args(BackendType))}"
+        "The type of backend to use to run requests against. Defaults to 'openai_http'."
+        f" Supported types: {', '.join(get_args(BackendType))}"
     ),
     default="openai_http",
 )
@@ -68,8 +67,8 @@ def cli():
     callback=parse_json,
     default=None,
     help=(
-            "A JSON string containing any arguments to pass to the backend as a "
-            "dict with **kwargs."
+        "A JSON string containing any arguments to pass to the backend as a "
+        "dict with **kwargs."
     ),
 )
 @click.option(
@@ -77,8 +76,8 @@ def cli():
     default=None,
     type=str,
     help=(
-            "The ID of the model to benchmark within the backend. "
-            "If None provided (default), then it will use the first model available."
+        "The ID of the model to benchmark within the backend. "
+        "If None provided (default), then it will use the first model available."
     ),
 )
 @click.option(
@@ -86,9 +85,9 @@ def cli():
     default=None,
     type=str,
     help=(
-            "The processor or tokenizer to use to calculate token counts for statistics "
-            "and synthetic data generation. If None provided (default), will load "
-            "using the model arg, if needed."
+        "The processor or tokenizer to use to calculate token counts for statistics "
+        "and synthetic data generation. If None provided (default), will load "
+        "using the model arg, if needed."
     ),
 )
 @click.option(
@@ -96,8 +95,8 @@ def cli():
     default=None,
     callback=parse_json,
     help=(
-            "A JSON string containing any arguments to pass to the processor constructor "
-            "as a dict with **kwargs."
+        "A JSON string containing any arguments to pass to the processor constructor "
+        "as a dict with **kwargs."
     ),
 )
 @click.option(
@@ -105,17 +104,17 @@ def cli():
     required=True,
     type=str,
     help=(
-            "The HuggingFace dataset ID, a path to a HuggingFace dataset, "
-            "a path to a data file csv, json, jsonl, or txt, "
-            "or a synthetic data config as a json or key=value string."
+        "The HuggingFace dataset ID, a path to a HuggingFace dataset, "
+        "a path to a data file csv, json, jsonl, or txt, "
+        "or a synthetic data config as a json or key=value string."
     ),
 )
 @click.option(
     "--data-args",
     callback=parse_json,
     help=(
-            "A JSON string containing any arguments to pass to the dataset creation "
-            "as a dict with **kwargs."
+        "A JSON string containing any arguments to pass to the dataset creation "
+        "as a dict with **kwargs."
     ),
 )
 @click.option(
@@ -123,8 +122,8 @@ def cli():
     default=None,
     type=click.Choice(["random"]),
     help=(
-            "The data sampler type to use. 'random' will add a random shuffle on the data. "
-            "Defaults to None"
+        "The data sampler type to use. 'random' will add a random shuffle on the data. "
+        "Defaults to None"
     ),
 )
 @click.option(
@@ -132,8 +131,8 @@ def cli():
     required=True,
     type=click.Choice(STRATEGY_PROFILE_CHOICES),
     help=(
-            "The type of benchmark to run. "
-            f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. "
+        "The type of benchmark to run. "
+        f"Supported types {', '.join(STRATEGY_PROFILE_CHOICES)}. "
     ),
 )
 @click.option(
@@ -141,28 +140,28 @@ def cli():
     default=None,
     callback=parse_number_str,
     help=(
-            "The rates to run the benchmark at. "
-            "Can be a single number or a comma-separated list of numbers. "
-            "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. "
-            "For rate-type=concurrent, this is the number of concurrent requests. "
-            "For rate-type=async,constant,poisson, this is the rate requests per second. "
-            "For rate-type=synchronous,throughput, this must not be set."
+        "The rates to run the benchmark at. "
+        "Can be a single number or a comma-separated list of numbers. "
+        "For rate-type=sweep, this is the number of benchmarks it runs in the sweep. "
+        "For rate-type=concurrent, this is the number of concurrent requests. "
+        "For rate-type=async,constant,poisson, this is the rate requests per second. "
+        "For rate-type=synchronous,throughput, this must not be set."
     ),
 )
 @click.option(
     "--max-seconds",
     type=float,
     help=(
-            "The maximum number of seconds each benchmark can run for. "
-            "If None, will run until max_requests or the data is exhausted."
+        "The maximum number of seconds each benchmark can run for. "
+        "If None, will run until max_requests or the data is exhausted."
     ),
 )
 @click.option(
     "--max-requests",
     type=int,
     help=(
-            "The maximum number of requests each benchmark can run for. "
-            "If None, will run until max_seconds or the data is exhausted."
+        "The maximum number of requests each benchmark can run for. "
+        "If None, will run until max_seconds or the data is exhausted."
     ),
 )
 @click.option(
@@ -170,18 +169,18 @@ def cli():
     type=float,
     default=None,
     help=(
-            "The percent of the benchmark (based on max-seconds, max-requets, "
-            "or lenth of dataset) to run as a warmup and not include in the final results. "
-            "Defaults to None."
+        "The percent of the benchmark (based on max-seconds, max-requets, "
+        "or lenth of dataset) to run as a warmup and not include in the final results. "
+        "Defaults to None."
     ),
 )
 @click.option(
     "--cooldown-percent",
     type=float,
     help=(
-            "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
-            "of dataset) to run as a cooldown and not include in the final results. "
-            "Defaults to None."
+        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+        "of dataset) to run as a cooldown and not include in the final results. "
+        "Defaults to None."
     ),
 )
 @click.option(
@@ -204,10 +203,10 @@ def cli():
     type=click.Path(),
     default=Path.cwd() / "benchmarks.json",
     help=(
-            "The path to save the output to. If it is a directory, "
-            "it will save benchmarks.json under it. "
-            "Otherwise, json, yaml, or csv files are supported for output types "
-            "which will be read from the extension for the file path."
+        "The path to save the output to. If it is a directory, "
+        "it will save benchmarks.json under it. "
+        "Otherwise, json, yaml, or csv files are supported for output types "
+        "which will be read from the extension for the file path."
     ),
 )
 @click.option(
@@ -219,8 +218,8 @@ def cli():
     "--output-sampling",
     type=int,
     help=(
-            "The number of samples to save in the output file. "
-            "If None (default), will save all samples."
+        "The number of samples to save in the output file. "
+        "If None (default), will save all samples."
     ),
     default=None,
 )
@@ -231,28 +230,28 @@ def cli():
     help="The random seed to use for benchmarking to ensure reproducibility.",
 )
 def benchmark(
-        target,
-        backend_type,
-        backend_args,
-        model,
-        processor,
-        processor_args,
-        data,
-        data_args,
-        data_sampler,
-        rate_type,
-        rate,
-        max_seconds,
-        max_requests,
-        warmup_percent,
-        cooldown_percent,
-        disable_progress,
-        display_scheduler_stats,
-        disable_console_outputs,
-        output_path,
-        output_extras,
-        output_sampling,
-        random_seed,
+    target,
+    backend_type,
+    backend_args,
+    model,
+    processor,
+    processor_args,
+    data,
+    data_args,
+    data_sampler,
+    rate_type,
+    rate,
+    max_seconds,
+    max_requests,
+    warmup_percent,
+    cooldown_percent,
+    disable_progress,
+    display_scheduler_stats,
+    disable_console_outputs,
+    output_path,
+    output_extras,
+    output_sampling,
+    random_seed,
 ):
     asyncio.run(
         benchmark_generative_text(
@@ -284,8 +283,8 @@ def benchmark(
 
 @cli.command(
     help=(
-            "Print out the available configuration settings that can be set "
-            "through environment variables."
+        "Print out the available configuration settings that can be set "
+        "through environment variables."
     )
 )
 def config():

From fc4bfb2fd7389814c7c1ec8e6bbe7e8390c00ebd Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Thu, 15 May 2025 17:57:36 +0300
Subject: [PATCH 04/18] Removed redundant comments

---
 tests/unit/preprocess/test_dataset.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index e14bffc3..126e2d0e 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -95,13 +95,11 @@ def test_process_dataset_non_empty(
 
     process_dataset("input", "output_dir", tokenizer_mock)
 
-    # Validate
     mock_load_dataset.assert_called_once()
     mock_check_processor.assert_called_once()
     mock_dataset_class.from_list.assert_called_once()
     mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY)
 
-    # Check that generated dataset is correct (extracted from the mock call)
     args, kwargs = mock_dataset_class.from_list.call_args
     processed_list = args[0]
     assert len(processed_list) == 2

From 2bf436c68ed1db4be53dbbac1d42893e69013546 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 18 May 2025 14:40:30 +0300
Subject: [PATCH 05/18] Added support for saving in specific file format

---
 src/guidellm/__main__.py              |  2 +-
 src/guidellm/preprocess/dataset.py    | 40 ++++++++++++--
 tests/unit/preprocess/test_dataset.py | 75 +++++++++++++++++++++------
 3 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 1afb6720..68a63f37 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -310,7 +310,7 @@ def preprocess():
 )
 @click.argument(
     "output_path",
-    type=click.Path(file_okay=False, dir_okay=True, writable=True, resolve_path=True),
+    type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
     metavar="OUTPUT_PATH",
     required=True,
 )
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index 1112470c..6ab5c077 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -10,6 +10,12 @@
 from guidellm.dataset import load_dataset as guidellm_load_dataset
 from guidellm.utils import check_load_processor, IntegerRangeSampler
 
+SUPPORTED_TYPES = {
+    ".json",
+    ".csv",
+    ".parquet",
+}
+
 
 class ShortPromptStrategy(str, Enum):
     IGNORE = "ignore"
@@ -68,6 +74,33 @@ def handle_pad_strategy(
 }
 
 
+def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None:
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if output_path.suffix == ".csv":
+        dataset.to_csv(str(output_path))
+    elif output_path.suffix == ".json":
+        dataset.to_json(str(output_path))
+    elif output_path.suffix == ".parquet":
+        dataset.to_parquet(str(output_path))
+    else:
+        raise ValueError(
+            f"Unsupported file suffix '{output_path.suffix}' in output_path '{output_path}'. "
+            f"Only {SUPPORTED_TYPES} are supported."
+        )
+
+
+def _validate_output_suffix(output_path: str) -> None:
+    output_path = Path(output_path)
+    suffix = output_path.suffix.lower()
+    if suffix not in SUPPORTED_TYPES:
+        raise ValueError(
+            f"Unsupported file suffix '{suffix}' in output_path '{output_path}'. "
+            f"Only {SUPPORTED_TYPES} are supported."
+        )
+
+
 def process_dataset(
         input_data: Union[str, Path],
         output_path: Union[str, Path],
@@ -89,6 +122,7 @@ def process_dataset(
         push_to_hub: bool = False,
         hub_dataset_id: Optional[str] = None,
 ) -> None:
+    _validate_output_suffix(output_path)
     logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}")
 
     dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args)
@@ -138,7 +172,7 @@ def process_dataset(
 
         if len(tokenizer.encode(prompt_text)) > target_prompt_len:
             tokens = tokenizer.encode(prompt_text)
-            prompt_text = tokenizer.decode(tokens[:target_prompt_len])
+            prompt_text = tokenizer.decode(tokens[:target_prompt_len], skip_special_tokens=True)
 
         processed_prompt = prompt_row.copy()
         processed_prompt[prompt_column] = prompt_text
@@ -154,9 +188,7 @@ def process_dataset(
     logger.info(f"Generated processed dataset with {len(processed_prompts)} prompts")
 
     processed_dataset = Dataset.from_list(processed_prompts)
-    output_path = Path(output_path)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    processed_dataset.save_to_disk(output_path)
+    save_dataset_to_file(processed_dataset, output_path)
     logger.info(f"Conversion complete. Dataset saved to: {output_path}")
 
     if push_to_hub:
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 126e2d0e..2b6c4665 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -1,5 +1,5 @@
 import os
-from unittest import mock
+from pathlib import Path
 from unittest.mock import patch, MagicMock
 
 import pytest
@@ -13,7 +13,7 @@
     process_dataset,
     push_dataset_to_hub,
     ShortPromptStrategy,
-    STRATEGY_HANDLERS,
+    STRATEGY_HANDLERS, save_dataset_to_file,
 )
 
 
@@ -21,7 +21,7 @@
 def tokenizer_mock():
     tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
     tokenizer.encode.side_effect = lambda x: [1] * len(x)
-    tokenizer.decode.side_effect = lambda x: ''.join(str(item) for item in x)
+    tokenizer.decode.side_effect = lambda x, *args, **kwargs: ''.join(str(item) for item in x)
     return tokenizer
 
 
@@ -42,7 +42,7 @@ def test_strategy_handler_called(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE)
+    process_dataset("input", "output_dir/data.json", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE)
 
     assert mock_handler.call_count == 2
     mock_load_dataset.assert_called_once()
@@ -78,13 +78,21 @@ def test_handle_pad_strategy(tokenizer_mock):
     assert result == "shortppppp"
 
 
-@patch(f"{process_dataset.__module__}.Dataset")
-@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
-@patch(f"{process_dataset.__module__}.check_load_processor")
-@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
+@patch("guidellm.preprocess.dataset.save_dataset_to_file")
+@patch("guidellm.preprocess.dataset.Dataset")
+@patch("guidellm.preprocess.dataset.guidellm_load_dataset")
+@patch("guidellm.preprocess.dataset.check_load_processor")
+@patch("guidellm.preprocess.dataset.IntegerRangeSampler")
 def test_process_dataset_non_empty(
-        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock
+        mock_sampler,
+        mock_check_processor,
+        mock_load_dataset,
+        mock_dataset_class,
+        mock_save_to_file,
+        tokenizer_mock,
 ):
+    from guidellm.preprocess.dataset import process_dataset
+
     mock_dataset = [{"prompt": "Hello"}, {"prompt": "How are you?"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
     mock_check_processor.return_value = tokenizer_mock
@@ -93,14 +101,15 @@ def test_process_dataset_non_empty(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir", tokenizer_mock)
+    output_path = "output_dir/data.json"
+    process_dataset("input", output_path, tokenizer_mock)
 
     mock_load_dataset.assert_called_once()
     mock_check_processor.assert_called_once()
     mock_dataset_class.from_list.assert_called_once()
-    mock_dataset_obj.save_to_disk.assert_called_once_with(mock.ANY)
+    mock_save_to_file.assert_called_once_with(mock_dataset_obj, output_path)
 
-    args, kwargs = mock_dataset_class.from_list.call_args
+    args, _ = mock_dataset_class.from_list.call_args
     processed_list = args[0]
     assert len(processed_list) == 2
     for item in processed_list:
@@ -122,7 +131,7 @@ def test_process_dataset_empty_after_processing(
     mock_check_processor.return_value = tokenizer_mock
     mock_sampler.side_effect = lambda **kwargs: [10]
 
-    process_dataset("input", "output_dir", tokenizer_mock)
+    process_dataset("input", "output_dir/data.json", tokenizer_mock)
 
     mock_load_dataset.assert_called_once()
     mock_check_processor.assert_called_once()
@@ -145,7 +154,7 @@ def test_process_dataset_push_to_hub_called(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123")
+    process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123")
     mock_push.assert_called_once_with("id123", mock_dataset_obj)
 
 
@@ -165,7 +174,7 @@ def test_process_dataset_push_to_hub_not_called(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir", tokenizer_mock, push_to_hub=False)
+    process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=False)
     mock_push.assert_not_called()
 
 
@@ -189,3 +198,39 @@ def test_push_dataset_to_hub_error_no_id():
     mock_dataset = MagicMock(spec=Dataset)
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub(None, mock_dataset)
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_csv(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.csv")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_csv.assert_called_once_with(str(output_path))
+    mock_mkdir.assert_called_once()
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_json(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.json")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(str(output_path))
+    mock_mkdir.assert_called_once()
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_parquet(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.parquet")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_parquet.assert_called_once_with(str(output_path))
+    mock_mkdir.assert_called_once()
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_unsupported_type(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.txt")
+    with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
+        save_dataset_to_file(mock_dataset, output_path)
+    mock_mkdir.assert_called_once()

From 64a91de7aceeeaaeff38d558eda04435cf988c1d Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 18 May 2025 14:56:53 +0300
Subject: [PATCH 06/18] Fixed code styling

---
 src/guidellm/__main__.py              |  86 +++++++++----------
 src/guidellm/preprocess/__init__.py   |   7 +-
 src/guidellm/preprocess/dataset.py    | 117 +++++++++++++++-----------
 tests/unit/preprocess/test_dataset.py | 104 ++++++++++++++++-------
 4 files changed, 186 insertions(+), 128 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 68a63f37..6df1089d 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -8,7 +8,7 @@
 from guidellm.backend import BackendType
 from guidellm.benchmark import ProfileType, benchmark_generative_text
 from guidellm.config import print_config
-from guidellm.preprocess.dataset import process_dataset, ShortPromptStrategy
+from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
 from guidellm.scheduler import StrategyType
 
 STRATEGY_PROFILE_CHOICES = set(
@@ -298,9 +298,9 @@ def preprocess():
 
 @preprocess.command(
     help="Convert a dataset to have specific prompt and output token sizes.\n\n"
-         "INPUT_DATA: Path to the input dataset or dataset ID.\n"
-         "OUTPUT_PATH: Directory to save the converted dataset. "
-         "The dataset will be saved as an Arrow dataset (.arrow) inside the directory."
+    "INPUT_DATA: Path to the input dataset or dataset ID.\n"
+    "OUTPUT_PATH: Directory to save the converted dataset. "
+    "The dataset will be saved as an Arrow dataset (.arrow) inside the directory."
 )
 @click.argument(
     "input_data",
@@ -319,8 +319,8 @@ def preprocess():
     type=str,
     required=True,
     help=(
-            "The processor or tokenizer to use to calculate token counts for statistics "
-            "and synthetic data generation."
+        "The processor or tokenizer to use to calculate token counts for statistics "
+        "and synthetic data generation."
     ),
 )
 @click.option(
@@ -328,16 +328,16 @@ def preprocess():
     default=None,
     callback=parse_json,
     help=(
-            "A JSON string containing any arguments to pass to the processor constructor "
-            "as a dict with **kwargs."
+        "A JSON string containing any arguments to pass to the processor constructor "
+        "as a dict with **kwargs."
     ),
 )
 @click.option(
     "--data-args",
     callback=parse_json,
     help=(
-            "A JSON string containing any arguments to pass to the dataset creation "
-            "as a dict with **kwargs."
+        "A JSON string containing any arguments to pass to the dataset creation "
+        "as a dict with **kwargs."
     ),
 )
 @click.option(
@@ -345,110 +345,110 @@ def preprocess():
     type=click.Choice([s.value for s in ShortPromptStrategy]),
     default=ShortPromptStrategy.IGNORE.value,
     show_default=True,
-    help="Strategy to handle prompts shorter than the target length. "
+    help="Strategy to handle prompts shorter than the target length. ",
 )
 @click.option(
     "--pad-token",
     type=str,
     default=None,
-    help="The token to pad short prompts with when using the 'pad' strategy."
+    help="The token to pad short prompts with when using the 'pad' strategy.",
 )
 @click.option(
     "--prompt-tokens-average",
     type=int,
     default=10,
     show_default=True,
-    help="Average target number of tokens for prompts."
+    help="Average target number of tokens for prompts.",
 )
 @click.option(
     "--prompt-tokens-stdev",
     type=int,
     default=None,
-    help="Standard deviation for prompt tokens sampling."
+    help="Standard deviation for prompt tokens sampling.",
 )
 @click.option(
     "--prompt-tokens-min",
     type=int,
     default=None,
-    help="Minimum number of prompt tokens."
+    help="Minimum number of prompt tokens.",
 )
 @click.option(
     "--prompt-tokens-max",
     type=int,
     default=None,
-    help="Maximum number of prompt tokens."
+    help="Maximum number of prompt tokens.",
 )
 @click.option(
     "--prompt-random-seed",
     type=int,
     default=42,
     show_default=True,
-    help="Random seed for prompt token sampling."
+    help="Random seed for prompt token sampling.",
 )
 @click.option(
     "--output-tokens-average",
     type=int,
     default=10,
     show_default=True,
-    help="Average target number of tokens for outputs."
+    help="Average target number of tokens for outputs.",
 )
 @click.option(
     "--output-tokens-stdev",
     type=int,
     default=None,
-    help="Standard deviation for output tokens sampling."
+    help="Standard deviation for output tokens sampling.",
 )
 @click.option(
     "--output-tokens-min",
     type=int,
     default=None,
-    help="Minimum number of output tokens."
+    help="Minimum number of output tokens.",
 )
 @click.option(
     "--output-tokens-max",
     type=int,
     default=None,
-    help="Maximum number of output tokens."
+    help="Maximum number of output tokens.",
 )
 @click.option(
     "--output-random-seed",
     type=int,
     default=123,
     show_default=True,
-    help="Random seed for output token sampling."
+    help="Random seed for output token sampling.",
 )
 @click.option(
     "--push-to-hub",
     is_flag=True,
-    help="Set this flag to push the converted dataset to the Hugging Face Hub."
+    help="Set this flag to push the converted dataset to the Hugging Face Hub.",
 )
 @click.option(
     "--hub-dataset-id",
     type=str,
     default=None,
     help="The Hugging Face Hub dataset ID to push to. "
-         "Required if --push-to-hub is used."
+    "Required if --push-to-hub is used.",
 )
 def dataset(
-        input_data,
-        output_path,
-        processor,
-        processor_args,
-        data_args,
-        short_prompt_strategy,
-        pad_token,
-        prompt_tokens_average,
-        prompt_tokens_stdev,
-        prompt_tokens_min,
-        prompt_tokens_max,
-        prompt_random_seed,
-        output_tokens_average,
-        output_tokens_stdev,
-        output_tokens_min,
-        output_tokens_max,
-        output_random_seed,
-        push_to_hub,
-        hub_dataset_id,
+    input_data,
+    output_path,
+    processor,
+    processor_args,
+    data_args,
+    short_prompt_strategy,
+    pad_token,
+    prompt_tokens_average,
+    prompt_tokens_stdev,
+    prompt_tokens_min,
+    prompt_tokens_max,
+    prompt_random_seed,
+    output_tokens_average,
+    output_tokens_stdev,
+    output_tokens_min,
+    output_tokens_max,
+    output_random_seed,
+    push_to_hub,
+    hub_dataset_id,
 ):
     process_dataset(
         input_data=input_data,
diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py
index 669bf409..95d01e5f 100644
--- a/src/guidellm/preprocess/__init__.py
+++ b/src/guidellm/preprocess/__init__.py
@@ -1,6 +1,3 @@
-from .dataset import process_dataset, ShortPromptStrategy
+from .dataset import ShortPromptStrategy, process_dataset
 
-__all__ = [
-    "process_dataset",
-    "ShortPromptStrategy"
-]
+__all__ = ["ShortPromptStrategy", "process_dataset"]
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index 6ab5c077..f330b537 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -1,14 +1,15 @@
 import os
+from collections.abc import Iterator
 from enum import Enum
 from pathlib import Path
-from typing import Any, Optional, Union, Dict, Callable, Iterator
+from typing import Any, Callable, Optional, Union
 
 from datasets import Dataset
 from loguru import logger
 from transformers import PreTrainedTokenizerBase
 
 from guidellm.dataset import load_dataset as guidellm_load_dataset
-from guidellm.utils import check_load_processor, IntegerRangeSampler
+from guidellm.utils import IntegerRangeSampler, check_load_processor
 
 SUPPORTED_TYPES = {
     ".json",
@@ -24,10 +25,10 @@ class ShortPromptStrategy(str, Enum):
 
 
 def handle_ignore_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        tokenizer: PreTrainedTokenizerBase,
-        **kwargs
+    current_prompt: str,
+    min_prompt_tokens: int,
+    tokenizer: PreTrainedTokenizerBase,
+    **_kwargs,
 ) -> Optional[str]:
     if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
         logger.warning("Prompt too short, ignoring")
@@ -36,19 +37,21 @@ def handle_ignore_strategy(
 
 
 def handle_concatenate_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        dataset_iterator: Iterator[Dict[str, Any]],
-        prompt_column: str,
-        tokenizer: PreTrainedTokenizerBase,
-        **kwargs
+    current_prompt: str,
+    min_prompt_tokens: int,
+    dataset_iterator: Iterator[dict[str, Any]],
+    prompt_column: str,
+    tokenizer: PreTrainedTokenizerBase,
+    **_kwargs,
 ) -> Optional[str]:
     tokens_len = len(tokenizer.encode(current_prompt))
     while tokens_len < min_prompt_tokens:
         try:
             next_row = next(dataset_iterator)
         except StopIteration:
-            logger.warning("Could not concatenate enough prompts to reach minimum length, ignoring")
+            logger.warning(
+                "Could not concatenate enough prompts to reach minimum length, ignoring"
+            )
             return None
         current_prompt += next_row[prompt_column]
         tokens_len = len(tokenizer.encode(current_prompt))
@@ -56,25 +59,25 @@ def handle_concatenate_strategy(
 
 
 def handle_pad_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        tokenizer: PreTrainedTokenizerBase,
-        pad_token: str,
-        **kwargs
+    current_prompt: str,
+    min_prompt_tokens: int,
+    tokenizer: PreTrainedTokenizerBase,
+    pad_token: str,
+    **_kwargs,
 ) -> str:
     while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
         current_prompt += pad_token
     return current_prompt
 
 
-STRATEGY_HANDLERS: Dict[ShortPromptStrategy, Callable] = {
+STRATEGY_HANDLERS: dict[ShortPromptStrategy, Callable] = {
     ShortPromptStrategy.IGNORE: handle_ignore_strategy,
     ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy,
     ShortPromptStrategy.PAD: handle_pad_strategy,
 }
 
 
-def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None:
+def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
     output_path = Path(output_path)
     output_path.parent.mkdir(parents=True, exist_ok=True)
 
@@ -86,12 +89,12 @@ def save_dataset_to_file(dataset: Dataset, output_path: Path) -> None:
         dataset.to_parquet(str(output_path))
     else:
         raise ValueError(
-            f"Unsupported file suffix '{output_path.suffix}' in output_path '{output_path}'. "
-            f"Only {SUPPORTED_TYPES} are supported."
+            f"Unsupported file suffix '{output_path.suffix}' in output_path "
+            f"'{output_path}'. Only {SUPPORTED_TYPES} are supported."
         )
 
 
-def _validate_output_suffix(output_path: str) -> None:
+def _validate_output_suffix(output_path: Union[str, Path]) -> None:
     output_path = Path(output_path)
     suffix = output_path.suffix.lower()
     if suffix not in SUPPORTED_TYPES:
@@ -102,33 +105,44 @@ def _validate_output_suffix(output_path: str) -> None:
 
 
 def process_dataset(
-        input_data: Union[str, Path],
-        output_path: Union[str, Path],
-        processor: Union[str, Path, PreTrainedTokenizerBase],
-        processor_args: Optional[dict[str, Any]] = None,
-        data_args: Optional[dict[str, Any]] = None,
-        short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
-        pad_token: Optional[str] = None,
-        prompt_tokens_average: int = 10,
-        prompt_tokens_stdev: Optional[int] = None,
-        prompt_tokens_min: Optional[int] = None,
-        prompt_tokens_max: Optional[int] = None,
-        prompt_random_seed: int = 42,
-        output_tokens_average: int = 10,
-        output_tokens_stdev: Optional[int] = None,
-        output_tokens_min: Optional[int] = None,
-        output_tokens_max: Optional[int] = None,
-        output_random_seed: int = 123,
-        push_to_hub: bool = False,
-        hub_dataset_id: Optional[str] = None,
+    input_data: Union[str, Path],
+    output_path: Union[str, Path],
+    processor: Union[str, Path, PreTrainedTokenizerBase],
+    processor_args: Optional[dict[str, Any]] = None,
+    data_args: Optional[dict[str, Any]] = None,
+    short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
+    pad_token: Optional[str] = None,
+    prompt_tokens_average: int = 10,
+    prompt_tokens_stdev: Optional[int] = None,
+    prompt_tokens_min: Optional[int] = None,
+    prompt_tokens_max: Optional[int] = None,
+    prompt_random_seed: int = 42,
+    output_tokens_average: int = 10,
+    output_tokens_stdev: Optional[int] = None,
+    output_tokens_min: Optional[int] = None,
+    output_tokens_max: Optional[int] = None,
+    output_random_seed: int = 123,
+    push_to_hub: bool = False,
+    hub_dataset_id: Optional[str] = None,
 ) -> None:
     _validate_output_suffix(output_path)
-    logger.info(f"Starting dataset conversion | Input: {input_data} | Output directory: {output_path}")
+    logger.info(
+        f"Starting dataset conversion | Input: {input_data} | "
+        f"Output directory: {output_path}"
+    )
 
-    dataset, column_mappings = guidellm_load_dataset(input_data, data_args, processor, processor_args)
-    tokenizer = check_load_processor(processor, processor_args, "Processor/tokenizer required for dataset conversion.")
+    dataset, column_mappings = guidellm_load_dataset(
+        input_data, data_args, processor, processor_args
+    )
+    tokenizer = check_load_processor(
+        processor,
+        processor_args,
+        "Processor/tokenizer required for dataset conversion.",
+    )
     prompt_column = column_mappings.get("prompt_column")
-    output_column = column_mappings.get("output_tokens_count_column", "output_tokens_count")
+    output_column = column_mappings.get(
+        "output_tokens_count_column", "output_tokens_count"
+    )
 
     prompt_token_sampler = iter(
         IntegerRangeSampler(
@@ -172,7 +186,9 @@ def process_dataset(
 
         if len(tokenizer.encode(prompt_text)) > target_prompt_len:
             tokens = tokenizer.encode(prompt_text)
-            prompt_text = tokenizer.decode(tokens[:target_prompt_len], skip_special_tokens=True)
+            prompt_text = tokenizer.decode(
+                tokens[:target_prompt_len], skip_special_tokens=True
+            )
 
         processed_prompt = prompt_row.copy()
         processed_prompt[prompt_column] = prompt_text
@@ -196,8 +212,11 @@ def process_dataset(
         logger.info(f"Pushed dataset to: {hub_dataset_id}")
 
 
-def push_dataset_to_hub(hub_dataset_id: str, processed_dataset: Dataset) -> None:
+def push_dataset_to_hub(hub_dataset_id: Optional[str], processed_dataset: Dataset) -> None:
     hf_token = os.environ.get("HF_TOKEN")
     if not hub_dataset_id or not hf_token:
-        raise ValueError("hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub is True")
+        raise ValueError(
+            "hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub"
+            " is True"
+        )
     processed_dataset.push_to_hub(hub_dataset_id, token=hf_token)
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 2b6c4665..756817a5 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -1,19 +1,21 @@
 import os
 from pathlib import Path
-from unittest.mock import patch, MagicMock
+from typing import Iterator
+from unittest.mock import MagicMock, patch
 
 import pytest
 from datasets import Dataset
 from transformers import PreTrainedTokenizerBase
 
 from guidellm.preprocess.dataset import (
-    handle_ignore_strategy,
+    STRATEGY_HANDLERS,
+    ShortPromptStrategy,
     handle_concatenate_strategy,
+    handle_ignore_strategy,
     handle_pad_strategy,
     process_dataset,
     push_dataset_to_hub,
-    ShortPromptStrategy,
-    STRATEGY_HANDLERS, save_dataset_to_file,
+    save_dataset_to_file,
 )
 
 
@@ -21,32 +23,48 @@
 def tokenizer_mock():
     tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
     tokenizer.encode.side_effect = lambda x: [1] * len(x)
-    tokenizer.decode.side_effect = lambda x, *args, **kwargs: ''.join(str(item) for item in x)
+    tokenizer.decode.side_effect = lambda x, *args, **kwargs: "".join(
+        str(item) for item in x
+    )
     return tokenizer
 
 
-@patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: MagicMock(return_value="processed_prompt")})
+from unittest.mock import MagicMock, patch
+from guidellm.preprocess.dataset import process_dataset, STRATEGY_HANDLERS, ShortPromptStrategy
+from datasets import Dataset
+
+
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_strategy_handler_called(
-        mock_sampler, mock_dataset_class, mock_check_processor, mock_load_dataset, tokenizer_mock
+    mock_sampler,
+    mock_dataset_class,
+    mock_check_processor,
+    mock_load_dataset,
+    tokenizer_mock,
 ):
-    mock_handler = STRATEGY_HANDLERS[ShortPromptStrategy.IGNORE]
-    mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}]
-    mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
-    mock_check_processor.return_value = tokenizer_mock
-    mock_sampler.side_effect = lambda **kwargs: [10, 10]
+    mock_handler = MagicMock(return_value="processed_prompt")
+    with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}):
+        mock_dataset = [{"prompt": "abc"}, {"prompt": "def"}]
+        mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
+        mock_check_processor.return_value = tokenizer_mock
+        mock_sampler.side_effect = lambda **kwargs: [10, 10]
 
-    mock_dataset_obj = MagicMock(spec=Dataset)
-    mock_dataset_class.from_list.return_value = mock_dataset_obj
+        mock_dataset_obj = MagicMock(spec=Dataset)
+        mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir/data.json", tokenizer_mock, short_prompt_strategy=ShortPromptStrategy.IGNORE)
+        process_dataset(
+            "input",
+            "output_dir/data.json",
+            tokenizer_mock,
+            short_prompt_strategy=ShortPromptStrategy.IGNORE,
+        )
 
-    assert mock_handler.call_count == 2
-    mock_load_dataset.assert_called_once()
-    mock_check_processor.assert_called_once()
+        assert mock_handler.call_count == 2
+        mock_load_dataset.assert_called_once()
+        mock_check_processor.assert_called_once()
 
 
 def test_handle_ignore_strategy_too_short(tokenizer_mock):
@@ -63,13 +81,17 @@ def test_handle_ignore_strategy_sufficient_length(tokenizer_mock):
 
 def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     dataset_iter = iter([{"prompt": "longer"}])
-    result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock)
+    result = handle_concatenate_strategy(
+        "short", 10, dataset_iter, "prompt", tokenizer_mock
+    )
     assert result == "shortlonger"
 
 
 def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
-    dataset_iter = iter([])
-    result = handle_concatenate_strategy("short", 10, dataset_iter, "prompt", tokenizer_mock)
+    dataset_iter: Iterator = iter([])
+    result = handle_concatenate_strategy(
+        "short", 10, dataset_iter, "prompt", tokenizer_mock
+    )
     assert result is None
 
 
@@ -84,12 +106,12 @@ def test_handle_pad_strategy(tokenizer_mock):
 @patch("guidellm.preprocess.dataset.check_load_processor")
 @patch("guidellm.preprocess.dataset.IntegerRangeSampler")
 def test_process_dataset_non_empty(
-        mock_sampler,
-        mock_check_processor,
-        mock_load_dataset,
-        mock_dataset_class,
-        mock_save_to_file,
-        tokenizer_mock,
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_save_to_file,
+    tokenizer_mock,
 ):
     from guidellm.preprocess.dataset import process_dataset
 
@@ -124,7 +146,11 @@ def test_process_dataset_non_empty(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_empty_after_processing(
-        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, tokenizer_mock
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": ""}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -144,7 +170,12 @@ def test_process_dataset_empty_after_processing(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_called(
-        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_push,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -154,7 +185,13 @@ def test_process_dataset_push_to_hub_called(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=True, hub_dataset_id="id123")
+    process_dataset(
+        "input",
+        "output_dir/data.json",
+        tokenizer_mock,
+        push_to_hub=True,
+        hub_dataset_id="id123",
+    )
     mock_push.assert_called_once_with("id123", mock_dataset_obj)
 
 
@@ -164,7 +201,12 @@ def test_process_dataset_push_to_hub_called(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_not_called(
-        mock_sampler, mock_check_processor, mock_load_dataset, mock_dataset_class, mock_push, tokenizer_mock
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_push,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})

From b5a4877b577abb75aee3c4ba7f0b9258f2a8eb80 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 18 May 2025 16:14:32 +0300
Subject: [PATCH 07/18] Improved error message

---
 src/guidellm/preprocess/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index f330b537..9daa5256 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -137,7 +137,7 @@ def process_dataset(
     tokenizer = check_load_processor(
         processor,
         processor_args,
-        "Processor/tokenizer required for dataset conversion.",
+        "dataset conversion.",
     )
     prompt_column = column_mappings.get("prompt_column")
     output_column = column_mappings.get(
@@ -185,7 +185,7 @@ def process_dataset(
                 continue
 
         if len(tokenizer.encode(prompt_text)) > target_prompt_len:
-            tokens = tokenizer.encode(prompt_text)
+            tokens = tokenizer.encode(prompt_text, add_special_tokens=True)
             prompt_text = tokenizer.decode(
                 tokens[:target_prompt_len], skip_special_tokens=True
             )

From 4d624a28c8c5f8c65d89ddd3163d81bc0e62811e Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Tue, 20 May 2025 09:43:53 +0300
Subject: [PATCH 08/18] Fixed code styling

---
 src/guidellm/preprocess/dataset.py    |  4 +-
 tests/unit/preprocess/test_dataset.py | 66 +++++++++++++--------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index 9daa5256..baae943c 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -212,7 +212,9 @@ def process_dataset(
         logger.info(f"Pushed dataset to: {hub_dataset_id}")
 
 
-def push_dataset_to_hub(hub_dataset_id: Optional[str], processed_dataset: Dataset) -> None:
+def push_dataset_to_hub(
+        hub_dataset_id: Optional[str], processed_dataset: Dataset,
+) -> None:
     hf_token = os.environ.get("HF_TOKEN")
     if not hub_dataset_id or not hf_token:
         raise ValueError(
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 756817a5..dd4ab645 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -1,8 +1,11 @@
 import os
 from pathlib import Path
-from typing import Iterator
+from typing import TYPE_CHECKING
 from unittest.mock import MagicMock, patch
 
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
 import pytest
 from datasets import Dataset
 from transformers import PreTrainedTokenizerBase
@@ -29,21 +32,16 @@ def tokenizer_mock():
     return tokenizer
 
 
-from unittest.mock import MagicMock, patch
-from guidellm.preprocess.dataset import process_dataset, STRATEGY_HANDLERS, ShortPromptStrategy
-from datasets import Dataset
-
-
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_strategy_handler_called(
-    mock_sampler,
-    mock_dataset_class,
-    mock_check_processor,
-    mock_load_dataset,
-    tokenizer_mock,
+        mock_sampler,
+        mock_dataset_class,
+        mock_check_processor,
+        mock_load_dataset,
+        tokenizer_mock,
 ):
     mock_handler = MagicMock(return_value="processed_prompt")
     with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}):
@@ -106,12 +104,12 @@ def test_handle_pad_strategy(tokenizer_mock):
 @patch("guidellm.preprocess.dataset.check_load_processor")
 @patch("guidellm.preprocess.dataset.IntegerRangeSampler")
 def test_process_dataset_non_empty(
-    mock_sampler,
-    mock_check_processor,
-    mock_load_dataset,
-    mock_dataset_class,
-    mock_save_to_file,
-    tokenizer_mock,
+        mock_sampler,
+        mock_check_processor,
+        mock_load_dataset,
+        mock_dataset_class,
+        mock_save_to_file,
+        tokenizer_mock,
 ):
     from guidellm.preprocess.dataset import process_dataset
 
@@ -146,11 +144,11 @@ def test_process_dataset_non_empty(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_empty_after_processing(
-    mock_sampler,
-    mock_check_processor,
-    mock_load_dataset,
-    mock_dataset_class,
-    tokenizer_mock,
+        mock_sampler,
+        mock_check_processor,
+        mock_load_dataset,
+        mock_dataset_class,
+        tokenizer_mock,
 ):
     mock_dataset = [{"prompt": ""}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -170,12 +168,12 @@ def test_process_dataset_empty_after_processing(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_called(
-    mock_sampler,
-    mock_check_processor,
-    mock_load_dataset,
-    mock_dataset_class,
-    mock_push,
-    tokenizer_mock,
+        mock_sampler,
+        mock_check_processor,
+        mock_load_dataset,
+        mock_dataset_class,
+        mock_push,
+        tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -201,12 +199,12 @@ def test_process_dataset_push_to_hub_called(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_not_called(
-    mock_sampler,
-    mock_check_processor,
-    mock_load_dataset,
-    mock_dataset_class,
-    mock_push,
-    tokenizer_mock,
+        mock_sampler,
+        mock_check_processor,
+        mock_load_dataset,
+        mock_dataset_class,
+        mock_push,
+        tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})

From e2ca9193fd9bc38c10eab4bc67a295efe5d4def5 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 15:32:40 +0300
Subject: [PATCH 09/18] Fixed CR comments

---
 pyproject.toml                        |   3 +-
 src/guidellm/__main__.py              |  79 ++++++++------
 src/guidellm/preprocess/dataset.py    | 150 +++++++++++++++-----------
 tests/unit/preprocess/test_dataset.py |  69 ++++++++++--
 4 files changed, 193 insertions(+), 108 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a78b1fc5..9f86d993 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,8 +21,7 @@ name = "guidellm"
 description = "Guidance platform for deploying and managing large language models."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.9.0,<4.0"
-license = "Apache-2.0"
-license-files = ["LICENSE"]
+license = { text = "Apache-2.0" }
 authors = [ { name = "Red Hat" } ]
 keywords = [
     "ai",
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 6df1089d..51ac7049 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -1,4 +1,5 @@
 import asyncio
+import codecs
 import json
 from pathlib import Path
 from typing import get_args
@@ -280,6 +281,18 @@ def benchmark(
         )
     )
 
+def decode_escaped_str(_ctx, _param, value):
+    """
+    Click auto adds characters. For example, when using --pad-char "\n",
+    it parses it as "\\n". This method decodes the string to handle escape
+    sequences correctly.
+    """
+    if value is None:
+        return None
+    try:
+        return codecs.decode(value, "unicode_escape")
+    except Exception as e:
+        raise click.BadParameter(f"Could not decode escape sequences: {e}") from e
 
 @cli.command(
     help=(
@@ -291,27 +304,26 @@ def config():
     print_config()
 
 
-@cli.group(help="Preprocessing utilities for datasets.")
+@cli.group(help="General preprocessing tools and utilities.")
 def preprocess():
     pass
 
 
 @preprocess.command(
-    help="Convert a dataset to have specific prompt and output token sizes.\n\n"
-    "INPUT_DATA: Path to the input dataset or dataset ID.\n"
-    "OUTPUT_PATH: Directory to save the converted dataset. "
-    "The dataset will be saved as an Arrow dataset (.arrow) inside the directory."
+    help=(
+        "Convert a dataset to have specific prompt and output token sizes.\n\n"
+        "INPUT_DATA: Path to the input dataset or dataset ID.\n"
+        "OUTPUT_PATH: Path to save the converted dataset, including file suffix. "
+    )
 )
 @click.argument(
-    "input_data",
+    "data",
     type=str,
-    metavar="INPUT_DATA",
     required=True,
 )
 @click.argument(
     "output_path",
     type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
-    metavar="OUTPUT_PATH",
     required=True,
 )
 @click.option(
@@ -348,11 +360,21 @@ def preprocess():
     help="Strategy to handle prompts shorter than the target length. ",
 )
 @click.option(
-    "--pad-token",
+    "--pad-char",
     type=str,
-    default=None,
+    default="",
+    callback=decode_escaped_str,
     help="The token to pad short prompts with when using the 'pad' strategy.",
 )
+@click.option(
+    "--concat-delimiter",
+    type=str,
+    default="",
+    help=(
+        "The delimiter to use when concatenating prompts that are too short."
+        " Used when strategy is 'concatenate'."
+    )
+)
 @click.option(
     "--prompt-tokens-average",
     type=int,
@@ -378,13 +400,6 @@ def preprocess():
     default=None,
     help="Maximum number of prompt tokens.",
 )
-@click.option(
-    "--prompt-random-seed",
-    type=int,
-    default=42,
-    show_default=True,
-    help="Random seed for prompt token sampling.",
-)
 @click.option(
     "--output-tokens-average",
     type=int,
@@ -410,13 +425,6 @@ def preprocess():
     default=None,
     help="Maximum number of output tokens.",
 )
-@click.option(
-    "--output-random-seed",
-    type=int,
-    default=123,
-    show_default=True,
-    help="Random seed for output token sampling.",
-)
 @click.option(
     "--push-to-hub",
     is_flag=True,
@@ -429,47 +437,54 @@ def preprocess():
     help="The Hugging Face Hub dataset ID to push to. "
     "Required if --push-to-hub is used.",
 )
+@click.option(
+    "--random-seed",
+    type=int,
+    default=42,
+    show_default=True,
+    help="Random seed for prompt token sampling and output tokens sampling.",
+)
 def dataset(
-    input_data,
+    data,
     output_path,
     processor,
     processor_args,
     data_args,
     short_prompt_strategy,
-    pad_token,
+    pad_char,
+    concat_delimiter,
     prompt_tokens_average,
     prompt_tokens_stdev,
     prompt_tokens_min,
     prompt_tokens_max,
-    prompt_random_seed,
     output_tokens_average,
     output_tokens_stdev,
     output_tokens_min,
     output_tokens_max,
-    output_random_seed,
     push_to_hub,
     hub_dataset_id,
+    random_seed,
 ):
     process_dataset(
-        input_data=input_data,
+        data=data,
         output_path=output_path,
         processor=processor,
         processor_args=processor_args,
         data_args=data_args,
         short_prompt_strategy=short_prompt_strategy,
-        pad_token=pad_token,
+        pad_char=pad_char,
+        concat_delimiter=concat_delimiter,
         prompt_tokens_average=prompt_tokens_average,
         prompt_tokens_stdev=prompt_tokens_stdev,
         prompt_tokens_min=prompt_tokens_min,
         prompt_tokens_max=prompt_tokens_max,
-        prompt_random_seed=prompt_random_seed,
         output_tokens_average=output_tokens_average,
         output_tokens_stdev=output_tokens_stdev,
         output_tokens_min=output_tokens_min,
         output_tokens_max=output_tokens_max,
-        output_random_seed=output_random_seed,
         push_to_hub=push_to_hub,
         hub_dataset_id=hub_dataset_id,
+        random_seed=random_seed,
     )
 
 
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index baae943c..b3f3db7a 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -13,22 +13,28 @@
 
 SUPPORTED_TYPES = {
     ".json",
+    ".jsonl",
     ".csv",
     ".parquet",
 }
 
 
+class PromptTooShortError(Exception):
+    pass
+
+
 class ShortPromptStrategy(str, Enum):
     IGNORE = "ignore"
     CONCATENATE = "concatenate"
     PAD = "pad"
+    ERROR = "error"
 
 
 def handle_ignore_strategy(
-    current_prompt: str,
-    min_prompt_tokens: int,
-    tokenizer: PreTrainedTokenizerBase,
-    **_kwargs,
+        current_prompt: str,
+        min_prompt_tokens: int,
+        tokenizer: PreTrainedTokenizerBase,
+        **_kwargs,
 ) -> Optional[str]:
     if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
         logger.warning("Prompt too short, ignoring")
@@ -37,12 +43,13 @@ def handle_ignore_strategy(
 
 
 def handle_concatenate_strategy(
-    current_prompt: str,
-    min_prompt_tokens: int,
-    dataset_iterator: Iterator[dict[str, Any]],
-    prompt_column: str,
-    tokenizer: PreTrainedTokenizerBase,
-    **_kwargs,
+        current_prompt: str,
+        min_prompt_tokens: int,
+        dataset_iterator: Iterator[dict[str, Any]],
+        prompt_column: str,
+        tokenizer: PreTrainedTokenizerBase,
+        concat_delimiter: str,
+        **_kwargs,
 ) -> Optional[str]:
     tokens_len = len(tokenizer.encode(current_prompt))
     while tokens_len < min_prompt_tokens:
@@ -53,20 +60,35 @@ def handle_concatenate_strategy(
                 "Could not concatenate enough prompts to reach minimum length, ignoring"
             )
             return None
-        current_prompt += next_row[prompt_column]
+        current_prompt += concat_delimiter + next_row[prompt_column]
         tokens_len = len(tokenizer.encode(current_prompt))
     return current_prompt
 
 
 def handle_pad_strategy(
-    current_prompt: str,
-    min_prompt_tokens: int,
-    tokenizer: PreTrainedTokenizerBase,
-    pad_token: str,
-    **_kwargs,
+        current_prompt: str,
+        min_prompt_tokens: int,
+        tokenizer: PreTrainedTokenizerBase,
+        pad_char: str,
+        **_kwargs,
 ) -> str:
     while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
-        current_prompt += pad_token
+        current_prompt += pad_char
+    return current_prompt
+
+
+def handle_error_strategy(
+        current_prompt: str,
+        min_prompt_tokens: int,
+        tokenizer: PreTrainedTokenizerBase,
+        **_kwargs,
+) -> Optional[str]:
+    prompt_len = len(tokenizer.encode(current_prompt))
+    if prompt_len < min_prompt_tokens:
+        raise PromptTooShortError(
+            f"Found too short prompt: {current_prompt}, with length: {prompt_len}. "
+            f"Minimum length required: {min_prompt_tokens}.",
+        )
     return current_prompt
 
 
@@ -74,23 +96,25 @@ def handle_pad_strategy(
     ShortPromptStrategy.IGNORE: handle_ignore_strategy,
     ShortPromptStrategy.CONCATENATE: handle_concatenate_strategy,
     ShortPromptStrategy.PAD: handle_pad_strategy,
+    ShortPromptStrategy.ERROR: handle_error_strategy,
 }
 
 
 def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
     output_path = Path(output_path)
     output_path.parent.mkdir(parents=True, exist_ok=True)
+    suffix = output_path.suffix.lower()
 
-    if output_path.suffix == ".csv":
-        dataset.to_csv(str(output_path))
-    elif output_path.suffix == ".json":
-        dataset.to_json(str(output_path))
-    elif output_path.suffix == ".parquet":
-        dataset.to_parquet(str(output_path))
+    if suffix == ".csv":
+        dataset.to_csv(output_path)
+    elif suffix in {".json", ".jsonl"}:
+        dataset.to_json(output_path)
+    elif suffix == ".parquet":
+        dataset.to_parquet(output_path)
     else:
         raise ValueError(
-            f"Unsupported file suffix '{output_path.suffix}' in output_path "
-            f"'{output_path}'. Only {SUPPORTED_TYPES} are supported."
+            f"Unsupported file suffix '{suffix}' in output_path'{output_path}'."
+            f" Only {SUPPORTED_TYPES} are supported."
         )
 
 
@@ -105,34 +129,34 @@ def _validate_output_suffix(output_path: Union[str, Path]) -> None:
 
 
 def process_dataset(
-    input_data: Union[str, Path],
-    output_path: Union[str, Path],
-    processor: Union[str, Path, PreTrainedTokenizerBase],
-    processor_args: Optional[dict[str, Any]] = None,
-    data_args: Optional[dict[str, Any]] = None,
-    short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
-    pad_token: Optional[str] = None,
-    prompt_tokens_average: int = 10,
-    prompt_tokens_stdev: Optional[int] = None,
-    prompt_tokens_min: Optional[int] = None,
-    prompt_tokens_max: Optional[int] = None,
-    prompt_random_seed: int = 42,
-    output_tokens_average: int = 10,
-    output_tokens_stdev: Optional[int] = None,
-    output_tokens_min: Optional[int] = None,
-    output_tokens_max: Optional[int] = None,
-    output_random_seed: int = 123,
-    push_to_hub: bool = False,
-    hub_dataset_id: Optional[str] = None,
+        data: Union[str, Path],
+        output_path: Union[str, Path],
+        processor: Union[str, Path, PreTrainedTokenizerBase],
+        processor_args: Optional[dict[str, Any]] = None,
+        data_args: Optional[dict[str, Any]] = None,
+        short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
+        pad_char: Optional[str] = None,
+        concat_delimiter: Optional[str] = None,
+        prompt_tokens_average: int = 10,
+        prompt_tokens_stdev: Optional[int] = None,
+        prompt_tokens_min: Optional[int] = None,
+        prompt_tokens_max: Optional[int] = None,
+        output_tokens_average: int = 10,
+        output_tokens_stdev: Optional[int] = None,
+        output_tokens_min: Optional[int] = None,
+        output_tokens_max: Optional[int] = None,
+        push_to_hub: bool = False,
+        hub_dataset_id: Optional[str] = None,
+        random_seed: int = 42,
 ) -> None:
     _validate_output_suffix(output_path)
     logger.info(
-        f"Starting dataset conversion | Input: {input_data} | "
+        f"Starting dataset conversion | Input: {data} | "
         f"Output directory: {output_path}"
     )
 
     dataset, column_mappings = guidellm_load_dataset(
-        input_data, data_args, processor, processor_args
+        data, data_args, processor, processor_args
     )
     tokenizer = check_load_processor(
         processor,
@@ -150,7 +174,7 @@ def process_dataset(
             variance=prompt_tokens_stdev,
             min_value=prompt_tokens_min,
             max_value=prompt_tokens_max,
-            random_seed=prompt_random_seed,
+            random_seed=random_seed,
         )
     )
 
@@ -160,35 +184,33 @@ def process_dataset(
             variance=output_tokens_stdev,
             min_value=output_tokens_min,
             max_value=output_tokens_max,
-            random_seed=output_random_seed,
+            random_seed=random_seed,
         )
     )
 
     dataset_iterator = iter(dataset)
     processed_prompts = []
-    handler = STRATEGY_HANDLERS[short_prompt_strategy]
+    prompt_handler = STRATEGY_HANDLERS[short_prompt_strategy]
 
     for prompt_row in dataset_iterator:
         prompt_text = prompt_row[prompt_column]
         target_prompt_len = next(prompt_token_sampler)
 
-        if len(tokenizer.encode(prompt_text)) < target_prompt_len:
-            prompt_text = handler(
-                current_prompt=prompt_text,
-                min_prompt_tokens=target_prompt_len,
-                dataset_iterator=dataset_iterator,
-                prompt_column=prompt_column,
-                tokenizer=tokenizer,
-                pad_token=pad_token,
-            )
-            if prompt_text is None:
-                continue
+        prompt_text = prompt_handler(
+            current_prompt=prompt_text,
+            min_prompt_tokens=target_prompt_len,
+            dataset_iterator=dataset_iterator,
+            prompt_column=prompt_column,
+            tokenizer=tokenizer,
+            pad_char=pad_char,
+            concat_delimiter=concat_delimiter,
+        )
+        if prompt_text is None:
+            continue
 
         if len(tokenizer.encode(prompt_text)) > target_prompt_len:
-            tokens = tokenizer.encode(prompt_text, add_special_tokens=True)
-            prompt_text = tokenizer.decode(
-                tokens[:target_prompt_len], skip_special_tokens=True
-            )
+            tokens = tokenizer.encode(prompt_text)
+            prompt_text = tokenizer.decode(tokens[:target_prompt_len])
 
         processed_prompt = prompt_row.copy()
         processed_prompt[prompt_column] = prompt_text
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index dd4ab645..401a9e23 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -12,8 +12,10 @@
 
 from guidellm.preprocess.dataset import (
     STRATEGY_HANDLERS,
+    PromptTooShortError,
     ShortPromptStrategy,
     handle_concatenate_strategy,
+    handle_error_strategy,
     handle_ignore_strategy,
     handle_pad_strategy,
     process_dataset,
@@ -80,15 +82,15 @@ def test_handle_ignore_strategy_sufficient_length(tokenizer_mock):
 def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     dataset_iter = iter([{"prompt": "longer"}])
     result = handle_concatenate_strategy(
-        "short", 10, dataset_iter, "prompt", tokenizer_mock
+        "short", 10, dataset_iter, "prompt", tokenizer_mock, "\n"
     )
-    assert result == "shortlonger"
+    assert result == "short\nlonger"
 
 
 def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
     dataset_iter: Iterator = iter([])
     result = handle_concatenate_strategy(
-        "short", 10, dataset_iter, "prompt", tokenizer_mock
+        "short", 10, dataset_iter, "prompt", tokenizer_mock, ""
     )
     assert result is None
 
@@ -98,6 +100,17 @@ def test_handle_pad_strategy(tokenizer_mock):
     assert result == "shortppppp"
 
 
+def test_handle_error_strategy_valid_prompt(tokenizer_mock):
+    result = handle_error_strategy("valid prompt", 5, tokenizer_mock)
+    assert result == "valid prompt"
+    tokenizer_mock.encode.assert_called_with("valid prompt")
+
+
+def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
+    with pytest.raises(PromptTooShortError):
+        handle_error_strategy("short", 10, tokenizer_mock)
+
+
 @patch("guidellm.preprocess.dataset.save_dataset_to_file")
 @patch("guidellm.preprocess.dataset.Dataset")
 @patch("guidellm.preprocess.dataset.guidellm_load_dataset")
@@ -245,8 +258,17 @@ def test_save_dataset_to_file_csv(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
     output_path = Path("some/path/output.csv")
     save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_csv.assert_called_once_with(str(output_path))
-    mock_mkdir.assert_called_once()
+    mock_dataset.to_csv.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.CSV")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_csv.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
 
 @patch.object(Path, "mkdir")
@@ -254,8 +276,35 @@ def test_save_dataset_to_file_json(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
     output_path = Path("some/path/output.json")
     save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_json.assert_called_once_with(str(output_path))
-    mock_mkdir.assert_called_once()
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_json_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.JSON")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_jsonl(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.jsonl")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.JSONL")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
 
 @patch.object(Path, "mkdir")
@@ -263,8 +312,8 @@ def test_save_dataset_to_file_parquet(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
     output_path = Path("some/path/output.parquet")
     save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_parquet.assert_called_once_with(str(output_path))
-    mock_mkdir.assert_called_once()
+    mock_dataset.to_parquet.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
 
 @patch.object(Path, "mkdir")
@@ -273,4 +322,4 @@ def test_save_dataset_to_file_unsupported_type(mock_mkdir):
     output_path = Path("some/path/output.txt")
     with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
         save_dataset_to_file(mock_dataset, output_path)
-    mock_mkdir.assert_called_once()
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)

From a9f4fa6f90bd59600cfddfeab750f756fe2b7809 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 16:00:14 +0300
Subject: [PATCH 10/18] Fixed UTs

---
 src/guidellm/__main__.py              | 70 +++----------------
 src/guidellm/preprocess/dataset.py    | 97 ++++++++++++++++++++++-----
 tests/unit/preprocess/test_dataset.py | 41 ++++++++---
 3 files changed, 123 insertions(+), 85 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index cca18048..aac0efa4 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -376,54 +376,16 @@ def preprocess():
     )
 )
 @click.option(
-    "--prompt-tokens-average",
-    type=int,
-    default=10,
-    show_default=True,
-    help="Average target number of tokens for prompts.",
-)
-@click.option(
-    "--prompt-tokens-stdev",
-    type=int,
-    default=None,
-    help="Standard deviation for prompt tokens sampling.",
-)
-@click.option(
-    "--prompt-tokens-min",
-    type=int,
-    default=None,
-    help="Minimum number of prompt tokens.",
-)
-@click.option(
-    "--prompt-tokens-max",
-    type=int,
-    default=None,
-    help="Maximum number of prompt tokens.",
-)
-@click.option(
-    "--output-tokens-average",
-    type=int,
-    default=10,
-    show_default=True,
-    help="Average target number of tokens for outputs.",
-)
-@click.option(
-    "--output-tokens-stdev",
-    type=int,
-    default=None,
-    help="Standard deviation for output tokens sampling.",
-)
-@click.option(
-    "--output-tokens-min",
-    type=int,
+    "--prompt-tokens",
+    type=str,
     default=None,
-    help="Minimum number of output tokens.",
+    help="Prompt tokens config (JSON, YAML file or key=value string)",
 )
 @click.option(
-    "--output-tokens-max",
-    type=int,
+    "--output-tokens",
+    type=str,
     default=None,
-    help="Maximum number of output tokens.",
+    help="Output tokens config (JSON, YAML file or key=value string)",
 )
 @click.option(
     "--push-to-hub",
@@ -453,14 +415,8 @@ def dataset(
     short_prompt_strategy,
     pad_char,
     concat_delimiter,
-    prompt_tokens_average,
-    prompt_tokens_stdev,
-    prompt_tokens_min,
-    prompt_tokens_max,
-    output_tokens_average,
-    output_tokens_stdev,
-    output_tokens_min,
-    output_tokens_max,
+    prompt_tokens,
+    output_tokens,
     push_to_hub,
     hub_dataset_id,
     random_seed,
@@ -469,19 +425,13 @@ def dataset(
         data=data,
         output_path=output_path,
         processor=processor,
+        prompt_tokens=prompt_tokens,
+        output_tokens=output_tokens,
         processor_args=processor_args,
         data_args=data_args,
         short_prompt_strategy=short_prompt_strategy,
         pad_char=pad_char,
         concat_delimiter=concat_delimiter,
-        prompt_tokens_average=prompt_tokens_average,
-        prompt_tokens_stdev=prompt_tokens_stdev,
-        prompt_tokens_min=prompt_tokens_min,
-        prompt_tokens_max=prompt_tokens_max,
-        output_tokens_average=output_tokens_average,
-        output_tokens_stdev=output_tokens_stdev,
-        output_tokens_min=output_tokens_min,
-        output_tokens_max=output_tokens_max,
         push_to_hub=push_to_hub,
         hub_dataset_id=hub_dataset_id,
         random_seed=random_seed,
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index b3f3db7a..b2a45e16 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -1,11 +1,14 @@
+import json
 import os
 from collections.abc import Iterator
 from enum import Enum
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
+import yaml
 from datasets import Dataset
 from loguru import logger
+from pydantic import BaseModel, Field
 from transformers import PreTrainedTokenizerBase
 
 from guidellm.dataset import load_dataset as guidellm_load_dataset
@@ -100,6 +103,71 @@ def handle_error_strategy(
 }
 
 
+class TokensConfig(BaseModel):
+    average: int = Field(
+        description="The average number of tokens.",
+        gt=0,
+    )
+    stdev: Optional[int] = Field(
+        description="The standard deviation of the tokens.",
+        gt=0,
+        default=None,
+    )
+    min: Optional[int] = Field(
+        description="The minimum number of tokens.",
+        gt=0,
+        default=None,
+    )
+    max: Optional[int] = Field(
+        description="The maximum number of tokens.",
+        gt=0,
+        default=None,
+    )
+
+    @staticmethod
+    def parse_str(data: Union[str, Path]) -> "TokensConfig":
+        if (
+            isinstance(data, Path)
+            or data.strip().endswith(".config")
+            or data.strip().endswith(".yaml")
+        ):
+            return TokensConfig.parse_config_file(data)
+
+        if data.strip().startswith("{"):
+            return TokensConfig.parse_json(data)
+
+        if data.count("=") > 1:
+            return TokensConfig.parse_key_value_pairs(data)
+
+        raise ValueError(
+            f"Unsupported data format. Expected JSON or key-value pairs, got {data}"
+        )
+
+    @staticmethod
+    def parse_json(data: str) -> "TokensConfig":
+        config_dict = json.loads(data.strip())
+
+        return TokensConfig(**config_dict)
+
+    @staticmethod
+    def parse_key_value_pairs(data: str) -> "TokensConfig":
+        config_dict = {}
+        items = data.strip().split(",")
+        for item in items:
+            key, value = item.split("=")
+            config_dict[key.strip()] = (
+                int(value.strip()) if value.strip().isnumeric() else value.strip()
+            )
+
+        return TokensConfig(**config_dict)  # type: ignore[arg-type]
+
+    @staticmethod
+    def parse_config_file(data: Union[str, Path]) -> "TokensConfig":
+        with Path(data).open("r") as file:
+            config_dict = yaml.safe_load(file)
+
+        return TokensConfig(**config_dict)
+
 def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
     output_path = Path(output_path)
     output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -132,19 +200,13 @@ def process_dataset(
         data: Union[str, Path],
         output_path: Union[str, Path],
         processor: Union[str, Path, PreTrainedTokenizerBase],
+        prompt_tokens: Union[str, Path],
+        output_tokens: Union[str, Path],
         processor_args: Optional[dict[str, Any]] = None,
         data_args: Optional[dict[str, Any]] = None,
         short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
         pad_char: Optional[str] = None,
         concat_delimiter: Optional[str] = None,
-        prompt_tokens_average: int = 10,
-        prompt_tokens_stdev: Optional[int] = None,
-        prompt_tokens_min: Optional[int] = None,
-        prompt_tokens_max: Optional[int] = None,
-        output_tokens_average: int = 10,
-        output_tokens_stdev: Optional[int] = None,
-        output_tokens_min: Optional[int] = None,
-        output_tokens_max: Optional[int] = None,
         push_to_hub: bool = False,
         hub_dataset_id: Optional[str] = None,
         random_seed: int = 42,
@@ -168,22 +230,25 @@ def process_dataset(
         "output_tokens_count_column", "output_tokens_count"
     )
 
+    prompt_tokens_cfg = TokensConfig.parse_str(prompt_tokens)
+    output_tokens_cfg = TokensConfig.parse_str(output_tokens)
+
     prompt_token_sampler = iter(
         IntegerRangeSampler(
-            average=prompt_tokens_average,
-            variance=prompt_tokens_stdev,
-            min_value=prompt_tokens_min,
-            max_value=prompt_tokens_max,
+            average=prompt_tokens_cfg.average,
+            variance=prompt_tokens_cfg.stdev,
+            min_value=prompt_tokens_cfg.min,
+            max_value=prompt_tokens_cfg.max,
             random_seed=random_seed,
         )
     )
 
     output_token_sampler = iter(
         IntegerRangeSampler(
-            average=output_tokens_average,
-            variance=output_tokens_stdev,
-            min_value=output_tokens_min,
-            max_value=output_tokens_max,
+            average=output_tokens_cfg.average,
+            variance=output_tokens_cfg.stdev,
+            min_value=output_tokens_cfg.min,
+            max_value=output_tokens_cfg.max,
             random_seed=random_seed,
         )
     )
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 401a9e23..38e5bcf9 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -56,9 +56,11 @@ def test_strategy_handler_called(
         mock_dataset_class.from_list.return_value = mock_dataset_obj
 
         process_dataset(
-            "input",
-            "output_dir/data.json",
-            tokenizer_mock,
+            data="input",
+            output_path="output_dir/data.json",
+            processor=tokenizer_mock,
+            prompt_tokens="average=10,min=1",
+            output_tokens="average=10,min=1",
             short_prompt_strategy=ShortPromptStrategy.IGNORE,
         )
 
@@ -135,7 +137,13 @@ def test_process_dataset_non_empty(
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
     output_path = "output_dir/data.json"
-    process_dataset("input", output_path, tokenizer_mock)
+    process_dataset(
+        data="input",
+        output_path=output_path,
+        processor=tokenizer_mock,
+        prompt_tokens="average=10,min=1",
+        output_tokens="average=10,min=1",
+    )
 
     mock_load_dataset.assert_called_once()
     mock_check_processor.assert_called_once()
@@ -168,7 +176,13 @@ def test_process_dataset_empty_after_processing(
     mock_check_processor.return_value = tokenizer_mock
     mock_sampler.side_effect = lambda **kwargs: [10]
 
-    process_dataset("input", "output_dir/data.json", tokenizer_mock)
+    process_dataset(
+        data="input",
+        output_path="output_dir/data.json",
+        processor=tokenizer_mock,
+        prompt_tokens="average=10,min=1",
+        output_tokens="average=10,min=1",
+    )
 
     mock_load_dataset.assert_called_once()
     mock_check_processor.assert_called_once()
@@ -197,9 +211,11 @@ def test_process_dataset_push_to_hub_called(
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
     process_dataset(
-        "input",
-        "output_dir/data.json",
-        tokenizer_mock,
+        data="input",
+        output_path="output_dir/data.json",
+        processor=tokenizer_mock,
+        prompt_tokens="average=10,min=1",
+        output_tokens="average=10,min=1",
         push_to_hub=True,
         hub_dataset_id="id123",
     )
@@ -227,7 +243,14 @@ def test_process_dataset_push_to_hub_not_called(
     mock_dataset_obj = MagicMock(spec=Dataset)
     mock_dataset_class.from_list.return_value = mock_dataset_obj
 
-    process_dataset("input", "output_dir/data.json", tokenizer_mock, push_to_hub=False)
+    process_dataset(
+        data="input",
+        output_path="output_dir/data.json",
+        processor=tokenizer_mock,
+        prompt_tokens="average=10,min=1",
+        output_tokens="average=10,min=1",
+        push_to_hub=False,
+    )
     mock_push.assert_not_called()
 
 

From 40c11182702c8ffb17cbfe4431e2c6b063ae9cf1 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 16:04:03 +0300
Subject: [PATCH 11/18] Added pytest mark to UTs

---
 tests/unit/preprocess/test_dataset.py | 46 +++++++++++++--------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 38e5bcf9..8e9ddd58 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -33,7 +33,7 @@ def tokenizer_mock():
     )
     return tokenizer
 
-
+@pytest.mark.smoke
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.Dataset")
@@ -68,19 +68,19 @@ def test_strategy_handler_called(
         mock_load_dataset.assert_called_once()
         mock_check_processor.assert_called_once()
 
-
+@pytest.mark.sanity
 def test_handle_ignore_strategy_too_short(tokenizer_mock):
     result = handle_ignore_strategy("short", 10, tokenizer_mock)
     assert result is None
     tokenizer_mock.encode.assert_called_with("short")
 
-
+@pytest.mark.sanity
 def test_handle_ignore_strategy_sufficient_length(tokenizer_mock):
     result = handle_ignore_strategy("long prompt", 5, tokenizer_mock)
     assert result == "long prompt"
     tokenizer_mock.encode.assert_called_with("long prompt")
 
-
+@pytest.mark.sanity
 def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     dataset_iter = iter([{"prompt": "longer"}])
     result = handle_concatenate_strategy(
@@ -88,7 +88,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     )
     assert result == "short\nlonger"
 
-
+@pytest.mark.sanity
 def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
     dataset_iter: Iterator = iter([])
     result = handle_concatenate_strategy(
@@ -96,23 +96,23 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
     )
     assert result is None
 
-
+@pytest.mark.sanity
 def test_handle_pad_strategy(tokenizer_mock):
     result = handle_pad_strategy("short", 10, tokenizer_mock, "p")
     assert result == "shortppppp"
 
-
+@pytest.mark.sanity
 def test_handle_error_strategy_valid_prompt(tokenizer_mock):
     result = handle_error_strategy("valid prompt", 5, tokenizer_mock)
     assert result == "valid prompt"
     tokenizer_mock.encode.assert_called_with("valid prompt")
 
-
+@pytest.mark.sanity
 def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
     with pytest.raises(PromptTooShortError):
         handle_error_strategy("short", 10, tokenizer_mock)
 
-
+@pytest.mark.smoke
 @patch("guidellm.preprocess.dataset.save_dataset_to_file")
 @patch("guidellm.preprocess.dataset.Dataset")
 @patch("guidellm.preprocess.dataset.guidellm_load_dataset")
@@ -159,7 +159,7 @@ def test_process_dataset_non_empty(
         assert "output_tokens_count" in item
         assert len(tokenizer_mock.encode(item["prompt"])) <= 3
 
-
+@pytest.mark.sanity
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
@@ -188,7 +188,7 @@ def test_process_dataset_empty_after_processing(
     mock_check_processor.assert_called_once()
     mock_dataset_class.from_list.assert_not_called()
 
-
+@pytest.mark.smoke
 @patch(f"{process_dataset.__module__}.push_dataset_to_hub")
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
@@ -221,7 +221,7 @@ def test_process_dataset_push_to_hub_called(
     )
     mock_push.assert_called_once_with("id123", mock_dataset_obj)
 
-
+@pytest.mark.sanity
 @patch(f"{process_dataset.__module__}.push_dataset_to_hub")
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
@@ -253,14 +253,14 @@ def test_process_dataset_push_to_hub_not_called(
     )
     mock_push.assert_not_called()
 
-
+@pytest.mark.regression
 def test_push_dataset_to_hub_success():
     os.environ["HF_TOKEN"] = "token"
     mock_dataset = MagicMock(spec=Dataset)
     push_dataset_to_hub("dataset_id", mock_dataset)
     mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token")
 
-
+@pytest.mark.regression
 def test_push_dataset_to_hub_error_no_env():
     if "HF_TOKEN" in os.environ:
         del os.environ["HF_TOKEN"]
@@ -268,14 +268,14 @@ def test_push_dataset_to_hub_error_no_env():
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub("dataset_id", mock_dataset)
 
-
+@pytest.mark.regression
 def test_push_dataset_to_hub_error_no_id():
     os.environ["HF_TOKEN"] = "token"
     mock_dataset = MagicMock(spec=Dataset)
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub(None, mock_dataset)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_csv(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -284,7 +284,7 @@ def test_save_dataset_to_file_csv(mock_mkdir):
     mock_dataset.to_csv.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -293,7 +293,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
     mock_dataset.to_csv.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_json(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -302,7 +302,7 @@ def test_save_dataset_to_file_json(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_json_capitalized(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -311,7 +311,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_jsonl(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -320,7 +320,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -329,7 +329,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_parquet(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)
@@ -338,7 +338,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir):
     mock_dataset.to_parquet.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-
+@pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_unsupported_type(mock_mkdir):
     mock_dataset = MagicMock(spec=Dataset)

From f61b7f07238c16071362b1a82130272ee007e6d8 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 16:08:03 +0300
Subject: [PATCH 12/18] Added docs

---
 src/guidellm/preprocess/dataset.py | 154 ++++++++++++++++++++++-------
 1 file changed, 120 insertions(+), 34 deletions(-)

diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index b2a45e16..f2853910 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -34,11 +34,20 @@ class ShortPromptStrategy(str, Enum):
 
 
 def handle_ignore_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        tokenizer: PreTrainedTokenizerBase,
-        **_kwargs,
+    current_prompt: str,
+    min_prompt_tokens: int,
+    tokenizer: PreTrainedTokenizerBase,
+    **_kwargs,
 ) -> Optional[str]:
+    """
+    Ignores prompts that are shorter than the required minimum token length.
+
+    :param current_prompt: The input prompt string.
+    :param min_prompt_tokens: Minimum required token count.
+    :param tokenizer: Tokenizer used to count tokens.
+    :return: The prompt if it meets the length, otherwise None.
+    """
+
     if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
         logger.warning("Prompt too short, ignoring")
         return None
@@ -46,14 +55,26 @@ def handle_ignore_strategy(
 
 
 def handle_concatenate_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        dataset_iterator: Iterator[dict[str, Any]],
-        prompt_column: str,
-        tokenizer: PreTrainedTokenizerBase,
-        concat_delimiter: str,
-        **_kwargs,
+    current_prompt: str,
+    min_prompt_tokens: int,
+    dataset_iterator: Iterator[dict[str, Any]],
+    prompt_column: str,
+    tokenizer: PreTrainedTokenizerBase,
+    concat_delimiter: str,
+    **_kwargs,
 ) -> Optional[str]:
+    """
+    Concatenates prompts until the minimum token requirement is met.
+
+    :param current_prompt: The initial prompt.
+    :param min_prompt_tokens: Target minimum token length.
+    :param dataset_iterator: Iterator to fetch more prompts.
+    :param prompt_column: Column key for prompt extraction.
+    :param tokenizer: Tokenizer used to count tokens.
+    :param concat_delimiter: Delimiter to use between prompts.
+    :return: Concatenated prompt or None if not enough data.
+    """
+
     tokens_len = len(tokenizer.encode(current_prompt))
     while tokens_len < min_prompt_tokens:
         try:
@@ -69,23 +90,43 @@ def handle_concatenate_strategy(
 
 
 def handle_pad_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        tokenizer: PreTrainedTokenizerBase,
-        pad_char: str,
-        **_kwargs,
+    current_prompt: str,
+    min_prompt_tokens: int,
+    tokenizer: PreTrainedTokenizerBase,
+    pad_char: str,
+    **_kwargs,
 ) -> str:
+    """
+    Pads the prompt with a character until it reaches the minimum token length.
+
+    :param current_prompt: The input prompt.
+    :param min_prompt_tokens: Desired minimum token count.
+    :param tokenizer: Tokenizer used to count tokens.
+    :param pad_char: Character used for padding.
+    :return: Padded prompt string.
+    """
+
     while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
         current_prompt += pad_char
     return current_prompt
 
 
 def handle_error_strategy(
-        current_prompt: str,
-        min_prompt_tokens: int,
-        tokenizer: PreTrainedTokenizerBase,
-        **_kwargs,
+    current_prompt: str,
+    min_prompt_tokens: int,
+    tokenizer: PreTrainedTokenizerBase,
+    **_kwargs,
 ) -> Optional[str]:
+    """
+    Raises an error if the prompt is too short.
+
+    :param current_prompt: The input prompt.
+    :param min_prompt_tokens: Required token count.
+    :param tokenizer: Tokenizer used to count tokens.
+    :return: The input prompt if valid.
+    :raises PromptTooShortError: If the prompt is too short.
+    """
+
     prompt_len = len(tokenizer.encode(current_prompt))
     if prompt_len < min_prompt_tokens:
         raise PromptTooShortError(
@@ -126,6 +167,17 @@ class TokensConfig(BaseModel):
 
     @staticmethod
     def parse_str(data: Union[str, Path]) -> "TokensConfig":
+        """
+        Parses a string or path into a TokensConfig object. Supports:
+        - JSON string
+        - key=value pairs
+        - file path to .yaml/.config
+
+        :param data: String or path containing configuration.
+        :return: Parsed TokensConfig instance.
+        :raises ValueError: If the format is not recognized.
+        """
+
         if (
             isinstance(data, Path)
             or data.strip().endswith(".config")
@@ -169,6 +221,13 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig":
         return TokensConfig(**config_dict)
 
 def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
+    """
+    Saves a HuggingFace Dataset to file in a supported format.
+
+    :param dataset: Dataset to save.
+    :param output_path: Output file path (.json, .jsonl, .csv, .parquet).
+    :raises ValueError: If the file extension is not supported.
+    """
     output_path = Path(output_path)
     output_path.parent.mkdir(parents=True, exist_ok=True)
     suffix = output_path.suffix.lower()
@@ -197,20 +256,39 @@ def _validate_output_suffix(output_path: Union[str, Path]) -> None:
 
 
 def process_dataset(
-        data: Union[str, Path],
-        output_path: Union[str, Path],
-        processor: Union[str, Path, PreTrainedTokenizerBase],
-        prompt_tokens: Union[str, Path],
-        output_tokens: Union[str, Path],
-        processor_args: Optional[dict[str, Any]] = None,
-        data_args: Optional[dict[str, Any]] = None,
-        short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
-        pad_char: Optional[str] = None,
-        concat_delimiter: Optional[str] = None,
-        push_to_hub: bool = False,
-        hub_dataset_id: Optional[str] = None,
-        random_seed: int = 42,
+    data: Union[str, Path],
+    output_path: Union[str, Path],
+    processor: Union[str, Path, PreTrainedTokenizerBase],
+    prompt_tokens: Union[str, Path],
+    output_tokens: Union[str, Path],
+    processor_args: Optional[dict[str, Any]] = None,
+    data_args: Optional[dict[str, Any]] = None,
+    short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
+    pad_char: Optional[str] = None,
+    concat_delimiter: Optional[str] = None,
+    push_to_hub: bool = False,
+    hub_dataset_id: Optional[str] = None,
+    random_seed: int = 42,
 ) -> None:
+    """
+    Main method to process and save a dataset with sampled prompt/output token counts.
+
+    :param data: Path or identifier for dataset input.
+    :param output_path: File path to save the processed dataset.
+    :param processor: Tokenizer object or its config.
+    :param prompt_tokens: Prompt token config string or file.
+    :param output_tokens: Output token config string or file.
+    :param processor_args: Optional processor arguments.
+    :param data_args: Optional data loading arguments.
+    :param short_prompt_strategy: Strategy for handling short prompts.
+    :param pad_char: Character used when padding short prompts.
+    :param concat_delimiter: Delimiter for concatenation strategy.
+    :param push_to_hub: Whether to push to Hugging Face Hub.
+    :param hub_dataset_id: Dataset ID on Hugging Face Hub.
+    :param random_seed: Seed for random sampling.
+    :raises ValueError: If output path is invalid or pushing conditions unmet.
+    """
+
     _validate_output_suffix(output_path)
     logger.info(
         f"Starting dataset conversion | Input: {data} | "
@@ -300,8 +378,16 @@ def process_dataset(
 
 
 def push_dataset_to_hub(
-        hub_dataset_id: Optional[str], processed_dataset: Dataset,
+    hub_dataset_id: Optional[str], processed_dataset: Dataset,
 ) -> None:
+    """
+    Pushes the processed dataset to Hugging Face Hub using HF_TOKEN.
+
+    :param hub_dataset_id: Identifier on the Hub to push to.
+    :param processed_dataset: HuggingFace Dataset object.
+    :raises ValueError: If hub_dataset_id or HF_TOKEN is not available.
+    """
+
     hf_token = os.environ.get("HF_TOKEN")
     if not hub_dataset_id or not hf_token:
         raise ValueError(

From b6146c921bc0b95ace0a4d1080fccee409d9f483 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 17:59:31 +0300
Subject: [PATCH 13/18] Ran tox -e style

---
 pyproject.toml                        |  3 +-
 src/guidellm/__main__.py              |  4 +-
 src/guidellm/benchmark/benchmark.py   |  5 +-
 src/guidellm/preprocess/dataset.py    |  7 ++-
 tests/unit/preprocess/test_dataset.py | 79 +++++++++++++++++----------
 5 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9f86d993..a78b1fc5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,8 @@ name = "guidellm"
 description = "Guidance platform for deploying and managing large language models."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.9.0,<4.0"
-license = { text = "Apache-2.0" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 authors = [ { name = "Red Hat" } ]
 keywords = [
     "ai",
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index aac0efa4..66c3daa4 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -281,6 +281,7 @@ def benchmark(
         )
     )
 
+
 def decode_escaped_str(_ctx, _param, value):
     """
     Click auto adds characters. For example, when using --pad-char "\n",
@@ -294,6 +295,7 @@ def decode_escaped_str(_ctx, _param, value):
     except Exception as e:
         raise click.BadParameter(f"Could not decode escape sequences: {e}") from e
 
+
 @cli.command(
     help=(
         "Print out the available configuration settings that can be set "
@@ -373,7 +375,7 @@ def preprocess():
     help=(
         "The delimiter to use when concatenating prompts that are too short."
         " Used when strategy is 'concatenate'."
-    )
+    ),
 )
 @click.option(
     "--prompt-tokens",
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 9f683f8e..1e2a5f4b 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -815,10 +815,7 @@ def from_stats(
                         req.first_token_time or req.start_time
                         for req in total_with_output_first
                     ],
-                    iter_counts=[
-                        req.output_tokens
-                        for req in total_with_output_first
-                    ],
+                    iter_counts=[req.output_tokens for req in total_with_output_first],
                     first_iter_counts=[
                         req.prompt_tokens for req in total_with_output_first
                     ],
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index f2853910..601aab31 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -220,6 +220,7 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig":
 
         return TokensConfig(**config_dict)
 
+
 def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
     """
     Saves a HuggingFace Dataset to file in a supported format.
@@ -291,8 +292,7 @@ def process_dataset(
 
     _validate_output_suffix(output_path)
     logger.info(
-        f"Starting dataset conversion | Input: {data} | "
-        f"Output directory: {output_path}"
+        f"Starting dataset conversion | Input: {data} | Output directory: {output_path}"
     )
 
     dataset, column_mappings = guidellm_load_dataset(
@@ -378,7 +378,8 @@ def process_dataset(
 
 
 def push_dataset_to_hub(
-    hub_dataset_id: Optional[str], processed_dataset: Dataset,
+    hub_dataset_id: Optional[str],
+    processed_dataset: Dataset,
 ) -> None:
     """
     Pushes the processed dataset to Hugging Face Hub using HF_TOKEN.
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 8e9ddd58..49a8e417 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -33,17 +33,18 @@ def tokenizer_mock():
     )
     return tokenizer
 
+
 @pytest.mark.smoke
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_strategy_handler_called(
-        mock_sampler,
-        mock_dataset_class,
-        mock_check_processor,
-        mock_load_dataset,
-        tokenizer_mock,
+    mock_sampler,
+    mock_dataset_class,
+    mock_check_processor,
+    mock_load_dataset,
+    tokenizer_mock,
 ):
     mock_handler = MagicMock(return_value="processed_prompt")
     with patch.dict(STRATEGY_HANDLERS, {ShortPromptStrategy.IGNORE: mock_handler}):
@@ -68,18 +69,21 @@ def test_strategy_handler_called(
         mock_load_dataset.assert_called_once()
         mock_check_processor.assert_called_once()
 
+
 @pytest.mark.sanity
 def test_handle_ignore_strategy_too_short(tokenizer_mock):
     result = handle_ignore_strategy("short", 10, tokenizer_mock)
     assert result is None
     tokenizer_mock.encode.assert_called_with("short")
 
+
 @pytest.mark.sanity
 def test_handle_ignore_strategy_sufficient_length(tokenizer_mock):
     result = handle_ignore_strategy("long prompt", 5, tokenizer_mock)
     assert result == "long prompt"
     tokenizer_mock.encode.assert_called_with("long prompt")
 
+
 @pytest.mark.sanity
 def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     dataset_iter = iter([{"prompt": "longer"}])
@@ -88,6 +92,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
     )
     assert result == "short\nlonger"
 
+
 @pytest.mark.sanity
 def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
     dataset_iter: Iterator = iter([])
@@ -96,22 +101,26 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
     )
     assert result is None
 
+
 @pytest.mark.sanity
 def test_handle_pad_strategy(tokenizer_mock):
     result = handle_pad_strategy("short", 10, tokenizer_mock, "p")
     assert result == "shortppppp"
 
+
 @pytest.mark.sanity
 def test_handle_error_strategy_valid_prompt(tokenizer_mock):
     result = handle_error_strategy("valid prompt", 5, tokenizer_mock)
     assert result == "valid prompt"
     tokenizer_mock.encode.assert_called_with("valid prompt")
 
+
 @pytest.mark.sanity
 def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
     with pytest.raises(PromptTooShortError):
         handle_error_strategy("short", 10, tokenizer_mock)
 
+
 @pytest.mark.smoke
 @patch("guidellm.preprocess.dataset.save_dataset_to_file")
 @patch("guidellm.preprocess.dataset.Dataset")
@@ -119,12 +128,12 @@ def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
 @patch("guidellm.preprocess.dataset.check_load_processor")
 @patch("guidellm.preprocess.dataset.IntegerRangeSampler")
 def test_process_dataset_non_empty(
-        mock_sampler,
-        mock_check_processor,
-        mock_load_dataset,
-        mock_dataset_class,
-        mock_save_to_file,
-        tokenizer_mock,
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_save_to_file,
+    tokenizer_mock,
 ):
     from guidellm.preprocess.dataset import process_dataset
 
@@ -159,17 +168,18 @@ def test_process_dataset_non_empty(
         assert "output_tokens_count" in item
         assert len(tokenizer_mock.encode(item["prompt"])) <= 3
 
+
 @pytest.mark.sanity
 @patch(f"{process_dataset.__module__}.Dataset")
 @patch(f"{process_dataset.__module__}.guidellm_load_dataset")
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_empty_after_processing(
-        mock_sampler,
-        mock_check_processor,
-        mock_load_dataset,
-        mock_dataset_class,
-        tokenizer_mock,
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": ""}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -188,6 +198,7 @@ def test_process_dataset_empty_after_processing(
     mock_check_processor.assert_called_once()
     mock_dataset_class.from_list.assert_not_called()
 
+
 @pytest.mark.smoke
 @patch(f"{process_dataset.__module__}.push_dataset_to_hub")
 @patch(f"{process_dataset.__module__}.Dataset")
@@ -195,12 +206,12 @@ def test_process_dataset_empty_after_processing(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_called(
-        mock_sampler,
-        mock_check_processor,
-        mock_load_dataset,
-        mock_dataset_class,
-        mock_push,
-        tokenizer_mock,
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_push,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -221,6 +232,7 @@ def test_process_dataset_push_to_hub_called(
     )
     mock_push.assert_called_once_with("id123", mock_dataset_obj)
 
+
 @pytest.mark.sanity
 @patch(f"{process_dataset.__module__}.push_dataset_to_hub")
 @patch(f"{process_dataset.__module__}.Dataset")
@@ -228,12 +240,12 @@ def test_process_dataset_push_to_hub_called(
 @patch(f"{process_dataset.__module__}.check_load_processor")
 @patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_push_to_hub_not_called(
-        mock_sampler,
-        mock_check_processor,
-        mock_load_dataset,
-        mock_dataset_class,
-        mock_push,
-        tokenizer_mock,
+    mock_sampler,
+    mock_check_processor,
+    mock_load_dataset,
+    mock_dataset_class,
+    mock_push,
+    tokenizer_mock,
 ):
     mock_dataset = [{"prompt": "abc"}]
     mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"})
@@ -253,6 +265,7 @@ def test_process_dataset_push_to_hub_not_called(
     )
     mock_push.assert_not_called()
 
+
 @pytest.mark.regression
 def test_push_dataset_to_hub_success():
     os.environ["HF_TOKEN"] = "token"
@@ -260,6 +273,7 @@ def test_push_dataset_to_hub_success():
     push_dataset_to_hub("dataset_id", mock_dataset)
     mock_dataset.push_to_hub.assert_called_once_with("dataset_id", token="token")
 
+
 @pytest.mark.regression
 def test_push_dataset_to_hub_error_no_env():
     if "HF_TOKEN" in os.environ:
@@ -268,6 +282,7 @@ def test_push_dataset_to_hub_error_no_env():
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub("dataset_id", mock_dataset)
 
+
 @pytest.mark.regression
 def test_push_dataset_to_hub_error_no_id():
     os.environ["HF_TOKEN"] = "token"
@@ -275,6 +290,7 @@ def test_push_dataset_to_hub_error_no_id():
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub(None, mock_dataset)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_csv(mock_mkdir):
@@ -284,6 +300,7 @@ def test_save_dataset_to_file_csv(mock_mkdir):
     mock_dataset.to_csv.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
@@ -293,6 +310,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
     mock_dataset.to_csv.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_json(mock_mkdir):
@@ -302,6 +320,7 @@ def test_save_dataset_to_file_json(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_json_capitalized(mock_mkdir):
@@ -311,6 +330,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_jsonl(mock_mkdir):
@@ -320,6 +340,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
@@ -329,6 +350,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
     mock_dataset.to_json.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_parquet(mock_mkdir):
@@ -338,6 +360,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir):
     mock_dataset.to_parquet.assert_called_once_with(output_path)
     mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
+
 @pytest.mark.regression
 @patch.object(Path, "mkdir")
 def test_save_dataset_to_file_unsupported_type(mock_mkdir):

From f3a3cd78f0bfd166b5c6525a10c6195baf2190b6 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 18:00:50 +0300
Subject: [PATCH 14/18] Fixed help for preprocess dataset subcommand

---
 src/guidellm/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 66c3daa4..7dc06835 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -313,9 +313,9 @@ def preprocess():
 
 @preprocess.command(
     help=(
-        "Convert a dataset to have specific prompt and output token sizes.\n\n"
-        "INPUT_DATA: Path to the input dataset or dataset ID.\n"
-        "OUTPUT_PATH: Path to save the converted dataset, including file suffix. "
+        "Convert a dataset to have specific prompt and output token sizes.\n"
+        "DATA: Path to the input dataset or dataset ID.\n"
+        "OUTPUT_PATH: Path to save the converted dataset, including file suffix."
     )
 )
 @click.argument(

From 448d609ad884934b704f6cb57e4e99c381c23ff0 Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Sun, 1 Jun 2025 18:02:02 +0300
Subject: [PATCH 15/18] Fixed help for preprocess dataset subcommand

---
 src/guidellm/benchmark/benchmark.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 1e2a5f4b..9f683f8e 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -815,7 +815,10 @@ def from_stats(
                         req.first_token_time or req.start_time
                         for req in total_with_output_first
                     ],
-                    iter_counts=[req.output_tokens for req in total_with_output_first],
+                    iter_counts=[
+                        req.output_tokens
+                        for req in total_with_output_first
+                    ],
                     first_iter_counts=[
                         req.prompt_tokens for req in total_with_output_first
                     ],

From c48472a113f0e4cdb4b177e5bea840f4e54e331c Mon Sep 17 00:00:00 2001
From: TomerGarber <tomergarber@gmail.com>
Date: Thu, 5 Jun 2025 12:48:22 +0300
Subject: [PATCH 16/18] Fixed CR comments

---
 src/guidellm/preprocess/dataset.py    | 53 ++++-----------
 src/guidellm/utils/__init__.py        |  6 ++
 src/guidellm/utils/hf_datasets.py     | 36 +++++++++++
 tests/unit/preprocess/test_dataset.py | 92 ++-------------------------
 tests/unit/utils/__init__.py          |  0
 tests/unit/utils/test_hf_datasets.py  | 87 +++++++++++++++++++++++++
 6 files changed, 149 insertions(+), 125 deletions(-)
 create mode 100644 src/guidellm/utils/hf_datasets.py
 create mode 100644 tests/unit/utils/__init__.py
 create mode 100644 tests/unit/utils/test_hf_datasets.py

diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index 601aab31..c6957863 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -13,13 +13,7 @@
 
 from guidellm.dataset import load_dataset as guidellm_load_dataset
 from guidellm.utils import IntegerRangeSampler, check_load_processor
-
-SUPPORTED_TYPES = {
-    ".json",
-    ".jsonl",
-    ".csv",
-    ".parquet",
-}
+from guidellm.utils.hf_datasets import SUPPORTED_TYPES, save_dataset_to_file
 
 
 class PromptTooShortError(Exception):
@@ -94,6 +88,7 @@ def handle_pad_strategy(
     min_prompt_tokens: int,
     tokenizer: PreTrainedTokenizerBase,
     pad_char: str,
+    pad_multiplier: int = 2,
     **_kwargs,
 ) -> str:
     """
@@ -103,13 +98,18 @@ def handle_pad_strategy(
     :param min_prompt_tokens: Desired minimum token count.
     :param tokenizer: Tokenizer used to count tokens.
     :param pad_char: Character used for padding.
+    :param pad_multiplier: Multiplier for padding character length.
     :return: Padded prompt string.
     """
 
-    while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
-        current_prompt += pad_char
-    return current_prompt
-
+    tokens = tokenizer.encode(current_prompt)
+    pad_count = 1
+    prompt = current_prompt
+    while len(tokens) < min_prompt_tokens:
+        prompt += pad_char * pad_count
+        tokens = tokenizer.encode(prompt)
+        pad_count *= pad_multiplier
+    return prompt
 
 def handle_error_strategy(
     current_prompt: str,
@@ -221,31 +221,6 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig":
         return TokensConfig(**config_dict)
 
 
-def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
-    """
-    Saves a HuggingFace Dataset to file in a supported format.
-
-    :param dataset: Dataset to save.
-    :param output_path: Output file path (.json, .jsonl, .csv, .parquet).
-    :raises ValueError: If the file extension is not supported.
-    """
-    output_path = Path(output_path)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    suffix = output_path.suffix.lower()
-
-    if suffix == ".csv":
-        dataset.to_csv(output_path)
-    elif suffix in {".json", ".jsonl"}:
-        dataset.to_json(output_path)
-    elif suffix == ".parquet":
-        dataset.to_parquet(output_path)
-    else:
-        raise ValueError(
-            f"Unsupported file suffix '{suffix}' in output_path'{output_path}'."
-            f" Only {SUPPORTED_TYPES} are supported."
-        )
-
-
 def _validate_output_suffix(output_path: Union[str, Path]) -> None:
     output_path = Path(output_path)
     suffix = output_path.suffix.lower()
@@ -351,8 +326,8 @@ def process_dataset(
         if prompt_text is None:
             continue
 
-        if len(tokenizer.encode(prompt_text)) > target_prompt_len:
-            tokens = tokenizer.encode(prompt_text)
+        tokens = tokenizer.encode(prompt_text)
+        if len(tokens) > target_prompt_len:
             prompt_text = tokenizer.decode(tokens[:target_prompt_len])
 
         processed_prompt = prompt_row.copy()
@@ -370,7 +345,7 @@ def process_dataset(
 
     processed_dataset = Dataset.from_list(processed_prompts)
     save_dataset_to_file(processed_dataset, output_path)
-    logger.info(f"Conversion complete. Dataset saved to: {output_path}")
+    logger.info(f"Conversion completed. Dataset saved to: {output_path}")
 
     if push_to_hub:
         push_dataset_to_hub(hub_dataset_id, processed_dataset)
diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 753bef02..11751c0d 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -1,4 +1,8 @@
 from .colors import Colors
+from .hf_datasets import (
+    save_dataset_to_file,
+    SUPPORTED_TYPES,
+)
 from .hf_transformers import (
     check_load_processor,
 )
@@ -22,6 +26,8 @@
     "filter_text",
     "is_puncutation",
     "load_text",
+    "save_dataset_to_file",
     "split_text",
     "split_text_list_by_length",
+    "SUPPORTED_TYPES",
 ]
diff --git a/src/guidellm/utils/hf_datasets.py b/src/guidellm/utils/hf_datasets.py
new file mode 100644
index 00000000..73e55ebc
--- /dev/null
+++ b/src/guidellm/utils/hf_datasets.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+from typing import Union
+
+from datasets import Dataset
+
+SUPPORTED_TYPES = {
+    ".json",
+    ".jsonl",
+    ".csv",
+    ".parquet",
+}
+
+
+def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
+    """
+    Saves a HuggingFace Dataset to file in a supported format.
+
+    :param dataset: Dataset to save.
+    :param output_path: Output file path (.json, .jsonl, .csv, .parquet).
+    :raises ValueError: If the file extension is not supported.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    suffix = output_path.suffix.lower()
+
+    if suffix == ".csv":
+        dataset.to_csv(output_path)
+    elif suffix in {".json", ".jsonl"}:
+        dataset.to_json(output_path)
+    elif suffix == ".parquet":
+        dataset.to_parquet(output_path)
+    else:
+        raise ValueError(
+            f"Unsupported file suffix '{suffix}' in output_path'{output_path}'."
+            f" Only {SUPPORTED_TYPES} are supported."
+        )
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index 49a8e417..b3878565 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import TYPE_CHECKING
 from unittest.mock import MagicMock, patch
 
@@ -20,7 +19,6 @@
     handle_pad_strategy,
     process_dataset,
     push_dataset_to_hub,
-    save_dataset_to_file,
 )
 
 
@@ -105,7 +103,7 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
 @pytest.mark.sanity
 def test_handle_pad_strategy(tokenizer_mock):
     result = handle_pad_strategy("short", 10, tokenizer_mock, "p")
-    assert result == "shortppppp"
+    assert result.startswith("shortppppp")
 
 
 @pytest.mark.sanity
@@ -122,11 +120,11 @@ def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
 
 
 @pytest.mark.smoke
-@patch("guidellm.preprocess.dataset.save_dataset_to_file")
-@patch("guidellm.preprocess.dataset.Dataset")
-@patch("guidellm.preprocess.dataset.guidellm_load_dataset")
-@patch("guidellm.preprocess.dataset.check_load_processor")
-@patch("guidellm.preprocess.dataset.IntegerRangeSampler")
+@patch(f"{process_dataset.__module__}.save_dataset_to_file")
+@patch(f"{process_dataset.__module__}.Dataset")
+@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
+@patch(f"{process_dataset.__module__}.check_load_processor")
+@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
 def test_process_dataset_non_empty(
     mock_sampler,
     mock_check_processor,
@@ -291,81 +289,3 @@ def test_push_dataset_to_hub_error_no_id():
         push_dataset_to_hub(None, mock_dataset)
 
 
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_csv(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.csv")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_csv.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.CSV")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_csv.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_json(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.json")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_json.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_json_capitalized(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.JSON")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_json.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_jsonl(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.jsonl")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_json.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.JSONL")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_json.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_parquet(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.parquet")
-    save_dataset_to_file(mock_dataset, output_path)
-    mock_dataset.to_parquet.assert_called_once_with(output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-
-@pytest.mark.regression
-@patch.object(Path, "mkdir")
-def test_save_dataset_to_file_unsupported_type(mock_mkdir):
-    mock_dataset = MagicMock(spec=Dataset)
-    output_path = Path("some/path/output.txt")
-    with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
-        save_dataset_to_file(mock_dataset, output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
diff --git a/tests/unit/utils/__init__.py b/tests/unit/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/utils/test_hf_datasets.py b/tests/unit/utils/test_hf_datasets.py
new file mode 100644
index 00000000..a59f6a13
--- /dev/null
+++ b/tests/unit/utils/test_hf_datasets.py
@@ -0,0 +1,87 @@
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+from datasets import Dataset
+
+from guidellm.utils import save_dataset_to_file
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_csv(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.csv")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_csv.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.CSV")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_csv.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_json(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.json")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_json_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.JSON")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_jsonl(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.jsonl")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.JSONL")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_json.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_parquet(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.parquet")
+    save_dataset_to_file(mock_dataset, output_path)
+    mock_dataset.to_parquet.assert_called_once_with(output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+
+@pytest.mark.regression
+@patch.object(Path, "mkdir")
+def test_save_dataset_to_file_unsupported_type(mock_mkdir):
+    mock_dataset = MagicMock(spec=Dataset)
+    output_path = Path("some/path/output.txt")
+    with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
+        save_dataset_to_file(mock_dataset, output_path)
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
\ No newline at end of file

From 0cc3ffefdcb18155179c6bd9c87d12267b3cc9fe Mon Sep 17 00:00:00 2001
From: TomerG711 <tomergarber@gmail.com>
Date: Thu, 5 Jun 2025 12:54:39 +0300
Subject: [PATCH 17/18] Linters

---
 src/guidellm/benchmark/benchmark.py   | 5 +----
 src/guidellm/preprocess/dataset.py    | 1 +
 src/guidellm/utils/__init__.py        | 4 ++--
 tests/unit/preprocess/test_dataset.py | 2 --
 tests/unit/utils/test_hf_datasets.py  | 4 ++--
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 9f683f8e..1e2a5f4b 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -815,10 +815,7 @@ def from_stats(
                         req.first_token_time or req.start_time
                         for req in total_with_output_first
                     ],
-                    iter_counts=[
-                        req.output_tokens
-                        for req in total_with_output_first
-                    ],
+                    iter_counts=[req.output_tokens for req in total_with_output_first],
                     first_iter_counts=[
                         req.prompt_tokens for req in total_with_output_first
                     ],
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index c6957863..a94b8a14 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -111,6 +111,7 @@ def handle_pad_strategy(
         pad_count *= pad_multiplier
     return prompt
 
+
 def handle_error_strategy(
     current_prompt: str,
     min_prompt_tokens: int,
diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 11751c0d..399c021d 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -1,7 +1,7 @@
 from .colors import Colors
 from .hf_datasets import (
-    save_dataset_to_file,
     SUPPORTED_TYPES,
+    save_dataset_to_file,
 )
 from .hf_transformers import (
     check_load_processor,
@@ -18,6 +18,7 @@
 )
 
 __all__ = [
+    "SUPPORTED_TYPES",
     "Colors",
     "EndlessTextCreator",
     "IntegerRangeSampler",
@@ -29,5 +30,4 @@
     "save_dataset_to_file",
     "split_text",
     "split_text_list_by_length",
-    "SUPPORTED_TYPES",
 ]
diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py
index b3878565..2a2a8293 100644
--- a/tests/unit/preprocess/test_dataset.py
+++ b/tests/unit/preprocess/test_dataset.py
@@ -287,5 +287,3 @@ def test_push_dataset_to_hub_error_no_id():
     mock_dataset = MagicMock(spec=Dataset)
     with pytest.raises(ValueError, match="hub_dataset_id and HF_TOKEN"):
         push_dataset_to_hub(None, mock_dataset)
-
-
diff --git a/tests/unit/utils/test_hf_datasets.py b/tests/unit/utils/test_hf_datasets.py
index a59f6a13..bbd41c4f 100644
--- a/tests/unit/utils/test_hf_datasets.py
+++ b/tests/unit/utils/test_hf_datasets.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from unittest.mock import patch, MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 from datasets import Dataset
@@ -84,4 +84,4 @@ def test_save_dataset_to_file_unsupported_type(mock_mkdir):
     output_path = Path("some/path/output.txt")
     with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
         save_dataset_to_file(mock_dataset, output_path)
-    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
\ No newline at end of file
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)

From c0cd1c9936d6e503b9fddffafabf4380c73f98b7 Mon Sep 17 00:00:00 2001
From: TomerG711 <tomergarber@gmail.com>
Date: Thu, 5 Jun 2025 12:55:58 +0300
Subject: [PATCH 18/18] Linters

---
 src/guidellm/benchmark/benchmark.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 1e2a5f4b..9f683f8e 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -815,7 +815,10 @@ def from_stats(
                         req.first_token_time or req.start_time
                         for req in total_with_output_first
                     ],
-                    iter_counts=[req.output_tokens for req in total_with_output_first],
+                    iter_counts=[
+                        req.output_tokens
+                        for req in total_with_output_first
+                    ],
                     first_iter_counts=[
                         req.prompt_tokens for req in total_with_output_first
                     ],