Skip to content

Feat/add preprocess dataset #162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c700d9d
Implemented preprocessing for datasets
TomerG711 May 15, 2025
91027e1
Implemented preprocessing for datasets
TomerG711 May 15, 2025
e02ec8a
Reverted irrelevant formatting
TomerG711 May 15, 2025
fc4bfb2
Removed redundant comments
TomerG711 May 15, 2025
2bf436c
Added support for saving in specific file format
TomerG711 May 18, 2025
64a91de
Fixed code styling
TomerG711 May 18, 2025
b5a4877
Improved error message
TomerG711 May 18, 2025
4d624a2
Fixed code styling
TomerG711 May 20, 2025
8e737b6
Merge branch 'main' into fead/add-preprocess-dataset
TomerG711 May 20, 2025
829366a
Merge branch 'main' into fead/add-preprocess-dataset
markurtz May 27, 2025
6f275d7
Merge branch 'main' into fead/add-preprocess-dataset
markurtz May 29, 2025
cf99526
Merge branch 'main' into fead/add-preprocess-dataset
TomerG711 Jun 1, 2025
e2ca919
Fixed CR comments
TomerG711 Jun 1, 2025
4af81d9
Merge remote-tracking branch 'origin/fead/add-preprocess-dataset' int…
TomerG711 Jun 1, 2025
a9f4fa6
Fixed UTs
TomerG711 Jun 1, 2025
40c1118
Added pytest mark to UTs
TomerG711 Jun 1, 2025
f61b7f0
Added docs
TomerG711 Jun 1, 2025
b6146c9
Ran tox -e style
TomerG711 Jun 1, 2025
f3a3cd7
Fixed help for preprocess dataset subcommand
TomerG711 Jun 1, 2025
448d609
Fixed help for preprocess dataset subcommand
TomerG711 Jun 1, 2025
c48472a
Fixed CR comments
TomerG711 Jun 5, 2025
0cc3ffe
Linters
TomerG711 Jun 5, 2025
c0cd1c9
Linters
TomerG711 Jun 5, 2025
06f19a0
Merge branch 'main' into fead/add-preprocess-dataset
markurtz Jun 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import codecs
import json
from pathlib import Path
from typing import get_args
Expand All @@ -8,6 +9,7 @@
from guidellm.backend import BackendType
from guidellm.benchmark import ProfileType, benchmark_generative_text
from guidellm.config import print_config
from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
from guidellm.scheduler import StrategyType

STRATEGY_PROFILE_CHOICES = set(
Expand Down Expand Up @@ -280,6 +282,20 @@ def benchmark(
)


def decode_escaped_str(_ctx, _param, value):
"""
Click auto adds characters. For example, when using --pad-char "\n",
it parses it as "\\n". This method decodes the string to handle escape
sequences correctly.
"""
if value is None:
return None
try:
return codecs.decode(value, "unicode_escape")
except Exception as e:
raise click.BadParameter(f"Could not decode escape sequences: {e}") from e


@cli.command(
help=(
"Print out the available configuration settings that can be set "
Expand All @@ -290,5 +306,139 @@ def config():
print_config()


@cli.group(help="General preprocessing tools and utilities.")
def preprocess():
pass


@preprocess.command(
help=(
"Convert a dataset to have specific prompt and output token sizes.\n"
"DATA: Path to the input dataset or dataset ID.\n"
"OUTPUT_PATH: Path to save the converted dataset, including file suffix."
)
)
@click.argument(
"data",
type=str,
required=True,
)
@click.argument(
"output_path",
type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
required=True,
)
@click.option(
"--processor",
type=str,
required=True,
help=(
"The processor or tokenizer to use to calculate token counts for statistics "
"and synthetic data generation."
),
)
@click.option(
"--processor-args",
default=None,
callback=parse_json,
help=(
"A JSON string containing any arguments to pass to the processor constructor "
"as a dict with **kwargs."
),
)
@click.option(
"--data-args",
callback=parse_json,
help=(
"A JSON string containing any arguments to pass to the dataset creation "
"as a dict with **kwargs."
),
)
@click.option(
"--short-prompt-strategy",
type=click.Choice([s.value for s in ShortPromptStrategy]),
default=ShortPromptStrategy.IGNORE.value,
show_default=True,
help="Strategy to handle prompts shorter than the target length. ",
)
@click.option(
"--pad-char",
type=str,
default="",
callback=decode_escaped_str,
help="The token to pad short prompts with when using the 'pad' strategy.",
)
@click.option(
"--concat-delimiter",
type=str,
default="",
help=(
"The delimiter to use when concatenating prompts that are too short."
" Used when strategy is 'concatenate'."
),
)
@click.option(
"--prompt-tokens",
type=str,
default=None,
help="Prompt tokens config (JSON, YAML file or key=value string)",
)
@click.option(
"--output-tokens",
type=str,
default=None,
help="Output tokens config (JSON, YAML file or key=value string)",
)
@click.option(
"--push-to-hub",
is_flag=True,
help="Set this flag to push the converted dataset to the Hugging Face Hub.",
)
@click.option(
"--hub-dataset-id",
type=str,
default=None,
help="The Hugging Face Hub dataset ID to push to. "
"Required if --push-to-hub is used.",
)
@click.option(
"--random-seed",
type=int,
default=42,
show_default=True,
help="Random seed for prompt token sampling and output tokens sampling.",
)
def dataset(
data,
output_path,
processor,
processor_args,
data_args,
short_prompt_strategy,
pad_char,
concat_delimiter,
prompt_tokens,
output_tokens,
push_to_hub,
hub_dataset_id,
random_seed,
):
process_dataset(
data=data,
output_path=output_path,
processor=processor,
prompt_tokens=prompt_tokens,
output_tokens=output_tokens,
processor_args=processor_args,
data_args=data_args,
short_prompt_strategy=short_prompt_strategy,
pad_char=pad_char,
concat_delimiter=concat_delimiter,
push_to_hub=push_to_hub,
hub_dataset_id=hub_dataset_id,
random_seed=random_seed,
)


if __name__ == "__main__":
cli()
3 changes: 3 additions & 0 deletions src/guidellm/preprocess/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .dataset import ShortPromptStrategy, process_dataset

__all__ = ["ShortPromptStrategy", "process_dataset"]
Loading