From 646233d228511614635e5618ce6291cf0f05db75 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 9 Dec 2025 06:08:14 +0000 Subject: [PATCH 1/6] [QEff.finetuning] Adding config_manager and its test cases. Signed-off-by: Tanisha Chawada --- .../experimental/core/config_manager.py | 648 ++++++++++++++++++ .../experimental/tests/test_config.yaml | 117 ++++ .../experimental/tests/test_config_manager.py | 50 ++ 3 files changed, 815 insertions(+) create mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml create mode 100644 QEfficient/finetune/experimental/tests/test_config_manager.py diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index d647b73a6..60ed4d4b6 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -4,3 +4,651 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +""" +Configuration manager for handling all training configurations. +Provides centralized configuration loading, validation, and management. +""" + +import json +import os +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import yaml +from transformers.hf_argparser import HfArgumentParser + +from QEfficient.finetune.experimental.core.component_registry import registry + + +@dataclass +class OptimizerConfig: + """Configuration for optimizers.""" + + optimizer_name: str = field( + default="adamw", + metadata={"help": "The name of the optimizer to use."}, + ) + lr: float = field( + default=5e-5, + metadata={"help": "The initial learning rate for the optimizer."}, + ) + weight_decay: float = field( + default=0.01, + metadata={"help": "The weight decay to apply (if any)."}, + ) + + +@dataclass +class SchedulerConfig: + """Configuration for learning rate schedulers.""" + + scheduler_name: str = field( + default="cosine", + metadata={"help": "The name of the scheduler to use (e.g., 'linear', 'cosine')."}, + ) + warmup_steps: int = field( + default=100, + metadata={ + "help": "Number of steps for the warmup phase. If provided " + "value is within [0-1) range then it will be interpreted as " + "ratio of total training steps for the warmup phase." + }, + ) + + +@dataclass +class DatasetConfig: + """Configuration for datasets.""" + + tokenizer_name: str = field( + default="HuggingFaceTB/SmolLM-135M", + metadata={"help": "The name or path of the tokenizer to use."}, + ) + dataset_type: str = field( + default="seq_completion", + metadata={"help": "The type of dataset (e.g., 'seq_completion')."}, + ) + dataset_name: str = field( + default="knkarthick/samsum", + metadata={"help": "The name or path of the dataset."}, + ) + dataset_subset: str = field( + default="default", + metadata={"help": "The subset of the dataset to use, if applicable."}, + ) + train_split: str = field( + default="train", + metadata={"help": "The name of the training split."}, + ) + test_split: str = field( + default="test", + metadata={"help": "The name of the test/validation split."}, + ) + max_seq_length: int = field( + default=512, + metadata={"help": "The maximum sequence length for tokenization."}, + ) + split_ratio: float = field( + default=0.8, + metadata={"help": "Ratio for train/test split, used when only train_split is provided."}, + ) + input_columns: list[str] = field( + default_factory=lambda: ["text"], + metadata={"help": "List of column names containing input text."}, + ) + target_column: Optional[str] = field( + default=None, + metadata={"help": "Name of the column containing target labels (if applicable)."}, + ) + train_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during training."}, + ) + eval_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during evaluation."}, + ) + num_workers: int = field( + default=4, + metadata={"help": "Number of workers for dataset processing."}, + ) + collate_fn: str = field( + default="dynamic_padding", + metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."}, + ) + group_by_length: bool = field( + default=True, + metadata={"help": "Whether to group samples by length to minimize padding."}, + ) + length_column_name: str = field( + default="input_ids", + metadata={"help": "The column name containing the length of the input sequences."}, + ) + dataloader_pin_memory: bool = field( + default=True, + metadata={"help": "Whether to pin GPU memory for dataloaders."}, + ) + dataloader_persistent_workers: bool = field( + default=True, + metadata={"help": "Whether to keep dataloader workers alive across epochs."}, + ) + dataloader_prefetch_factor: int = field( + default=1, + metadata={"help": "Number of samples loaded in advance by each worker."}, + ) + dataloader_drop_last: bool = field( + default=False, + metadata={"help": "Whether to drop the last incomplete batch."}, + ) + dataloader_num_workers: int = field( + default=1, + metadata={"help": "Number of workers for the DataLoader."}, + ) + + +@dataclass +class PeftConfig: + """Configuration for PEFT (Parameter-Efficient Fine-Tuning) methods.""" + + lora_r: int = field( + default=8, + metadata={"help": "Lora attention dimension."}, + ) + lora_alpha: int = field( + default=16, + metadata={"help": "Lora alpha."}, + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout probability for Lora layers."}, + ) + target_modules: list[str] = field( + default_factory=lambda: ["q_proj", "v_proj"], + metadata={"help": "The modules to apply Lora to."}, + ) + bias: str = field( + default="none", + metadata={"help": "Bias type for Lora ('none', 'all', 'lora_only')."}, + ) + task_type: str = field( + default="CAUSAL_LM", + metadata={"help": "The task type for PEFT (e.g., 'CAUSAL_LM', 'SEQ_2_SEQ_LM')."}, + ) + peft_type: str = field( + default="LORA", + metadata={"help": "The PEFT method to use (e.g., 'LORA', 'IA3')."}, + ) + + +@dataclass +class ModelConfig: + """Configuration for models.""" + + model_name: str = field( + default="HuggingFaceTB/SmolLM-135M", + metadata={"help": "The name or path of the pretrained model."}, + ) + model_type: str = field( + default="hf", + metadata={"help": "The type of model ('hf' for Hugging Face, 'custom' for custom models)."}, + ) + auto_class_name: str = field( + default="AutoModelForCausalLM", + metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."}, + ) + load_in_4bit: bool = field( + default=False, + metadata={"help": "Whether to load the model in 4-bit quantization."}, + ) + use_peft: bool = field( + default=True, + metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."}, + ) + peft_config: Optional[PeftConfig] = field( + default_factory=PeftConfig, + metadata={"help": "Configuration for PEFT."}, + ) + use_cache: bool = field( + default=False, + metadata={"help": "Whether to use the past key/values in the model for faster decoding."}, + ) + attn_implementation: str = field( + default="sdpa", + metadata={"help": "The attention implementation to use (e.g., 'sdpa', 'eager')."}, + ) + device_map: Optional[str] = field( + default=None, + metadata={"help": "The device map to use for model distribution (e.g., 'auto')."}, + ) + + +@dataclass +class CallbackConfig: + """Configuration for callbacks.""" + + callbacks: Dict[str, Dict[str, Any]] = field( + default_factory=dict, + metadata={"help": "Dictionary of callback configurations, keyed by callback name."}, + ) + + +@dataclass +class GradientCheckpointingKwargs: + """Arguments for gradient checkpointing.""" + + preserve_rng_state: bool = field( + default=True, + metadata={"help": "Whether to preserve the RNG state when checkpointing."}, + ) + use_reenrant: bool = field( + default=False, + metadata={"help": "Whether to use reentrant gradient checkpointing."}, + ) + + +@dataclass +class DdpConfig: + """Arguments for Distributed Data Parallel (DDP) training.""" + + ddp_backend: str = field( + default="qccl", + metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."}, + ) + ddp_find_unused_parameters: bool = field( + default=True, + metadata={"help": "Whether to find unused parameters in DDP."}, + ) + ddp_bucket_cap_mb: Optional[int] = field( + default=25, + metadata={"help": "The bucket size in MB for DDP communication."}, + ) + ddp_broadcast_buffers: bool = field( + default=True, + metadata={"help": "Whether to broadcast buffers in DDP."}, + ) + ddp_timeout: int = field( + default=1800, + metadata={"help": "Timeout for DDP operations in seconds."}, + ) + + +@dataclass +class TrainingConfig: + """Configuration for training.""" + + type: str = field( + default="sft", + metadata={"help": "The type of training (e.g., 'sft' for Supervised Fine-Tuning)."}, + ) + output_dir: str = field( + default="./training_results", + metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, + ) + overwrite_output_dir: bool = field( + default=False, + metadata={"help": "Whether to overwrite the output directory."}, + ) + seed: int = field( + default=42, + metadata={"help": "Random seed for reproducibility."}, + ) + + do_eval: bool = field( + default=True, + metadata={"help": "Whether to run evaluation during training."}, + ) + eval_strategy: str = field( + default="epoch", + metadata={"help": "The evaluation strategy to use ('no', 'steps', 'epoch')."}, + ) + eval_steps: int = field( + default=100, + metadata={"help": "Number of update steps between two evaluations."}, + ) + + per_device_train_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during training."}, + ) + per_device_eval_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during evaluation."}, + ) + gradient_accumulation_steps: int = field( + default=1, + metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, + ) + num_train_epochs: int = field( + default=1, + metadata={"help": "Total number of training epochs to perform."}, + ) + max_steps: int = field( + default=-1, + metadata={"help": "If > 0: set total number of training steps to perform."}, + ) + + log_level: str = field( + default="info", + metadata={"help": "Set the verbosity level of the logs ('debug', 'info', 'warning', 'error')."}, + ) + log_on_each_node: bool = field( + default=True, + metadata={"help": "Whether to log on each node in a distributed setup."}, + ) + logging_strategy: str = field( + default="steps", + metadata={"help": "The logging strategy to use ('no', 'steps', 'epoch')."}, + ) + logging_steps: int = field( + default=10, + metadata={"help": "Number of update steps between two loggings."}, + ) + + save_strategy: str = field( + default="epoch", + metadata={"help": "The checkpoint save strategy to use ('no', 'steps', 'epoch')."}, + ) + save_steps: int = field( + default=100, + metadata={"help": "Number of update steps between two checkpoints (if save_strategy is 'steps')."}, + ) + save_total_limit: int = field( + default=5, + metadata={"help": "Limit the total amount of checkpoints. Deletes older checkpoints to stay within limit."}, + ) + metric_for_best_model: str = field( + default="eval_loss", + metadata={"help": "The metric to use to compare two models ('eval_loss', etc.)."}, + ) + + dtype: str = field( + default="fp16", + metadata={"help": "The data type to use for training (e.g., 'fp16', 'bf16')."}, + ) + + gradient_checkpointing: bool = field( + default=False, + metadata={"help": "Whether to use gradient checkpointing."}, + ) + gradient_checkpointing_kwargs: Optional[GradientCheckpointingKwargs] = field( + default_factory=GradientCheckpointingKwargs, + metadata={"help": "Arguments for gradient checkpointing."}, + ) + + torch_compile: bool = field( + default=True, + metadata={"help": "Whether to compile the model with `torch.compile`."}, + ) + include_tokens_per_second: bool = field( + default=True, + metadata={"help": "Whether to include tokens per second in logs."}, + ) + include_num_input_tokens_seen: bool = field( + default=True, + metadata={"help": "Whether to include the number of input tokens seen in logs."}, + ) + average_tokens_across_devices: bool = field( + default=True, + metadata={"help": "Whether to average tokens across devices in distributed training."}, + ) + + disable_tqdm: Optional[bool] = field( + default=None, + metadata={"help": "Whether to disable the tqdm progress bar."}, + ) + fsdp_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "FSDP configuration dictionary."}, + ) + deepspeed_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "DeepSpeed configuration dictionary."}, + ) + accelerator_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "Accelerate configuration dictionary."}, + ) + ddp_config: Optional[DdpConfig] = field( + default_factory=DdpConfig, + metadata={"help": "DDP configuration dictionary."}, + ) + use_cpu: Optional[bool] = field( + default=None, + metadata={"help": "Whether to explicitly run training on CPU."}, + ) + resume_from_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "Path to a checkpoint to resume training from."}, + ) + restore_callback_states_from_checkpoint: Optional[bool] = field( + default=None, + metadata={"help": "Whether to restore callback states from checkpoint."}, + ) + + +@dataclass +class MasterConfig: + """Main training configuration.""" + + model: ModelConfig = field(default_factory=ModelConfig, metadata={"help": "Configuration for the model."}) + + dataset: DatasetConfig = field(default_factory=DatasetConfig, metadata={"help": "Configuration for the dataset."}) + + optimizers: OptimizerConfig = field( + default_factory=OptimizerConfig, metadata={"help": "Configuration for optimizers."} + ) + + scheduler: SchedulerConfig = field( + default_factory=SchedulerConfig, metadata={"help": "Configuration for the learning rate scheduler."} + ) + + callbacks: CallbackConfig = field(default_factory=CallbackConfig, metadata={"help": "Configuration for callbacks."}) + + training: TrainingConfig = field( + default_factory=TrainingConfig, metadata={"help": "Configuration for training parameters."} + ) + + extra_params: Dict[str, Any] = field( + default_factory=dict, metadata={"help": "Additional top-level parameters not explicitly defined."} + ) + + +def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: + """Create argument parser for the new finetuning interface.""" + parser = HfArgumentParser(MasterConfig) + + if config_path: + config_path = os.path.abspath(config_path) + if not os.path.exists(config_path): + raise FileNotFoundError(f"Config file not found: {config_path}") + if not (config_path.endswith(".yaml") or config_path.endswith(".yml")): + raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}") + + try: + (master_config,) = parser.parse_yaml_file(yaml_file=config_path) + return master_config + except Exception as e: + raise ValueError(f"Failed to parse YAML config '{config_path}': {e}") + + if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0] + else: + master_config = parser.parse_args_into_dataclasses() + + return master_config + + +class ConfigManager: + """Manages configuration loading, validation, and updates.""" + + def __init__(self, config: MasterConfig): + """ + Initialize ConfigManager with either: + - Path to config file (str or Path) + - Configuration dictionary + - None (creates empty config) + """ + self.config = config + + def load_config(self, config_path: Union[str, Path]) -> None: + """Load configuration from file.""" + config_path = Path(config_path) + + if not config_path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + if config_path.suffix.lower() in [".yaml", ".yml"]: + with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + elif config_path.suffix.lower() == ".json": + with open(config_path, "r") as f: + config_dict = json.load(f) + else: + raise ValueError(f"Unsupported configuration file format: {config_path.suffix}") + + self.update_config(config_dict) + + def update_config(self, config_dict: Dict[str, Any]) -> None: + """Update configuration with dictionary values.""" + for key, value in config_dict.items(): + if hasattr(self.config, key): + if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"): + # Special handling for callbacks + if key in ["callbacks", "optimizers", "loss_functions"]: + nested_config = getattr(self.config, key) + for component_name, component_dict in value.items(): + if isinstance(component_dict, dict): + getattr(nested_config, key)[component_name] = component_dict + else: + getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[ + component_name + ] = component_dict + else: + # Update nested dataclass + nested_config = getattr(self.config, key) + for nested_key, nested_value in value.items(): + if hasattr(nested_config, nested_key): + setattr(getattr(self.config, key), nested_key, nested_value) + elif hasattr(nested_config, "extra_params"): + getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value + else: + setattr(self.config, key, value) + else: + # Store unknown parameters in extra_params + self.config.extra_params[key] = value + + def save_config(self, output_path: Union[str, Path]) -> None: + """Save current configuration to file.""" + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + config_dict = self.config + + if output_path.suffix.lower() in [".yaml", ".yml"]: + with open(output_path, "w") as f: + yaml.dump(config_dict, f, default_flow_style=False, indent=2) + elif output_path.suffix.lower() == ".json": + with open(output_path, "w") as f: + json.dump(config_dict, f, indent=2) + else: + raise ValueError(f"Unsupported output file format: {output_path.suffix}") + + def validate_config(self) -> None: + """Validate configuration parameters.""" + errors = [] + + # Validate model configuration + if not self.config.model.model_name: + errors.append("Model name is required") + + # Validate dataset configuration + if not self.config.dataset.dataset_name: + errors.append("Dataset name is required") + + # Validate training parameters + if self.config.dataset.train_batch_size <= 0: + errors.append("Train batch size must be positive") + + if self.config.dataset.eval_batch_size <= 0: + errors.append("Validation batch size must be positive") + + if self.config.training.num_train_epochs <= 0: + errors.append("Number of epochs must be positive") + + if self.config.training.gradient_accumulation_steps <= 0: + errors.append("Gradient accumulation steps must be positive") + + # Validate device configuration + valid_devices = ["cpu", "cuda", "qaic"] + if self.config.training.device not in valid_devices: + errors.append(f"Device must be one of {valid_devices}") + + if errors: + raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)) + + def get_callback_config(self) -> Dict[str, Any]: + """Get callback configuration as dictionary.""" + return self.config.callbacks + + def get_optimizer_config(self) -> Dict[str, Any]: + """Get optimizer configuration as dictionary.""" + return self.config.optimizers + + def get_training_config(self) -> Dict[str, Any]: + """Get training configuration as dictionary.""" + return self.config.training + + def get_scheduler_config(self) -> Dict[str, Any]: + """Get scheduler configuration as dictionary.""" + return self.config.scheduler + + def get_dataset_config(self) -> Dict[str, Any]: + """Get dataset configuration as dictionary.""" + return self.config.dataset + + def get_model_config(self) -> Dict[str, Any]: + """Get model configuration as dictionary.""" + return self.config.model + + def to_dict(self) -> Dict[str, Any]: + """Convert configuration to dictionary.""" + return asdict(self.config) + + def __getattr__(self, name: str) -> Any: + """Allow direct access to config attributes.""" + if hasattr(self.config, name): + return getattr(self.config, name) + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + +def create_trainer_config(name: str, **dependencies) -> tuple: + """ + Create trainer configuration based on registered trainer modules. + + Args: + name: Name of the trainer type + **dependencies: Any dependencies needed to configure the trainer + + Returns: + tuple: (trainer_class, args_class, additional_kwargs) + """ + config = registry.get_trainer_module(name) + + # Process required kwargs based on available dependencies + additional_kwargs = {} + for kwarg, default in config["required_kwargs"].items(): + if kwarg in dependencies: + additional_kwargs[kwarg] = dependencies[kwarg] + elif default != "REQUIRED": + additional_kwargs[kwarg] = default + + # Check for missing required arguments + for kwarg, default in config["required_kwargs"].items(): + if kwarg not in additional_kwargs and default == "REQUIRED": + raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'") + + return config["trainer_cls"], config["args_cls"], additional_kwargs diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml new file mode 100644 index 000000000..59d388bd3 --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_config.yaml @@ -0,0 +1,117 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +# Model configuration +model: + model_type: "hf" # Hugging Face model + auto_class_name: "AutoModelForCausalLM" + model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name + load_in_4bit: false + use_peft: true + peft_config: + lora_r: 8 + lora_alpha: 16 + lora_dropout: 0.1 + target_modules: ["q_proj", "v_proj"] + bias: "none" # Options: none, all, lora_only + task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. + peft_type: "LORA" # Options: LORA, IA3, etc. + +# Dataset configuration +dataset: + tokenizer_name: "HuggingFaceTB/SmolLM-135M" + dataset_type: "seq_completion" + # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M" + dataset_name: "knkarthick/samsum" + train_split: "train" + max_seq_length: 512 + split_ratio: 0.8 # Ratio for train/test split, used when only train_split is provided + test_split: "test" + group_by_length: True + num_workers: 4 + pin_memory: True + persistent_workers: True + prefetch_factor: 1 + drop_last: False + +# Training configuration +training: + type: "sft" + output_dir: "./training_results" + overwrite_output_dir: False + seed: 42 + + do_eval: True + eval_strategy: "epoch" + eval_steps: 100 + + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 1 + num_train_epochs: 1 + max_steps: -1 + + log_level: "info" + log_on_each_node: True + logging_strategy: "steps" + logging_steps: 10 + + save_strategy: "epoch" + save_steps: 100 # If 'save_strategy' is 'steps' then it will be used. + save_total_limit: 5 + metric_for_best_model: "eval_loss" + + dtype: "fp16" + completion_only_loss: True + report_to: "trackio" + + ddp_config: + ddp_backend: "qccl" + ddp_find_unused_parameters: False + ddp_bucket_cap_mb: 25 + ddp_broadcast_buffers: null + ddp_timeout: 1800 + + # Uncomment below to explicitly run on CPU + use_cpu: False + + gradient_checkpointing: False + gradient_checkpointing_kwargs: + preserve_rng_state : True + use_reenrant: False + + torch_compile: True + include_tokens_per_second: True + include_num_input_tokens_seen: True + average_tokens_across_devices: True + +# Optimizer configuration +optimizers: + optimizer_name: "adamw" + lr: 5e-5 + weight_decay: 0.01 + + +# “linear” → transformers.get_linear_schedule_with_warmup +# “cosine” → transformers.get_cosine_schedule_with_warmup +# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup +# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup +# “constant” → transformers.get_constant_schedule +# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup +# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule + +scheduler: + scheduler_name: "cosine" + warmup_steps: 100 # warmup_steps or warmup_ratio + warmup_ratio: 0.1 + +callbacks: + early_stopping: + early_stopping_patience: 3 + early_stopping_threshold: 0.001 + tensorboard: + diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py new file mode 100644 index 000000000..10105a33e --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +from pathlib import Path + +import pytest + +from QEfficient.finetune.experimental.core.config_manager import ConfigManager, parse_arguments + + +@pytest.fixture +def config_path() -> Path: + here = Path(__file__).resolve().parent + return (here / "test_config.yaml").resolve() + + +# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases." + + +def test_config(config_path): + # parse the yaml file + master_config = parse_arguments(config_path) + config_manager = ConfigManager(master_config) + # Test that the config manager is initialized correctly + assert isinstance(config_manager, ConfigManager) + + # Test that all required fields are present + missing = [ + a + for a in ("model", "dataset", "optimizers", "scheduler", "callbacks", "training") + if not hasattr(config_manager, a) + ] + assert not missing, f"Missing attributes: {missing}" + trainer_config = config_manager.get_training_config() + assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs")) + dataset_config = config_manager.get_dataset_config() + assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name")) + model_config = config_manager.get_model_config() + assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft")) + scheduler_config = config_manager.get_scheduler_config() + assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name")) + callback_config = config_manager.get_callback_config() + assert (hasattr(callback_config, attr) for attr in ("earlystopping")) + optimizer_config = config_manager.get_optimizer_config() + assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr")) From 848c911328bf6d3fb8078ef225e433ac5fb3eccc Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 9 Dec 2025 07:31:48 +0000 Subject: [PATCH 2/6] [QEff.finetuning] Adding config_manager and its test cases. Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/tests/test_config_manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py index 10105a33e..b3b9b0b24 100644 --- a/QEfficient/finetune/experimental/tests/test_config_manager.py +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -19,9 +19,6 @@ def config_path() -> Path: return (here / "test_config.yaml").resolve() -# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases." - - def test_config(config_path): # parse the yaml file master_config = parse_arguments(config_path) From 1607c43738a9f4a6e4973214df82a10aa4afec9d Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 07:26:49 +0000 Subject: [PATCH 3/6] [QEff.finetuning] Adding config_manager and its test_cases. Signed-off-by: Tanisha Chawada --- .../experimental/core/config_manager.py | 233 +++++++++++++----- .../experimental/core/utils/profiler_utils.py | 88 ------- .../experimental/tests/test_config.yaml | 33 +-- .../experimental/tests/test_config_manager.py | 25 +- 4 files changed, 196 insertions(+), 183 deletions(-) diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index 60ed4d4b6..b28c2e1e3 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -11,10 +11,9 @@ import json import os -import sys -from dataclasses import asdict, dataclass, field +from dataclasses import asdict, dataclass, field, fields, is_dataclass from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import yaml from transformers.hf_argparser import HfArgumentParser @@ -257,7 +256,7 @@ class DdpConfig: metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."}, ) ddp_find_unused_parameters: bool = field( - default=True, + default=False, metadata={"help": "Whether to find unused parameters in DDP."}, ) ddp_bucket_cap_mb: Optional[int] = field( @@ -294,7 +293,10 @@ class TrainingConfig: default=42, metadata={"help": "Random seed for reproducibility."}, ) - + device: str = field( + default="qaic", + metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."}, + ) do_eval: bool = field( default=True, metadata={"help": "Whether to run evaluation during training."}, @@ -307,7 +309,6 @@ class TrainingConfig: default=100, metadata={"help": "Number of update steps between two evaluations."}, ) - per_device_train_batch_size: int = field( default=1, metadata={"help": "Batch size per device during training."}, @@ -381,10 +382,6 @@ class TrainingConfig: default=True, metadata={"help": "Whether to compile the model with `torch.compile`."}, ) - include_tokens_per_second: bool = field( - default=True, - metadata={"help": "Whether to include tokens per second in logs."}, - ) include_num_input_tokens_seen: bool = field( default=True, metadata={"help": "Whether to include the number of input tokens seen in logs."}, @@ -426,6 +423,14 @@ class TrainingConfig: default=None, metadata={"help": "Whether to restore callback states from checkpoint."}, ) + report_to: Optional[List[str]] = field( + default=None, + metadata={"help": "The list of integrations to report the results and logs to."}, + ) + completion_only_loss: Optional[bool] = field( + default=False, + metadata={"help": "Whether to compute loss only on completion tokens."}, + ) @dataclass @@ -455,7 +460,7 @@ class MasterConfig: ) -def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: +def parse_arguments(config_path: Optional[str] = None, args: Optional[List[str]] = None) -> MasterConfig: """Create argument parser for the new finetuning interface.""" parser = HfArgumentParser(MasterConfig) @@ -472,12 +477,15 @@ def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: except Exception as e: raise ValueError(f"Failed to parse YAML config '{config_path}': {e}") - if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0] + args = [] if args is None else args + # If a single positional YAML file was passed via args, parse it as YAML + if len(args) == 1 and (args[0].endswith(".yaml") or args[0].endswith(".yml")): + yaml_path = os.path.abspath(args[0]) + (master_config,) = parser.parse_yaml_file(yaml_file=yaml_path) else: - master_config = parser.parse_args_into_dataclasses() + (master_config,) = parser.parse_args_into_dataclasses(args=args) + master_config = asdict(master_config) + master_config = MasterConfig(**master_config) return master_config @@ -512,34 +520,58 @@ def load_config(self, config_path: Union[str, Path]) -> None: self.update_config(config_dict) + def _ensure_extra_params(self, obj) -> Dict[str, Any]: + """Ensure obj.extra_params exists and is a dict; return it.""" + ep = getattr(obj, "extra_params", None) + if ep is None: + setattr(obj, "extra_params", {}) + ep = obj.extra_params + if not isinstance(ep, dict): + raise TypeError("extra_params must be a dict.") + return ep + + def _stash_top_level_extra(self, section: str, nested_key: str, value: Any) -> None: + """Store unknown nested values under MasterConfig.extra_params['section.nested_key'].""" + ep = self._ensure_extra_params(self.config) + ep[f"{section}.{nested_key}"] = value + def update_config(self, config_dict: Dict[str, Any]) -> None: """Update configuration with dictionary values.""" + + SPECIAL_KEYS = {"callbacks"} + for key, value in config_dict.items(): if hasattr(self.config, key): - if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"): - # Special handling for callbacks - if key in ["callbacks", "optimizers", "loss_functions"]: - nested_config = getattr(self.config, key) - for component_name, component_dict in value.items(): - if isinstance(component_dict, dict): - getattr(nested_config, key)[component_name] = component_dict - else: - getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[ - component_name - ] = component_dict + target = getattr(self.config, key) + + # Special handling for callbacks (dict inside CallbackConfig) + if key in SPECIAL_KEYS and isinstance(value, dict): + if is_dataclass(target) and hasattr(target, "callbacks") and isinstance(target.callbacks, dict): + for component_name, component_cfg in value.items(): + target.callbacks[component_name] = component_cfg + elif isinstance(target, dict): + target.update(value) else: - # Update nested dataclass - nested_config = getattr(self.config, key) - for nested_key, nested_value in value.items(): - if hasattr(nested_config, nested_key): - setattr(getattr(self.config, key), nested_key, nested_value) - elif hasattr(nested_config, "extra_params"): - getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value - else: - setattr(self.config, key, value) + self._stash_top_level_extra(key, "__all__", value) + continue + + if isinstance(value, dict) and is_dataclass(target): + known = {f.name for f in fields(target)} + for nested_key, nested_value in value.items(): + if nested_key in known: + setattr(target, nested_key, nested_value) + else: + self._stash_top_level_extra(key, nested_key, nested_value) + continue + + if isinstance(value, dict) and isinstance(target, dict): + target.update(value) + continue + setattr(self.config, key, value) + else: - # Store unknown parameters in extra_params - self.config.extra_params[key] = value + ep = self._ensure_extra_params(self.config) + ep[key] = value def save_config(self, output_path: Union[str, Path]) -> None: """Save current configuration to file.""" @@ -557,38 +589,105 @@ def save_config(self, output_path: Union[str, Path]) -> None: else: raise ValueError(f"Unsupported output file format: {output_path.suffix}") - def validate_config(self) -> None: - """Validate configuration parameters.""" - errors = [] - - # Validate model configuration - if not self.config.model.model_name: - errors.append("Model name is required") - - # Validate dataset configuration - if not self.config.dataset.dataset_name: - errors.append("Dataset name is required") - - # Validate training parameters - if self.config.dataset.train_batch_size <= 0: - errors.append("Train batch size must be positive") - - if self.config.dataset.eval_batch_size <= 0: - errors.append("Validation batch size must be positive") + def _push(self, errs: List[str], cond: bool, msg: str) -> None: + """Append msg to errs if cond is True.""" + if cond: + errs.append(msg) - if self.config.training.num_train_epochs <= 0: - errors.append("Number of epochs must be positive") - - if self.config.training.gradient_accumulation_steps <= 0: - errors.append("Gradient accumulation steps must be positive") - - # Validate device configuration + def validate_config(self) -> None: + """ + Validate configuration parameters for MasterConfig. + """ + errors: List[str] = [] + + cfg = self.config + model = getattr(cfg, "model", {}) + dataset = getattr(cfg, "dataset", {}) + training = getattr(cfg, "training", {}) + + # ---------- Model ---------- + self._push(errors, not model.get("model_name"), "model.model_name is required.") + + # PEFT validation + if model.get("use_peft"): + pc = model.get("peft_config", {}) + self._push(errors, not isinstance(pc, dict), "model.peft_config must be a dict when use_peft=True.") + if isinstance(pc, dict): + self._push( + errors, + not isinstance(pc.get("lora_r", 0), int) or pc.get("lora_r", 0) <= 0, + "model.peft_config.lora_r must be a positive integer.", + ) + self._push( + errors, + not isinstance(pc.get("lora_alpha", 0), int) or pc.get("lora_alpha", 0) <= 0, + "model.peft_config.lora_alpha must be a positive integer.", + ) + self._push( + errors, + not (0.0 <= float(pc.get("lora_dropout", 0.0)) < 1.0), + "model.peft_config.lora_dropout must be in [0,1).", + ) + + # ---------- Dataset ---------- + self._push(errors, not dataset.get("dataset_name"), "dataset.dataset_name is required.") + self._push(errors, not dataset.get("tokenizer_name"), "dataset.tokenizer_name is required.") + self._push(errors, dataset.get("max_seq_length", 0) <= 0, "dataset.max_seq_length must be positive.") + + # ---------- Training ---------- + # Batch sizes + self._push( + errors, + training.get("per_device_train_batch_size", 0) <= 0, + "training.per_device_train_batch_size must be positive.", + ) + self._push( + errors, + training.get("per_device_eval_batch_size", 0) <= 0, + "training.per_device_eval_batch_size must be positive.", + ) + + # Epochs / steps + n_epochs = training.get("num_train_epochs", 0) + max_steps = training.get("max_steps", -1) + self._push( + errors, + n_epochs <= 0 and max_steps <= 0, + "Either training.num_train_epochs > 0 or training.max_steps > 0 must be set.", + ) + + # Gradient accumulation + self._push( + errors, + training.get("gradient_accumulation_steps", 0) <= 0, + "training.gradient_accumulation_steps must be positive.", + ) + + # Logging / saving configs + self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.") + self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.") + + # Device valid_devices = ["cpu", "cuda", "qaic"] - if self.config.training.device not in valid_devices: - errors.append(f"Device must be one of {valid_devices}") - + training_device = training.get("device", None) + if training_device not in valid_devices: + self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.") + + # DDP config + ddp = training.get("ddp_config", {}) + if isinstance(ddp, dict): + backend = ddp.get("ddp_backend") + # Accept qccl for Qualcomm, nccl for CUDA, gloo for CPU + self._push( + errors, + backend not in {"qccl", "nccl", "gloo", None}, + "training.ddp_config.ddp_backend must be one of {'qccl','nccl','gloo'} or omitted.", + ) + + # ---------- Final ---------- if errors: - raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)) + # Join messages with bullet points for readability + raise ValueError("Configuration validation failed:\n- " + "\n- ".join(errors)) def get_callback_config(self) -> Dict[str, Any]: """Get callback configuration as dictionary.""" diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py index e24508e83..d647b73a6 100644 --- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py +++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py @@ -4,91 +4,3 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - - -from contextlib import nullcontext -from typing import ContextManager - -import torch - - -def get_op_verifier_ctx( - use_op_by_op_verifier: bool, - device_type: str, - dump_dir: str, - step: int, - ref_device: str = "cpu", - ref_dtype: torch.dtype = torch.float32, - atol: float = 1e-1, - rtol: float = 1e-5, - use_ref_output_on_mismatch: bool = True, -) -> ContextManager: - """Get the op-by-op verifier context manager when op-by-op verification is - enabled. It helps in debuging operator related issues by matching the - operator execution on qaic v/s cpu. This is meant only for qaic backend. - - Args: - use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier. - device_type (str): Device on which the model is being executed. - dump_dir (str): Directory to dump the op-by-op verification results. - step (int): Step number for which the op-by-op verification is to be performed. - ref_device (str, optional): Device to use as reference for verification. - Defaults to "cpu". - ref_dtype (torch.dtype, optional): Data type to use as reference - datatype for verification. Defaults to torch.float32. - atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1. - rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5. - use_ref_output_on_mismatch (bool, optional): If an operator has a - mismatch with respect to the reference device, use the reference - device outputs and continue rest of the verification. Defaults to True. - - Returns: - ContextManager: Instance of context manager used to verify the operators. - """ - if (not use_op_by_op_verifier) or ("qaic" in device_type): - return nullcontext() - - # Lazily imported qaic_debug when it is actually needed. - import torch_qaic.debug as qaic_debug - - filter_config = qaic_debug.DispatchFilterConfig.default(device_type) - dump_dir = dump_dir + "/mismatches/step_" + str(step) - return qaic_debug.OpByOpVerifierMode( - ref_device=ref_device, - ref_dtype=ref_dtype, - atol=atol, - rtol=rtol, - use_ref_output_on_mismatch=use_ref_output_on_mismatch, - filter_config=filter_config, - dump_root_dir=dump_dir, - ) - - -def init_qaic_profiling(use_profiler: bool, device_type: str) -> None: - """Initialize the qaic profiling tool. Note: The profiler is only works - for qaic backend. - - Args: - use_profiler (bool): Boolean flag to enable profiler. - device_type (str): Device on which the model is being executed. - """ - if (use_profiler) and ("qaic" in device_type): - # Lazily imported qaic's qaic_profile when it is actually needed. - import torch_qaic.profile as qaic_profile - - qaic_profile.start_profiling(device_type, 1) - - -def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None: - """Stop the qaic profiling tool. Note: The profiler is only works - for qaic backend. - - Args: - use_profiler (bool): Boolean flag to enable profiler. - device_type (str): Device on which the model is being executed. - """ - if (use_profiler) and ("qaic" in device_type): - # Lazily imported qaic's qaic_profile when it is actually needed. - import torch_qaic.profile as qaic_profile - - qaic_profile.stop_profiling(device_type) diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml index 59d388bd3..e97e99d58 100644 --- a/QEfficient/finetune/experimental/tests/test_config.yaml +++ b/QEfficient/finetune/experimental/tests/test_config.yaml @@ -5,9 +5,9 @@ # # ----------------------------------------------------------------------------- -# Model configuration +# model configuration model: - model_type: "hf" # Hugging Face model + model_type: "hf" auto_class_name: "AutoModelForCausalLM" model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name load_in_4bit: false @@ -17,9 +17,9 @@ model: lora_alpha: 16 lora_dropout: 0.1 target_modules: ["q_proj", "v_proj"] - bias: "none" # Options: none, all, lora_only - task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. - peft_type: "LORA" # Options: LORA, IA3, etc. + bias: "none" + task_type: "CAUSAL_LM" + peft_type: "LORA" # Dataset configuration dataset: @@ -33,10 +33,10 @@ dataset: test_split: "test" group_by_length: True num_workers: 4 - pin_memory: True - persistent_workers: True - prefetch_factor: 1 - drop_last: False + dataloader_pin_memory: True + dataloader_persistent_workers: True + dataloader_prefetch_factor: 1 + dataloader_drop_last: False # Training configuration training: @@ -44,7 +44,7 @@ training: output_dir: "./training_results" overwrite_output_dir: False seed: 42 - + device: "qaic" do_eval: True eval_strategy: "epoch" eval_steps: 100 @@ -61,7 +61,6 @@ training: logging_steps: 10 save_strategy: "epoch" - save_steps: 100 # If 'save_strategy' is 'steps' then it will be used. save_total_limit: 5 metric_for_best_model: "eval_loss" @@ -76,7 +75,6 @@ training: ddp_broadcast_buffers: null ddp_timeout: 1800 - # Uncomment below to explicitly run on CPU use_cpu: False gradient_checkpointing: False @@ -85,7 +83,6 @@ training: use_reenrant: False torch_compile: True - include_tokens_per_second: True include_num_input_tokens_seen: True average_tokens_across_devices: True @@ -95,19 +92,9 @@ optimizers: lr: 5e-5 weight_decay: 0.01 - -# “linear” → transformers.get_linear_schedule_with_warmup -# “cosine” → transformers.get_cosine_schedule_with_warmup -# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup -# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup -# “constant” → transformers.get_constant_schedule -# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup -# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule - scheduler: scheduler_name: "cosine" warmup_steps: 100 # warmup_steps or warmup_ratio - warmup_ratio: 0.1 callbacks: early_stopping: diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py index b3b9b0b24..fd2abfd48 100644 --- a/QEfficient/finetune/experimental/tests/test_config_manager.py +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -20,11 +20,14 @@ def config_path() -> Path: def test_config(config_path): - # parse the yaml file - master_config = parse_arguments(config_path) + master_config = parse_arguments(args=[]) config_manager = ConfigManager(master_config) - # Test that the config manager is initialized correctly assert isinstance(config_manager, ConfigManager) + config_manager.load_config(config_path) + try: + config_manager.validate_config() + except Exception as e: + pytest.fail(f"Config validation failed with error: {e}") # Test that all required fields are present missing = [ @@ -34,14 +37,26 @@ def test_config(config_path): ] assert not missing, f"Missing attributes: {missing}" trainer_config = config_manager.get_training_config() - assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs")) + assert trainer_config is not None + assert isinstance(trainer_config, dict) + assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs", "ddp_config")) dataset_config = config_manager.get_dataset_config() + assert dataset_config is not None + assert isinstance(dataset_config, dict) assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name")) model_config = config_manager.get_model_config() - assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft")) + assert model_config is not None + assert isinstance(model_config, dict) + assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft", "peft_config")) scheduler_config = config_manager.get_scheduler_config() + assert scheduler_config is not None + assert isinstance(scheduler_config, dict) assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name")) callback_config = config_manager.get_callback_config() + assert callback_config is not None + assert isinstance(callback_config, dict) assert (hasattr(callback_config, attr) for attr in ("earlystopping")) optimizer_config = config_manager.get_optimizer_config() + assert optimizer_config is not None + assert isinstance(optimizer_config, dict) assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr")) From 9fecf682184123826f3a89c342897f236658e06e Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 15 Dec 2025 07:39:55 +0000 Subject: [PATCH 4/6] Adding profiler_utils.py Signed-off-by: Tanisha Chawada --- .../experimental/core/utils/profiler_utils.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py index d647b73a6..e24508e83 100644 --- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py +++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py @@ -4,3 +4,91 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + + +from contextlib import nullcontext +from typing import ContextManager + +import torch + + +def get_op_verifier_ctx( + use_op_by_op_verifier: bool, + device_type: str, + dump_dir: str, + step: int, + ref_device: str = "cpu", + ref_dtype: torch.dtype = torch.float32, + atol: float = 1e-1, + rtol: float = 1e-5, + use_ref_output_on_mismatch: bool = True, +) -> ContextManager: + """Get the op-by-op verifier context manager when op-by-op verification is + enabled. It helps in debuging operator related issues by matching the + operator execution on qaic v/s cpu. This is meant only for qaic backend. + + Args: + use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier. + device_type (str): Device on which the model is being executed. + dump_dir (str): Directory to dump the op-by-op verification results. + step (int): Step number for which the op-by-op verification is to be performed. + ref_device (str, optional): Device to use as reference for verification. + Defaults to "cpu". + ref_dtype (torch.dtype, optional): Data type to use as reference + datatype for verification. Defaults to torch.float32. + atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1. + rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5. + use_ref_output_on_mismatch (bool, optional): If an operator has a + mismatch with respect to the reference device, use the reference + device outputs and continue rest of the verification. Defaults to True. + + Returns: + ContextManager: Instance of context manager used to verify the operators. + """ + if (not use_op_by_op_verifier) or ("qaic" in device_type): + return nullcontext() + + # Lazily imported qaic_debug when it is actually needed. + import torch_qaic.debug as qaic_debug + + filter_config = qaic_debug.DispatchFilterConfig.default(device_type) + dump_dir = dump_dir + "/mismatches/step_" + str(step) + return qaic_debug.OpByOpVerifierMode( + ref_device=ref_device, + ref_dtype=ref_dtype, + atol=atol, + rtol=rtol, + use_ref_output_on_mismatch=use_ref_output_on_mismatch, + filter_config=filter_config, + dump_root_dir=dump_dir, + ) + + +def init_qaic_profiling(use_profiler: bool, device_type: str) -> None: + """Initialize the qaic profiling tool. Note: The profiler is only works + for qaic backend. + + Args: + use_profiler (bool): Boolean flag to enable profiler. + device_type (str): Device on which the model is being executed. + """ + if (use_profiler) and ("qaic" in device_type): + # Lazily imported qaic's qaic_profile when it is actually needed. + import torch_qaic.profile as qaic_profile + + qaic_profile.start_profiling(device_type, 1) + + +def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None: + """Stop the qaic profiling tool. Note: The profiler is only works + for qaic backend. + + Args: + use_profiler (bool): Boolean flag to enable profiler. + device_type (str): Device on which the model is being executed. + """ + if (use_profiler) and ("qaic" in device_type): + # Lazily imported qaic's qaic_profile when it is actually needed. + import torch_qaic.profile as qaic_profile + + qaic_profile.stop_profiling(device_type) From 16b7718c6fb6a5e2d1d9379bb00c3d40d6529624 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 23 Dec 2025 12:20:48 +0000 Subject: [PATCH 5/6] Adding sample config and readme Signed-off-by: Tanisha Chawada --- .../finetune/experimental/configs/README.md | 175 ++++++++++++++++++ .../experimental/configs/default_config.yaml | 104 +++++++++++ .../experimental/configs/sample_config.yaml | 0 3 files changed, 279 insertions(+) create mode 100644 QEfficient/finetune/experimental/configs/README.md create mode 100644 QEfficient/finetune/experimental/configs/default_config.yaml delete mode 100644 QEfficient/finetune/experimental/configs/sample_config.yaml diff --git a/QEfficient/finetune/experimental/configs/README.md b/QEfficient/finetune/experimental/configs/README.md new file mode 100644 index 000000000..97f0d6ea2 --- /dev/null +++ b/QEfficient/finetune/experimental/configs/README.md @@ -0,0 +1,175 @@ +--- +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +--- +*** + +# Training Configuration with LoRA Finetuning + +## Overview + +This configuration file defines the setup for fine-tuning a Hugging Face causal language model using **LoRA (Low-Rank Adaptation)** and **PEFT (Parameter-Efficient Fine-Tuning)** techniques. It also includes dataset, training, optimizer, and scheduler settings. + +*** +### 1. Model Configuration + +Model-related parameters for loading and fine-tuning. + +* **model\_type**: `hf` → Type of model (`hf` for Hugging Face, `custom` for custom models). +* **auto\_class\_name**: `AutoModelForCausalLM` → AutoClass used to load the model. +* **model\_name**: `HuggingFaceTB/SmolLM-135M` → Pretrained model to fine-tune. +* **load\_in\_4bit**: `false` → If `true`, loads model in 4-bit quantization for memory efficiency. +* **use\_peft**: `true` → Enables PEFT for parameter-efficient fine-tuning. +* **peft\_config**: Defines LoRA parameters when `use_peft` is `true`: + * `lora_r`: Rank for LoRA adapters. + * `lora_alpha`: Scaling factor for LoRA updates. + * `lora_dropout`: Dropout applied to LoRA layers. + * `target_modules`: Modules to apply LoRA (e.g., `q_proj`, `v_proj`). + * `bias`: Bias handling (`none`, `all`, `lora_only`). + * `task_type`: `CAUSAL_LM` → Task type (e.g., `CAUSAL_LM`, `SEQ_2_SEQ_LM`). + * `peft_type`: `LORA` → Fine-tuning method (e.g., `LORA`, `IA3`). + +*** + + +### 2. Dataset Configuration + +This section defines parameters for dataset handling during fine-tuning with Hugging Face models. It covers dataset type, splits, prompt formatting, and DataLoader settings. + +* **tokenizer\_name**: Matches model name. +* **dataset\_type**: `seq_completion` → Used for sequence continuation tasks, where the model predicts the next tokens given an input text (e.g., summarization, text generation). +* **dataset\_name**: Dataset name for training. +* **train\_split/test\_split**: Defines splits. +* **split\_ratio**: For spliting the train/test dataset, only if train split is provided. +* **prompt\_func**: Python function to format prompts. +* **completion\_template**: `{output}` → string pattern that tells the fine-tuning pipeline which part of the dataset should be treated as the target output (completion) for the model to learn. + +### Example Dataset Configs + +### **1. Alpaca (yahma/alpaca-cleaned)** + +```yaml +dataset: + tokenizer_name: "meta-llama/Llama-3.2-1B" + dataset_type: "seq_completion" + dataset_name: "yahma/alpaca-cleaned" + train_split: "train" + test_split: "test" + max_seq_length: 512 + prompt_func: "alpaca_func:create_alpaca_prompt" + completion_template: "{output}" + +``` + +*** + +### **2. Samsum (knkarthick/samsum)** + +```yaml +dataset: + tokenizer_name: "meta-llama/Llama-3.2-1B" + dataset_type: "seq_completion" + dataset_name: "knkarthick/samsum" + train_split: "train" + test_split: "test" + prompt_func: "samsum_func:create_samsum_prompt" + completion_template: "{summary}" + +``` + +*** +### **3. gsm8k (openai/gsm8k)** + +```yaml +dataset: + tokenizer_name: "meta-llama/Llama-3.2-1B" + dataset_type: "seq_completion" + dataset_name: "openai/gsm8k" + train_split: "train" + test_split: "test" + prompt_func: "gsm8k_func:create_gsm8k_prompt" + completion_template: "{answer}" + +``` + + *** + +*** +### **4. grammar (grammar_dataset)** + +```yaml +dataset: + tokenizer_name: "meta-llama/Llama-3.2-1B" + dataset_type: "seq_completion" + dataset_name: "grammar" + train_split: "train" + split_ratio: 0.8 + prompt_func: "gsm8k_func:create_grammar_prompt" + completion_template: "{target}" +``` + + *** +### Prompt Function Examples + +```python +# Alpaca +def create_alpaca_prompt(example): + return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n Response:\n" + +# Samsum +def create_samsum_prompt(example): + return f"Summarize the following conversation:\n\n{example['dialogue']}\n\nSummary:\n" + +#gsm8K +def create_gsm8k_prompt(example): + return f"Solve the following math problem step by step:\n\n{example['question']}\n\nAnswer:\n" + +#grammar +def create_grammar_prompt(example): + return f"Correct the grammar in the following sentence:\n\n{example['input']}\n\nCorrected:\n" + +``` + + +*** + +### 3. Training Configuration + +This section defines core parameters for fine-tuning and evaluation. + +* **type**: `sft` → Specifies training type; `sft` means Supervised Fine-Tuning. +* **output\_dir**: Directory where model checkpoints and logs are saved. +* **do\_eval**: Enables evaluation during training. +* **eval\_strategy**: `epoch` → When to run evaluation (e.g., per epoch or steps). +* **gradient\_accumulation\_steps**: Accumulate gradients over multiple steps to simulate larger batch size. +* **dtype**: `fp16` → Mixed precision for faster training and reduced memory usage. +* **gradient\_checkpointing**: Saves memory by recomputing activations during backward pass (slower but memory-efficient). +* **torch\_compile**: Wraps your model with torch.compile() (PyTorch 2.0+) to fuse ops, reduce Python overhead, and generate optimized kernels—often yielding speed-ups without code changes. +* **Optional distributed configs**: FSDP, DeepSpeed, or DDP for multi-QAIC or large-scale training. +* **resume_from_checkpoint**: Path to a checkpoint to resume training from. +* **disable_tqdm**: False by default; set to True to disable progress bar (if running in Notebook). + +*** + +### 4. Optimizer & Scheduler + +* **optimizer**: `adamw` – Optimizer for weight-decoupled regularization; options: `adamw`, `adam`, `sgd`. + * **lr**: Initial learning rate (e.g., `5e-5` for fine-tuning). + * **weight\_decay**: Regularization strength (commonly `0.01`). + +* **scheduler**: `cosine` – Learning rate decay strategy; options: `linear`, `cosine`, `cosine_with_restarts`, `polynomial`, `constant`, `constant_with_warmup`, `inverse_sqrt`. + * **warmup\_steps**: Number of steps or ratio (e.g., `100` steps or `0.05` for 5% of total steps). + * Stabilizes early training and improves convergence. + +*** + +### 5. Callbacks + +Callbacks allow custom actions during training, such as logging, early stopping, or hardware profiling. + +* **early\_stopping**: Stops training if no improvement in a monitored metric for a defined patience period. +* **tensorboard**: Enables logging of metrics and losses to TensorBoard for visualization. +* **QAICProfilerCallback**: Profiles QAIC devices over a specified training step range to monitor performance and resource usage. +* **QAICOpByOpVerifierCallback**: Verifies QAIC operations step-by-step during a specified training range for correctness and debugging. + +*** diff --git a/QEfficient/finetune/experimental/configs/default_config.yaml b/QEfficient/finetune/experimental/configs/default_config.yaml new file mode 100644 index 000000000..1d2e24b3a --- /dev/null +++ b/QEfficient/finetune/experimental/configs/default_config.yaml @@ -0,0 +1,104 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +# Model configuration +model: + model_type: "hf" # Hugging Face model + auto_class_name: "AutoModelForCausalLM" + model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name + use_peft: true + peft_config: + lora_r: 8 + lora_alpha: 16 + lora_dropout: 0.1 + target_modules: ["q_proj", "v_proj"] + bias: "none" # Options: none, all, lora_only + task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. + peft_type: "LORA" # Options: LORA, IA3, etc. + +# Dataset configuration +dataset: + tokenizer_name: "HuggingFaceTB/SmolLM-135M" + dataset_type: "seq_completion" + # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M" + dataset_name: "knkarthick/samsum" + train_split: "train" + max_seq_length: 512 + split_ratio: 0.8 # Ratio for train/test split, used when only train_split is provided + test_split: "test" + group_by_length: True + num_workers: 4 + pin_memory: True + persistent_workers: True + prefetch_factor: 1 + drop_last: False + +# Training configuration +training: + type: "sft" + output_dir: "./training_results" + eval_strategy: "epoch" + # eval_steps: 100 # If 'eval_strategy' is 'steps' then it will be used. + gradient_accumulation_steps: 1 + num_train_epochs: 1 + max_steps: -1 + log_level: "info" + log_on_each_node: True + logging_strategy: "steps" + logging_steps: 10 + save_strategy: "epoch" + # save_steps: 100 # If 'save_strategy' is 'steps' then it will be used. + save_total_limit: 5 + metric_for_best_model: "eval_loss" + dtype: "fp16" + + # Uncomment if running in Notebook + # disable_tqdm: True + + # Uncomment below fsdp block to enable FSDP training + # fsdp: "full_shard" + # fsdp_config: "./configs/accelerate/fsdp_config.yaml" + # fsdp_config: "./configs/accelerate/fsdp_tp_parallelism_config.yaml" + + # Uncomment below deepspeed block to enable DeepSpeed training + # deepspeed_config: "./configs/accelerate/deepSpeed_config.yaml" + + # Uncomment below DDP block to enable DDP training and configure DDP params + # ddp_config: + # ddp_backend: "qccl" + # ddp_find_unused_parameters: False + # ddp_bucket_cap_mb: 25 + # ddp_broadcast_buffers: null + # ddp_timeout: 1800 + + # Uncomment and populate to resume training + # resume_from_checkpoint: "./abc" + # restore_callback_states_from_checkpoint: True + + gradient_checkpointing: False + gradient_checkpointing_kwargs: + preserve_rng_state : True + use_reenrant: False + + torch_compile: True + +# Optimizer configuration +optimizers: + optimizer_name: "adamw" + lr: 5e-5 + weight_decay: 0.01 + +scheduler: + scheduler_name: "cosine" + warmup_steps: 100 # warmup_steps or warmup_ratio + warmup_ratio: 0.1 + +callbacks: + early_stopping: + early_stopping_patience: 3 + early_stopping_threshold: 0.001 + tensorboard: diff --git a/QEfficient/finetune/experimental/configs/sample_config.yaml b/QEfficient/finetune/experimental/configs/sample_config.yaml deleted file mode 100644 index e69de29bb..000000000 From 39ce4bf998863d9a428522cfdca93d2a91a8c2da Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 23 Dec 2025 17:56:40 +0530 Subject: [PATCH 6/6] Delete QEfficient/finetune/experimental/tests/test_config.yaml Signed-off-by: Tanisha Chawada --- .../experimental/tests/test_config.yaml | 104 ------------------ 1 file changed, 104 deletions(-) delete mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml deleted file mode 100644 index e97e99d58..000000000 --- a/QEfficient/finetune/experimental/tests/test_config.yaml +++ /dev/null @@ -1,104 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -# model configuration -model: - model_type: "hf" - auto_class_name: "AutoModelForCausalLM" - model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name - load_in_4bit: false - use_peft: true - peft_config: - lora_r: 8 - lora_alpha: 16 - lora_dropout: 0.1 - target_modules: ["q_proj", "v_proj"] - bias: "none" - task_type: "CAUSAL_LM" - peft_type: "LORA" - -# Dataset configuration -dataset: - tokenizer_name: "HuggingFaceTB/SmolLM-135M" - dataset_type: "seq_completion" - # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M" - dataset_name: "knkarthick/samsum" - train_split: "train" - max_seq_length: 512 - split_ratio: 0.8 # Ratio for train/test split, used when only train_split is provided - test_split: "test" - group_by_length: True - num_workers: 4 - dataloader_pin_memory: True - dataloader_persistent_workers: True - dataloader_prefetch_factor: 1 - dataloader_drop_last: False - -# Training configuration -training: - type: "sft" - output_dir: "./training_results" - overwrite_output_dir: False - seed: 42 - device: "qaic" - do_eval: True - eval_strategy: "epoch" - eval_steps: 100 - - per_device_train_batch_size: 1 - per_device_eval_batch_size: 1 - gradient_accumulation_steps: 1 - num_train_epochs: 1 - max_steps: -1 - - log_level: "info" - log_on_each_node: True - logging_strategy: "steps" - logging_steps: 10 - - save_strategy: "epoch" - save_total_limit: 5 - metric_for_best_model: "eval_loss" - - dtype: "fp16" - completion_only_loss: True - report_to: "trackio" - - ddp_config: - ddp_backend: "qccl" - ddp_find_unused_parameters: False - ddp_bucket_cap_mb: 25 - ddp_broadcast_buffers: null - ddp_timeout: 1800 - - use_cpu: False - - gradient_checkpointing: False - gradient_checkpointing_kwargs: - preserve_rng_state : True - use_reenrant: False - - torch_compile: True - include_num_input_tokens_seen: True - average_tokens_across_devices: True - -# Optimizer configuration -optimizers: - optimizer_name: "adamw" - lr: 5e-5 - weight_decay: 0.01 - -scheduler: - scheduler_name: "cosine" - warmup_steps: 100 # warmup_steps or warmup_ratio - -callbacks: - early_stopping: - early_stopping_patience: 3 - early_stopping_threshold: 0.001 - tensorboard: -