From 646233d228511614635e5618ce6291cf0f05db75 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 9 Dec 2025 06:08:14 +0000
Subject: [PATCH 1/6] [QEff.finetuning] Adding config_manager and its test
 cases.

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../experimental/core/config_manager.py       | 648 ++++++++++++++++++
 .../experimental/tests/test_config.yaml       | 117 ++++
 .../experimental/tests/test_config_manager.py |  50 ++
 3 files changed, 815 insertions(+)
 create mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml
 create mode 100644 QEfficient/finetune/experimental/tests/test_config_manager.py

diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index d647b73a6..60ed4d4b6 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -4,3 +4,651 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+"""
+Configuration manager for handling all training configurations.
+Provides centralized configuration loading, validation, and management.
+"""
+
+import json
+import os
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import yaml
+from transformers.hf_argparser import HfArgumentParser
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+
+
+@dataclass
+class OptimizerConfig:
+    """Configuration for optimizers."""
+
+    optimizer_name: str = field(
+        default="adamw",
+        metadata={"help": "The name of the optimizer to use."},
+    )
+    lr: float = field(
+        default=5e-5,
+        metadata={"help": "The initial learning rate for the optimizer."},
+    )
+    weight_decay: float = field(
+        default=0.01,
+        metadata={"help": "The weight decay to apply (if any)."},
+    )
+
+
+@dataclass
+class SchedulerConfig:
+    """Configuration for learning rate schedulers."""
+
+    scheduler_name: str = field(
+        default="cosine",
+        metadata={"help": "The name of the scheduler to use (e.g., 'linear', 'cosine')."},
+    )
+    warmup_steps: int = field(
+        default=100,
+        metadata={
+            "help": "Number of steps for the warmup phase. If provided "
+            "value is within [0-1) range then it will be interpreted as "
+            "ratio of total training steps for the warmup phase."
+        },
+    )
+
+
+@dataclass
+class DatasetConfig:
+    """Configuration for datasets."""
+
+    tokenizer_name: str = field(
+        default="HuggingFaceTB/SmolLM-135M",
+        metadata={"help": "The name or path of the tokenizer to use."},
+    )
+    dataset_type: str = field(
+        default="seq_completion",
+        metadata={"help": "The type of dataset (e.g., 'seq_completion')."},
+    )
+    dataset_name: str = field(
+        default="knkarthick/samsum",
+        metadata={"help": "The name or path of the dataset."},
+    )
+    dataset_subset: str = field(
+        default="default",
+        metadata={"help": "The subset of the dataset to use, if applicable."},
+    )
+    train_split: str = field(
+        default="train",
+        metadata={"help": "The name of the training split."},
+    )
+    test_split: str = field(
+        default="test",
+        metadata={"help": "The name of the test/validation split."},
+    )
+    max_seq_length: int = field(
+        default=512,
+        metadata={"help": "The maximum sequence length for tokenization."},
+    )
+    split_ratio: float = field(
+        default=0.8,
+        metadata={"help": "Ratio for train/test split, used when only train_split is provided."},
+    )
+    input_columns: list[str] = field(
+        default_factory=lambda: ["text"],
+        metadata={"help": "List of column names containing input text."},
+    )
+    target_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the column containing target labels (if applicable)."},
+    )
+    train_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during training."},
+    )
+    eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during evaluation."},
+    )
+    num_workers: int = field(
+        default=4,
+        metadata={"help": "Number of workers for dataset processing."},
+    )
+    collate_fn: str = field(
+        default="dynamic_padding",
+        metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
+    )
+    group_by_length: bool = field(
+        default=True,
+        metadata={"help": "Whether to group samples by length to minimize padding."},
+    )
+    length_column_name: str = field(
+        default="input_ids",
+        metadata={"help": "The column name containing the length of the input sequences."},
+    )
+    dataloader_pin_memory: bool = field(
+        default=True,
+        metadata={"help": "Whether to pin GPU memory for dataloaders."},
+    )
+    dataloader_persistent_workers: bool = field(
+        default=True,
+        metadata={"help": "Whether to keep dataloader workers alive across epochs."},
+    )
+    dataloader_prefetch_factor: int = field(
+        default=1,
+        metadata={"help": "Number of samples loaded in advance by each worker."},
+    )
+    dataloader_drop_last: bool = field(
+        default=False,
+        metadata={"help": "Whether to drop the last incomplete batch."},
+    )
+    dataloader_num_workers: int = field(
+        default=1,
+        metadata={"help": "Number of workers for the DataLoader."},
+    )
+
+
+@dataclass
+class PeftConfig:
+    """Configuration for PEFT (Parameter-Efficient Fine-Tuning) methods."""
+
+    lora_r: int = field(
+        default=8,
+        metadata={"help": "Lora attention dimension."},
+    )
+    lora_alpha: int = field(
+        default=16,
+        metadata={"help": "Lora alpha."},
+    )
+    lora_dropout: float = field(
+        default=0.1,
+        metadata={"help": "The dropout probability for Lora layers."},
+    )
+    target_modules: list[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj"],
+        metadata={"help": "The modules to apply Lora to."},
+    )
+    bias: str = field(
+        default="none",
+        metadata={"help": "Bias type for Lora ('none', 'all', 'lora_only')."},
+    )
+    task_type: str = field(
+        default="CAUSAL_LM",
+        metadata={"help": "The task type for PEFT (e.g., 'CAUSAL_LM', 'SEQ_2_SEQ_LM')."},
+    )
+    peft_type: str = field(
+        default="LORA",
+        metadata={"help": "The PEFT method to use (e.g., 'LORA', 'IA3')."},
+    )
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for models."""
+
+    model_name: str = field(
+        default="HuggingFaceTB/SmolLM-135M",
+        metadata={"help": "The name or path of the pretrained model."},
+    )
+    model_type: str = field(
+        default="hf",
+        metadata={"help": "The type of model ('hf' for Hugging Face, 'custom' for custom models)."},
+    )
+    auto_class_name: str = field(
+        default="AutoModelForCausalLM",
+        metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to load the model in 4-bit quantization."},
+    )
+    use_peft: bool = field(
+        default=True,
+        metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
+    )
+    peft_config: Optional[PeftConfig] = field(
+        default_factory=PeftConfig,
+        metadata={"help": "Configuration for PEFT."},
+    )
+    use_cache: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the past key/values in the model for faster decoding."},
+    )
+    attn_implementation: str = field(
+        default="sdpa",
+        metadata={"help": "The attention implementation to use (e.g., 'sdpa', 'eager')."},
+    )
+    device_map: Optional[str] = field(
+        default=None,
+        metadata={"help": "The device map to use for model distribution (e.g., 'auto')."},
+    )
+
+
+@dataclass
+class CallbackConfig:
+    """Configuration for callbacks."""
+
+    callbacks: Dict[str, Dict[str, Any]] = field(
+        default_factory=dict,
+        metadata={"help": "Dictionary of callback configurations, keyed by callback name."},
+    )
+
+
+@dataclass
+class GradientCheckpointingKwargs:
+    """Arguments for gradient checkpointing."""
+
+    preserve_rng_state: bool = field(
+        default=True,
+        metadata={"help": "Whether to preserve the RNG state when checkpointing."},
+    )
+    use_reenrant: bool = field(
+        default=False,
+        metadata={"help": "Whether to use reentrant gradient checkpointing."},
+    )
+
+
+@dataclass
+class DdpConfig:
+    """Arguments for Distributed Data Parallel (DDP) training."""
+
+    ddp_backend: str = field(
+        default="qccl",
+        metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."},
+    )
+    ddp_find_unused_parameters: bool = field(
+        default=True,
+        metadata={"help": "Whether to find unused parameters in DDP."},
+    )
+    ddp_bucket_cap_mb: Optional[int] = field(
+        default=25,
+        metadata={"help": "The bucket size in MB for DDP communication."},
+    )
+    ddp_broadcast_buffers: bool = field(
+        default=True,
+        metadata={"help": "Whether to broadcast buffers in DDP."},
+    )
+    ddp_timeout: int = field(
+        default=1800,
+        metadata={"help": "Timeout for DDP operations in seconds."},
+    )
+
+
+@dataclass
+class TrainingConfig:
+    """Configuration for training."""
+
+    type: str = field(
+        default="sft",
+        metadata={"help": "The type of training (e.g., 'sft' for Supervised Fine-Tuning)."},
+    )
+    output_dir: str = field(
+        default="./training_results",
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={"help": "Whether to overwrite the output directory."},
+    )
+    seed: int = field(
+        default=42,
+        metadata={"help": "Random seed for reproducibility."},
+    )
+
+    do_eval: bool = field(
+        default=True,
+        metadata={"help": "Whether to run evaluation during training."},
+    )
+    eval_strategy: str = field(
+        default="epoch",
+        metadata={"help": "The evaluation strategy to use ('no', 'steps', 'epoch')."},
+    )
+    eval_steps: int = field(
+        default=100,
+        metadata={"help": "Number of update steps between two evaluations."},
+    )
+
+    per_device_train_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during training."},
+    )
+    per_device_eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during evaluation."},
+    )
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
+    )
+    num_train_epochs: int = field(
+        default=1,
+        metadata={"help": "Total number of training epochs to perform."},
+    )
+    max_steps: int = field(
+        default=-1,
+        metadata={"help": "If > 0: set total number of training steps to perform."},
+    )
+
+    log_level: str = field(
+        default="info",
+        metadata={"help": "Set the verbosity level of the logs ('debug', 'info', 'warning', 'error')."},
+    )
+    log_on_each_node: bool = field(
+        default=True,
+        metadata={"help": "Whether to log on each node in a distributed setup."},
+    )
+    logging_strategy: str = field(
+        default="steps",
+        metadata={"help": "The logging strategy to use ('no', 'steps', 'epoch')."},
+    )
+    logging_steps: int = field(
+        default=10,
+        metadata={"help": "Number of update steps between two loggings."},
+    )
+
+    save_strategy: str = field(
+        default="epoch",
+        metadata={"help": "The checkpoint save strategy to use ('no', 'steps', 'epoch')."},
+    )
+    save_steps: int = field(
+        default=100,
+        metadata={"help": "Number of update steps between two checkpoints (if save_strategy is 'steps')."},
+    )
+    save_total_limit: int = field(
+        default=5,
+        metadata={"help": "Limit the total amount of checkpoints. Deletes older checkpoints to stay within limit."},
+    )
+    metric_for_best_model: str = field(
+        default="eval_loss",
+        metadata={"help": "The metric to use to compare two models ('eval_loss', etc.)."},
+    )
+
+    dtype: str = field(
+        default="fp16",
+        metadata={"help": "The data type to use for training (e.g., 'fp16', 'bf16')."},
+    )
+
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={"help": "Whether to use gradient checkpointing."},
+    )
+    gradient_checkpointing_kwargs: Optional[GradientCheckpointingKwargs] = field(
+        default_factory=GradientCheckpointingKwargs,
+        metadata={"help": "Arguments for gradient checkpointing."},
+    )
+
+    torch_compile: bool = field(
+        default=True,
+        metadata={"help": "Whether to compile the model with `torch.compile`."},
+    )
+    include_tokens_per_second: bool = field(
+        default=True,
+        metadata={"help": "Whether to include tokens per second in logs."},
+    )
+    include_num_input_tokens_seen: bool = field(
+        default=True,
+        metadata={"help": "Whether to include the number of input tokens seen in logs."},
+    )
+    average_tokens_across_devices: bool = field(
+        default=True,
+        metadata={"help": "Whether to average tokens across devices in distributed training."},
+    )
+
+    disable_tqdm: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to disable the tqdm progress bar."},
+    )
+    fsdp_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "FSDP configuration dictionary."},
+    )
+    deepspeed_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "DeepSpeed configuration dictionary."},
+    )
+    accelerator_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "Accelerate configuration dictionary."},
+    )
+    ddp_config: Optional[DdpConfig] = field(
+        default_factory=DdpConfig,
+        metadata={"help": "DDP configuration dictionary."},
+    )
+    use_cpu: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to explicitly run training on CPU."},
+    )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to a checkpoint to resume training from."},
+    )
+    restore_callback_states_from_checkpoint: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to restore callback states from checkpoint."},
+    )
+
+
+@dataclass
+class MasterConfig:
+    """Main training configuration."""
+
+    model: ModelConfig = field(default_factory=ModelConfig, metadata={"help": "Configuration for the model."})
+
+    dataset: DatasetConfig = field(default_factory=DatasetConfig, metadata={"help": "Configuration for the dataset."})
+
+    optimizers: OptimizerConfig = field(
+        default_factory=OptimizerConfig, metadata={"help": "Configuration for optimizers."}
+    )
+
+    scheduler: SchedulerConfig = field(
+        default_factory=SchedulerConfig, metadata={"help": "Configuration for the learning rate scheduler."}
+    )
+
+    callbacks: CallbackConfig = field(default_factory=CallbackConfig, metadata={"help": "Configuration for callbacks."})
+
+    training: TrainingConfig = field(
+        default_factory=TrainingConfig, metadata={"help": "Configuration for training parameters."}
+    )
+
+    extra_params: Dict[str, Any] = field(
+        default_factory=dict, metadata={"help": "Additional top-level parameters not explicitly defined."}
+    )
+
+
+def parse_arguments(config_path: Optional[str] = None) -> MasterConfig:
+    """Create argument parser for the new finetuning interface."""
+    parser = HfArgumentParser(MasterConfig)
+
+    if config_path:
+        config_path = os.path.abspath(config_path)
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+        if not (config_path.endswith(".yaml") or config_path.endswith(".yml")):
+            raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}")
+
+        try:
+            (master_config,) = parser.parse_yaml_file(yaml_file=config_path)
+            return master_config
+        except Exception as e:
+            raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0]
+    else:
+        master_config = parser.parse_args_into_dataclasses()
+
+    return master_config
+
+
+class ConfigManager:
+    """Manages configuration loading, validation, and updates."""
+
+    def __init__(self, config: MasterConfig):
+        """
+        Initialize ConfigManager with either:
+        - Path to config file (str or Path)
+        - Configuration dictionary
+        - None (creates empty config)
+        """
+        self.config = config
+
+    def load_config(self, config_path: Union[str, Path]) -> None:
+        """Load configuration from file."""
+        config_path = Path(config_path)
+
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file not found: {config_path}")
+
+        if config_path.suffix.lower() in [".yaml", ".yml"]:
+            with open(config_path, "r") as f:
+                config_dict = yaml.safe_load(f)
+        elif config_path.suffix.lower() == ".json":
+            with open(config_path, "r") as f:
+                config_dict = json.load(f)
+        else:
+            raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
+
+        self.update_config(config_dict)
+
+    def update_config(self, config_dict: Dict[str, Any]) -> None:
+        """Update configuration with dictionary values."""
+        for key, value in config_dict.items():
+            if hasattr(self.config, key):
+                if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"):
+                    # Special handling for callbacks
+                    if key in ["callbacks", "optimizers", "loss_functions"]:
+                        nested_config = getattr(self.config, key)
+                        for component_name, component_dict in value.items():
+                            if isinstance(component_dict, dict):
+                                getattr(nested_config, key)[component_name] = component_dict
+                            else:
+                                getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[
+                                    component_name
+                                ] = component_dict
+                    else:
+                        # Update nested dataclass
+                        nested_config = getattr(self.config, key)
+                        for nested_key, nested_value in value.items():
+                            if hasattr(nested_config, nested_key):
+                                setattr(getattr(self.config, key), nested_key, nested_value)
+                            elif hasattr(nested_config, "extra_params"):
+                                getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value
+                else:
+                    setattr(self.config, key, value)
+            else:
+                # Store unknown parameters in extra_params
+                self.config.extra_params[key] = value
+
+    def save_config(self, output_path: Union[str, Path]) -> None:
+        """Save current configuration to file."""
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        config_dict = self.config
+
+        if output_path.suffix.lower() in [".yaml", ".yml"]:
+            with open(output_path, "w") as f:
+                yaml.dump(config_dict, f, default_flow_style=False, indent=2)
+        elif output_path.suffix.lower() == ".json":
+            with open(output_path, "w") as f:
+                json.dump(config_dict, f, indent=2)
+        else:
+            raise ValueError(f"Unsupported output file format: {output_path.suffix}")
+
+    def validate_config(self) -> None:
+        """Validate configuration parameters."""
+        errors = []
+
+        # Validate model configuration
+        if not self.config.model.model_name:
+            errors.append("Model name is required")
+
+        # Validate dataset configuration
+        if not self.config.dataset.dataset_name:
+            errors.append("Dataset name is required")
+
+        # Validate training parameters
+        if self.config.dataset.train_batch_size <= 0:
+            errors.append("Train batch size must be positive")
+
+        if self.config.dataset.eval_batch_size <= 0:
+            errors.append("Validation batch size must be positive")
+
+        if self.config.training.num_train_epochs <= 0:
+            errors.append("Number of epochs must be positive")
+
+        if self.config.training.gradient_accumulation_steps <= 0:
+            errors.append("Gradient accumulation steps must be positive")
+
+        # Validate device configuration
+        valid_devices = ["cpu", "cuda", "qaic"]
+        if self.config.training.device not in valid_devices:
+            errors.append(f"Device must be one of {valid_devices}")
+
+        if errors:
+            raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors))
+
+    def get_callback_config(self) -> Dict[str, Any]:
+        """Get callback configuration as dictionary."""
+        return self.config.callbacks
+
+    def get_optimizer_config(self) -> Dict[str, Any]:
+        """Get optimizer configuration as dictionary."""
+        return self.config.optimizers
+
+    def get_training_config(self) -> Dict[str, Any]:
+        """Get training configuration as dictionary."""
+        return self.config.training
+
+    def get_scheduler_config(self) -> Dict[str, Any]:
+        """Get scheduler configuration as dictionary."""
+        return self.config.scheduler
+
+    def get_dataset_config(self) -> Dict[str, Any]:
+        """Get dataset configuration as dictionary."""
+        return self.config.dataset
+
+    def get_model_config(self) -> Dict[str, Any]:
+        """Get model configuration as dictionary."""
+        return self.config.model
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert configuration to dictionary."""
+        return asdict(self.config)
+
+    def __getattr__(self, name: str) -> Any:
+        """Allow direct access to config attributes."""
+        if hasattr(self.config, name):
+            return getattr(self.config, name)
+        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+
+def create_trainer_config(name: str, **dependencies) -> tuple:
+    """
+    Create trainer configuration based on registered trainer modules.
+
+    Args:
+        name: Name of the trainer type
+        **dependencies: Any dependencies needed to configure the trainer
+
+    Returns:
+        tuple: (trainer_class, args_class, additional_kwargs)
+    """
+    config = registry.get_trainer_module(name)
+
+    # Process required kwargs based on available dependencies
+    additional_kwargs = {}
+    for kwarg, default in config["required_kwargs"].items():
+        if kwarg in dependencies:
+            additional_kwargs[kwarg] = dependencies[kwarg]
+        elif default != "REQUIRED":
+            additional_kwargs[kwarg] = default
+
+    # Check for missing required arguments
+    for kwarg, default in config["required_kwargs"].items():
+        if kwarg not in additional_kwargs and default == "REQUIRED":
+            raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'")
+
+    return config["trainer_cls"], config["args_cls"], additional_kwargs
diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml
new file mode 100644
index 000000000..59d388bd3
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_config.yaml
@@ -0,0 +1,117 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM"
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  load_in_4bit: false
+  use_peft: true
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0.1
+    target_modules: ["q_proj", "v_proj"]
+    bias: "none"  # Options: none, all, lora_only
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  tokenizer_name: "HuggingFaceTB/SmolLM-135M"
+  dataset_type: "seq_completion"
+  # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M"
+  dataset_name: "knkarthick/samsum"
+  train_split: "train"
+  max_seq_length: 512
+  split_ratio: 0.8  # Ratio for train/test split, used when only train_split is provided
+  test_split: "test"
+  group_by_length: True
+  num_workers: 4
+  pin_memory: True
+  persistent_workers: True
+  prefetch_factor: 1
+  drop_last: False
+
+# Training configuration
+training:
+  type: "sft"
+  output_dir: "./training_results"
+  overwrite_output_dir: False
+  seed: 42
+
+  do_eval: True
+  eval_strategy: "epoch"
+  eval_steps: 100
+
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 1
+  num_train_epochs: 1
+  max_steps: -1
+
+  log_level: "info"
+  log_on_each_node: True
+  logging_strategy: "steps"
+  logging_steps: 10
+
+  save_strategy: "epoch"
+  save_steps: 100   # If 'save_strategy' is 'steps' then it will be used.
+  save_total_limit: 5
+  metric_for_best_model: "eval_loss"
+
+  dtype: "fp16"
+  completion_only_loss: True
+  report_to: "trackio"
+
+  ddp_config:
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: null
+    ddp_timeout: 1800
+
+  # Uncomment below to explicitly run on CPU
+  use_cpu: False
+
+  gradient_checkpointing: False
+  gradient_checkpointing_kwargs:
+    preserve_rng_state : True
+    use_reenrant: False
+
+  torch_compile: True
+  include_tokens_per_second: True
+  include_num_input_tokens_seen: True
+  average_tokens_across_devices: True
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 5e-5
+  weight_decay: 0.01
+
+
+# “linear” → transformers.get_linear_schedule_with_warmup
+# “cosine” → transformers.get_cosine_schedule_with_warmup
+# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup
+# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup
+# “constant” → transformers.get_constant_schedule
+# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup
+# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule
+
+scheduler:
+  scheduler_name: "cosine"
+  warmup_steps: 100   # warmup_steps or warmup_ratio
+  warmup_ratio: 0.1
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3
+    early_stopping_threshold: 0.001
+  tensorboard:
+
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
new file mode 100644
index 000000000..10105a33e
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -0,0 +1,50 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+from pathlib import Path
+
+import pytest
+
+from QEfficient.finetune.experimental.core.config_manager import ConfigManager, parse_arguments
+
+
+@pytest.fixture
+def config_path() -> Path:
+    here = Path(__file__).resolve().parent
+    return (here / "test_config.yaml").resolve()
+
+
+# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases."
+
+
+def test_config(config_path):
+    # parse the yaml file
+    master_config = parse_arguments(config_path)
+    config_manager = ConfigManager(master_config)
+    # Test that the config manager is initialized correctly
+    assert isinstance(config_manager, ConfigManager)
+
+    # Test that all required fields are present
+    missing = [
+        a
+        for a in ("model", "dataset", "optimizers", "scheduler", "callbacks", "training")
+        if not hasattr(config_manager, a)
+    ]
+    assert not missing, f"Missing attributes: {missing}"
+    trainer_config = config_manager.get_training_config()
+    assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs"))
+    dataset_config = config_manager.get_dataset_config()
+    assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name"))
+    model_config = config_manager.get_model_config()
+    assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft"))
+    scheduler_config = config_manager.get_scheduler_config()
+    assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name"))
+    callback_config = config_manager.get_callback_config()
+    assert (hasattr(callback_config, attr) for attr in ("earlystopping"))
+    optimizer_config = config_manager.get_optimizer_config()
+    assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr"))

From 848c911328bf6d3fb8078ef225e433ac5fb3eccc Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 9 Dec 2025 07:31:48 +0000
Subject: [PATCH 2/6] [QEff.finetuning] Adding config_manager and its test
 cases.

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/tests/test_config_manager.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index 10105a33e..b3b9b0b24 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -19,9 +19,6 @@ def config_path() -> Path:
     return (here / "test_config.yaml").resolve()
 
 
-# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases."
-
-
 def test_config(config_path):
     # parse the yaml file
     master_config = parse_arguments(config_path)

From 1607c43738a9f4a6e4973214df82a10aa4afec9d Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Thu, 11 Dec 2025 07:26:49 +0000
Subject: [PATCH 3/6] [QEff.finetuning] Adding config_manager and its
 test_cases.

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../experimental/core/config_manager.py       | 233 +++++++++++++-----
 .../experimental/core/utils/profiler_utils.py |  88 -------
 .../experimental/tests/test_config.yaml       |  33 +--
 .../experimental/tests/test_config_manager.py |  25 +-
 4 files changed, 196 insertions(+), 183 deletions(-)

diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 60ed4d4b6..b28c2e1e3 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -11,10 +11,9 @@
 
 import json
 import os
-import sys
-from dataclasses import asdict, dataclass, field
+from dataclasses import asdict, dataclass, field, fields, is_dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import yaml
 from transformers.hf_argparser import HfArgumentParser
@@ -257,7 +256,7 @@ class DdpConfig:
         metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."},
     )
     ddp_find_unused_parameters: bool = field(
-        default=True,
+        default=False,
         metadata={"help": "Whether to find unused parameters in DDP."},
     )
     ddp_bucket_cap_mb: Optional[int] = field(
@@ -294,7 +293,10 @@ class TrainingConfig:
         default=42,
         metadata={"help": "Random seed for reproducibility."},
     )
-
+    device: str = field(
+        default="qaic",
+        metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."},
+    )
     do_eval: bool = field(
         default=True,
         metadata={"help": "Whether to run evaluation during training."},
@@ -307,7 +309,6 @@ class TrainingConfig:
         default=100,
         metadata={"help": "Number of update steps between two evaluations."},
     )
-
     per_device_train_batch_size: int = field(
         default=1,
         metadata={"help": "Batch size per device during training."},
@@ -381,10 +382,6 @@ class TrainingConfig:
         default=True,
         metadata={"help": "Whether to compile the model with `torch.compile`."},
     )
-    include_tokens_per_second: bool = field(
-        default=True,
-        metadata={"help": "Whether to include tokens per second in logs."},
-    )
     include_num_input_tokens_seen: bool = field(
         default=True,
         metadata={"help": "Whether to include the number of input tokens seen in logs."},
@@ -426,6 +423,14 @@ class TrainingConfig:
         default=None,
         metadata={"help": "Whether to restore callback states from checkpoint."},
     )
+    report_to: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": "The list of integrations to report the results and logs to."},
+    )
+    completion_only_loss: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to compute loss only on completion tokens."},
+    )
 
 
 @dataclass
@@ -455,7 +460,7 @@ class MasterConfig:
     )
 
 
-def parse_arguments(config_path: Optional[str] = None) -> MasterConfig:
+def parse_arguments(config_path: Optional[str] = None, args: Optional[List[str]] = None) -> MasterConfig:
     """Create argument parser for the new finetuning interface."""
     parser = HfArgumentParser(MasterConfig)
 
@@ -472,12 +477,15 @@ def parse_arguments(config_path: Optional[str] = None) -> MasterConfig:
         except Exception as e:
             raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
 
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0]
+    args = [] if args is None else args
+    # If a single positional YAML file was passed via args, parse it as YAML
+    if len(args) == 1 and (args[0].endswith(".yaml") or args[0].endswith(".yml")):
+        yaml_path = os.path.abspath(args[0])
+        (master_config,) = parser.parse_yaml_file(yaml_file=yaml_path)
     else:
-        master_config = parser.parse_args_into_dataclasses()
+        (master_config,) = parser.parse_args_into_dataclasses(args=args)
+        master_config = asdict(master_config)
+        master_config = MasterConfig(**master_config)
 
     return master_config
 
@@ -512,34 +520,58 @@ def load_config(self, config_path: Union[str, Path]) -> None:
 
         self.update_config(config_dict)
 
+    def _ensure_extra_params(self, obj) -> Dict[str, Any]:
+        """Ensure obj.extra_params exists and is a dict; return it."""
+        ep = getattr(obj, "extra_params", None)
+        if ep is None:
+            setattr(obj, "extra_params", {})
+            ep = obj.extra_params
+        if not isinstance(ep, dict):
+            raise TypeError("extra_params must be a dict.")
+        return ep
+
+    def _stash_top_level_extra(self, section: str, nested_key: str, value: Any) -> None:
+        """Store unknown nested values under MasterConfig.extra_params['section.nested_key']."""
+        ep = self._ensure_extra_params(self.config)
+        ep[f"{section}.{nested_key}"] = value
+
     def update_config(self, config_dict: Dict[str, Any]) -> None:
         """Update configuration with dictionary values."""
+
+        SPECIAL_KEYS = {"callbacks"}
+
         for key, value in config_dict.items():
             if hasattr(self.config, key):
-                if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"):
-                    # Special handling for callbacks
-                    if key in ["callbacks", "optimizers", "loss_functions"]:
-                        nested_config = getattr(self.config, key)
-                        for component_name, component_dict in value.items():
-                            if isinstance(component_dict, dict):
-                                getattr(nested_config, key)[component_name] = component_dict
-                            else:
-                                getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[
-                                    component_name
-                                ] = component_dict
+                target = getattr(self.config, key)
+
+                # Special handling for callbacks (dict inside CallbackConfig)
+                if key in SPECIAL_KEYS and isinstance(value, dict):
+                    if is_dataclass(target) and hasattr(target, "callbacks") and isinstance(target.callbacks, dict):
+                        for component_name, component_cfg in value.items():
+                            target.callbacks[component_name] = component_cfg
+                    elif isinstance(target, dict):
+                        target.update(value)
                     else:
-                        # Update nested dataclass
-                        nested_config = getattr(self.config, key)
-                        for nested_key, nested_value in value.items():
-                            if hasattr(nested_config, nested_key):
-                                setattr(getattr(self.config, key), nested_key, nested_value)
-                            elif hasattr(nested_config, "extra_params"):
-                                getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value
-                else:
-                    setattr(self.config, key, value)
+                        self._stash_top_level_extra(key, "__all__", value)
+                    continue
+
+                if isinstance(value, dict) and is_dataclass(target):
+                    known = {f.name for f in fields(target)}
+                    for nested_key, nested_value in value.items():
+                        if nested_key in known:
+                            setattr(target, nested_key, nested_value)
+                        else:
+                            self._stash_top_level_extra(key, nested_key, nested_value)
+                    continue
+
+                if isinstance(value, dict) and isinstance(target, dict):
+                    target.update(value)
+                    continue
+                setattr(self.config, key, value)
+
             else:
-                # Store unknown parameters in extra_params
-                self.config.extra_params[key] = value
+                ep = self._ensure_extra_params(self.config)
+                ep[key] = value
 
     def save_config(self, output_path: Union[str, Path]) -> None:
         """Save current configuration to file."""
@@ -557,38 +589,105 @@ def save_config(self, output_path: Union[str, Path]) -> None:
         else:
             raise ValueError(f"Unsupported output file format: {output_path.suffix}")
 
-    def validate_config(self) -> None:
-        """Validate configuration parameters."""
-        errors = []
-
-        # Validate model configuration
-        if not self.config.model.model_name:
-            errors.append("Model name is required")
-
-        # Validate dataset configuration
-        if not self.config.dataset.dataset_name:
-            errors.append("Dataset name is required")
-
-        # Validate training parameters
-        if self.config.dataset.train_batch_size <= 0:
-            errors.append("Train batch size must be positive")
-
-        if self.config.dataset.eval_batch_size <= 0:
-            errors.append("Validation batch size must be positive")
+    def _push(self, errs: List[str], cond: bool, msg: str) -> None:
+        """Append msg to errs if cond is True."""
+        if cond:
+            errs.append(msg)
 
-        if self.config.training.num_train_epochs <= 0:
-            errors.append("Number of epochs must be positive")
-
-        if self.config.training.gradient_accumulation_steps <= 0:
-            errors.append("Gradient accumulation steps must be positive")
-
-        # Validate device configuration
+    def validate_config(self) -> None:
+        """
+        Validate configuration parameters for MasterConfig.
+        """
+        errors: List[str] = []
+
+        cfg = self.config
+        model = getattr(cfg, "model", {})
+        dataset = getattr(cfg, "dataset", {})
+        training = getattr(cfg, "training", {})
+
+        # ---------- Model ----------
+        self._push(errors, not model.get("model_name"), "model.model_name is required.")
+
+        # PEFT validation
+        if model.get("use_peft"):
+            pc = model.get("peft_config", {})
+            self._push(errors, not isinstance(pc, dict), "model.peft_config must be a dict when use_peft=True.")
+            if isinstance(pc, dict):
+                self._push(
+                    errors,
+                    not isinstance(pc.get("lora_r", 0), int) or pc.get("lora_r", 0) <= 0,
+                    "model.peft_config.lora_r must be a positive integer.",
+                )
+                self._push(
+                    errors,
+                    not isinstance(pc.get("lora_alpha", 0), int) or pc.get("lora_alpha", 0) <= 0,
+                    "model.peft_config.lora_alpha must be a positive integer.",
+                )
+                self._push(
+                    errors,
+                    not (0.0 <= float(pc.get("lora_dropout", 0.0)) < 1.0),
+                    "model.peft_config.lora_dropout must be in [0,1).",
+                )
+
+        # ---------- Dataset ----------
+        self._push(errors, not dataset.get("dataset_name"), "dataset.dataset_name is required.")
+        self._push(errors, not dataset.get("tokenizer_name"), "dataset.tokenizer_name is required.")
+        self._push(errors, dataset.get("max_seq_length", 0) <= 0, "dataset.max_seq_length must be positive.")
+
+        # ---------- Training ----------
+        # Batch sizes
+        self._push(
+            errors,
+            training.get("per_device_train_batch_size", 0) <= 0,
+            "training.per_device_train_batch_size must be positive.",
+        )
+        self._push(
+            errors,
+            training.get("per_device_eval_batch_size", 0) <= 0,
+            "training.per_device_eval_batch_size must be positive.",
+        )
+
+        # Epochs / steps
+        n_epochs = training.get("num_train_epochs", 0)
+        max_steps = training.get("max_steps", -1)
+        self._push(
+            errors,
+            n_epochs <= 0 and max_steps <= 0,
+            "Either training.num_train_epochs > 0 or training.max_steps > 0 must be set.",
+        )
+
+        # Gradient accumulation
+        self._push(
+            errors,
+            training.get("gradient_accumulation_steps", 0) <= 0,
+            "training.gradient_accumulation_steps must be positive.",
+        )
+
+        # Logging / saving configs
+        self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.")
+        self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.")
+
+        # Device
         valid_devices = ["cpu", "cuda", "qaic"]
-        if self.config.training.device not in valid_devices:
-            errors.append(f"Device must be one of {valid_devices}")
-
+        training_device = training.get("device", None)
+        if training_device not in valid_devices:
+            self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.")
+
+        # DDP config
+        ddp = training.get("ddp_config", {})
+        if isinstance(ddp, dict):
+            backend = ddp.get("ddp_backend")
+            # Accept qccl for Qualcomm, nccl for CUDA, gloo for CPU
+            self._push(
+                errors,
+                backend not in {"qccl", "nccl", "gloo", None},
+                "training.ddp_config.ddp_backend must be one of {'qccl','nccl','gloo'} or omitted.",
+            )
+
+        # ---------- Final ----------
         if errors:
-            raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors))
+            # Join messages with bullet points for readability
+            raise ValueError("Configuration validation failed:\n- " + "\n- ".join(errors))
 
     def get_callback_config(self) -> Dict[str, Any]:
         """Get callback configuration as dictionary."""
diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
index e24508e83..d647b73a6 100644
--- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
@@ -4,91 +4,3 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
-
-
-from contextlib import nullcontext
-from typing import ContextManager
-
-import torch
-
-
-def get_op_verifier_ctx(
-    use_op_by_op_verifier: bool,
-    device_type: str,
-    dump_dir: str,
-    step: int,
-    ref_device: str = "cpu",
-    ref_dtype: torch.dtype = torch.float32,
-    atol: float = 1e-1,
-    rtol: float = 1e-5,
-    use_ref_output_on_mismatch: bool = True,
-) -> ContextManager:
-    """Get the op-by-op verifier context manager when op-by-op verification is
-    enabled. It helps in debuging operator related issues by matching the
-    operator execution on qaic v/s cpu. This is meant only for qaic backend.
-
-    Args:
-        use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier.
-        device_type (str): Device on which the model is being executed.
-        dump_dir (str): Directory to dump the op-by-op verification results.
-        step (int): Step number for which the op-by-op verification is to be performed.
-        ref_device (str, optional): Device to use as reference for verification.
-            Defaults to "cpu".
-        ref_dtype (torch.dtype, optional): Data type to use as reference
-            datatype for verification. Defaults to torch.float32.
-        atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1.
-        rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5.
-        use_ref_output_on_mismatch (bool, optional): If an operator has a
-            mismatch with respect to the reference device, use the reference
-            device outputs and continue rest of the verification. Defaults to True.
-
-    Returns:
-        ContextManager: Instance of context manager used to verify the operators.
-    """
-    if (not use_op_by_op_verifier) or ("qaic" in device_type):
-        return nullcontext()
-
-    # Lazily imported qaic_debug when it is actually needed.
-    import torch_qaic.debug as qaic_debug
-
-    filter_config = qaic_debug.DispatchFilterConfig.default(device_type)
-    dump_dir = dump_dir + "/mismatches/step_" + str(step)
-    return qaic_debug.OpByOpVerifierMode(
-        ref_device=ref_device,
-        ref_dtype=ref_dtype,
-        atol=atol,
-        rtol=rtol,
-        use_ref_output_on_mismatch=use_ref_output_on_mismatch,
-        filter_config=filter_config,
-        dump_root_dir=dump_dir,
-    )
-
-
-def init_qaic_profiling(use_profiler: bool, device_type: str) -> None:
-    """Initialize the qaic profiling tool. Note: The profiler is only works
-    for qaic backend.
-
-    Args:
-        use_profiler (bool): Boolean flag to enable profiler.
-        device_type (str): Device on which the model is being executed.
-    """
-    if (use_profiler) and ("qaic" in device_type):
-        # Lazily imported qaic's qaic_profile when it is actually needed.
-        import torch_qaic.profile as qaic_profile
-
-        qaic_profile.start_profiling(device_type, 1)
-
-
-def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None:
-    """Stop the qaic profiling tool. Note: The profiler is only works
-    for qaic backend.
-
-    Args:
-        use_profiler (bool): Boolean flag to enable profiler.
-        device_type (str): Device on which the model is being executed.
-    """
-    if (use_profiler) and ("qaic" in device_type):
-        # Lazily imported qaic's qaic_profile when it is actually needed.
-        import torch_qaic.profile as qaic_profile
-
-        qaic_profile.stop_profiling(device_type)
diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml
index 59d388bd3..e97e99d58 100644
--- a/QEfficient/finetune/experimental/tests/test_config.yaml
+++ b/QEfficient/finetune/experimental/tests/test_config.yaml
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-# Model configuration
+# model configuration
 model:
-  model_type: "hf"  # Hugging Face model
+  model_type: "hf"  
   auto_class_name: "AutoModelForCausalLM"
   model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
   load_in_4bit: false
@@ -17,9 +17,9 @@ model:
     lora_alpha: 16
     lora_dropout: 0.1
     target_modules: ["q_proj", "v_proj"]
-    bias: "none"  # Options: none, all, lora_only
-    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
-    peft_type: "LORA"  # Options: LORA, IA3, etc.
+    bias: "none" 
+    task_type: "CAUSAL_LM" 
+    peft_type: "LORA" 
 
 # Dataset configuration
 dataset:
@@ -33,10 +33,10 @@ dataset:
   test_split: "test"
   group_by_length: True
   num_workers: 4
-  pin_memory: True
-  persistent_workers: True
-  prefetch_factor: 1
-  drop_last: False
+  dataloader_pin_memory: True
+  dataloader_persistent_workers: True
+  dataloader_prefetch_factor: 1
+  dataloader_drop_last: False
 
 # Training configuration
 training:
@@ -44,7 +44,7 @@ training:
   output_dir: "./training_results"
   overwrite_output_dir: False
   seed: 42
-
+  device: "qaic"
   do_eval: True
   eval_strategy: "epoch"
   eval_steps: 100
@@ -61,7 +61,6 @@ training:
   logging_steps: 10
 
   save_strategy: "epoch"
-  save_steps: 100   # If 'save_strategy' is 'steps' then it will be used.
   save_total_limit: 5
   metric_for_best_model: "eval_loss"
 
@@ -76,7 +75,6 @@ training:
     ddp_broadcast_buffers: null
     ddp_timeout: 1800
 
-  # Uncomment below to explicitly run on CPU
   use_cpu: False
 
   gradient_checkpointing: False
@@ -85,7 +83,6 @@ training:
     use_reenrant: False
 
   torch_compile: True
-  include_tokens_per_second: True
   include_num_input_tokens_seen: True
   average_tokens_across_devices: True
 
@@ -95,19 +92,9 @@ optimizers:
   lr: 5e-5
   weight_decay: 0.01
 
-
-# “linear” → transformers.get_linear_schedule_with_warmup
-# “cosine” → transformers.get_cosine_schedule_with_warmup
-# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup
-# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup
-# “constant” → transformers.get_constant_schedule
-# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup
-# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule
-
 scheduler:
   scheduler_name: "cosine"
   warmup_steps: 100   # warmup_steps or warmup_ratio
-  warmup_ratio: 0.1
 
 callbacks:
   early_stopping:
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index b3b9b0b24..fd2abfd48 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -20,11 +20,14 @@ def config_path() -> Path:
 
 
 def test_config(config_path):
-    # parse the yaml file
-    master_config = parse_arguments(config_path)
+    master_config = parse_arguments(args=[])
     config_manager = ConfigManager(master_config)
-    # Test that the config manager is initialized correctly
     assert isinstance(config_manager, ConfigManager)
+    config_manager.load_config(config_path)
+    try:
+        config_manager.validate_config()
+    except Exception as e:
+        pytest.fail(f"Config validation failed with error: {e}")
 
     # Test that all required fields are present
     missing = [
@@ -34,14 +37,26 @@ def test_config(config_path):
     ]
     assert not missing, f"Missing attributes: {missing}"
     trainer_config = config_manager.get_training_config()
-    assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs"))
+    assert trainer_config is not None
+    assert isinstance(trainer_config, dict)
+    assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs", "ddp_config"))
     dataset_config = config_manager.get_dataset_config()
+    assert dataset_config is not None
+    assert isinstance(dataset_config, dict)
     assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name"))
     model_config = config_manager.get_model_config()
-    assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft"))
+    assert model_config is not None
+    assert isinstance(model_config, dict)
+    assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft", "peft_config"))
     scheduler_config = config_manager.get_scheduler_config()
+    assert scheduler_config is not None
+    assert isinstance(scheduler_config, dict)
     assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name"))
     callback_config = config_manager.get_callback_config()
+    assert callback_config is not None
+    assert isinstance(callback_config, dict)
     assert (hasattr(callback_config, attr) for attr in ("earlystopping"))
     optimizer_config = config_manager.get_optimizer_config()
+    assert optimizer_config is not None
+    assert isinstance(optimizer_config, dict)
     assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr"))

From 9fecf682184123826f3a89c342897f236658e06e Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 15 Dec 2025 07:39:55 +0000
Subject: [PATCH 4/6] Adding profiler_utils.py

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../experimental/core/utils/profiler_utils.py | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
index d647b73a6..e24508e83 100644
--- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
@@ -4,3 +4,91 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+
+from contextlib import nullcontext
+from typing import ContextManager
+
+import torch
+
+
+def get_op_verifier_ctx(
+    use_op_by_op_verifier: bool,
+    device_type: str,
+    dump_dir: str,
+    step: int,
+    ref_device: str = "cpu",
+    ref_dtype: torch.dtype = torch.float32,
+    atol: float = 1e-1,
+    rtol: float = 1e-5,
+    use_ref_output_on_mismatch: bool = True,
+) -> ContextManager:
+    """Get the op-by-op verifier context manager when op-by-op verification is
+    enabled. It helps in debuging operator related issues by matching the
+    operator execution on qaic v/s cpu. This is meant only for qaic backend.
+
+    Args:
+        use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier.
+        device_type (str): Device on which the model is being executed.
+        dump_dir (str): Directory to dump the op-by-op verification results.
+        step (int): Step number for which the op-by-op verification is to be performed.
+        ref_device (str, optional): Device to use as reference for verification.
+            Defaults to "cpu".
+        ref_dtype (torch.dtype, optional): Data type to use as reference
+            datatype for verification. Defaults to torch.float32.
+        atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1.
+        rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5.
+        use_ref_output_on_mismatch (bool, optional): If an operator has a
+            mismatch with respect to the reference device, use the reference
+            device outputs and continue rest of the verification. Defaults to True.
+
+    Returns:
+        ContextManager: Instance of context manager used to verify the operators.
+    """
+    if (not use_op_by_op_verifier) or ("qaic" in device_type):
+        return nullcontext()
+
+    # Lazily imported qaic_debug when it is actually needed.
+    import torch_qaic.debug as qaic_debug
+
+    filter_config = qaic_debug.DispatchFilterConfig.default(device_type)
+    dump_dir = dump_dir + "/mismatches/step_" + str(step)
+    return qaic_debug.OpByOpVerifierMode(
+        ref_device=ref_device,
+        ref_dtype=ref_dtype,
+        atol=atol,
+        rtol=rtol,
+        use_ref_output_on_mismatch=use_ref_output_on_mismatch,
+        filter_config=filter_config,
+        dump_root_dir=dump_dir,
+    )
+
+
+def init_qaic_profiling(use_profiler: bool, device_type: str) -> None:
+    """Initialize the qaic profiling tool. Note: The profiler is only works
+    for qaic backend.
+
+    Args:
+        use_profiler (bool): Boolean flag to enable profiler.
+        device_type (str): Device on which the model is being executed.
+    """
+    if (use_profiler) and ("qaic" in device_type):
+        # Lazily imported qaic's qaic_profile when it is actually needed.
+        import torch_qaic.profile as qaic_profile
+
+        qaic_profile.start_profiling(device_type, 1)
+
+
+def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None:
+    """Stop the qaic profiling tool. Note: The profiler is only works
+    for qaic backend.
+
+    Args:
+        use_profiler (bool): Boolean flag to enable profiler.
+        device_type (str): Device on which the model is being executed.
+    """
+    if (use_profiler) and ("qaic" in device_type):
+        # Lazily imported qaic's qaic_profile when it is actually needed.
+        import torch_qaic.profile as qaic_profile
+
+        qaic_profile.stop_profiling(device_type)

From 16b7718c6fb6a5e2d1d9379bb00c3d40d6529624 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 23 Dec 2025 12:20:48 +0000
Subject: [PATCH 5/6] Adding sample config and readme

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../finetune/experimental/configs/README.md   | 175 ++++++++++++++++++
 .../experimental/configs/default_config.yaml  | 104 +++++++++++
 .../experimental/configs/sample_config.yaml   |   0
 3 files changed, 279 insertions(+)
 create mode 100644 QEfficient/finetune/experimental/configs/README.md
 create mode 100644 QEfficient/finetune/experimental/configs/default_config.yaml
 delete mode 100644 QEfficient/finetune/experimental/configs/sample_config.yaml

diff --git a/QEfficient/finetune/experimental/configs/README.md b/QEfficient/finetune/experimental/configs/README.md
new file mode 100644
index 000000000..97f0d6ea2
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/README.md
@@ -0,0 +1,175 @@
+---
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+---
+***
+
+# Training Configuration with LoRA Finetuning
+
+## Overview
+
+This configuration file defines the setup for fine-tuning a Hugging Face causal language model using **LoRA (Low-Rank Adaptation)** and **PEFT (Parameter-Efficient Fine-Tuning)** techniques. It also includes dataset, training, optimizer, and scheduler settings.
+
+***
+### 1. Model Configuration
+
+Model-related parameters for loading and fine-tuning.
+
+*   **model\_type**: `hf` → Type of model (`hf` for Hugging Face, `custom` for custom models).
+*   **auto\_class\_name**: `AutoModelForCausalLM` → AutoClass used to load the model.
+*   **model\_name**: `HuggingFaceTB/SmolLM-135M` → Pretrained model to fine-tune.
+*   **load\_in\_4bit**: `false` → If `true`, loads model in 4-bit quantization for memory efficiency.
+*   **use\_peft**: `true` → Enables PEFT for parameter-efficient fine-tuning.
+*   **peft\_config**: Defines LoRA parameters when `use_peft` is `true`:
+    *   `lora_r`: Rank for LoRA adapters.
+    *   `lora_alpha`: Scaling factor for LoRA updates.
+    *   `lora_dropout`: Dropout applied to LoRA layers.
+    *   `target_modules`: Modules to apply LoRA (e.g., `q_proj`, `v_proj`).
+    *   `bias`: Bias handling (`none`, `all`, `lora_only`).
+    *   `task_type`: `CAUSAL_LM` → Task type (e.g., `CAUSAL_LM`, `SEQ_2_SEQ_LM`).
+    *   `peft_type`: `LORA` → Fine-tuning method (e.g., `LORA`, `IA3`).
+
+***
+
+
+### 2. Dataset Configuration
+
+This section defines parameters for dataset handling during fine-tuning with Hugging Face models. It covers dataset type, splits, prompt formatting, and DataLoader settings.
+
+*   **tokenizer\_name**: Matches model name.
+*   **dataset\_type**: `seq_completion` → Used for sequence continuation tasks, where the model     predicts the next tokens given an input text (e.g., summarization, text generation).
+*   **dataset\_name**: Dataset name for training.
+*   **train\_split/test\_split**: Defines splits.
+*   **split\_ratio**: For spliting the train/test dataset, only if train split is provided.
+*   **prompt\_func**: Python function to format prompts.
+*   **completion\_template**: `{output}` → string pattern that tells the fine-tuning pipeline which part of the dataset should be treated as the target output (completion) for the model to learn.
+
+### Example Dataset Configs
+
+### **1. Alpaca (yahma/alpaca-cleaned)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "seq_completion"
+  dataset_name: "yahma/alpaca-cleaned"
+  train_split: "train"
+  test_split: "test"
+  max_seq_length: 512
+  prompt_func: "alpaca_func:create_alpaca_prompt"
+  completion_template: "{output}"
+
+```
+
+***
+
+### **2. Samsum (knkarthick/samsum)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "seq_completion"
+  dataset_name: "knkarthick/samsum"
+  train_split: "train"
+  test_split: "test"
+  prompt_func: "samsum_func:create_samsum_prompt"
+  completion_template: "{summary}"
+
+```
+
+***
+### **3. gsm8k (openai/gsm8k)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "seq_completion"
+  dataset_name: "openai/gsm8k"
+  train_split: "train"
+  test_split: "test"
+  prompt_func: "gsm8k_func:create_gsm8k_prompt"
+  completion_template: "{answer}"
+
+```
+
+ ***
+
+***
+### **4. grammar (grammar_dataset)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "seq_completion"
+  dataset_name: "grammar"
+  train_split: "train"
+  split_ratio: 0.8
+  prompt_func: "gsm8k_func:create_grammar_prompt"
+  completion_template: "{target}"
+```
+
+ *** 
+### Prompt Function Examples
+
+```python
+# Alpaca
+def create_alpaca_prompt(example):
+    return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n Response:\n"
+
+# Samsum
+def create_samsum_prompt(example):
+    return f"Summarize the following conversation:\n\n{example['dialogue']}\n\nSummary:\n"
+
+#gsm8K
+def create_gsm8k_prompt(example):
+    return f"Solve the following math problem step by step:\n\n{example['question']}\n\nAnswer:\n"
+
+#grammar
+def create_grammar_prompt(example):
+    return f"Correct the grammar in the following sentence:\n\n{example['input']}\n\nCorrected:\n"
+  
+```
+
+
+***
+
+### 3. Training Configuration
+
+This section defines core parameters for fine-tuning and evaluation.
+
+*   **type**: `sft` → Specifies training type; `sft` means Supervised Fine-Tuning.
+*   **output\_dir**: Directory where model checkpoints and logs are saved.
+*   **do\_eval**: Enables evaluation during training.
+*   **eval\_strategy**: `epoch` → When to run evaluation (e.g., per epoch or steps).
+*   **gradient\_accumulation\_steps**: Accumulate gradients over multiple steps to simulate larger batch size.
+*   **dtype**: `fp16` → Mixed precision for faster training and reduced memory usage.
+*   **gradient\_checkpointing**: Saves memory by recomputing activations during backward pass (slower but memory-efficient).
+*   **torch\_compile**: Wraps your model with torch.compile() (PyTorch 2.0+) to fuse ops, reduce Python overhead, and generate optimized kernels—often yielding speed-ups without code changes.
+*   **Optional distributed configs**: FSDP, DeepSpeed, or DDP for multi-QAIC or large-scale training.
+*    **resume_from_checkpoint**: Path to a checkpoint to resume training from.
+*    **disable_tqdm**: False by default; set to True to disable progress bar (if running in Notebook).
+
+***
+
+### 4. Optimizer & Scheduler
+
+*   **optimizer**: `adamw` – Optimizer for weight-decoupled regularization; options: `adamw`, `adam`, `sgd`.
+    *   **lr**: Initial learning rate (e.g., `5e-5` for fine-tuning).
+    *   **weight\_decay**: Regularization strength (commonly `0.01`).
+
+*   **scheduler**: `cosine` – Learning rate decay strategy; options: `linear`, `cosine`, `cosine_with_restarts`, `polynomial`, `constant`, `constant_with_warmup`, `inverse_sqrt`.
+    *   **warmup\_steps**: Number of steps or ratio (e.g., `100` steps or `0.05` for 5% of total steps).
+    *   Stabilizes early training and improves convergence.
+
+***
+
+### 5. Callbacks
+
+Callbacks allow custom actions during training, such as logging, early stopping, or hardware profiling.
+
+*   **early\_stopping**: Stops training if no improvement in a monitored metric for a defined patience period.
+*   **tensorboard**: Enables logging of metrics and losses to TensorBoard for visualization.
+*   **QAICProfilerCallback**: Profiles QAIC devices over a specified training step range to monitor performance and resource usage.
+*   **QAICOpByOpVerifierCallback**: Verifies QAIC operations step-by-step during a specified training range for correctness and debugging.
+
+***
diff --git a/QEfficient/finetune/experimental/configs/default_config.yaml b/QEfficient/finetune/experimental/configs/default_config.yaml
new file mode 100644
index 000000000..1d2e24b3a
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/default_config.yaml
@@ -0,0 +1,104 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM"
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0.1
+    target_modules: ["q_proj", "v_proj"]
+    bias: "none"  # Options: none, all, lora_only
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  tokenizer_name: "HuggingFaceTB/SmolLM-135M"
+  dataset_type: "seq_completion"
+  # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M"
+  dataset_name: "knkarthick/samsum"
+  train_split: "train"
+  max_seq_length: 512
+  split_ratio: 0.8  # Ratio for train/test split, used when only train_split is provided
+  test_split: "test"
+  group_by_length: True
+  num_workers: 4
+  pin_memory: True
+  persistent_workers: True
+  prefetch_factor: 1
+  drop_last: False
+
+# Training configuration
+training:
+  type: "sft"
+  output_dir: "./training_results"
+  eval_strategy: "epoch"
+  # eval_steps: 100 # If 'eval_strategy' is 'steps' then it will be used.
+  gradient_accumulation_steps: 1
+  num_train_epochs: 1
+  max_steps: -1
+  log_level: "info"
+  log_on_each_node: True
+  logging_strategy: "steps"
+  logging_steps: 10
+  save_strategy: "epoch"
+  # save_steps: 100   # If 'save_strategy' is 'steps' then it will be used.
+  save_total_limit: 5
+  metric_for_best_model: "eval_loss"
+  dtype: "fp16"
+
+  # Uncomment if running in Notebook
+  # disable_tqdm: True
+
+  # Uncomment below fsdp block to enable FSDP training
+  # fsdp: "full_shard"
+  # fsdp_config: "./configs/accelerate/fsdp_config.yaml"
+  # fsdp_config: "./configs/accelerate/fsdp_tp_parallelism_config.yaml"
+
+  # Uncomment below deepspeed block to enable DeepSpeed training
+  # deepspeed_config: "./configs/accelerate/deepSpeed_config.yaml"
+
+  # Uncomment below DDP block to enable DDP training and configure DDP params
+  # ddp_config:
+  #   ddp_backend: "qccl"
+  #   ddp_find_unused_parameters: False
+  #   ddp_bucket_cap_mb: 25
+  #   ddp_broadcast_buffers: null
+  #   ddp_timeout: 1800
+
+  # Uncomment and populate to resume training
+  # resume_from_checkpoint: "./abc"
+  # restore_callback_states_from_checkpoint: True
+
+  gradient_checkpointing: False
+  gradient_checkpointing_kwargs:
+    preserve_rng_state : True
+    use_reenrant: False
+
+  torch_compile: True
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 5e-5
+  weight_decay: 0.01
+
+scheduler:
+  scheduler_name: "cosine"
+  warmup_steps: 100   # warmup_steps or warmup_ratio
+  warmup_ratio: 0.1
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3
+    early_stopping_threshold: 0.001
+  tensorboard:
diff --git a/QEfficient/finetune/experimental/configs/sample_config.yaml b/QEfficient/finetune/experimental/configs/sample_config.yaml
deleted file mode 100644
index e69de29bb..000000000

From 39ce4bf998863d9a428522cfdca93d2a91a8c2da Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 23 Dec 2025 17:56:40 +0530
Subject: [PATCH 6/6] Delete
 QEfficient/finetune/experimental/tests/test_config.yaml

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../experimental/tests/test_config.yaml       | 104 ------------------
 1 file changed, 104 deletions(-)
 delete mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml

diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml
deleted file mode 100644
index e97e99d58..000000000
--- a/QEfficient/finetune/experimental/tests/test_config.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-# model configuration
-model:
-  model_type: "hf"  
-  auto_class_name: "AutoModelForCausalLM"
-  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
-  load_in_4bit: false
-  use_peft: true
-  peft_config:
-    lora_r: 8
-    lora_alpha: 16
-    lora_dropout: 0.1
-    target_modules: ["q_proj", "v_proj"]
-    bias: "none" 
-    task_type: "CAUSAL_LM" 
-    peft_type: "LORA" 
-
-# Dataset configuration
-dataset:
-  tokenizer_name: "HuggingFaceTB/SmolLM-135M"
-  dataset_type: "seq_completion"
-  # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M"
-  dataset_name: "knkarthick/samsum"
-  train_split: "train"
-  max_seq_length: 512
-  split_ratio: 0.8  # Ratio for train/test split, used when only train_split is provided
-  test_split: "test"
-  group_by_length: True
-  num_workers: 4
-  dataloader_pin_memory: True
-  dataloader_persistent_workers: True
-  dataloader_prefetch_factor: 1
-  dataloader_drop_last: False
-
-# Training configuration
-training:
-  type: "sft"
-  output_dir: "./training_results"
-  overwrite_output_dir: False
-  seed: 42
-  device: "qaic"
-  do_eval: True
-  eval_strategy: "epoch"
-  eval_steps: 100
-
-  per_device_train_batch_size: 1
-  per_device_eval_batch_size: 1
-  gradient_accumulation_steps: 1
-  num_train_epochs: 1
-  max_steps: -1
-
-  log_level: "info"
-  log_on_each_node: True
-  logging_strategy: "steps"
-  logging_steps: 10
-
-  save_strategy: "epoch"
-  save_total_limit: 5
-  metric_for_best_model: "eval_loss"
-
-  dtype: "fp16"
-  completion_only_loss: True
-  report_to: "trackio"
-
-  ddp_config:
-    ddp_backend: "qccl"
-    ddp_find_unused_parameters: False
-    ddp_bucket_cap_mb: 25
-    ddp_broadcast_buffers: null
-    ddp_timeout: 1800
-
-  use_cpu: False
-
-  gradient_checkpointing: False
-  gradient_checkpointing_kwargs:
-    preserve_rng_state : True
-    use_reenrant: False
-
-  torch_compile: True
-  include_num_input_tokens_seen: True
-  average_tokens_across_devices: True
-
-# Optimizer configuration
-optimizers:
-  optimizer_name: "adamw"
-  lr: 5e-5
-  weight_decay: 0.01
-
-scheduler:
-  scheduler_name: "cosine"
-  warmup_steps: 100   # warmup_steps or warmup_ratio
-
-callbacks:
-  early_stopping:
-    early_stopping_patience: 3
-    early_stopping_threshold: 0.001
-  tensorboard:
-