Skip to content

Commit 582e5f3

Browse files
committed
Updated jenkins file to run finetuning tests and dump in separate file. Addressed comments.
Signed-off-by: Meet Patel <[email protected]>
1 parent f3c34a1 commit 582e5f3

File tree

4 files changed

+39
-7
lines changed

4 files changed

+39
-7
lines changed

QEfficient/cloud/finetune.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@
4747
warnings.filterwarnings("ignore")
4848

4949

50-
def setup_distributed_training(config: TrainConfig) -> None:
50+
def setup_distributed_training(train_config: TrainConfig) -> None:
5151
"""Initialize distributed training environment if enabled.
5252
5353
Args:
54-
config (TrainConfig): Training configuration object.
54+
train_config (TrainConfig): Training configuration object.
5555
5656
Notes:
5757
- If distributed data parallel (DDP) is disabled, this function does nothing.
@@ -61,14 +61,14 @@ def setup_distributed_training(config: TrainConfig) -> None:
6161
Raises:
6262
AssertionError: If device is CPU or includes an index with DDP enabled.
6363
"""
64-
if not config.enable_ddp:
64+
if not train_config.enable_ddp:
6565
return
6666

67-
torch_device = torch.device(config.device)
67+
torch_device = torch.device(train_config.device)
6868
assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
6969
assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
7070

71-
dist.init_process_group(backend=config.dist_backend)
71+
dist.init_process_group(backend=train_config.dist_backend)
7272
# from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
7373
getattr(torch, torch_device.type).set_device(dist.get_rank())
7474

QEfficient/finetune/configs/training.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class TrainConfig:
1919
batch_size_training (int): Batch size for training (default: 1).
2020
context_length (Optional[int]): Maximum sequence length for inputs (default: None).
2121
gradient_accumulation_steps (int): Steps for gradient accumulation (default: 4).
22+
gradient checkpointing (bool): Enable gradient checkpointing to save the memory by compromising the speed. (default: False).
2223
num_epochs (int): Number of training epochs (default: 1).
2324
max_train_step (int): Maximum training steps (default: 0, unlimited if 0).
2425
max_eval_step (int): Maximum evaluation steps (default: 0, unlimited if 0).
@@ -32,6 +33,7 @@ class TrainConfig:
3233
use_autocast (bool): Use autocast for mixed precision (default: True).
3334
val_batch_size (int): Batch size for validation (default: 1).
3435
dataset (str): Dataset name for training (default: "samsum_dataset").
36+
task_type (str): Type of task for which the finetuning is to be done. Options: "generation" and "seq_classification". (default: "generation")
3537
peft_method (str): Parameter-efficient fine-tuning method (default: "lora").
3638
use_peft (bool): Whether to use PEFT (default: True).
3739
from_peft_checkpoint (str): Path to PEFT checkpoint (default: "").

scripts/Jenkinsfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,23 @@ pipeline {
6565
}
6666
}
6767
}
68+
stage('Run Non-CLI QAIC Finetuning Tests') {
69+
steps {
70+
timeout(time: 200, unit: 'MINUTES') {
71+
sh '''
72+
sudo docker exec ${BUILD_TAG} bash -c "
73+
cd /efficient-transformers &&
74+
. preflight_qeff/bin/activate &&
75+
mkdir -p $PWD/Non_cli_qaic_finetuning &&
76+
export TOKENIZERS_PARALLELISM=false &&
77+
export QEFF_HOME=$PWD/Non_cli_qaic_finetuning &&
78+
pytest tests -m '(not cli) and (on_qaic) and (not qnn) and (finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log3.xml &&
79+
junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
80+
deactivate"
81+
'''
82+
}
83+
}
84+
}
6885
}
6986
}
7087
stage('CLI Tests') {

tests/finetune/test_finetune.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,19 @@ def clean_up(path):
2323
shutil.rmtree(path)
2424

2525

26-
configs = [pytest.param("meta-llama/Llama-3.2-1B", 10, 20, 1, None, True, True, "qaic", id="llama_config")]
26+
configs = [
27+
pytest.param(
28+
"meta-llama/Llama-3.2-1B", # model_name
29+
10, # max_eval_step
30+
20, # max_train_step
31+
1, # intermediate_step_save
32+
None, # context_length
33+
True, # run_validation
34+
True, # use_peft
35+
"qaic", # device
36+
id="llama_config", # config name
37+
)
38+
]
2739

2840

2941
@pytest.mark.on_qaic
@@ -105,7 +117,8 @@ def test_finetune(
105117
args, kwargs = update_config_spy.call_args_list[0]
106118
train_config = args[0]
107119
assert max_train_step >= train_config.gradient_accumulation_steps, (
108-
"Total training step should be more than 4 which is gradient accumulation steps."
120+
"Total training step should be more than "
121+
f"{train_config.gradient_accumulation_steps} which is gradient accumulation steps."
109122
)
110123

111124
saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors")

0 commit comments

Comments
 (0)