diff --git a/3.test_cases/torchtune/.gitignore b/3.test_cases/torchtune/.gitignore new file mode 100644 index 00000000..3d8a0fd0 --- /dev/null +++ b/3.test_cases/torchtune/.gitignore @@ -0,0 +1,6 @@ +checkponts +models +miniconda3 +pt_torchtune +torchtune +Miniconda3-latest-Linux-x86_64.sh diff --git a/3.test_cases/torchtune/0.create_conda_env.sh b/3.test_cases/torchtune/0.create_conda_env.sh new file mode 100755 index 00000000..1ae3af3d --- /dev/null +++ b/3.test_cases/torchtune/0.create_conda_env.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -ex + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +chmod +x Miniconda3-latest-Linux-x86_64.sh +./Miniconda3-latest-Linux-x86_64.sh -b -f -p ./miniconda3 + +source ./miniconda3/bin/activate + +conda create -y -p ./pt_torchtune python=3.10 + +source activate ./pt_torchtune/ + +# Install AWS Pytorch, see https://aws-pytorch-doc.com/ +# conda install -y pytorch=2.2.0 torchvision torchaudio torchtriton=2.2.0 pytorch-cuda=12.1 transformers datasets --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge +conda install -y pytorch torchvision torchaudio pytorch-cuda=12.1 transformers datasets -c pytorch -c nvidia + +git clone https://github.com/pytorch/torchtune.git +pip install -e ./torchtune + +# Create checkpoint dir +mkdir checkpoints diff --git a/3.test_cases/torchtune/1.download_hf_model.sh b/3.test_cases/torchtune/1.download_hf_model.sh new file mode 100755 index 00000000..6520e4d6 --- /dev/null +++ b/3.test_cases/torchtune/1.download_hf_model.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# set -ex; + +# Default value for HF_MODEL +DEFAULT_HF_MODEL="meta-llama/Llama-2-7b" +read -p "Please enter Hugging Face model ($DEFAULT_HF_MODEL): " HF_MODEL +if [ -z "$HF_MODEL" ]; then + HF_MODEL="$DEFAULT_HF_MODEL" +fi + +read -p "Please enter Hugging Face Access Tokens: " HF_TOKEN + +mkdir -p models/${HF_MODEL} + +tune download \ + ${HF_MODEL} \ + --output-dir models/${HF_MODEL} \ + --hf-token ${HF_TOKEN} diff --git a/3.test_cases/torchtune/2.full_finetune_distributed.sbatch b/3.test_cases/torchtune/2.full_finetune_distributed.sbatch new file mode 100644 index 00000000..daba4e2d --- /dev/null +++ b/3.test_cases/torchtune/2.full_finetune_distributed.sbatch @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --nodes=1 # number of nodes to use +#SBATCH --job-name=full_ft # name of your job +#SBATCH --exclusive # job has exclusive use of the resource, no sharing + +set -ex; + +########################### +###### User Variables ##### +########################### + +GPUS_PER_NODE=4 # 4 for G5.12x, 8 for P4/P5 + +########################### +## Environment Variables ## +########################### + +## Plenty of EFA level variables +## Comment out for non-efa instances (G4d, P3) +## For G5.12x, Comment out RDMA and Fork safe +## For G4dn and other G5, comment out all +# export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +# export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa +export NCCL_DEBUG=INFO +## Switching SYNC_MEMOPS to zero can boost throughput with FSDP +## Disables CU_POINTER_ATTRIBUTE_SYNC_MEMOPS +## Reduces memory synchronizations +## https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html +export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 + +########################### +####### Torch Dist ####### +########################### + +declare -a TORCHRUN_ARGS=( + --nproc_per_node=$GPUS_PER_NODE \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$(hostname) \ +) + +export TORCHTUNE=./pt_torchtune/bin/tune +export TRAIN_CONFIG=./llama2_7B_full.yaml + +srun -l ${TORCHTUNE} run "${TORCHRUN_ARGS[@]}" full_finetune_distributed --config ${TRAIN_CONFIG} diff --git a/3.test_cases/torchtune/llama2_7B_full.yaml b/3.test_cases/torchtune/llama2_7B_full.yaml new file mode 100644 index 00000000..3909a7dd --- /dev/null +++ b/3.test_cases/torchtune/llama2_7B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama2 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Llama-2-7b \ +# --hf-token \ +# --output-dir /tmp/llama2 +# +# To launch on 4 devices, run the following command from root: +# tune run --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/7B_full \ +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/7B_full \ +# checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 7B_full_single_device.yaml for those cases + + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama2.llama2_tokenizer + path: models/meta-llama/Llama-2-7b/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama2.llama2_7b + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: models/meta-llama/Llama-2-7b + checkpoint_files: [consolidated.00.pth] + recipe_checkpoint: null + output_dir: models/meta-llama/Llama-2-7b + model_type: LLAMA2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/alpaca-llama2-finetune +log_every_n_steps: null