Skip to content

Commit f315700

Browse files
SW publisherJenkins
authored andcommitted
Megatron-DeepSpeed-fork content for 1.17.0
Signed-off-by: SW publisher <[email protected]>
1 parent 7eb36a1 commit f315700

File tree

93 files changed

+6491
-1334
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+6491
-1334
lines changed

README.md

Lines changed: 285 additions & 541 deletions
Large diffs are not rendered by default.

examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
DIR=`pwd`
35
###############################################################################
@@ -119,8 +121,14 @@ MP_SIZE=1
119121
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
120122
## to 1 and use the "--no-pipeline-parallel" arg.
121123
PP_SIZE=1
122-
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
123-
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
124+
nvidia-smi || count_GPU=0
125+
if [[ ${count_GPU} == 0 ]];then
126+
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
127+
NUM_GPUS_PERNODE=${NUM_GPUS}
128+
else
129+
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
130+
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
131+
fi
124132
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
125133
###############################################################################
126134
### MoE configs
@@ -172,6 +180,7 @@ LOG_INTERVAL=10
172180
EVAL_ITERS=10
173181
EVAL_INTERVAL=100
174182
SAVE_INTERVAL=10000
183+
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
175184

176185
## Standard deviation for weight initialization
177186
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
@@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
241250
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
242251
0.01359 ${ARX} 0.01588 ${GIT}"
243252
else
244-
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
245-
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
253+
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
254+
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
246255
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
247256
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
248-
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
257+
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
249258
# For cluster Azure-WestUS3-A100
250259
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
260+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
261+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
262+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
263+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
251264
fi
252265
###############################################################################
253266
data_options=" \
@@ -284,6 +297,7 @@ megatron_options=" \
284297
--min-lr ${MIN_LR} \
285298
--lr-decay-style cosine \
286299
--split 98,2,0 \
300+
--exit-interval ${EXIT_INTERVAL} \
287301
--log-interval ${LOG_INTERVAL} \
288302
--eval-interval ${EVAL_INTERVAL} \
289303
--eval-iters ${EVAL_ITERS} \
@@ -299,11 +313,12 @@ megatron_options=" \
299313
--log-timers-to-tensorboard \
300314
--log-batch-size-to-tensorboard \
301315
--log-validation-ppl-to-tensorboard \
316+
--no-gradient-accumulation-fusion \
302317
--tensorboard-dir ${TENSORBOARD_DIR}"
303318

304319
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
305320
megatron_options="${megatron_options} \
306-
--checkpoint-activations"
321+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
307322
fi
308323

309324
if [[ $EP_SIZE -gt 1 ]]; then
@@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
329344
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
330345
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
331346
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
332-
> ${config_json}
347+
> ${config_json}
333348

334349
deepspeed_options=" \
335-
--deepspeed \
336-
--deepspeed_config ${config_json} \
337-
--pipeline-model-parallel-size ${PP_SIZE}"
350+
--deepspeed \
351+
--deepspeed_config ${config_json} \
352+
--pipeline-model-parallel-size ${PP_SIZE}"
338353

339354
# Currently MoE is not compatible with pipeline parallel
340355
if [[ $EP_SIZE -gt 1 ]]; then
@@ -369,4 +384,4 @@ fi
369384
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
370385
echo ${run_cmd}
371386
eval ${run_cmd}
372-
set +x
387+
set +x

examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
DIR=`pwd`
35
###############################################################################
@@ -123,8 +125,14 @@ NO_PP="true"
123125
ZERO_STAGE=0
124126

125127
## Total number of GPUs
126-
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
127-
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
128+
nvidia-smi || count_GPU=0
129+
if [[ ${count_GPU} == 0 ]];then
130+
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
131+
NUM_GPUS_PERNODE=${NUM_GPUS}
132+
else
133+
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
134+
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
135+
fi
128136
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
129137
DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
130138
###############################################################################
@@ -143,6 +151,7 @@ LOG_INTERVAL=10
143151
EVAL_ITERS=10
144152
EVAL_INTERVAL=100
145153
SAVE_INTERVAL=1000
154+
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
146155

147156
## Standard deviation for weight initialization. Usually larger model needs
148157
## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
@@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
175184
mkdir -p ${TENSORBOARD_PATH}
176185
mkdir -p ${CHECKPOINT_PATH}
177186

178-
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
179-
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
187+
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
188+
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
180189
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
181190
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
182-
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
191+
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
183192
# For cluster Azure-WestUS3-A100
184193
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
194+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
195+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
196+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
197+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
185198
###############################################################################
186199
data_options=" \
187200
--vocab-file ${VOCAB_PATH} \
@@ -211,6 +224,7 @@ megatron_options=" \
211224
--min-lr ${MIN_LR} \
212225
--lr-decay-style cosine \
213226
--split 98,2,0 \
227+
--exit-interval ${EXIT_INTERVAL} \
214228
--log-interval ${LOG_INTERVAL} \
215229
--eval-interval ${EVAL_INTERVAL} \
216230
--eval-iters ${EVAL_ITERS} \
@@ -226,11 +240,12 @@ megatron_options=" \
226240
--log-timers-to-tensorboard \
227241
--log-batch-size-to-tensorboard \
228242
--log-validation-ppl-to-tensorboard \
243+
--no-gradient-accumulation-fusion \
229244
--tensorboard-dir ${TENSORBOARD_PATH}"
230245

231246
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
232247
megatron_options="${megatron_options} \
233-
--checkpoint-activations"
248+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
234249
fi
235250

236251
if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
@@ -306,4 +321,4 @@ fi
306321
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
307322
echo ${run_cmd}
308323
eval ${run_cmd}
309-
set +x
324+
set +x

examples_deepspeed/MoE/readme_evalharness.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,4 @@ Import location: Replace data at selected cell
165165
166166
4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
167167
168-
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
168+
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->

examples_deepspeed/run_deepspeed_example.sh

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
set -ex
35

4-
BASE_PATH=/vc_data/Megatron-LM/data
5-
DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
6+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
7+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
8+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
9+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
610
DS_CONFIG=ds_config.json
711

812
TP=1
@@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
4852
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
4953

5054

51-
deepspeed pretrain_gpt.py \
55+
deepspeed ../pretrain_gpt.py \
5256
--tensor-model-parallel-size $TP \
5357
--pipeline-model-parallel-size $PP \
5458
--num-layers $NLAYERS \
@@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
6771
--eval-iters 40 \
6872
--eval-interval 1000 \
6973
--data-path $DATA_PATH \
70-
--vocab-file $BASE_PATH/gpt2-vocab.json \
71-
--merge-file $BASE_PATH/gpt2-merges.txt \
74+
--vocab-file $VOCAB_PATH \
75+
--merge-file $MERGE_PATH \
7276
--save-interval 1000 \
7377
--split 98,2,0 \
7478
--clip-grad 1.0 \
@@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
7882
--init-method-std 0.006 \
7983
--fp16 \
8084
--checkpoint-activations \
85+
--recompute-granularity=full \
86+
--recompute-method=uniform \
87+
--no-gradient-accumulation-fusion \
8188
--tensorboard-dir $OUTPUT_DIR \
8289
$ds_args \
83-
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
84-
90+
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log

examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
dir=`pwd`
35
###############################################################################
@@ -147,8 +149,14 @@ no_pp="true"
147149
zero_stage=1
148150

149151
## Total number of GPUs. ds_ssh is from DeepSpeed library.
150-
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
151-
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
152+
nvidia-smi || count_GPU=0
153+
if [[ ${count_GPU} == 0 ]];then
154+
num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
155+
num_gpus_pernode=${num_gpus}
156+
else
157+
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
158+
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
159+
fi
152160
num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
153161

154162
## Data parallel size.
@@ -187,21 +195,28 @@ host="${HOSTNAME}"
187195
seed=1234
188196
num_workers=0
189197

190-
data_path="BookCorpusDataset_text_document"
191-
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
192-
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
193-
fi
194-
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
195-
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
196-
fi
197-
198-
vocab_path="gpt2-vocab.json"
199-
if [ ! -f "$vocab_path" ]; then
200-
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
201-
fi
202-
merge_path="gpt2-merges.txt"
203-
if [ ! -f "$merge_path" ]; then
204-
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
198+
USE_INTERNAL_DATA="false"
199+
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
200+
data_path="BookCorpusDataset_text_document"
201+
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
202+
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
203+
fi
204+
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
205+
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
206+
fi
207+
vocab_path="gpt2-vocab.json"
208+
if [ ! -f "$vocab_path" ]; then
209+
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
210+
fi
211+
merge_path="gpt2-merges.txt"
212+
if [ ! -f "$merge_path" ]; then
213+
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
214+
fi
215+
else
216+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
217+
data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
218+
vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
219+
merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
205220
fi
206221

207222
prescale_grad="true"
@@ -282,11 +297,12 @@ megatron_options=" \
282297
--log-timers-to-tensorboard \
283298
--log-batch-size-to-tensorboard \
284299
--log-validation-ppl-to-tensorboard \
300+
--no-gradient-accumulation-fusion \
285301
--tensorboard-dir ${tensorboard_path}"
286302

287303
if [ "${activation_checkpoint}" = "true" ]; then
288304
megatron_options="${megatron_options} \
289-
--checkpoint-activations"
305+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
290306
fi
291307

292308
if [ "${log_optimizer_state}" = "true" ]; then
@@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
338354
ds_ssh "echo $iteration_2 > $iteration_file_2"
339355
fi
340356

341-
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
357+
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log

0 commit comments

Comments
 (0)