-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_runner.sh
More file actions
58 lines (47 loc) · 1.61 KB
/
benchmark_runner.sh
File metadata and controls
58 lines (47 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env bash
set -e
# Define general configs
NUM_GPUS=${1:-1}
MODEL_NAME=${2:-gpt2}
N_SAMPLES=${3:-2000}
BATCH_SIZE=${4:-32}
EPOCHS=${5:-50}
SEQ_LEN=${6:-128}
# Define arrays of configs
declare -a ZERO_STAGES=("0" "1" "2" "3")
declare -a PRECISIONS=("fp32" "fp16")
mkdir -p results
for zs in "${ZERO_STAGES[@]}"; do
for prec in "${PRECISIONS[@]}"; do
EXP="zs${zs}_${prec}_gpus${NUM_GPUS}_bs${BATCH_SIZE}_sl${SEQ_LEN}"
OUTDIR="results/${EXP}"
mkdir -p "${OUTDIR}"
# Choose config file
DS_CONFIG="experiments/ds_stage${zs}_${prec}.json"
# Run 3 repeats
for run in 1 2 3; do
RUN_DIR="${OUTDIR}/run${run}"
mkdir -p "${RUN_DIR}"
# Start nvidia-smi monitor
./monitor_gpu.sh "${RUN_DIR}/gpu_log.csv" &
MON_PID=$!
echo "Started GPU monitor PID ${MON_PID} -> ${RUN_DIR}/gpu_log.csv"
# Set STEP_LOG env so train.py writes there
export STEP_LOG="${RUN_DIR}/step_log.csv"
echo "Running experiment ${EXP} run${run} config ${DS_CONFIG}"
# Run DeepSpeed
deepspeed --num_gpus $NUM_GPUS train.py \
--model_name $MODEL_NAME \
--n_samples $N_SAMPLES \
--batch_size $BATCH_SIZE \
--epochs $EPOCHS \
--seq_length $SEQ_LEN \
--deepspeed_config $DS_CONFIG \
|| echo "deepspeed failed for ${EXP} run${run}"
# Terminate monitor
kill $MON_PID || true
sleep 2
echo "Saved logs to ${RUN_DIR}"
done
done
done