llmm/train.sh at master · approach0/llmm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
export NCCL_BLOCKING_WAIT=1  # Set this variable to use the NCCL backend
export NCCL_IB_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_P2P_DISABLE=1 # direct access between GPUs? using NVLink or PCI.
# See https://github.com/NVIDIA/nccl/issues/631

#export TORCH_DISTRIBUTED_DEBUG=DETAIL
export TORCH_DISTRIBUTED_DEBUG=OFF

#--data_file ./data/finetune-pairs.json \

deepspeed \
    --include=localhost:0,1,2,3 \
    --master_port 8921 \
    train.py \
    \
    --model_name_or_path approach0/mathy-vicuna-13B-FFT \
    --data_file approach0/MATH-picky-test \
    --debug_single_layer False \
    --dryrun False \
    --ctx_length 1500 \
    --datamap_nprocs 10 \
    --use_flash_att2 True \
    --load_8bit False \
    --num_train_epochs 2 \
    \
    --output_dir ./output \
    --save_strategy "steps" \
    --save_steps 10 \
    --save_total_limit 2 \
    --logging_steps 1 \
    --report_to "tensorboard" \
    \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --max_grad_norm 1.0 \
    --learning_rate 2e-5 \
    --warmup_steps 10 \
    --fp16 False \
    --bf16 True \
    --deepspeed $(python ds_config.py \
        --en_param_offload False \
        --en_act_ackpt False \
        --en_sparse_attn False \
    )