diff --git a/pretrain/installers/v4-megatron-abci/qsub_setup.sh b/pretrain/installers/v4-megatron-abci/qsub_setup.sh index 621c87c9..1317cffb 100755 --- a/pretrain/installers/v4-megatron-abci/qsub_setup.sh +++ b/pretrain/installers/v4-megatron-abci/qsub_setup.sh @@ -1,6 +1,6 @@ #!/bin/bash #PBS -P gcg51557 -#PBS -q R10415 +#PBS -q R9920251000 #PBS -v RTYPE=rt_HF #PBS -l select=1 #PBS -l walltime=01:00:00 @@ -46,6 +46,7 @@ source ${SCRIPT_DIR}/src/install_pytorch.sh source ${SCRIPT_DIR}/src/install_requirements.sh source ${SCRIPT_DIR}/src/install_apex.sh source ${SCRIPT_DIR}/src/install_flash_attention.sh +# source ${SCRIPT_DIR}/src/install_flash_attention3.sh source ${SCRIPT_DIR}/src/install_transformer_engine.sh source ${SCRIPT_DIR}/src/install_megatron_lm.sh source ${SCRIPT_DIR}/src/install_tokenizer.sh diff --git a/pretrain/installers/v4-megatron-abci/scripts/environment.sh b/pretrain/installers/v4-megatron-abci/scripts/environment.sh index de8fd69f..c620e478 100644 --- a/pretrain/installers/v4-megatron-abci/scripts/environment.sh +++ b/pretrain/installers/v4-megatron-abci/scripts/environment.sh @@ -18,9 +18,13 @@ export PRETRAIN_TORCH_VERSION=2.6.0 export PRETRAIN_TORCHVISION_VERSION=0.21.0 export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863 export PRETRAIN_FLASH_ATTENTION_VERSION=2.5.8 -export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0 +# export PRETRAIN_FLASH_ATTENTION_VERSION=3.0.0b1 +# export PRETRAIN_FLASH_ATTENTION_COMMIT=0e79d71175346c7151f49ab6287084a052bc9613 +# export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0 +export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.9 export PRETRAIN_MEGATRON_TAG=v4 +# export PRETRAIN_MEGATRON_TAG=v4-old # Ensure the appropriate Huggingface tokenizer is included # https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209 export PRETRAIN_TOKENIZER_TAG=v3.0b2 @@ -31,4 +35,4 @@ module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH} module load hpcx/${PRETRAIN_HPCX_VERSION} module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH} -export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH \ No newline at end of file +export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH diff --git a/pretrain/installers/v4-megatron-abci/src/install_flash_attention3.sh b/pretrain/installers/v4-megatron-abci/src/install_flash_attention3.sh new file mode 100644 index 00000000..b1674490 --- /dev/null +++ b/pretrain/installers/v4-megatron-abci/src/install_flash_attention3.sh @@ -0,0 +1,21 @@ +# Ref: https://github.com/llm-jp/scripts/blob/exp/tokenizer_test/experiments/v4-hq_tokenizer_test/installer/install_megatron.sh + +echo "Installing FlashAttention ${PRETRAIN_FLASH_ATTENTION_VERSION} (commit ${PRETRAIN_FLASH_ATTENTION_COMMIT})" +source "${TARGET_DIR}/venv/bin/activate" +pushd "${TARGET_DIR}/src" + +git clone https://github.com/Dao-AILab/flash-attention.git +pushd flash-attention +git checkout "${PRETRAIN_FLASH_ATTENTION_COMMIT}" +pushd hopper # cd hopper/ +export TORCH_CUDA_ARCH_LIST="90" +python setup.py install + +python_path=`python -c "import site; print(site.getsitepackages()[0])"` +mkdir -p $python_path/flash_attn_3 +wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/${PRETRAIN_FLASH_ATTENTION_COMMIT}/hopper/flash_attn_interface.py + +popd # flash-attention/hopper +popd # flash-attention +popd # ${TARGET_DIR}/src +deactivate