44# The purpose is just as an integration test, not to actually train models in any meaningful way.
55# For that reason, most of these set epochs = 1 and --dry-run.
66#
7- # Optionally specify a comma separated list of examples to run.
8- # can be run as:
9- # ./run_python_examples.sh "install_deps,run_all,clean"
10- # to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
11- # Expects pytorch, torchvision to be installed.
7+ # Optionally specify a comma separated list of examples to run. Can be run as:
8+ # * To run all examples:
9+ # ./run_distributed_examples.sh
10+ # * To run specific example:
11+ # ./run_distributed_examples.sh "distributed/tensor_parallelism,distributed/ddp"
12+ #
13+ # To test examples on CUDA accelerator, run as:
14+ # USE_CUDA=True ./run_distributed_examples.sh
15+ #
16+ # Script requires uv to be installed. When executed, script will install prerequisites from
17+ # `requirements.txt` for each example. If ran within activated virtual environment (uv venv,
18+ # python -m venv, conda) this might reinstall some of the packages. To change pip installation
19+ # index or to pass additional pip install options, run as:
20+ # PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \
21+ # ./run_python_examples.sh
22+ #
23+ # To force script to create virtual environment for each example, run as:
24+ # VIRTUAL_ENV=".venv" ./run_distributed_examples.sh
25+ # Script will remove environments it creates in a teardown step after execution of each example.
1226
1327BASE_DIR=" $( pwd) /$( dirname $0 ) "
1428source $BASE_DIR /utils.sh
1529
16- USE_CUDA=$( python -c " import torch; print(torch.cuda.is_available()) " )
30+ USE_CUDA=${USE_CUDA :- False}
1731case $USE_CUDA in
1832 " True" )
1933 echo " using cuda"
@@ -30,21 +44,19 @@ case $USE_CUDA in
3044 ;;
3145esac
3246
33- function distributed() {
34- start
35- bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error " tensor parallel example failed"
36- bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error " sequence parallel example failed"
37- bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error " 2D parallel example failed"
38- python ddp/main.py || error " ddp example failed"
47+ function distributed_tensor_parallelism() {
48+ uv run bash run_example.sh tensor_parallel_example.py || error " tensor parallel example failed"
49+ uv run bash run_example.sh sequence_parallel_example.py || error " sequence parallel example failed"
50+ uv run bash run_example.sh fsdp_tp_example.py || error " 2D parallel example failed"
3951}
4052
41- function clean() {
42- cd $BASE_DIR
43- echo " running clean to remove cruft"
53+ function distributed_ddp() {
54+ uv run main.py || error " ddp example failed"
4455}
4556
4657function run_all() {
47- distributed
58+ run distributed/tensor_parallelism
59+ run distributed/ddp
4860}
4961
5062# by default, run all examples
5466 for i in $( echo $EXAMPLES | sed " s/,/ /g" )
5567 do
5668 echo " Starting $i "
57- $i
69+ run $i
5870 echo " Finished $i , status $? "
5971 done
6072fi
0 commit comments