Skip to content

Commit d1c48aa

Browse files
dmwufacebook-github-bot
authored andcommitted
Add build_test file for RCCL (#23)
Summary: As in title. Also delete the missed setup_rccl.sh copy in github (previous diff) Differential Revision: D84125652 Pulled By: sudharssun
1 parent 3b65b52 commit d1c48aa

File tree

2 files changed

+83
-1
lines changed

2 files changed

+83
-1
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: Build RCCL
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
9+
permissions:
10+
id-token: write
11+
contents: read
12+
13+
jobs:
14+
build:
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
include:
19+
- name: RCCL
20+
runs-on: amd-mi350-runner
21+
gpu-arch-type: "rocm"
22+
gpu-arch-version: "7.0"
23+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
24+
docker-image: pytorch/manylinux2_28-builder:rocm7.0
25+
cmake-version: "latest"
26+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
27+
with:
28+
timeout: 180
29+
runner: ${{ matrix.runs-on }}
30+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
31+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
32+
script: |
33+
set -ex
34+
# use faster libmamba solver
35+
conda config --set solver libmamba
36+
37+
# TODO: remove dependency on fbwhoami
38+
echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami
39+
cat /etc/fbwhoami
40+
41+
conda create -n venv python=${{ matrix.python-version }} -y
42+
conda activate venv
43+
python -m pip install --upgrade pip
44+
conda install conda-forge::libopenssl-static conda-forge::rsync -y
45+
46+
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0
47+
48+
if [ "${{ matrix.cmake-version }}" = "latest" ]; then
49+
conda install -y cmake
50+
else # default to latest
51+
conda install -y cmake==${{ matrix.cmake-version }}
52+
fi
53+
54+
pip install -r requirements.txt
55+
56+
export BUILD_RCCL_ONLY=1
57+
export ROCM_HOME=/opt/rocm
58+
export RCCL_INCLUDE=$ROCM_HOME/include/rccl
59+
export USE_SYSTEM_LIBS=1
60+
export USE_RCCL=1
61+
./build_rccl.sh
62+
pip install numpy
63+
USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install -v .
64+
65+
# Verify installation
66+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
67+
python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')"
68+
python -c "import torchcomms; print('TorchComms imported successfully')"
69+
70+
# Test RCCL backend availability
71+
python -c "
72+
import torchcomms
73+
try:
74+
comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm')
75+
print('RCCL backend available')
76+
except Exception as e:
77+
print(f'RCCL backend test failed: {e}')
78+
"
79+
80+
# Run integration tests
81+
echo "Running RCCL integration tests..."
82+
comms/torchcomms/scripts/run_tests_integration_rccl_py.sh

build_rccl.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,5 @@ else
7575
export RCCL_HOME=$ROCM_HOME/lib
7676
fi
7777

78-
popd || exit 1
78+
popd || true
7979
pip install numpy

0 commit comments

Comments
 (0)