From 08d23ef5638d1bb4cefef4766835a4b9034dab63 Mon Sep 17 00:00:00 2001 From: Dingming Wu Date: Tue, 4 Nov 2025 14:44:52 -0800 Subject: [PATCH 1/2] Add build_test file for RCCL (#23) Summary: Pull Request resolved: https://github.com/meta-pytorch/torchcomms/pull/23 Pull Request resolved: https://github.com/meta-pytorch/torchcomms/pull/18 As in title. Also delete the missed setup_rccl.sh copy in github (previous diff) Pull Request resolved: https://github.com/meta-pytorch/torchcomms/pull/267 Differential Revision: D84125652 Pulled By: sudharssun --- .github/workflows/build_test_rccl.yaml | 83 ++++++++++++++++++++++++++ build_rccl.sh | 2 +- 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build_test_rccl.yaml diff --git a/.github/workflows/build_test_rccl.yaml b/.github/workflows/build_test_rccl.yaml new file mode 100644 index 0000000..73f8ae3 --- /dev/null +++ b/.github/workflows/build_test_rccl.yaml @@ -0,0 +1,83 @@ +name: Build RCCL + +on: + push: + branches: + - main + pull_request: + +permissions: + id-token: write + contents: read + +jobs: + build: + strategy: + fail-fast: false + matrix: + include: + - name: RCCL + runs-on: amd-mi350-runner + gpu-arch-type: "rocm" + gpu-arch-version: "7.0" + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' + docker-image: pytorch/manylinux2_28-builder:rocm7.0 + cmake-version: "latest" + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 180 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + script: | + set -ex + # use faster libmamba solver + conda config --set solver libmamba + + # TODO: remove dependency on fbwhoami + echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami + cat /etc/fbwhoami + export PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH + conda create -n venv python=${{ matrix.python-version }} -y + conda activate venv + python -m pip install --upgrade pip + conda install conda-forge::libopenssl-static conda-forge::rsync -y + conda install conda-forge::glog=0.4.0 conda-forge::gflags conda-forge::fmt -y + + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0 + + if [ "${{ matrix.cmake-version }}" = "latest" ]; then + conda install -y cmake + else # default to latest + conda install -y cmake==${{ matrix.cmake-version }} + fi + + pip install -r requirements.txt + + export BUILD_RCCL_ONLY=1 + export ROCM_HOME=/opt/rocm + export RCCL_INCLUDE=$ROCM_HOME/include/rccl + export USE_SYSTEM_LIBS=1 + export USE_RCCL=1 + ./build_rccl.sh + pip install numpy + USE_NCCL=0 USE_NCCLX=0 USE_GLOO=0 USE_RCCL=1 pip install --no-build-isolation -v -e . + + # Verify installation + python -c "import torch; print(f'PyTorch version: {torch.__version__}')" + python -c "import torch; print(f'HIP available: {torch.cuda.is_available()}')" + python -c "import torchcomms; print('TorchComms imported successfully')" + + # Test RCCL backend availability + python -c " + import torchcomms + try: + comm = torchcomms.new_comm('rccl', torch.device('hip'), 'test_comm') + print('RCCL backend available') + except Exception as e: + print(f'RCCL backend test failed: {e}') + " + + # Run integration tests + echo "Running RCCL integration tests..." + comms/torchcomms/scripts/run_tests_integration_rccl_py.sh diff --git a/build_rccl.sh b/build_rccl.sh index 31f00f3..cd31d25 100755 --- a/build_rccl.sh +++ b/build_rccl.sh @@ -75,5 +75,5 @@ else export RCCL_HOME=$ROCM_HOME/lib fi -popd || exit 1 +popd || true pip install numpy From 1c29f5637c61ff243cfeb5d46b524b98989ee237 Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 4 Nov 2025 15:25:03 -0800 Subject: [PATCH 2/2] test --- .github/workflows/build_test_rccl.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_test_rccl.yaml b/.github/workflows/build_test_rccl.yaml index 73f8ae3..3c11272 100644 --- a/.github/workflows/build_test_rccl.yaml +++ b/.github/workflows/build_test_rccl.yaml @@ -38,6 +38,8 @@ jobs: echo "DEVICE_NAME=$(hostname)" > /etc/fbwhoami cat /etc/fbwhoami export PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH + export CC=/opt/rh/gcc-toolset-11/root/usr/bin/cc + export CXX=/opt/rh/gcc-toolset-11/root/usr/bin/c++ conda create -n venv python=${{ matrix.python-version }} -y conda activate venv python -m pip install --upgrade pip