diff --git a/.github/workflows/building-conda.yml b/.github/workflows/building-conda.yml deleted file mode 100644 index a74e873e..00000000 --- a/.github/workflows/building-conda.yml +++ /dev/null @@ -1,85 +0,0 @@ -name: Building Conda - -on: [workflow_dispatch] - -jobs: - - conda-build: - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: [windows-latest] #[ubuntu-18.04, macos-10.15, windows-latest] - python-version: [3.6, 3.7, 3.8, 3.9] - torch-version: [1.9.0, 1.10.0] - cuda-version: ['cu113'] #['cpu', 'cu102', 'cu111', 'cu113'] - exclude: - - torch-version: 1.9.0 - cuda-version: 'cu113' - - torch-version: 1.10.0 - cuda-version: 'cu111' - - os: macos-10.15 - cuda-version: 'cu102' - - os: macos-10.15 - cuda-version: 'cu111' - - os: macos-10.15 - cuda-version: 'cu113' - - steps: - - uses: actions/checkout@v2 - - name: Set up Conda for Python ${{ matrix.python-version }} - uses: conda-incubator/setup-miniconda@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Free up disk space - if: ${{ runner.os == 'Linux' }} - run: | - sudo rm -rf /usr/share/dotnet - - - name: Install Conda packages - run: | - conda install conda-build conda-verify --yes - shell: - bash -l {0} - - - name: Install CUDA ${{ matrix.cuda-version }} - if: ${{ matrix.cuda-version != 'cpu' }} - run: | - bash .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}.sh - shell: - bash - - - name: Build Conda package for CPU - if: ${{ matrix.cuda-version == 'cpu' }} - run: | - FORCE_CUDA=0 TORCH_CUDA_ARCH_LIST=0 ./conda/pytorch-scatter/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }} - shell: - bash -l {0} - - - name: Build Conda package for GPU - if: ${{ matrix.cuda-version != 'cpu' }} - run: | - source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh - ./conda/pytorch-scatter/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }} - shell: - bash -l {0} - - - name: Publish Conda package on organization channel - run: | - conda install anaconda-client --yes - anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2 - env: - ANACONDA_API_TOKEN: ${{ secrets.PYG_ANACONDA_TOKEN }} - shell: - bash -l {0} - - - name: Publish Conda package on personal channel - run: | - conda install anaconda-client --yes - anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2 - env: - ANACONDA_API_TOKEN: ${{ secrets.RUSTY1S_ANACONDA_TOKEN }} - shell: - bash -l {0} diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml index 7aca6bc8..78ff5580 100644 --- a/.github/workflows/building.yml +++ b/.github/workflows/building.yml @@ -10,21 +10,25 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-18.04, macos-10.15, windows-latest] - python-version: [3.6, 3.7, 3.8, 3.9] - torch-version: [1.9.0, 1.10.0] - cuda-version: ['cpu', 'cu102', 'cu111', 'cu113'] + os: [ubuntu-20.04, macos-14, windows-2019, ubuntu-22.04-arm] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + torch-version: [2.6.0] # [2.5.0] + cuda-version: ['cpu', 'cu118', 'cu121', 'cu124', 'cu126'] exclude: - - torch-version: 1.9.0 - cuda-version: 'cu113' - - torch-version: 1.10.0 - cuda-version: 'cu111' - - os: macos-10.15 - cuda-version: 'cu102' - - os: macos-10.15 - cuda-version: 'cu111' - - os: macos-10.15 - cuda-version: 'cu113' + - torch-version: 2.5.0 + python-version: '3.13' + - torch-version: 2.5.0 + cuda-version: 'cu126' + - torch-version: 2.6.0 + cuda-version: 'cu121' + - os: macos-14 + cuda-version: 'cu118' + - os: macos-14 + cuda-version: 'cu121' + - os: macos-14 + cuda-version: 'cu124' + - os: macos-14 + cuda-version: 'cu126' steps: - uses: actions/checkout@v2 @@ -33,10 +37,13 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Free up disk space - if: ${{ runner.os == 'Linux' }} + - name: Upgrade pip run: | - sudo rm -rf /usr/share/dotnet + pip install --upgrade setuptools + + - name: Free Disk Space (Ubuntu) + if: ${{ runner.os == 'Linux' }} + uses: jlumbroso/free-disk-space@main - name: Install CUDA ${{ matrix.cuda-version }} if: ${{ matrix.cuda-version != 'cpu' }} @@ -45,15 +52,34 @@ jobs: - name: Install PyTorch ${{ matrix.torch-version }}+${{ matrix.cuda-version }} run: | - pip install numpy typing-extensions dataclasses - pip install --no-index --no-cache-dir torch==${{ matrix.torch-version}} -f https://download.pytorch.org/whl/${{ matrix.cuda-version }}/torch_stable.html + pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/${{ matrix.cuda-version }} python -c "import torch; print('PyTorch:', torch.__version__)" python -c "import torch; print('CUDA:', torch.version.cuda)" + - name: Patch PyTorch static constexpr on Windows + if: ${{ runner.os == 'Windows' }} + run: | + Torch_DIR=`python -c 'import os; import torch; print(os.path.dirname(torch.__file__))'` + sed -i '31,38c\ + TORCH_API void lazy_init_num_threads();' ${Torch_DIR}/include/ATen/Parallel.h + shell: bash + + - name: Set version + if: ${{ runner.os != 'macOS' }} + run: | + VERSION=`sed -n "s/^__version__ = '\(.*\)'/\1/p" torch_scatter/__init__.py` + TORCH_VERSION=`echo "pt${{ matrix.torch-version }}" | sed "s/..$//" | sed "s/\.//g"` + CUDA_VERSION=`echo ${{ matrix.cuda-version }}` + echo "New version name: $VERSION+$TORCH_VERSION$CUDA_VERSION" + sed -i "s/$VERSION/$VERSION+$TORCH_VERSION$CUDA_VERSION/" setup.py + sed -i "s/$VERSION/$VERSION+$TORCH_VERSION$CUDA_VERSION/" torch_scatter/__init__.py + shell: + bash + - name: Install main package for CPU if: ${{ matrix.cuda-version == 'cpu' }} run: | - FORCE_ONLY_CPU=1 pip install -e . + FORCE_ONLY_CPU=1 python setup.py develop shell: bash @@ -61,10 +87,14 @@ jobs: if: ${{ matrix.cuda-version != 'cpu' }} run: | source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh - pip install -e . + python setup.py develop shell: bash + - name: Test installation + run: | + python -c "import torch_scatter; print('torch-scatter:', torch_scatter.__version__)" + - name: Build wheel run: | pip install wheel diff --git a/.github/workflows/cuda/cu115-Linux-env.sh b/.github/workflows/cuda/cu115-Linux-env.sh new file mode 100644 index 00000000..1c148a22 --- /dev/null +++ b/.github/workflows/cuda/cu115-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-11.5 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" diff --git a/.github/workflows/cuda/cu115-Linux.sh b/.github/workflows/cuda/cu115-Linux.sh new file mode 100755 index 00000000..02bcb4d1 --- /dev/null +++ b/.github/workflows/cuda/cu115-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu1804 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb +sudo apt-key add /var/cuda-repo-${OS}-11-5-local/7fa2af80.pub + +sudo apt-get -qq update +sudo apt install cuda-nvcc-11-5 cuda-libraries-dev-11-5 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb diff --git a/.github/workflows/cuda/cu115-Windows-env.sh b/.github/workflows/cuda/cu115-Windows-env.sh new file mode 100644 index 00000000..3a662fb8 --- /dev/null +++ b/.github/workflows/cuda/cu115-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu115-Windows.sh b/.github/workflows/cuda/cu115-Windows.sh new file mode 100755 index 00000000..db2559c4 --- /dev/null +++ b/.github/workflows/cuda/cu115-Windows.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=11.3 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu116-Linux-env.sh b/.github/workflows/cuda/cu116-Linux-env.sh new file mode 100644 index 00000000..fe318942 --- /dev/null +++ b/.github/workflows/cuda/cu116-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-11.6 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh new file mode 100755 index 00000000..28cda58d --- /dev/null +++ b/.github/workflows/cuda/cu116-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu1804 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb +sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub + +sudo apt-get -qq update +sudo apt install cuda-nvcc-11-6 cuda-libraries-dev-11-6 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb diff --git a/.github/workflows/cuda/cu116-Windows-env.sh b/.github/workflows/cuda/cu116-Windows-env.sh new file mode 100644 index 00000000..3a662fb8 --- /dev/null +++ b/.github/workflows/cuda/cu116-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu116-Windows.sh b/.github/workflows/cuda/cu116-Windows.sh new file mode 100755 index 00000000..af329138 --- /dev/null +++ b/.github/workflows/cuda/cu116-Windows.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# TODO We currently use CUDA 11.3 to build CUDA 11.6 Windows wheels + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=11.3 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu117-Linux-env.sh b/.github/workflows/cuda/cu117-Linux-env.sh new file mode 100644 index 00000000..b14f0744 --- /dev/null +++ b/.github/workflows/cuda/cu117-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-11.7 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" diff --git a/.github/workflows/cuda/cu117-Linux.sh b/.github/workflows/cuda/cu117-Linux.sh new file mode 100755 index 00000000..d521965b --- /dev/null +++ b/.github/workflows/cuda/cu117-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu2004 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb +sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/ + +sudo apt-get -qq update +sudo apt install cuda-nvcc-11-7 cuda-libraries-dev-11-7 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb diff --git a/.github/workflows/cuda/cu117-Windows-env.sh b/.github/workflows/cuda/cu117-Windows-env.sh new file mode 100644 index 00000000..3a662fb8 --- /dev/null +++ b/.github/workflows/cuda/cu117-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu117-Windows.sh b/.github/workflows/cuda/cu117-Windows.sh new file mode 100755 index 00000000..b11a9a03 --- /dev/null +++ b/.github/workflows/cuda/cu117-Windows.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# TODO We currently use CUDA 11.3 to build CUDA 11.7 Windows wheels + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=11.3 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu118-Linux-env.sh b/.github/workflows/cuda/cu118-Linux-env.sh new file mode 100644 index 00000000..18d97b8a --- /dev/null +++ b/.github/workflows/cuda/cu118-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-11.8 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" diff --git a/.github/workflows/cuda/cu118-Linux.sh b/.github/workflows/cuda/cu118-Linux.sh new file mode 100755 index 00000000..46b66e7d --- /dev/null +++ b/.github/workflows/cuda/cu118-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu2004 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb +sudo cp /var/cuda-repo-${OS}-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/ + +sudo apt-get -qq update +sudo apt install cuda-nvcc-11-8 cuda-libraries-dev-11-8 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb diff --git a/.github/workflows/cuda/cu118-Windows-env.sh b/.github/workflows/cuda/cu118-Windows-env.sh new file mode 100644 index 00000000..d0ff04b6 --- /dev/null +++ b/.github/workflows/cuda/cu118-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.8 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu118-Windows.sh b/.github/workflows/cuda/cu118-Windows.sh new file mode 100755 index 00000000..b82a5a9b --- /dev/null +++ b/.github/workflows/cuda/cu118-Windows.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=11.8 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.0_522.06_windows.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu121-Linux-env.sh b/.github/workflows/cuda/cu121-Linux-env.sh new file mode 100644 index 00000000..b15ae52d --- /dev/null +++ b/.github/workflows/cuda/cu121-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-12.1 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0" diff --git a/.github/workflows/cuda/cu121-Linux.sh b/.github/workflows/cuda/cu121-Linux.sh new file mode 100755 index 00000000..dd8db12e --- /dev/null +++ b/.github/workflows/cuda/cu121-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu2004 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb +sudo cp /var/cuda-repo-${OS}-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/ + +sudo apt-get -qq update +sudo apt install cuda-nvcc-12-1 cuda-libraries-dev-12-1 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb diff --git a/.github/workflows/cuda/cu121-Windows-env.sh b/.github/workflows/cuda/cu121-Windows-env.sh new file mode 100644 index 00000000..c55a0356 --- /dev/null +++ b/.github/workflows/cuda/cu121-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.1 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu121-Windows.sh b/.github/workflows/cuda/cu121-Windows.sh new file mode 100755 index 00000000..7df067fb --- /dev/null +++ b/.github/workflows/cuda/cu121-Windows.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=12.1 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.1_531.14_windows.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu124-Linux-env.sh b/.github/workflows/cuda/cu124-Linux-env.sh new file mode 100644 index 00000000..19133e1a --- /dev/null +++ b/.github/workflows/cuda/cu124-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-12.4 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0" diff --git a/.github/workflows/cuda/cu124-Linux.sh b/.github/workflows/cuda/cu124-Linux.sh new file mode 100755 index 00000000..f337d5f5 --- /dev/null +++ b/.github/workflows/cuda/cu124-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu2004 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb +sudo cp /var/cuda-repo-${OS}-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ + +sudo apt-get -qq update +sudo apt install cuda-nvcc-12-4 cuda-libraries-dev-12-4 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb diff --git a/.github/workflows/cuda/cu124-Windows-env.sh b/.github/workflows/cuda/cu124-Windows-env.sh new file mode 100644 index 00000000..c47fdbbb --- /dev/null +++ b/.github/workflows/cuda/cu124-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.4 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu124-Windows.sh b/.github/workflows/cuda/cu124-Windows.sh new file mode 100755 index 00000000..b85aeb69 --- /dev/null +++ b/.github/workflows/cuda/cu124-Windows.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=12.4 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.1_551.78_windows.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/cuda/cu126-Linux-env.sh b/.github/workflows/cuda/cu126-Linux-env.sh new file mode 100644 index 00000000..bad961ef --- /dev/null +++ b/.github/workflows/cuda/cu126-Linux-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/usr/local/cuda-12.6 +LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +PATH=${CUDA_HOME}/bin:${PATH} + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0" diff --git a/.github/workflows/cuda/cu126-Linux.sh b/.github/workflows/cuda/cu126-Linux.sh new file mode 100755 index 00000000..99759545 --- /dev/null +++ b/.github/workflows/cuda/cu126-Linux.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OS=ubuntu2004 + +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin +sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb +sudo dpkg -i cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb +sudo cp /var/cuda-repo-${OS}-12-6-local/cuda-*-keyring.gpg /usr/share/keyrings/ + +sudo apt-get -qq update +sudo apt install cuda-nvcc-12-6 cuda-libraries-dev-12-6 +sudo apt clean + +rm -f https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb diff --git a/.github/workflows/cuda/cu126-Windows-env.sh b/.github/workflows/cuda/cu126-Windows-env.sh new file mode 100644 index 00000000..c68771bd --- /dev/null +++ b/.github/workflows/cuda/cu126-Windows-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.6 +PATH=${CUDA_HOME}/bin:$PATH +PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH + +export FORCE_CUDA=1 +export TORCH_CUDA_ARCH_LIST="6.0+PTX" diff --git a/.github/workflows/cuda/cu126-Windows.sh b/.github/workflows/cuda/cu126-Windows.sh new file mode 100755 index 00000000..193d014b --- /dev/null +++ b/.github/workflows/cuda/cu126-Windows.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Install NVIDIA drivers, see: +# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 +curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" +7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" + +export CUDA_SHORT=12.6 +export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers +export CUDA_FILE=cuda_${CUDA_SHORT}.0_560.76_windows.exe + +# Install CUDA: +curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" +echo "" +echo "Installing from ${CUDA_FILE}..." +PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" +echo "Done!" +rm -f "${CUDA_FILE}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 001145fd..78734f7c 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,23 +1,23 @@ name: Linting -on: [push, pull_request] +on: + push: + branches: + - master + pull_request: jobs: flake8: - runs-on: ${{ matrix.os }} - - strategy: - matrix: - os: [ubuntu-latest] - python-version: [3.6] + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} + + - name: Set up Python uses: actions/setup-python@v2 with: - python-version: ${{ matrix.python-version }} + python-version: 3.8 - name: Install dependencies run: | diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index df8ceb07..1f22fd88 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,6 +1,10 @@ name: Testing -on: [push, pull_request] +on: + push: + branches: + - master + pull_request: jobs: @@ -8,10 +12,11 @@ jobs: runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: - os: [ubuntu-latest, windows-latest] - python-version: [3.6] - torch-version: [1.9.0, 1.10.0] + os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm] + python-version: [3.9] + torch-version: [2.5.0, 2.6.0] steps: - uses: actions/checkout@v2 @@ -22,24 +27,27 @@ jobs: - name: Install PyTorch ${{ matrix.torch-version }} run: | - pip install torch==${{ matrix.torch-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/cpu - - name: Install main package + - name: Patch PyTorch static constexpr on Windows + if: ${{ runner.os == 'Windows' }} run: | - pip install -e .[test] + Torch_DIR=`python -c 'import os; import torch; print(os.path.dirname(torch.__file__))'` + sed -i '31,38c\ + TORCH_API void lazy_init_num_threads();' ${Torch_DIR}/include/ATen/Parallel.h + shell: bash - - name: Run test-suite + - name: Install main package run: | - python setup.py test + python setup.py develop - - name: Generate coverage report - if: success() + - name: Run test-suite run: | - pip install coverage - coverage xml + pip install pytest pytest-cov + pytest --cov --cov-report=xml - - name: Upload coverage report to codecov - uses: codecov/codecov-action@v1 + - name: Upload coverage + uses: codecov/codecov-action@v4 if: success() with: - file: coverage.xml + fail_ci_if_error: false diff --git a/.gitignore b/.gitignore index b9233c5b..d334126f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ dist/ *.aux *.log *.pdf +*.hip +*_hip.cpp +hip diff --git a/CMakeLists.txt b/CMakeLists.txt index b180c461..e0a5e11f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,11 @@ cmake_minimum_required(VERSION 3.0) project(torchscatter) -set(CMAKE_CXX_STANDARD 14) -set(TORCHSCATTER_VERSION 2.0.9) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED) +set(TORCHSCATTER_VERSION 2.1.2) option(WITH_CUDA "Enable CUDA support" OFF) +option(WITH_PYTHON "Link to Python when building" ON) if(WITH_CUDA) enable_language(CUDA) @@ -12,21 +14,27 @@ if(WITH_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") endif() -find_package(Python3 COMPONENTS Development) +if (WITH_PYTHON) + add_definitions(-DWITH_PYTHON) + find_package(Python3 COMPONENTS Development) +endif() find_package(Torch REQUIRED) -file(GLOB HEADERS csrc/scatter.h) +file(GLOB HEADERS csrc/*.h) file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp) if(WITH_CUDA) file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu) endif() add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES}) -target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python) +target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES}) +if (WITH_PYTHON) + target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python) +endif() set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchScatter) target_include_directories(${PROJECT_NAME} INTERFACE - $ + "$" $) include(GNUInstallDirs) diff --git a/README.md b/README.md index 527e1b00..59e33b92 100644 --- a/README.md +++ b/README.md @@ -41,52 +41,46 @@ All included operations are broadcastable, work on varying data types, are imple ## Installation -### Anaconda - -**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗 -Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run - -``` -conda install pytorch-scatter -c pyg -``` - ### Binaries -We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl). +We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl). -#### PyTorch 1.10.0 +#### PyTorch 2.6 -To install the binaries for PyTorch 1.10.0, simply run +To install the binaries for PyTorch 2.6.0, simply run ``` -pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html +pip install torch-scatter -f https://data.pyg.org/whl/torch-2.6.0+${CUDA}.html ``` -where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu113` depending on your PyTorch installation. +where `${CUDA}` should be replaced by either `cpu`, `cu118`, `cu124`, or `cu126` depending on your PyTorch installation. + +| | `cpu` | `cu118` | `cu124` | `cu126` | +|-------------|-------|---------|---------|---------| +| **Linux** | ✅ | ✅ | ✅ | ✅ | +| **Windows** | ✅ | ✅ | ✅ | ✅ | +| **macOS** | ✅ | | | | -| | `cpu` | `cu102` | `cu113` | -|-------------|-------|---------|---------| -| **Linux** | ✅ | ✅ | ✅ | -| **Windows** | ✅ | ✅ | ✅ | -| **macOS** | ✅ | | | -#### PyTorch 1.9.0/1.9.1 +#### PyTorch 2.5 -To install the binaries for PyTorch 1.9.0 and 1.9.1, simply run +To install the binaries for PyTorch 2.5.0/2.5.1, simply run ``` -pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html +pip install torch-scatter -f https://data.pyg.org/whl/torch-2.5.0+${CUDA}.html ``` -where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation. +where `${CUDA}` should be replaced by either `cpu`, `cu118`, `cu121`, or `cu124` depending on your PyTorch installation. -| | `cpu` | `cu102` | `cu111` | -|-------------|-------|---------|---------| -| **Linux** | ✅ | ✅ | ✅ | -| **Windows** | ✅ | ✅ | ✅ | -| **macOS** | ✅ | | | +| | `cpu` | `cu118` | `cu121` | `cu124` | +|-------------|-------|---------|---------|---------| +| **Linux** | ✅ | ✅ | ✅ | ✅ | +| **Windows** | ✅ | ✅ | ✅ | ✅ | +| **macOS** | ✅ | | | | -**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1 and PyTorch 1.8.0/1.8.1 (following the same procedure). +**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1, PyTorch 1.13.0/1.13.1, PyTorch 2.0.0/2.0.1, PyTorch 2.1.0/2.1.1/2.1.2, PyTorch 2.2.0/2.2.1/2.2.2, PyTorch 2.3.0/2.3.1, and PyTorch 2.4.0/2.4.1 (following the same procedure). +For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source. +You can look up the latest supported version number [here](https://data.pyg.org/whl). ### From source @@ -141,18 +135,19 @@ tensor([[5, 5, 3, 4, 0, 1] ## Running tests ``` -python setup.py test +pytest ``` ## C++ API `torch-scatter` also offers a C++ API that contains C++ equivalent of python models. +For this, we need to add `TorchLib` to the `-DCMAKE_PREFIX_PATH` (run `import torch; print(torch.utils.cmake_prefix_path)` to obtain it). ``` mkdir build cd build -# Add -DWITH_CUDA=on support for the CUDA if needed -cmake .. +# Add -DWITH_CUDA=on support for CUDA support +cmake -DCMAKE_PREFIX_PATH="..." .. make make install ``` diff --git a/conda/pytorch-scatter/README.md b/conda/pytorch-scatter/README.md deleted file mode 100644 index f7fe27e8..00000000 --- a/conda/pytorch-scatter/README.md +++ /dev/null @@ -1,3 +0,0 @@ -``` -./build_conda.sh 3.9 1.9.0 cu111 # python, pytorch and cuda version -``` diff --git a/conda/pytorch-scatter/build_conda.sh b/conda/pytorch-scatter/build_conda.sh deleted file mode 100755 index 00f5123c..00000000 --- a/conda/pytorch-scatter/build_conda.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -export PYTHON_VERSION=$1 -export TORCH_VERSION=$2 -export CUDA_VERSION=$3 - -export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*" - -if [ "${CUDA_VERSION}" = "cpu" ]; then - export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly # [not osx]" -else - case $CUDA_VERSION in - cu113) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*" - ;; - cu111) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*" - ;; - cu102) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*" - ;; - cu101) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*" - ;; - *) - echo "Unrecognized CUDA_VERSION=$CUDA_VERSION" - exit 1 - ;; - esac -fi - -echo "PyTorch $TORCH_VERSION+$CUDA_VERSION" -echo "- $CONDA_PYTORCH_CONSTRAINT" -echo "- $CONDA_CUDATOOLKIT_CONSTRAINT" - -conda build . -c nvidia -c pytorch -c default -c conda-forge --output-folder "$HOME/conda-bld" diff --git a/conda/pytorch-scatter/meta.yaml b/conda/pytorch-scatter/meta.yaml deleted file mode 100644 index 0b24e406..00000000 --- a/conda/pytorch-scatter/meta.yaml +++ /dev/null @@ -1,37 +0,0 @@ -package: - name: pytorch-scatter - version: 2.0.9 - -source: - path: ../.. - -requirements: - build: - - {{ compiler('c') }} # [win] - - host: - - pip - - python {{ environ.get('PYTHON_VERSION') }} - - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} - - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }} - - run: - - python {{ environ.get('PYTHON_VERSION') }} - - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} - - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }} - -build: - string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }} - script: pip install . - script_env: - - FORCE_CUDA - - TORCH_CUDA_ARCH_LIST - -test: - imports: - - torch_scatter - -about: - home: https://github.com/rusty1s/pytorch_scatter - license: MIT - summary: PyTorch Extension Library of Optimized Scatter Operations diff --git a/csrc/cpu/index_info.h b/csrc/cpu/index_info.h index 9709a1de..5e9ed0b4 100644 --- a/csrc/cpu/index_info.h +++ b/csrc/cpu/index_info.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "../extensions.h" #define MAX_TENSORINFO_DIMS 25 diff --git a/csrc/cpu/scatter_cpu.cpp b/csrc/cpu/scatter_cpu.cpp index 77a43969..64b8ac3b 100644 --- a/csrc/cpu/scatter_cpu.cpp +++ b/csrc/cpu/scatter_cpu.cpp @@ -4,10 +4,10 @@ #include "reducer.h" #include "utils.h" -std::tuple> +std::tuple> scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { CHECK_CPU(src); CHECK_CPU(index); if (optional_out.has_value()) @@ -36,7 +36,7 @@ scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim, out = torch::empty(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full_like(out, src.size(dim), index.options()); @@ -57,7 +57,7 @@ scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim, auto N = out.size(dim); auto index_info = getTensorInfo(index); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "scatter_cpu", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cpu/scatter_cpu.h b/csrc/cpu/scatter_cpu.h index 25122e70..eb279c50 100644 --- a/csrc/cpu/scatter_cpu.h +++ b/csrc/cpu/scatter_cpu.h @@ -1,8 +1,8 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size, std::string reduce); + std::optional optional_out, + std::optional dim_size, std::string reduce); diff --git a/csrc/cpu/segment_coo_cpu.cpp b/csrc/cpu/segment_coo_cpu.cpp index 8497301f..64f975dd 100644 --- a/csrc/cpu/segment_coo_cpu.cpp +++ b/csrc/cpu/segment_coo_cpu.cpp @@ -3,11 +3,12 @@ #include "index_info.h" #include "reducer.h" #include "utils.h" +#include -std::tuple> +std::tuple> segment_coo_cpu(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { CHECK_CPU(src); CHECK_CPU(index); if (optional_out.has_value()) @@ -44,7 +45,7 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index, out = torch::empty(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full_like(out, src.size(dim), index.options()); @@ -69,12 +70,13 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index, auto index_info = getTensorInfo(index); auto stride = index_info.strides[index_info.dims - 1]; std::vector args(K); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "segment_coo_cpu", [&] { + using opmath_t = at::opmath_type; auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); scalar_t *count_data = nullptr; - std::vector vals(K); + std::vector vals(K); int64_t idx, next_idx, row_start; AT_DISPATCH_REDUCTION_TYPES(reduce, [&] { if (!optional_out.has_value()) @@ -87,19 +89,19 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index, idx = index_info.data[offset]; for (auto k = 0; k < K; k++) - vals[k] = out_data[b * N * K + k]; + vals[k] = static_cast(out_data[b * N * K + k]); row_start = 0; for (auto e = 0; e < E; e++) { for (auto k = 0; k < K; k++) - Reducer::update( - &vals[k], src_data[b * E * K + e * K + k], &args[k], e); + Reducer::update( + &vals[k], static_cast(src_data[b * E * K + e * K + k]), &args[k], e); if (e == E - 1) { for (auto k = 0; k < K; k++) Reducer::write( - out_data + b * N * K + idx * K + k, vals[k], + out_data + b * N * K + idx * K + k, static_cast(vals[k]), arg_out_data + b * N * K + idx * K + k, args[k], e + 1 - row_start); if (REDUCE == MEAN) @@ -111,11 +113,11 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index, if (idx != next_idx) { for (auto k = 0; k < K; k++) { Reducer::write( - out_data + b * N * K + idx * K + k, vals[k], + out_data + b * N * K + idx * K + k, static_cast(vals[k]), arg_out_data + b * N * K + idx * K + k, args[k], e + 1 - row_start); - vals[k] = out_data[b * N * K + next_idx * K + k]; + vals[k] = static_cast(out_data[b * N * K + next_idx * K + k]); } if (REDUCE == MEAN) count_data[b * N + idx] = (scalar_t)(e + 1 - row_start); @@ -139,7 +141,7 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index, } torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index, - torch::optional optional_out) { + std::optional optional_out) { CHECK_CPU(src); CHECK_CPU(index); if (optional_out.has_value()) @@ -178,7 +180,7 @@ torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index, auto index_info = getTensorInfo(index); auto stride = index_info.strides[index_info.dims - 1]; - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "gather_coo_cpu", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cpu/segment_coo_cpu.h b/csrc/cpu/segment_coo_cpu.h index feb7a827..e0501209 100644 --- a/csrc/cpu/segment_coo_cpu.h +++ b/csrc/cpu/segment_coo_cpu.h @@ -1,11 +1,11 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> segment_coo_cpu(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size, std::string reduce); + std::optional optional_out, + std::optional dim_size, std::string reduce); torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index, - torch::optional optional_out); + std::optional optional_out); diff --git a/csrc/cpu/segment_csr_cpu.cpp b/csrc/cpu/segment_csr_cpu.cpp index a826192c..18be4023 100644 --- a/csrc/cpu/segment_csr_cpu.cpp +++ b/csrc/cpu/segment_csr_cpu.cpp @@ -3,10 +3,11 @@ #include "index_info.h" #include "reducer.h" #include "utils.h" +#include -std::tuple> +std::tuple> segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out, + std::optional optional_out, std::string reduce) { CHECK_CPU(src); CHECK_CPU(indptr); @@ -37,7 +38,7 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, out = torch::empty(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full(out.sizes(), src.size(dim), indptr.options()); @@ -57,11 +58,12 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, auto indptr_info = getTensorInfo(indptr); auto stride = indptr_info.strides[indptr_info.dims - 1]; std::vector args(K); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "segment_csr_cpu", [&] { + using opmath_t = at::opmath_type; auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); - std::vector vals(K); + std::vector vals(K); int64_t row_start, row_end; AT_DISPATCH_REDUCTION_TYPES(reduce, [&] { for (auto n = 0; n < N; n++) { @@ -71,15 +73,15 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, offset = (n / (indptr.size(-1) - 1)) * E * K; for (auto k = 0; k < K; k++) - vals[k] = Reducer::init(); + vals[k] = Reducer::init(); for (auto e = row_start; e < row_end; e++) for (auto k = 0; k < K; k++) - Reducer::update( - &vals[k], src_data[offset + e * K + k], &args[k], e); + Reducer::update( + &vals[k], static_cast(src_data[offset + e * K + k]), &args[k], e); for (auto k = 0; k < K; k++) - Reducer::write(out_data + n * K + k, vals[k], + Reducer::write(out_data + n * K + k, static_cast(vals[k]), arg_out_data + n * K + k, args[k], row_end - row_start); } @@ -90,7 +92,7 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, } torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { + std::optional optional_out) { CHECK_CPU(src); CHECK_CPU(indptr); if (optional_out.has_value()) @@ -135,7 +137,7 @@ torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr, auto indptr_info = getTensorInfo(indptr); auto stride = indptr_info.strides[indptr_info.dims - 1]; - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "gather_csr_cpu", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cpu/segment_csr_cpu.h b/csrc/cpu/segment_csr_cpu.h index b93d450b..7b4da0a2 100644 --- a/csrc/cpu/segment_csr_cpu.h +++ b/csrc/cpu/segment_csr_cpu.h @@ -1,11 +1,11 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> segment_csr_cpu(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out, + std::optional optional_out, std::string reduce); torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); + std::optional optional_out); diff --git a/csrc/cpu/utils.h b/csrc/cpu/utils.h index 40dfb344..66ae38bf 100644 --- a/csrc/cpu/utils.h +++ b/csrc/cpu/utils.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "../extensions.h" #define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor") #define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch") diff --git a/csrc/cuda/atomics.cuh b/csrc/cuda/atomics.cuh index 8a7c4724..16b24fc0 100644 --- a/csrc/cuda/atomics.cuh +++ b/csrc/cuda/atomics.cuh @@ -68,8 +68,8 @@ \ template struct Atomic##NAME##DecimalImpl; \ \ - template struct Atomic##NAME##DecimalImpl { \ - inline __device__ void operator()(scalar *address, scalar val) { \ + template <> struct Atomic##NAME##DecimalImpl { \ + inline __device__ void operator()(at::Half *address, at::Half val) { \ unsigned int *address_as_ui = \ (unsigned int *)((char *)address - ((size_t)address & 2)); \ unsigned int old = *address_as_ui; \ @@ -87,6 +87,25 @@ } \ }; \ \ + template <> struct Atomic##NAME##DecimalImpl { \ + inline __device__ void operator()(at::BFloat16 *address, at::BFloat16 val){\ + unsigned int *address_as_ui = \ + (unsigned int *)((char *)address - ((size_t)address & 2)); \ + unsigned int old = *address_as_ui; \ + unsigned int assumed; \ + \ + do { \ + assumed = old; \ + at::BFloat16 hsum; \ + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); \ + hsum = OP(hsum, val); \ + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) \ + : (old & 0xffff0000) | hsum.x; \ + old = atomicCAS(address_as_ui, assumed, old); \ + } while (assumed != old); \ + } \ + }; \ + \ template struct Atomic##NAME##DecimalImpl { \ inline __device__ void operator()(scalar *address, scalar val) { \ int *address_as_i = (int *)address; \ @@ -135,7 +154,7 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) { static inline __device__ void atomAdd(int64_t *address, int64_t val) { AtomicAddIntegerImpl()(address, val); } -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000) +#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000)) static inline __device__ void atomAdd(at::Half *address, at::Half val) { AtomicAddDecimalImpl()(address, val); } @@ -156,6 +175,9 @@ static inline __device__ void atomAdd(double *address, double val) { atomicAdd(address, val); } #endif +static inline __device__ void atomAdd(at::BFloat16 *address, at::BFloat16 val) { + AtomicAddDecimalImpl()(address, val); +} #define OP(X, Y) Y *X ATOMIC(Mul) @@ -184,6 +206,9 @@ static inline __device__ void atomMul(at::Half *address, at::Half val) { static inline __device__ void atomMul(double *address, double val) { AtomicMulDecimalImpl()(address, val); } +static inline __device__ void atomMul(at::BFloat16 *address, at::BFloat16 val) { + AtomicMulDecimalImpl()(address, val); +} #define OP(X, Y) Y / X ATOMIC(Div) @@ -212,6 +237,9 @@ static inline __device__ void atomDiv(float *address, float val) { static inline __device__ void atomDiv(double *address, double val) { AtomicDivDecimalImpl()(address, val); } +static inline __device__ void atomDiv(at::BFloat16 *address, at::BFloat16 val) { + AtomicDivDecimalImpl()(address, val); +} #define OP(X, Y) max(Y, X) ATOMIC(Max) @@ -240,6 +268,9 @@ static inline __device__ void atomMax(float *address, float val) { static inline __device__ void atomMax(double *address, double val) { AtomicMaxDecimalImpl()(address, val); } +static inline __device__ void atomMax(at::BFloat16 *address, at::BFloat16 val) { + AtomicMaxDecimalImpl()(address, val); +} #define OP(X, Y) min(Y, X) ATOMIC(Min) @@ -268,3 +299,6 @@ static inline __device__ void atomMin(float *address, float val) { static inline __device__ void atomMin(double *address, double val) { AtomicMinDecimalImpl()(address, val); } +static inline __device__ void atomMin(at::BFloat16 *address, at::BFloat16 val) { + AtomicMinDecimalImpl()(address, val); +} diff --git a/csrc/cuda/reducer.cuh b/csrc/cuda/reducer.cuh index 8c851d20..577f4286 100644 --- a/csrc/cuda/reducer.cuh +++ b/csrc/cuda/reducer.cuh @@ -16,27 +16,27 @@ const std::map reduce2REDUCE = { [&] { \ switch (reduce2REDUCE.at(reduce)) { \ case SUM: { \ - const ReductionType REDUCE = SUM; \ + static constexpr ReductionType REDUCE = SUM; \ return __VA_ARGS__(); \ } \ case MEAN: { \ - const ReductionType REDUCE = MEAN; \ + static constexpr ReductionType REDUCE = MEAN; \ return __VA_ARGS__(); \ } \ case MUL: { \ - const ReductionType REDUCE = MUL; \ + static constexpr ReductionType REDUCE = MUL; \ return __VA_ARGS__(); \ } \ case DIV: { \ - const ReductionType REDUCE = DIV; \ + static constexpr ReductionType REDUCE = DIV; \ return __VA_ARGS__(); \ } \ case MIN: { \ - const ReductionType REDUCE = MIN; \ + static constexpr ReductionType REDUCE = MIN; \ return __VA_ARGS__(); \ } \ case MAX: { \ - const ReductionType REDUCE = MAX; \ + static constexpr ReductionType REDUCE = MAX; \ return __VA_ARGS__(); \ } \ } \ diff --git a/csrc/cuda/scatter_cuda.cu b/csrc/cuda/scatter_cuda.cu index bcf5d7eb..83db7ccd 100644 --- a/csrc/cuda/scatter_cuda.cu +++ b/csrc/cuda/scatter_cuda.cu @@ -7,7 +7,7 @@ #include "reducer.cuh" #include "utils.cuh" -#define THREADS 1024 +#define THREADS 256 #define BLOCKS(N) (N + THREADS - 1) / THREADS template @@ -55,15 +55,15 @@ scatter_arg_kernel(const scalar_t *src_data, } } -std::tuple> +std::tuple> scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { CHECK_CUDA(src); CHECK_CUDA(index); if (optional_out.has_value()) CHECK_CUDA(optional_out.value()); - cudaSetDevice(src.get_device()); + c10::cuda::MaybeSetDevice(src.get_device()); CHECK_INPUT(src.dim() == index.dim()); for (auto i = 0; i < index.dim() - 1; i++) @@ -89,7 +89,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim, out = torch::empty(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full_like(out, src.size(dim), index.options()); @@ -111,7 +111,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim, auto index_info = at::cuda::detail::getTensorInfo(index); auto stream = at::cuda::getCurrentCUDAStream(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cuda/scatter_cuda.h b/csrc/cuda/scatter_cuda.h index 95c80642..a0496793 100644 --- a/csrc/cuda/scatter_cuda.h +++ b/csrc/cuda/scatter_cuda.h @@ -1,8 +1,8 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size, std::string reduce); + std::optional optional_out, + std::optional dim_size, std::string reduce); diff --git a/csrc/cuda/segment_coo_cuda.cu b/csrc/cuda/segment_coo_cuda.cu index ad30f972..29610dfc 100644 --- a/csrc/cuda/segment_coo_cuda.cu +++ b/csrc/cuda/segment_coo_cuda.cu @@ -36,8 +36,8 @@ segment_coo_kernel(const scalar_t *src_data, #pragma unroll for (int i = 1; i < 32; i *= 2) { // Parallel reduction inside a single warp. - tmp = __shfl_up_sync(FULL_MASK, val, i); - next_idx = __shfl_up_sync(FULL_MASK, idx, i); + tmp = SHFL_UP_SYNC(FULL_MASK, val, i); + next_idx = SHFL_UP_SYNC(FULL_MASK, idx, i); if (lane_idx >= i && row_idx / D == (row_idx - i) / D) { assert(idx >= next_idx); if (idx == next_idx) @@ -45,7 +45,7 @@ segment_coo_kernel(const scalar_t *src_data, } } - next_idx = __shfl_down_sync(FULL_MASK, idx, 1); + next_idx = SHFL_DOWN_SYNC(FULL_MASK, idx, 1); if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D || idx != next_idx) Reducer::atomic_write(out_data + out_idx, val); @@ -149,15 +149,15 @@ __global__ void segment_coo_arg_broadcast_kernel( } } -std::tuple> +std::tuple> segment_coo_cuda(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { CHECK_CUDA(src); CHECK_CUDA(index); if (optional_out.has_value()) CHECK_CUDA(optional_out.value()); - cudaSetDevice(src.get_device()); + c10::cuda::MaybeSetDevice(src.get_device()); CHECK_INPUT(src.dim() >= index.dim()); @@ -191,7 +191,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index, out = torch::zeros(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full_like(out, src.size(dim), index.options()); @@ -214,7 +214,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index, auto index_info = at::cuda::detail::getTensorInfo(index); auto stream = at::cuda::getCurrentCUDAStream(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); @@ -325,12 +325,12 @@ __global__ void gather_coo_broadcast_kernel( } torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index, - torch::optional optional_out) { + std::optional optional_out) { CHECK_CUDA(src); CHECK_CUDA(index); if (optional_out.has_value()) CHECK_CUDA(optional_out.value()); - cudaSetDevice(src.get_device()); + c10::cuda::MaybeSetDevice(src.get_device()); CHECK_INPUT(src.dim() >= index.dim()); @@ -365,7 +365,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index, auto index_info = at::cuda::detail::getTensorInfo(index); auto stream = at::cuda::getCurrentCUDAStream(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cuda/segment_coo_cuda.h b/csrc/cuda/segment_coo_cuda.h index 68154775..93f2aee6 100644 --- a/csrc/cuda/segment_coo_cuda.h +++ b/csrc/cuda/segment_coo_cuda.h @@ -1,11 +1,11 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> segment_coo_cuda(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size, std::string reduce); + std::optional optional_out, + std::optional dim_size, std::string reduce); torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index, - torch::optional optional_out); + std::optional optional_out); diff --git a/csrc/cuda/segment_csr_cuda.cu b/csrc/cuda/segment_csr_cuda.cu index d88f3250..7061ba69 100644 --- a/csrc/cuda/segment_csr_cuda.cu +++ b/csrc/cuda/segment_csr_cuda.cu @@ -46,9 +46,9 @@ segment_csr_kernel(const scalar_t *src_data, for (int i = TB / 2; i > 0; i /= 2) { // Parallel reduction inside a single warp. if (REDUCE == MIN || REDUCE == MAX) - arg_tmp = __shfl_down_sync(FULL_MASK, arg, i); + arg_tmp = SHFL_DOWN_SYNC(FULL_MASK, arg, i); Reducer::update( - &val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp); + &val, SHFL_DOWN_SYNC(FULL_MASK, val, i), &arg, arg_tmp); } if (lane_idx == 0) { @@ -94,15 +94,15 @@ __global__ void segment_csr_broadcast_kernel( } } -std::tuple> +std::tuple> segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out, + std::optional optional_out, std::string reduce) { CHECK_CUDA(src); CHECK_CUDA(indptr); if (optional_out.has_value()) CHECK_CUDA(optional_out.value()); - cudaSetDevice(src.get_device()); + c10::cuda::MaybeSetDevice(src.get_device()); CHECK_INPUT(src.dim() >= indptr.dim()); @@ -128,7 +128,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, out = torch::empty(sizes, src.options()); } - torch::optional arg_out = torch::nullopt; + std::optional arg_out = std::nullopt; int64_t *arg_out_data = nullptr; if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) { arg_out = torch::full(out.sizes(), src.size(dim), indptr.options()); @@ -147,7 +147,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, auto indptr_info = at::cuda::detail::getTensorInfo(indptr); auto stream = at::cuda::getCurrentCUDAStream(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); @@ -217,12 +217,12 @@ __global__ void gather_csr_broadcast_kernel( } torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { + std::optional optional_out) { CHECK_CUDA(src); CHECK_CUDA(indptr); if (optional_out.has_value()) CHECK_CUDA(optional_out.value()); - cudaSetDevice(src.get_device()); + c10::cuda::MaybeSetDevice(src.get_device()); CHECK_INPUT(src.dim() >= indptr.dim()); @@ -264,7 +264,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, auto indptr_info = at::cuda::detail::getTensorInfo(indptr); auto stream = at::cuda::getCurrentCUDAStream(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] { auto src_data = src.data_ptr(); auto out_data = out.data_ptr(); diff --git a/csrc/cuda/segment_csr_cuda.h b/csrc/cuda/segment_csr_cuda.h index 5f8bd40e..8014e766 100644 --- a/csrc/cuda/segment_csr_cuda.h +++ b/csrc/cuda/segment_csr_cuda.h @@ -1,11 +1,11 @@ #pragma once -#include +#include "../extensions.h" -std::tuple> +std::tuple> segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out, + std::optional optional_out, std::string reduce); torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); + std::optional optional_out); diff --git a/csrc/cuda/utils.cuh b/csrc/cuda/utils.cuh index d08b0536..396b4fa1 100644 --- a/csrc/cuda/utils.cuh +++ b/csrc/cuda/utils.cuh @@ -1,6 +1,6 @@ #pragma once -#include +#include "../extensions.h" #define CHECK_CUDA(x) \ AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor") @@ -9,11 +9,32 @@ __device__ __inline__ at::Half __shfl_up_sync(const unsigned mask, const at::Half var, const unsigned int delta) { - return __shfl_up_sync(mask, (__half)var, delta); + return __shfl_up_sync(mask, var.operator __half(), delta); } __device__ __inline__ at::Half __shfl_down_sync(const unsigned mask, const at::Half var, const unsigned int delta) { - return __shfl_down_sync(mask, (__half)var, delta); + return __shfl_down_sync(mask, var.operator __half(), delta); } + +__device__ __inline__ at::Half __shfl_up(const at::Half var, + const unsigned int delta) { + return __shfl_up(var.operator __half(), delta); +} + +__device__ __inline__ at::Half __shfl_down(const at::Half var, + const unsigned int delta) { + return __shfl_down(var.operator __half(), delta); +} + +#ifdef USE_ROCM +__device__ __inline__ at::Half __ldg(const at::Half* ptr) { + return __ldg(reinterpret_cast(ptr)); +} +#define SHFL_UP_SYNC(mask, var, delta) __shfl_up(var, delta) +#define SHFL_DOWN_SYNC(mask, var, delta) __shfl_down(var, delta) +#else +#define SHFL_UP_SYNC __shfl_up_sync +#define SHFL_DOWN_SYNC __shfl_down_sync +#endif diff --git a/csrc/extensions.h b/csrc/extensions.h new file mode 100644 index 00000000..91c4df1a --- /dev/null +++ b/csrc/extensions.h @@ -0,0 +1,2 @@ +#include "macros.h" +#include diff --git a/csrc/macros.h b/csrc/macros.h new file mode 100644 index 00000000..d55e6236 --- /dev/null +++ b/csrc/macros.h @@ -0,0 +1,21 @@ +#pragma once + +#ifdef _WIN32 +#if defined(torchscatter_EXPORTS) +#define SCATTER_API __declspec(dllexport) +#else +#define SCATTER_API __declspec(dllimport) +#endif +#else +#define SCATTER_API +#endif + +#if (defined __cpp_inline_variables) || __cplusplus >= 201703L +#define SCATTER_INLINE_VARIABLE inline +#else +#ifdef _MSC_VER +#define SCATTER_INLINE_VARIABLE __declspec(selectany) +#else +#define SCATTER_INLINE_VARIABLE __attribute__((weak)) +#endif +#endif diff --git a/csrc/scatter.cpp b/csrc/scatter.cpp index 3a418ab3..0c986c00 100644 --- a/csrc/scatter.cpp +++ b/csrc/scatter.cpp @@ -1,7 +1,11 @@ +#ifdef WITH_PYTHON #include +#endif + #include #include "cpu/scatter_cpu.h" +#include "macros.h" #include "utils.h" #ifdef WITH_CUDA @@ -9,12 +13,14 @@ #endif #ifdef _WIN32 +#ifdef WITH_PYTHON #ifdef WITH_CUDA PyMODINIT_FUNC PyInit__scatter_cuda(void) { return NULL; } #else PyMODINIT_FUNC PyInit__scatter_cpu(void) { return NULL; } #endif #endif +#endif torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) { if (src.dim() == 1) @@ -26,10 +32,10 @@ torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) { return src; } -std::tuple> +std::tuple> scatter_fw(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { if (src.device().is_cuda()) { #ifdef WITH_CUDA return scatter_cuda(src, index, dim, optional_out, dim_size, reduce); @@ -49,8 +55,8 @@ class ScatterSum : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { dim = dim < 0 ? src.dim() + dim : dim; ctx->saved_data["dim"] = dim; ctx->saved_data["src_shape"] = src.sizes(); @@ -78,8 +84,8 @@ class ScatterMul : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { dim = dim < 0 ? src.dim() + dim : dim; ctx->saved_data["dim"] = dim; ctx->saved_data["src_shape"] = src.sizes(); @@ -110,8 +116,8 @@ class ScatterMean : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { dim = dim < 0 ? src.dim() + dim : dim; ctx->saved_data["dim"] = dim; ctx->saved_data["src_shape"] = src.sizes(); @@ -125,7 +131,7 @@ class ScatterMean : public torch::autograd::Function { auto ones = torch::ones(old_index.sizes(), src.options()); result = scatter_fw(ones, old_index, old_index.dim() <= dim ? old_index.dim() - 1 : dim, - torch::nullopt, out.size(dim), "sum"); + std::nullopt, out.size(dim), "sum"); auto count = std::get<0>(result); count.masked_fill_(count < 1, 1); count = broadcast(count, out, dim); @@ -158,8 +164,8 @@ class ScatterMin : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { dim = dim < 0 ? src.dim() + dim : dim; ctx->saved_data["dim"] = dim; ctx->saved_data["src_shape"] = src.sizes(); @@ -194,8 +200,8 @@ class ScatterMax : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { dim = dim < 0 ? src.dim() + dim : dim; ctx->saved_data["dim"] = dim; ctx->saved_data["src_shape"] = src.sizes(); @@ -226,36 +232,39 @@ class ScatterMax : public torch::autograd::Function { } }; -torch::Tensor scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { +SCATTER_API torch::Tensor +scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size) { return ScatterSum::apply(src, index, dim, optional_out, dim_size)[0]; } -torch::Tensor scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { +SCATTER_API torch::Tensor +scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size) { return ScatterMul::apply(src, index, dim, optional_out, dim_size)[0]; } -torch::Tensor scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { +SCATTER_API torch::Tensor +scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size) { return ScatterMean::apply(src, index, dim, optional_out, dim_size)[0]; } -std::tuple +SCATTER_API std::tuple scatter_min(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { auto result = ScatterMin::apply(src, index, dim, optional_out, dim_size); return std::make_tuple(result[0], result[1]); } -std::tuple +SCATTER_API std::tuple scatter_max(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { auto result = ScatterMax::apply(src, index, dim, optional_out, dim_size); return std::make_tuple(result[0], result[1]); } diff --git a/csrc/scatter.h b/csrc/scatter.h index 629c96f4..f5d9eaaf 100644 --- a/csrc/scatter.h +++ b/csrc/scatter.h @@ -1,61 +1,80 @@ #pragma once -#include +#include "extensions.h" -int64_t cuda_version(); +namespace scatter { +SCATTER_API int64_t cuda_version() noexcept; -torch::Tensor scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size); +namespace detail { +SCATTER_INLINE_VARIABLE int64_t _cuda_version = cuda_version(); +} // namespace detail +} // namespace scatter -torch::Tensor scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size); +SCATTER_API torch::Tensor +scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size); -std::tuple +SCATTER_API torch::Tensor +scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size); + +SCATTER_API torch::Tensor +scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim, + std::optional optional_out, + std::optional dim_size); + +SCATTER_API std::tuple scatter_min(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size); + std::optional optional_out, + std::optional dim_size); -std::tuple +SCATTER_API std::tuple scatter_max(torch::Tensor src, torch::Tensor index, int64_t dim, - torch::optional optional_out, - torch::optional dim_size); + std::optional optional_out, + std::optional dim_size); -torch::Tensor segment_sum_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size); +SCATTER_API torch::Tensor +segment_sum_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out, + std::optional dim_size); -torch::Tensor segment_mean_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size); +SCATTER_API torch::Tensor +segment_mean_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out, + std::optional dim_size); -std::tuple +SCATTER_API std::tuple segment_min_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size); + std::optional optional_out, + std::optional dim_size); -std::tuple +SCATTER_API std::tuple segment_max_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size); + std::optional optional_out, + std::optional dim_size); -torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out); +SCATTER_API torch::Tensor +gather_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out); -torch::Tensor segment_sum_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); +SCATTER_API torch::Tensor +segment_sum_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out); -torch::Tensor segment_mean_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); +SCATTER_API torch::Tensor +segment_mean_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out); -std::tuple +SCATTER_API std::tuple segment_min_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); + std::optional optional_out); -std::tuple +SCATTER_API std::tuple segment_max_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); + std::optional optional_out); -torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out); +SCATTER_API torch::Tensor +gather_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out); diff --git a/csrc/segment_coo.cpp b/csrc/segment_coo.cpp index 234f3ee4..e59877a8 100644 --- a/csrc/segment_coo.cpp +++ b/csrc/segment_coo.cpp @@ -1,7 +1,11 @@ +#ifdef WITH_PYTHON #include +#endif + #include #include "cpu/segment_coo_cpu.h" +#include "macros.h" #include "utils.h" #ifdef WITH_CUDA @@ -9,17 +13,19 @@ #endif #ifdef _WIN32 +#ifdef WITH_PYTHON #ifdef WITH_CUDA PyMODINIT_FUNC PyInit__segment_coo_cuda(void) { return NULL; } #else PyMODINIT_FUNC PyInit__segment_coo_cpu(void) { return NULL; } #endif #endif +#endif -std::tuple> +std::tuple> segment_coo_fw(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size, std::string reduce) { + std::optional optional_out, + std::optional dim_size, std::string reduce) { if (src.device().is_cuda()) { #ifdef WITH_CUDA return segment_coo_cuda(src, index, optional_out, dim_size, reduce); @@ -32,7 +38,7 @@ segment_coo_fw(torch::Tensor src, torch::Tensor index, } torch::Tensor gather_coo_fw(torch::Tensor src, torch::Tensor index, - torch::optional optional_out) { + std::optional optional_out) { if (src.device().is_cuda()) { #ifdef WITH_CUDA return gather_coo_cuda(src, index, optional_out); @@ -52,8 +58,8 @@ class SegmentSumCOO : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_coo_fw(src, index, optional_out, dim_size, "sum"); auto out = std::get<0>(result); @@ -78,8 +84,8 @@ class SegmentMeanCOO : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_coo_fw(src, index, optional_out, dim_size, "mean"); auto out = std::get<0>(result); @@ -98,7 +104,7 @@ class SegmentMeanCOO : public torch::autograd::Function { auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList()); auto grad_in = torch::empty(src_shape, grad_out.options()); gather_coo_fw(grad_out, index, grad_in); - count = gather_coo_fw(count, index, torch::nullopt); + count = gather_coo_fw(count, index, std::nullopt); for (auto i = 0; i < grad_out.dim() - index.dim(); i++) count = count.unsqueeze(-1); grad_in.true_divide_(count); @@ -110,8 +116,8 @@ class SegmentMinCOO : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_coo_fw(src, index, optional_out, dim_size, "min"); auto out = std::get<0>(result); @@ -142,8 +148,8 @@ class SegmentMaxCOO : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_coo_fw(src, index, optional_out, dim_size, "max"); auto out = std::get<0>(result); @@ -174,7 +180,7 @@ class GatherCOO : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable index, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto out = gather_coo_fw(src, index, optional_out); ctx->save_for_backward({index}); @@ -190,41 +196,44 @@ class GatherCOO : public torch::autograd::Function { auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList()); auto grad_in = torch::zeros(src_shape, grad_out.options()); - segment_coo_fw(grad_out, index, grad_in, torch::nullopt, "sum"); + segment_coo_fw(grad_out, index, grad_in, std::nullopt, "sum"); return {grad_in, Variable(), Variable()}; } }; -torch::Tensor segment_sum_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size) { +SCATTER_API torch::Tensor +segment_sum_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out, + std::optional dim_size) { return SegmentSumCOO::apply(src, index, optional_out, dim_size)[0]; } -torch::Tensor segment_mean_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size) { +SCATTER_API torch::Tensor +segment_mean_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out, + std::optional dim_size) { return SegmentMeanCOO::apply(src, index, optional_out, dim_size)[0]; } -std::tuple +SCATTER_API std::tuple segment_min_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { auto result = SegmentMinCOO::apply(src, index, optional_out, dim_size); return std::make_tuple(result[0], result[1]); } -std::tuple +SCATTER_API std::tuple segment_max_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out, - torch::optional dim_size) { + std::optional optional_out, + std::optional dim_size) { auto result = SegmentMaxCOO::apply(src, index, optional_out, dim_size); return std::make_tuple(result[0], result[1]); } -torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index, - torch::optional optional_out) { +SCATTER_API torch::Tensor +gather_coo(torch::Tensor src, torch::Tensor index, + std::optional optional_out) { return GatherCOO::apply(src, index, optional_out)[0]; } diff --git a/csrc/segment_csr.cpp b/csrc/segment_csr.cpp index 4b2ad08c..5f913c21 100644 --- a/csrc/segment_csr.cpp +++ b/csrc/segment_csr.cpp @@ -1,7 +1,11 @@ +#ifdef WITH_PYTHON #include +#endif + #include #include "cpu/segment_csr_cpu.h" +#include "macros.h" #include "utils.h" #ifdef WITH_CUDA @@ -9,16 +13,18 @@ #endif #ifdef _WIN32 +#ifdef WITH_PYTHON #ifdef WITH_CUDA PyMODINIT_FUNC PyInit__segment_csr_cuda(void) { return NULL; } #else PyMODINIT_FUNC PyInit__segment_csr_cpu(void) { return NULL; } #endif #endif +#endif -std::tuple> +std::tuple> segment_csr_fw(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out, + std::optional optional_out, std::string reduce) { if (src.device().is_cuda()) { #ifdef WITH_CUDA @@ -32,7 +38,7 @@ segment_csr_fw(torch::Tensor src, torch::Tensor indptr, } torch::Tensor gather_csr_fw(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { + std::optional optional_out) { if (src.device().is_cuda()) { #ifdef WITH_CUDA return gather_csr_cuda(src, indptr, optional_out); @@ -52,7 +58,7 @@ class SegmentSumCSR : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable indptr, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "sum")); ctx->save_for_backward({indptr}); @@ -76,7 +82,7 @@ class SegmentMeanCSR : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable indptr, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "mean")); ctx->save_for_backward({indptr}); @@ -96,7 +102,7 @@ class SegmentMeanCSR : public torch::autograd::Function { auto indptr1 = indptr.narrow(-1, 0, indptr.size(-1) - 1); auto indptr2 = indptr.narrow(-1, 1, indptr.size(-1) - 1); auto count = (indptr2 - indptr1).to(grad_in.options()); - count = gather_csr_fw(count, indptr, torch::nullopt); + count = gather_csr_fw(count, indptr, std::nullopt); for (auto i = 0; i < grad_out.dim() - indptr.dim(); i++) count = count.unsqueeze(-1); grad_in.true_divide_(count); @@ -109,7 +115,7 @@ class SegmentMinCSR : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable indptr, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_csr_fw(src, indptr, optional_out, "min"); auto out = std::get<0>(result); @@ -140,7 +146,7 @@ class SegmentMaxCSR : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable indptr, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto result = segment_csr_fw(src, indptr, optional_out, "max"); auto out = std::get<0>(result); @@ -171,7 +177,7 @@ class GatherCSR : public torch::autograd::Function { public: static variable_list forward(AutogradContext *ctx, Variable src, Variable indptr, - torch::optional optional_out) { + std::optional optional_out) { ctx->saved_data["src_shape"] = src.sizes(); auto out = gather_csr_fw(src, indptr, optional_out); ctx->save_for_backward({indptr}); @@ -192,32 +198,35 @@ class GatherCSR : public torch::autograd::Function { } }; -torch::Tensor segment_sum_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { +SCATTER_API torch::Tensor +segment_sum_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out) { return SegmentSumCSR::apply(src, indptr, optional_out)[0]; } -torch::Tensor segment_mean_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { +SCATTER_API torch::Tensor +segment_mean_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out) { return SegmentMeanCSR::apply(src, indptr, optional_out)[0]; } -std::tuple +SCATTER_API std::tuple segment_min_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { + std::optional optional_out) { auto result = SegmentMinCSR::apply(src, indptr, optional_out); return std::make_tuple(result[0], result[1]); } -std::tuple +SCATTER_API std::tuple segment_max_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { + std::optional optional_out) { auto result = SegmentMaxCSR::apply(src, indptr, optional_out); return std::make_tuple(result[0], result[1]); } -torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr, - torch::optional optional_out) { +SCATTER_API torch::Tensor +gather_csr(torch::Tensor src, torch::Tensor indptr, + std::optional optional_out) { return GatherCSR::apply(src, indptr, optional_out)[0]; } diff --git a/csrc/version.cpp b/csrc/version.cpp index a003ea81..3efac185 100644 --- a/csrc/version.cpp +++ b/csrc/version.cpp @@ -1,25 +1,42 @@ +#ifdef WITH_PYTHON #include +#endif + #include +#include "scatter.h" +#include "macros.h" #ifdef WITH_CUDA +#ifdef USE_ROCM +#include +#else #include #endif +#endif #ifdef _WIN32 +#ifdef WITH_PYTHON #ifdef WITH_CUDA PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; } #else PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; } #endif #endif +#endif -int64_t cuda_version() { +namespace scatter { +SCATTER_API int64_t cuda_version() noexcept { #ifdef WITH_CUDA +#ifdef USE_ROCM + return HIP_VERSION; +#else return CUDA_VERSION; +#endif #else return -1; #endif } +} // namespace scatter -static auto registry = - torch::RegisterOperators().op("torch_scatter::cuda_version", &cuda_version); +static auto registry = torch::RegisterOperators().op( + "torch_scatter::cuda_version", [] { return scatter::cuda_version(); }); diff --git a/docs/requirements.txt b/docs/requirements.txt index 7d432994..abd2d7c7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,3 @@ -numpy -https://download.pytorch.org/whl/cpu/torch-1.5.0%2Bcpu-cp37-cp37m-linux_x86_64.whl +https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp38-cp38-linux_x86_64.whl sphinx>=3 sphinx_rtd_theme diff --git a/docs/source/conf.py b/docs/source/conf.py index eccdb5f1..0db6a74a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,16 +1,15 @@ import datetime -import sphinx_rtd_theme import doctest + +import sphinx_rtd_theme import torch_scatter extensions = [ 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', ] source_suffix = '.rst' diff --git a/docs/source/functions/composite.rst b/docs/source/functions/composite.rst new file mode 100644 index 00000000..2aca0605 --- /dev/null +++ b/docs/source/functions/composite.rst @@ -0,0 +1,8 @@ +composite +========= + +.. currentmodule:: torch_scatter.composite + +.. automodule:: torch_scatter.composite + :members: + :undoc-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 51551643..3481d4a9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,10 +11,13 @@ All included operations are broadcastable, work on varying data types, are imple .. toctree:: :glob: - :maxdepth: 1 + :maxdepth: 0 :caption: Package reference - functions/* + functions/scatter + functions/segment_coo + functions/segment_csr + functions/composite Indices and tables ================== diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..dd14ceb1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "torch"] +build-backend = "setuptools.build_meta" diff --git a/readthedocs.yml b/readthedocs.yml index 2931804e..e4ca6eaa 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -4,7 +4,7 @@ build: image: latest python: - version: 3.7 + version: 3.8 system_packages: true install: - requirements: docs/requirements.txt diff --git a/setup.cfg b/setup.cfg index 93f7ab99..a13a3c32 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,21 @@ [metadata] -description-file = README.md +long_description=file: README.md +long_description_content_type=text/markdown + +classifiers = + Development Status :: 5 - Production/Stable + License :: OSI Approved :: MIT License + Programming Language :: Python + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3 :: Only [aliases] test = pytest [tool:pytest] -addopts = --capture=no --cov +addopts = --capture=no diff --git a/setup.py b/setup.py index 2394709a..fba55fc8 100644 --- a/setup.py +++ b/setup.py @@ -1,17 +1,22 @@ -import os -import sys import glob +import os import os.path as osp -from itertools import product -from setuptools import setup, find_packages import platform +import sys +from itertools import product import torch +from setuptools import find_packages, setup from torch.__config__ import parallel_info -from torch.utils.cpp_extension import BuildExtension -from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME +from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, CppExtension, + CUDAExtension) -WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None +__version__ = '2.1.2' +URL = 'https://github.com/rusty1s/pytorch_scatter' + +WITH_CUDA = False +if torch.cuda.is_available(): + WITH_CUDA = CUDA_HOME is not None or torch.version.hip suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu'] if os.getenv('FORCE_CUDA', '0') == '1': suffices = ['cuda', 'cpu'] @@ -21,6 +26,7 @@ suffices = ['cpu'] BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1' +WITH_SYMBOLS = os.getenv('WITH_SYMBOLS', '0') == '1' def get_extensions(): @@ -28,13 +34,20 @@ def get_extensions(): extensions_dir = osp.join('csrc') main_files = glob.glob(osp.join(extensions_dir, '*.cpp')) + # remove generated 'hip' files, in case of rebuilds + main_files = [path for path in main_files if 'hip' not in path] for main, suffix in product(main_files, suffices): - define_macros = [] - extra_compile_args = {'cxx': ['-O2']} + define_macros = [('WITH_PYTHON', None)] + undef_macros = [] + + if sys.platform == 'win32': + define_macros += [('torchscatter_EXPORTS', None)] + + extra_compile_args = {'cxx': ['-O3']} if not os.name == 'nt': # Not on Windows: extra_compile_args['cxx'] += ['-Wno-sign-compare'] - extra_link_args = ['-s'] + extra_link_args = [] if WITH_SYMBOLS else ['-s'] info = parallel_info() if ('backend: OpenMP' in info and 'OpenMP not found' not in info @@ -48,15 +61,24 @@ def get_extensions(): print('Compiling without OpenMP...') # Compile for mac arm64 - if (sys.platform == 'darwin' and platform.machine() == 'arm64'): - extra_compile_args['cxx'] += ['-arch', 'arm64'] - extra_link_args += ['-arch', 'arm64'] + if sys.platform == 'darwin': + extra_compile_args['cxx'] += ['-D_LIBCPP_DISABLE_AVAILABILITY'] + if platform.machine == 'arm64': + extra_compile_args['cxx'] += ['-arch', 'arm64'] + extra_link_args += ['-arch', 'arm64'] if suffix == 'cuda': define_macros += [('WITH_CUDA', None)] nvcc_flags = os.getenv('NVCC_FLAGS', '') nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ') - nvcc_flags += ['--expt-relaxed-constexpr', '-O2'] + nvcc_flags += ['-O3'] + if torch.version.hip: + # USE_ROCM was added to later versions of PyTorch. + # Define here to support older PyTorch versions as well: + define_macros += [('USE_ROCM', None)] + undef_macros += ['__HIP_NO_HALF_CONVERSIONS__'] + else: + nvcc_flags += ['--expt-relaxed-constexpr'] extra_compile_args['nvcc'] = nvcc_flags name = main.split(os.sep)[-1][:-4] @@ -76,6 +98,7 @@ def get_extensions(): sources, include_dirs=[extensions_dir], define_macros=define_macros, + undef_macros=undef_macros, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ) @@ -85,27 +108,36 @@ def get_extensions(): install_requires = [] -setup_requires = [] -tests_require = ['pytest', 'pytest-runner', 'pytest-cov'] + +test_requires = [ + 'pytest', + 'pytest-cov', +] + +# work-around hipify abs paths +include_package_data = True +if torch.cuda.is_available() and torch.version.hip: + include_package_data = False setup( name='torch_scatter', - version='2.0.9', + version=__version__, + description='PyTorch Extension Library of Optimized Scatter Operations', author='Matthias Fey', author_email='matthias.fey@tu-dortmund.de', - url='https://github.com/rusty1s/pytorch_scatter', - description='PyTorch Extension Library of Optimized Scatter Operations', + url=URL, + download_url=f'{URL}/archive/{__version__}.tar.gz', keywords=['pytorch', 'scatter', 'segment', 'gather'], - license='MIT', - python_requires='>=3.6', + python_requires='>=3.8', install_requires=install_requires, - setup_requires=setup_requires, - tests_require=tests_require, - extras_require={'test': tests_require}, + extras_require={ + 'test': test_requires, + }, ext_modules=get_extensions() if not BUILD_DOCS else [], cmdclass={ 'build_ext': BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False) }, packages=find_packages(), + include_package_data=include_package_data, ) diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/composite/test_logsumexp.py b/test/composite/test_logsumexp.py index 92844b73..a6b3d160 100644 --- a/test/composite/test_logsumexp.py +++ b/test/composite/test_logsumexp.py @@ -4,20 +4,37 @@ def test_logsumexp(): inputs = torch.tensor([ - 0.5, 0.5, 0.0, -2.1, 3.2, 7.0, -1.0, -100.0, - float('-inf'), - float('-inf'), 0.0 + 0.5, + 0.5, + 0.0, + -2.1, + 3.2, + 7.0, + -1.0, + -100.0, ]) inputs.requires_grad_() - index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4, 5, 6, 6]) - splits = [2, 3, 1, 0, 2, 1, 2] + index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4]) + splits = [2, 3, 1, 0, 2] outputs = scatter_logsumexp(inputs, index) for src, out in zip(inputs.split(splits), outputs.unbind()): - assert out.tolist() == torch.logsumexp(src, dim=0).tolist() + if src.numel() > 0: + assert out.tolist() == torch.logsumexp(src, dim=0).tolist() + else: + assert out.item() == 0.0 outputs.backward(torch.randn_like(outputs)) jit = torch.jit.script(scatter_logsumexp) assert jit(inputs, index).tolist() == outputs.tolist() + + +def test_logsumexp_out(): + src = torch.tensor([-1.0, -50.0]) + index = torch.tensor([0, 0]) + out = torch.tensor([-10.0, -10.0]) + + scatter_logsumexp(src=src, index=index, out=out) + assert out.allclose(torch.tensor([-0.9999, -10.0]), atol=1e-4) diff --git a/test/test_broadcasting.py b/test/test_broadcasting.py index cfb3593c..0b332e49 100644 --- a/test/test_broadcasting.py +++ b/test/test_broadcasting.py @@ -3,8 +3,7 @@ import pytest import torch from torch_scatter import scatter - -from .utils import reductions, devices +from torch_scatter.testing import devices, reductions @pytest.mark.parametrize('reduce,device', product(reductions, devices)) diff --git a/test/test_gather.py b/test/test_gather.py index 8d0d100f..0b40e5d4 100644 --- a/test/test_gather.py +++ b/test/test_gather.py @@ -3,9 +3,8 @@ import pytest import torch from torch.autograd import gradcheck -from torch_scatter import gather_csr, gather_coo - -from .utils import tensor, dtypes, devices +from torch_scatter import gather_coo, gather_csr +from torch_scatter.testing import devices, dtypes, tensor tests = [ { diff --git a/test/test_multi_gpu.py b/test/test_multi_gpu.py index cdaf893e..98ed38c4 100644 --- a/test/test_multi_gpu.py +++ b/test/test_multi_gpu.py @@ -3,8 +3,7 @@ import pytest import torch import torch_scatter - -from .utils import reductions, tensor, dtypes +from torch_scatter.testing import dtypes, reductions, tensor tests = [ { diff --git a/test/test_scatter.py b/test/test_scatter.py index e3b874a2..93257619 100644 --- a/test/test_scatter.py +++ b/test/test_scatter.py @@ -2,10 +2,9 @@ import pytest import torch -from torch.autograd import gradcheck import torch_scatter - -from .utils import reductions, tensor, dtypes, devices +from torch.autograd import gradcheck +from torch_scatter.testing import devices, dtypes, reductions, tensor reductions = reductions + ['mul'] @@ -13,7 +12,7 @@ { 'src': [1, 3, 2, 4, 5, 6], 'index': [0, 1, 0, 1, 1, 3], - 'dim': 0, + 'dim': -1, 'sum': [3, 12, 0, 6], 'add': [3, 12, 0, 6], 'mul': [2, 60, 1, 6], diff --git a/test/test_segment.py b/test/test_segment.py index 3e8996be..9adc49da 100644 --- a/test/test_segment.py +++ b/test/test_segment.py @@ -2,10 +2,9 @@ import pytest import torch -from torch.autograd import gradcheck import torch_scatter - -from .utils import reductions, tensor, dtypes, devices +from torch.autograd import gradcheck +from torch_scatter.testing import devices, dtypes, reductions, tensor tests = [ { diff --git a/test/test_zero_tensors.py b/test/test_zero_tensors.py index 60855427..f744eb56 100644 --- a/test/test_zero_tensors.py +++ b/test/test_zero_tensors.py @@ -2,10 +2,9 @@ import pytest import torch -from torch_scatter import scatter, segment_coo, gather_coo -from torch_scatter import segment_csr, gather_csr - -from .utils import reductions, tensor, grad_dtypes, devices +from torch_scatter import (gather_coo, gather_csr, scatter, segment_coo, + segment_csr) +from torch_scatter.testing import devices, grad_dtypes, reductions, tensor @pytest.mark.parametrize('reduce,dtype,device', diff --git a/torch_scatter/__init__.py b/torch_scatter/__init__.py index 8fff3818..d9388480 100644 --- a/torch_scatter/__init__.py +++ b/torch_scatter/__init__.py @@ -1,10 +1,10 @@ -import os import importlib +import os import os.path as osp import torch -__version__ = '2.0.9' +__version__ = '2.1.2' for library in ['_version', '_scatter', '_segment_csr', '_segment_coo']: cuda_spec = importlib.machinery.PathFinder().find_spec( @@ -28,18 +28,18 @@ torch.ops.torch_scatter.scatter_min = scatter_arg_placeholder torch.ops.torch_scatter.scatter_max = scatter_arg_placeholder - from .placeholder import segment_csr_placeholder - from .placeholder import segment_csr_arg_placeholder - from .placeholder import gather_csr_placeholder + from .placeholder import (gather_csr_placeholder, + segment_csr_arg_placeholder, + segment_csr_placeholder) torch.ops.torch_scatter.segment_sum_csr = segment_csr_placeholder torch.ops.torch_scatter.segment_mean_csr = segment_csr_placeholder torch.ops.torch_scatter.segment_min_csr = segment_csr_arg_placeholder torch.ops.torch_scatter.segment_max_csr = segment_csr_arg_placeholder torch.ops.torch_scatter.gather_csr = gather_csr_placeholder - from .placeholder import segment_coo_placeholder - from .placeholder import segment_coo_arg_placeholder - from .placeholder import gather_coo_placeholder + from .placeholder import (gather_coo_placeholder, + segment_coo_arg_placeholder, + segment_coo_placeholder) torch.ops.torch_scatter.segment_sum_coo = segment_coo_placeholder torch.ops.torch_scatter.segment_mean_coo = segment_coo_placeholder torch.ops.torch_scatter.segment_min_coo = segment_coo_arg_placeholder @@ -47,7 +47,9 @@ torch.ops.torch_scatter.gather_coo = gather_coo_placeholder cuda_version = torch.ops.torch_scatter.cuda_version() -if torch.cuda.is_available() and cuda_version != -1: # pragma: no cover +is_not_hip = torch.version.hip is None +is_cuda = torch.version.cuda is not None +if is_not_hip and is_cuda and cuda_version != -1: # pragma: no cover if cuda_version < 10000: major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2]) else: diff --git a/torch_scatter/composite/logsumexp.py b/torch_scatter/composite/logsumexp.py index 8fd47454..1d5ff9b6 100644 --- a/torch_scatter/composite/logsumexp.py +++ b/torch_scatter/composite/logsumexp.py @@ -1,15 +1,18 @@ from typing import Optional import torch -from torch_scatter import scatter_sum, scatter_max - +from torch_scatter import scatter_max, scatter_sum from torch_scatter.utils import broadcast -def scatter_logsumexp(src: torch.Tensor, index: torch.Tensor, dim: int = -1, - out: Optional[torch.Tensor] = None, - dim_size: Optional[int] = None, - eps: float = 1e-12) -> torch.Tensor: +def scatter_logsumexp( + src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, + out: Optional[torch.Tensor] = None, + dim_size: Optional[int] = None, + eps: float = 1e-12, +) -> torch.Tensor: if not torch.is_floating_point(src): raise ValueError('`scatter_logsumexp` can only be computed over ' 'tensors with floating point data types.') @@ -24,17 +27,30 @@ def scatter_logsumexp(src: torch.Tensor, index: torch.Tensor, dim: int = -1, size = list(src.size()) size[dim] = dim_size - max_value_per_index = torch.full(size, float('-inf'), dtype=src.dtype, - device=src.device) + max_value_per_index = torch.full( + size, + fill_value=float('-inf'), + dtype=src.dtype, + device=src.device, + ) scatter_max(src, index, dim, max_value_per_index, dim_size=dim_size)[0] max_per_src_element = max_value_per_index.gather(dim, index) recentered_score = src - max_per_src_element recentered_score.masked_fill_(torch.isnan(recentered_score), float('-inf')) + orig_out: Optional[torch.Tensor] = None if out is not None: + orig_out = out.clone() out = out.sub_(max_value_per_index).exp_() sum_per_index = scatter_sum(recentered_score.exp_(), index, dim, out, dim_size) - return sum_per_index.add_(eps).log_().add_(max_value_per_index) + out = sum_per_index.add_(eps).log_().add_(max_value_per_index) + + if orig_out is None: + return out.nan_to_num_(neginf=0.0) + + mask = ~out.isfinite() + out[mask] = orig_out[mask] + return out diff --git a/torch_scatter/scatter.py b/torch_scatter/scatter.py index b994873d..d4d80503 100644 --- a/torch_scatter/scatter.py +++ b/torch_scatter/scatter.py @@ -5,7 +5,9 @@ from .utils import broadcast -def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1, +def scatter_sum(src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> torch.Tensor: index = broadcast(index, src, dim) @@ -23,19 +25,25 @@ def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1, return out.scatter_add_(dim, index, src) -def scatter_add(src: torch.Tensor, index: torch.Tensor, dim: int = -1, +def scatter_add(src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> torch.Tensor: return scatter_sum(src, index, dim, out, dim_size) -def scatter_mul(src: torch.Tensor, index: torch.Tensor, dim: int = -1, +def scatter_mul(src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> torch.Tensor: return torch.ops.torch_scatter.scatter_mul(src, index, dim, out, dim_size) -def scatter_mean(src: torch.Tensor, index: torch.Tensor, dim: int = -1, +def scatter_mean(src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> torch.Tensor: out = scatter_sum(src, index, dim, out, dim_size) @@ -59,21 +67,28 @@ def scatter_mean(src: torch.Tensor, index: torch.Tensor, dim: int = -1, def scatter_min( - src: torch.Tensor, index: torch.Tensor, dim: int = -1, + src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]: return torch.ops.torch_scatter.scatter_min(src, index, dim, out, dim_size) def scatter_max( - src: torch.Tensor, index: torch.Tensor, dim: int = -1, + src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]: return torch.ops.torch_scatter.scatter_max(src, index, dim, out, dim_size) -def scatter(src: torch.Tensor, index: torch.Tensor, dim: int = -1, - out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None, +def scatter(src: torch.Tensor, + index: torch.Tensor, + dim: int = -1, + out: Optional[torch.Tensor] = None, + dim_size: Optional[int] = None, reduce: str = "sum") -> torch.Tensor: r""" | diff --git a/test/utils.py b/torch_scatter/testing.py similarity index 52% rename from test/utils.py rename to torch_scatter/testing.py index dcdb19c9..2407b8a0 100644 --- a/test/utils.py +++ b/torch_scatter/testing.py @@ -1,14 +1,19 @@ +from typing import Any + import torch reductions = ['sum', 'add', 'mean', 'min', 'max'] -dtypes = [torch.half, torch.float, torch.double, torch.int, torch.long] +dtypes = [ + torch.half, torch.bfloat16, torch.float, torch.double, torch.int, + torch.long +] grad_dtypes = [torch.float, torch.double] devices = [torch.device('cpu')] if torch.cuda.is_available(): - devices += [torch.device(f'cuda:{torch.cuda.current_device()}')] + devices += [torch.device('cuda:0')] -def tensor(x, dtype, device): +def tensor(x: Any, dtype: torch.dtype, device: torch.device): return None if x is None else torch.tensor(x, device=device).to(dtype)