diff --git a/.github/workflows/building-conda.yml b/.github/workflows/building-conda.yml
deleted file mode 100644
index a74e873e..00000000
--- a/.github/workflows/building-conda.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-name: Building Conda
-
-on: [workflow_dispatch]
-
-jobs:
-
-  conda-build:
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [windows-latest] #[ubuntu-18.04, macos-10.15, windows-latest]
-        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch-version: [1.9.0, 1.10.0]
-        cuda-version: ['cu113'] #['cpu', 'cu102', 'cu111', 'cu113']
-        exclude:
-          - torch-version: 1.9.0
-            cuda-version: 'cu113'
-          - torch-version: 1.10.0
-            cuda-version: 'cu111'
-          - os: macos-10.15
-            cuda-version: 'cu102'
-          - os: macos-10.15
-            cuda-version: 'cu111'
-          - os: macos-10.15
-            cuda-version: 'cu113'
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Conda for Python ${{ matrix.python-version }}
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Free up disk space
-        if: ${{ runner.os == 'Linux' }}
-        run: |
-          sudo rm -rf /usr/share/dotnet
-
-      - name: Install Conda packages
-        run: |
-          conda install conda-build conda-verify --yes
-        shell:
-          bash -l {0}
-
-      - name: Install CUDA ${{ matrix.cuda-version }}
-        if: ${{ matrix.cuda-version != 'cpu' }}
-        run: |
-          bash .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}.sh
-        shell:
-          bash
-
-      - name: Build Conda package for CPU
-        if: ${{ matrix.cuda-version == 'cpu' }}
-        run: |
-          FORCE_CUDA=0 TORCH_CUDA_ARCH_LIST=0 ./conda/pytorch-scatter/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
-        shell:
-          bash -l {0}
-
-      - name: Build Conda package for GPU
-        if: ${{ matrix.cuda-version != 'cpu' }}
-        run: |
-          source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
-          ./conda/pytorch-scatter/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
-        shell:
-          bash -l {0}
-
-      - name: Publish Conda package on organization channel
-        run: |
-          conda install anaconda-client --yes
-          anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.PYG_ANACONDA_TOKEN }}
-        shell:
-          bash -l {0}
-
-      - name: Publish Conda package on personal channel
-        run: |
-          conda install anaconda-client --yes
-          anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.RUSTY1S_ANACONDA_TOKEN }}
-        shell:
-          bash -l {0}
diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml
index 7aca6bc8..78ff5580 100644
--- a/.github/workflows/building.yml
+++ b/.github/workflows/building.yml
@@ -10,21 +10,25 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04, macos-10.15, windows-latest]
-        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch-version: [1.9.0, 1.10.0]
-        cuda-version: ['cpu', 'cu102', 'cu111', 'cu113']
+        os: [ubuntu-20.04, macos-14, windows-2019, ubuntu-22.04-arm]
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+        torch-version: [2.6.0]  # [2.5.0]
+        cuda-version: ['cpu', 'cu118', 'cu121', 'cu124', 'cu126']
         exclude:
-          - torch-version: 1.9.0
-            cuda-version: 'cu113'
-          - torch-version: 1.10.0
-            cuda-version: 'cu111'
-          - os: macos-10.15
-            cuda-version: 'cu102'
-          - os: macos-10.15
-            cuda-version: 'cu111'
-          - os: macos-10.15
-            cuda-version: 'cu113'
+          - torch-version: 2.5.0
+            python-version: '3.13'
+          - torch-version: 2.5.0
+            cuda-version: 'cu126'
+          - torch-version: 2.6.0
+            cuda-version: 'cu121'
+          - os: macos-14
+            cuda-version: 'cu118'
+          - os: macos-14
+            cuda-version: 'cu121'
+          - os: macos-14
+            cuda-version: 'cu124'
+          - os: macos-14
+            cuda-version: 'cu126'
 
     steps:
       - uses: actions/checkout@v2
@@ -33,10 +37,13 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Free up disk space
-        if: ${{ runner.os == 'Linux' }}
+      - name: Upgrade pip
         run: |
-          sudo rm -rf /usr/share/dotnet
+          pip install --upgrade setuptools
+
+      - name: Free Disk Space (Ubuntu)
+        if: ${{ runner.os == 'Linux' }}
+        uses: jlumbroso/free-disk-space@main
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
@@ -45,15 +52,34 @@ jobs:
 
       - name: Install PyTorch ${{ matrix.torch-version }}+${{ matrix.cuda-version }}
         run: |
-          pip install numpy typing-extensions dataclasses
-          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version}} -f https://download.pytorch.org/whl/${{ matrix.cuda-version }}/torch_stable.html
+          pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/${{ matrix.cuda-version }}
           python -c "import torch; print('PyTorch:', torch.__version__)"
           python -c "import torch; print('CUDA:', torch.version.cuda)"
 
+      - name: Patch PyTorch static constexpr on Windows
+        if: ${{ runner.os == 'Windows' }}
+        run: |
+          Torch_DIR=`python -c 'import os; import torch; print(os.path.dirname(torch.__file__))'`
+          sed -i '31,38c\
+          TORCH_API void lazy_init_num_threads();' ${Torch_DIR}/include/ATen/Parallel.h
+        shell: bash
+
+      - name: Set version
+        if: ${{ runner.os != 'macOS' }}
+        run: |
+          VERSION=`sed -n "s/^__version__ = '\(.*\)'/\1/p" torch_scatter/__init__.py`
+          TORCH_VERSION=`echo "pt${{ matrix.torch-version }}" | sed "s/..$//" | sed "s/\.//g"`
+          CUDA_VERSION=`echo ${{ matrix.cuda-version }}`
+          echo "New version name: $VERSION+$TORCH_VERSION$CUDA_VERSION"
+          sed -i "s/$VERSION/$VERSION+$TORCH_VERSION$CUDA_VERSION/" setup.py
+          sed -i "s/$VERSION/$VERSION+$TORCH_VERSION$CUDA_VERSION/" torch_scatter/__init__.py
+        shell:
+          bash
+
       - name: Install main package for CPU
         if: ${{ matrix.cuda-version == 'cpu' }}
         run: |
-          FORCE_ONLY_CPU=1 pip install -e .
+          FORCE_ONLY_CPU=1 python setup.py develop
         shell:
           bash
 
@@ -61,10 +87,14 @@ jobs:
         if: ${{ matrix.cuda-version != 'cpu' }}
         run: |
           source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
-          pip install -e .
+          python setup.py develop
         shell:
           bash
 
+      - name: Test installation
+        run: |
+          python -c "import torch_scatter; print('torch-scatter:', torch_scatter.__version__)"
+
       - name: Build wheel
         run: |
           pip install wheel
diff --git a/.github/workflows/cuda/cu115-Linux-env.sh b/.github/workflows/cuda/cu115-Linux-env.sh
new file mode 100644
index 00000000..1c148a22
--- /dev/null
+++ b/.github/workflows/cuda/cu115-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.5
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/.github/workflows/cuda/cu115-Linux.sh b/.github/workflows/cuda/cu115-Linux.sh
new file mode 100755
index 00000000..02bcb4d1
--- /dev/null
+++ b/.github/workflows/cuda/cu115-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu1804
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
+sudo apt-key add /var/cuda-repo-${OS}-11-5-local/7fa2af80.pub
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-11-5 cuda-libraries-dev-11-5
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
diff --git a/.github/workflows/cuda/cu115-Windows-env.sh b/.github/workflows/cuda/cu115-Windows-env.sh
new file mode 100644
index 00000000..3a662fb8
--- /dev/null
+++ b/.github/workflows/cuda/cu115-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu115-Windows.sh b/.github/workflows/cuda/cu115-Windows.sh
new file mode 100755
index 00000000..db2559c4
--- /dev/null
+++ b/.github/workflows/cuda/cu115-Windows.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=11.3
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu116-Linux-env.sh b/.github/workflows/cuda/cu116-Linux-env.sh
new file mode 100644
index 00000000..fe318942
--- /dev/null
+++ b/.github/workflows/cuda/cu116-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.6
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
new file mode 100755
index 00000000..28cda58d
--- /dev/null
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu1804
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
+sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-11-6 cuda-libraries-dev-11-6
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
diff --git a/.github/workflows/cuda/cu116-Windows-env.sh b/.github/workflows/cuda/cu116-Windows-env.sh
new file mode 100644
index 00000000..3a662fb8
--- /dev/null
+++ b/.github/workflows/cuda/cu116-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu116-Windows.sh b/.github/workflows/cuda/cu116-Windows.sh
new file mode 100755
index 00000000..af329138
--- /dev/null
+++ b/.github/workflows/cuda/cu116-Windows.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# TODO We currently use CUDA 11.3 to build CUDA 11.6 Windows wheels
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=11.3
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu117-Linux-env.sh b/.github/workflows/cuda/cu117-Linux-env.sh
new file mode 100644
index 00000000..b14f0744
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.7
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/.github/workflows/cuda/cu117-Linux.sh b/.github/workflows/cuda/cu117-Linux.sh
new file mode 100755
index 00000000..d521965b
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu2004
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-11-7 cuda-libraries-dev-11-7
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb
diff --git a/.github/workflows/cuda/cu117-Windows-env.sh b/.github/workflows/cuda/cu117-Windows-env.sh
new file mode 100644
index 00000000..3a662fb8
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu117-Windows.sh b/.github/workflows/cuda/cu117-Windows.sh
new file mode 100755
index 00000000..b11a9a03
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Windows.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# TODO We currently use CUDA 11.3 to build CUDA 11.7 Windows wheels
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=11.3
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu118-Linux-env.sh b/.github/workflows/cuda/cu118-Linux-env.sh
new file mode 100644
index 00000000..18d97b8a
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.8
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/.github/workflows/cuda/cu118-Linux.sh b/.github/workflows/cuda/cu118-Linux.sh
new file mode 100755
index 00000000..46b66e7d
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu2004
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-11-8 cuda-libraries-dev-11-8
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
diff --git a/.github/workflows/cuda/cu118-Windows-env.sh b/.github/workflows/cuda/cu118-Windows-env.sh
new file mode 100644
index 00000000..d0ff04b6
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.8
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu118-Windows.sh b/.github/workflows/cuda/cu118-Windows.sh
new file mode 100755
index 00000000..b82a5a9b
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Windows.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=11.8
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.0_522.06_windows.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu121-Linux-env.sh b/.github/workflows/cuda/cu121-Linux-env.sh
new file mode 100644
index 00000000..b15ae52d
--- /dev/null
+++ b/.github/workflows/cuda/cu121-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-12.1
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
diff --git a/.github/workflows/cuda/cu121-Linux.sh b/.github/workflows/cuda/cu121-Linux.sh
new file mode 100755
index 00000000..dd8db12e
--- /dev/null
+++ b/.github/workflows/cuda/cu121-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu2004
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-12-1 cuda-libraries-dev-12-1
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-${OS}-12-1-local_12.1.1-530.30.02-1_amd64.deb
diff --git a/.github/workflows/cuda/cu121-Windows-env.sh b/.github/workflows/cuda/cu121-Windows-env.sh
new file mode 100644
index 00000000..c55a0356
--- /dev/null
+++ b/.github/workflows/cuda/cu121-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.1
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu121-Windows.sh b/.github/workflows/cuda/cu121-Windows.sh
new file mode 100755
index 00000000..7df067fb
--- /dev/null
+++ b/.github/workflows/cuda/cu121-Windows.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=12.1
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.1_531.14_windows.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu124-Linux-env.sh b/.github/workflows/cuda/cu124-Linux-env.sh
new file mode 100644
index 00000000..19133e1a
--- /dev/null
+++ b/.github/workflows/cuda/cu124-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-12.4
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
diff --git a/.github/workflows/cuda/cu124-Linux.sh b/.github/workflows/cuda/cu124-Linux.sh
new file mode 100755
index 00000000..f337d5f5
--- /dev/null
+++ b/.github/workflows/cuda/cu124-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu2004
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-12-4 cuda-libraries-dev-12-4
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda-repo-${OS}-12-4-local_12.4.1-550.54.15-1_amd64.deb
diff --git a/.github/workflows/cuda/cu124-Windows-env.sh b/.github/workflows/cuda/cu124-Windows-env.sh
new file mode 100644
index 00000000..c47fdbbb
--- /dev/null
+++ b/.github/workflows/cuda/cu124-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.4
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu124-Windows.sh b/.github/workflows/cuda/cu124-Windows.sh
new file mode 100755
index 00000000..b85aeb69
--- /dev/null
+++ b/.github/workflows/cuda/cu124-Windows.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=12.4
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.1_551.78_windows.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/cuda/cu126-Linux-env.sh b/.github/workflows/cuda/cu126-Linux-env.sh
new file mode 100644
index 00000000..bad961ef
--- /dev/null
+++ b/.github/workflows/cuda/cu126-Linux-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-12.6
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
diff --git a/.github/workflows/cuda/cu126-Linux.sh b/.github/workflows/cuda/cu126-Linux.sh
new file mode 100755
index 00000000..99759545
--- /dev/null
+++ b/.github/workflows/cuda/cu126-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu2004
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-12-6-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda-nvcc-12-6 cuda-libraries-dev-12-6
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda-repo-${OS}-12-6-local_12.6.0-560.28.03-1_amd64.deb
diff --git a/.github/workflows/cuda/cu126-Windows-env.sh b/.github/workflows/cuda/cu126-Windows-env.sh
new file mode 100644
index 00000000..c68771bd
--- /dev/null
+++ b/.github/workflows/cuda/cu126-Windows-env.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.6
+PATH=${CUDA_HOME}/bin:$PATH
+PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="6.0+PTX"
diff --git a/.github/workflows/cuda/cu126-Windows.sh b/.github/workflows/cuda/cu126-Windows.sh
new file mode 100755
index 00000000..193d014b
--- /dev/null
+++ b/.github/workflows/cuda/cu126-Windows.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+export CUDA_SHORT=12.6
+export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
+export CUDA_FILE=cuda_${CUDA_SHORT}.0_560.76_windows.exe
+
+# Install CUDA:
+curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
+echo ""
+echo "Installing from ${CUDA_FILE}..."
+PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+echo "Done!"
+rm -f "${CUDA_FILE}"
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 001145fd..78734f7c 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -1,23 +1,23 @@
 name: Linting
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - master
+  pull_request:
 
 jobs:
 
   flake8:
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      matrix:
-        os: [ubuntu-latest]
-        python-version: [3.6]
+    runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
+
+      - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: 3.8
 
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index df8ceb07..1f22fd88 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -1,6 +1,10 @@
 name: Testing
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - master
+  pull_request:
 
 jobs:
 
@@ -8,10 +12,11 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest]
-        python-version: [3.6]
-        torch-version: [1.9.0, 1.10.0]
+        os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm]
+        python-version: [3.9]
+        torch-version: [2.5.0, 2.6.0]
 
     steps:
       - uses: actions/checkout@v2
@@ -22,24 +27,27 @@ jobs:
 
       - name: Install PyTorch ${{ matrix.torch-version }}
         run: |
-          pip install torch==${{ matrix.torch-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+          pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/cpu
 
-      - name: Install main package
+      - name: Patch PyTorch static constexpr on Windows
+        if: ${{ runner.os == 'Windows' }}
         run: |
-          pip install -e .[test]
+          Torch_DIR=`python -c 'import os; import torch; print(os.path.dirname(torch.__file__))'`
+          sed -i '31,38c\
+          TORCH_API void lazy_init_num_threads();' ${Torch_DIR}/include/ATen/Parallel.h
+        shell: bash
 
-      - name: Run test-suite
+      - name: Install main package
         run: |
-          python setup.py test
+          python setup.py develop
 
-      - name: Generate coverage report
-        if: success()
+      - name: Run test-suite
         run: |
-          pip install coverage
-          coverage xml
+          pip install pytest pytest-cov
+          pytest --cov --cov-report=xml
 
-      - name: Upload coverage report to codecov
-        uses: codecov/codecov-action@v1
+      - name: Upload coverage
+        uses: codecov/codecov-action@v4
         if: success()
         with:
-          file: coverage.xml
+          fail_ci_if_error: false
diff --git a/.gitignore b/.gitignore
index b9233c5b..d334126f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ dist/
 *.aux
 *.log
 *.pdf
+*.hip
+*_hip.cpp
+hip
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b180c461..e0a5e11f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,11 @@
 cmake_minimum_required(VERSION 3.0)
 project(torchscatter)
-set(CMAKE_CXX_STANDARD 14)
-set(TORCHSCATTER_VERSION 2.0.9)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED)
+set(TORCHSCATTER_VERSION 2.1.2)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_PYTHON "Link to Python when building" ON)
 
 if(WITH_CUDA)
   enable_language(CUDA)
@@ -12,21 +14,27 @@ if(WITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()
 
-find_package(Python3 COMPONENTS Development)
+if (WITH_PYTHON)
+  add_definitions(-DWITH_PYTHON)
+  find_package(Python3 COMPONENTS Development)
+endif()
 find_package(Torch REQUIRED)
 
-file(GLOB HEADERS csrc/scatter.h)
+file(GLOB HEADERS csrc/*.h)
 file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp)
 if(WITH_CUDA)
   file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu)
 endif()
 
 add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES})
-target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+if (WITH_PYTHON)
+  target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
+endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchScatter)
 
 target_include_directories(${PROJECT_NAME} INTERFACE
-  $<BUILD_INTERFACE:${HEADERS}>
+  "$<BUILD_INTERFACE:${HEADERS}>"
   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 
 include(GNUInstallDirs)
diff --git a/README.md b/README.md
index 527e1b00..59e33b92 100644
--- a/README.md
+++ b/README.md
@@ -41,52 +41,46 @@ All included operations are broadcastable, work on varying data types, are imple
 
 ## Installation
 
-### Anaconda
-
-**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
-Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
-
-```
-conda install pytorch-scatter -c pyg
-```
-
 ### Binaries
 
-We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
 
-#### PyTorch 1.10.0
+#### PyTorch 2.6
 
-To install the binaries for PyTorch 1.10.0, simply run
+To install the binaries for PyTorch 2.6.0, simply run
 
 ```
-pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
+pip install torch-scatter -f https://data.pyg.org/whl/torch-2.6.0+${CUDA}.html
 ```
 
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu113` depending on your PyTorch installation.
+where `${CUDA}` should be replaced by either `cpu`, `cu118`, `cu124`, or `cu126` depending on your PyTorch installation.
+
+|             | `cpu` | `cu118` | `cu124` | `cu126` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
 
-|             | `cpu` | `cu102` | `cu113` |
-|-------------|-------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      |
-| **Windows** | ✅    | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |
 
-#### PyTorch 1.9.0/1.9.1
+#### PyTorch 2.5
 
-To install the binaries for PyTorch 1.9.0 and 1.9.1, simply run
+To install the binaries for PyTorch 2.5.0/2.5.1, simply run
 
 ```
-pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html
+pip install torch-scatter -f https://data.pyg.org/whl/torch-2.5.0+${CUDA}.html
 ```
 
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation.
+where `${CUDA}` should be replaced by either `cpu`, `cu118`, `cu121`, or `cu124` depending on your PyTorch installation.
 
-|             | `cpu` | `cu102` | `cu111` |
-|-------------|-------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      |
-| **Windows** | ✅    | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |
+|             | `cpu` | `cu118` | `cu121` | `cu124` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
 
-**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1 and PyTorch 1.8.0/1.8.1 (following the same procedure).
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1, PyTorch 1.13.0/1.13.1, PyTorch 2.0.0/2.0.1, PyTorch 2.1.0/2.1.1/2.1.2, PyTorch 2.2.0/2.2.1/2.2.2, PyTorch 2.3.0/2.3.1, and PyTorch 2.4.0/2.4.1 (following the same procedure).
+For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
+You can look up the latest supported version number [here](https://data.pyg.org/whl).
 
 ### From source
 
@@ -141,18 +135,19 @@ tensor([[5, 5, 3, 4, 0, 1]
 ## Running tests
 
 ```
-python setup.py test
+pytest
 ```
 
 ## C++ API
 
 `torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
+For this, we need to add `TorchLib` to the `-DCMAKE_PREFIX_PATH` (run `import torch; print(torch.utils.cmake_prefix_path)` to obtain it).
 
 ```
 mkdir build
 cd build
-# Add -DWITH_CUDA=on support for the CUDA if needed
-cmake ..
+# Add -DWITH_CUDA=on support for CUDA support
+cmake -DCMAKE_PREFIX_PATH="..." ..
 make
 make install
 ```
diff --git a/conda/pytorch-scatter/README.md b/conda/pytorch-scatter/README.md
deleted file mode 100644
index f7fe27e8..00000000
--- a/conda/pytorch-scatter/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-```
-./build_conda.sh 3.9 1.9.0 cu111  # python, pytorch and cuda version
-```
diff --git a/conda/pytorch-scatter/build_conda.sh b/conda/pytorch-scatter/build_conda.sh
deleted file mode 100755
index 00f5123c..00000000
--- a/conda/pytorch-scatter/build_conda.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-export PYTHON_VERSION=$1
-export TORCH_VERSION=$2
-export CUDA_VERSION=$3
-
-export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
-
-if [ "${CUDA_VERSION}" = "cpu" ]; then
-  export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
-else
-  case $CUDA_VERSION in
-    cu113)
-      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
-      ;;
-    cu111)
-      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
-      ;;
-    cu102)
-      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
-      ;;
-    cu101)
-      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
-      ;;
-    *)
-      echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
-      exit 1
-      ;;
-  esac
-fi
-
-echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
-echo "- $CONDA_PYTORCH_CONSTRAINT"
-echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
-
-conda build . -c nvidia -c pytorch -c default -c conda-forge --output-folder "$HOME/conda-bld"
diff --git a/conda/pytorch-scatter/meta.yaml b/conda/pytorch-scatter/meta.yaml
deleted file mode 100644
index 0b24e406..00000000
--- a/conda/pytorch-scatter/meta.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-package:
-  name: pytorch-scatter
-  version: 2.0.9
-
-source:
-  path: ../..
-
-requirements:
-  build:
-    - {{ compiler('c') }}  # [win]
-
-  host:
-    - pip
-    - python {{ environ.get('PYTHON_VERSION') }}
-    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
-    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
-
-  run:
-    - python {{ environ.get('PYTHON_VERSION') }}
-    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
-    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
-
-build:
-  string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
-  script: pip install .
-  script_env:
-    - FORCE_CUDA
-    - TORCH_CUDA_ARCH_LIST
-
-test:
-  imports:
-    - torch_scatter
-
-about:
-  home: https://github.com/rusty1s/pytorch_scatter
-  license: MIT
-  summary: PyTorch Extension Library of Optimized Scatter Operations
diff --git a/csrc/cpu/index_info.h b/csrc/cpu/index_info.h
index 9709a1de..5e9ed0b4 100644
--- a/csrc/cpu/index_info.h
+++ b/csrc/cpu/index_info.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
 #define MAX_TENSORINFO_DIMS 25
 
diff --git a/csrc/cpu/scatter_cpu.cpp b/csrc/cpu/scatter_cpu.cpp
index 77a43969..64b8ac3b 100644
--- a/csrc/cpu/scatter_cpu.cpp
+++ b/csrc/cpu/scatter_cpu.cpp
@@ -4,10 +4,10 @@
 #include "reducer.h"
 #include "utils.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size, std::string reduce) {
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size, std::string reduce) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   if (optional_out.has_value())
@@ -36,7 +36,7 @@ scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim,
     out = torch::empty(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full_like(out, src.size(dim), index.options());
@@ -57,7 +57,7 @@ scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim,
   auto N = out.size(dim);
 
   auto index_info = getTensorInfo<int64_t>(index);
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "scatter_cpu", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cpu/scatter_cpu.h b/csrc/cpu/scatter_cpu.h
index 25122e70..eb279c50 100644
--- a/csrc/cpu/scatter_cpu.h
+++ b/csrc/cpu/scatter_cpu.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 scatter_cpu(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size, std::string reduce);
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size, std::string reduce);
diff --git a/csrc/cpu/segment_coo_cpu.cpp b/csrc/cpu/segment_coo_cpu.cpp
index 8497301f..64f975dd 100644
--- a/csrc/cpu/segment_coo_cpu.cpp
+++ b/csrc/cpu/segment_coo_cpu.cpp
@@ -3,11 +3,12 @@
 #include "index_info.h"
 #include "reducer.h"
 #include "utils.h"
+#include <ATen/OpMathType.h>
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_coo_cpu(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size, std::string reduce) {
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size, std::string reduce) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   if (optional_out.has_value())
@@ -44,7 +45,7 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index,
     out = torch::empty(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full_like(out, src.size(dim), index.options());
@@ -69,12 +70,13 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index,
   auto index_info = getTensorInfo<int64_t>(index);
   auto stride = index_info.strides[index_info.dims - 1];
   std::vector<int64_t> args(K);
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "segment_coo_cpu", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
     scalar_t *count_data = nullptr;
 
-    std::vector<scalar_t> vals(K);
+    std::vector<opmath_t> vals(K);
     int64_t idx, next_idx, row_start;
     AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
       if (!optional_out.has_value())
@@ -87,19 +89,19 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index,
         idx = index_info.data[offset];
 
         for (auto k = 0; k < K; k++)
-          vals[k] = out_data[b * N * K + k];
+          vals[k] = static_cast<opmath_t>(out_data[b * N * K + k]);
 
         row_start = 0;
         for (auto e = 0; e < E; e++) {
 
           for (auto k = 0; k < K; k++)
-            Reducer<scalar_t, REDUCE>::update(
-                &vals[k], src_data[b * E * K + e * K + k], &args[k], e);
+            Reducer<opmath_t, REDUCE>::update(
+                &vals[k], static_cast<opmath_t>(src_data[b * E * K + e * K + k]), &args[k], e);
 
           if (e == E - 1) {
             for (auto k = 0; k < K; k++)
               Reducer<scalar_t, REDUCE>::write(
-                  out_data + b * N * K + idx * K + k, vals[k],
+                  out_data + b * N * K + idx * K + k, static_cast<scalar_t>(vals[k]),
                   arg_out_data + b * N * K + idx * K + k, args[k],
                   e + 1 - row_start);
             if (REDUCE == MEAN)
@@ -111,11 +113,11 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index,
             if (idx != next_idx) {
               for (auto k = 0; k < K; k++) {
                 Reducer<scalar_t, REDUCE>::write(
-                    out_data + b * N * K + idx * K + k, vals[k],
+                    out_data + b * N * K + idx * K + k, static_cast<scalar_t>(vals[k]),
                     arg_out_data + b * N * K + idx * K + k, args[k],
                     e + 1 - row_start);
 
-                vals[k] = out_data[b * N * K + next_idx * K + k];
+                vals[k] = static_cast<opmath_t>(out_data[b * N * K + next_idx * K + k]);
               }
               if (REDUCE == MEAN)
                 count_data[b * N + idx] = (scalar_t)(e + 1 - row_start);
@@ -139,7 +141,7 @@ segment_coo_cpu(torch::Tensor src, torch::Tensor index,
 }
 
 torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index,
-                             torch::optional<torch::Tensor> optional_out) {
+                             std::optional<torch::Tensor> optional_out) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   if (optional_out.has_value())
@@ -178,7 +180,7 @@ torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index,
 
   auto index_info = getTensorInfo<int64_t>(index);
   auto stride = index_info.strides[index_info.dims - 1];
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "gather_coo_cpu", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cpu/segment_coo_cpu.h b/csrc/cpu/segment_coo_cpu.h
index feb7a827..e0501209 100644
--- a/csrc/cpu/segment_coo_cpu.h
+++ b/csrc/cpu/segment_coo_cpu.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_coo_cpu(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size, std::string reduce);
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size, std::string reduce);
 
 torch::Tensor gather_coo_cpu(torch::Tensor src, torch::Tensor index,
-                             torch::optional<torch::Tensor> optional_out);
+                             std::optional<torch::Tensor> optional_out);
diff --git a/csrc/cpu/segment_csr_cpu.cpp b/csrc/cpu/segment_csr_cpu.cpp
index a826192c..18be4023 100644
--- a/csrc/cpu/segment_csr_cpu.cpp
+++ b/csrc/cpu/segment_csr_cpu.cpp
@@ -3,10 +3,11 @@
 #include "index_info.h"
 #include "reducer.h"
 #include "utils.h"
+#include <ATen/OpMathType.h>
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out,
+                std::optional<torch::Tensor> optional_out,
                 std::string reduce) {
   CHECK_CPU(src);
   CHECK_CPU(indptr);
@@ -37,7 +38,7 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
     out = torch::empty(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());
@@ -57,11 +58,12 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
   auto indptr_info = getTensorInfo<int64_t>(indptr);
   auto stride = indptr_info.strides[indptr_info.dims - 1];
   std::vector<int64_t> args(K);
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "segment_csr_cpu", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
-    std::vector<scalar_t> vals(K);
+    std::vector<opmath_t> vals(K);
     int64_t row_start, row_end;
     AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
       for (auto n = 0; n < N; n++) {
@@ -71,15 +73,15 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
 
         offset = (n / (indptr.size(-1) - 1)) * E * K;
         for (auto k = 0; k < K; k++)
-          vals[k] = Reducer<scalar_t, REDUCE>::init();
+          vals[k] = Reducer<opmath_t, REDUCE>::init();
 
         for (auto e = row_start; e < row_end; e++)
           for (auto k = 0; k < K; k++)
-            Reducer<scalar_t, REDUCE>::update(
-                &vals[k], src_data[offset + e * K + k], &args[k], e);
+            Reducer<opmath_t, REDUCE>::update(
+                &vals[k], static_cast<opmath_t>(src_data[offset + e * K + k]), &args[k], e);
 
         for (auto k = 0; k < K; k++)
-          Reducer<scalar_t, REDUCE>::write(out_data + n * K + k, vals[k],
+          Reducer<scalar_t, REDUCE>::write(out_data + n * K + k, static_cast<scalar_t>(vals[k]),
                                            arg_out_data + n * K + k, args[k],
                                            row_end - row_start);
       }
@@ -90,7 +92,7 @@ segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
 }
 
 torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr,
-                             torch::optional<torch::Tensor> optional_out) {
+                             std::optional<torch::Tensor> optional_out) {
   CHECK_CPU(src);
   CHECK_CPU(indptr);
   if (optional_out.has_value())
@@ -135,7 +137,7 @@ torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr,
 
   auto indptr_info = getTensorInfo<int64_t>(indptr);
   auto stride = indptr_info.strides[indptr_info.dims - 1];
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "gather_csr_cpu", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cpu/segment_csr_cpu.h b/csrc/cpu/segment_csr_cpu.h
index b93d450b..7b4da0a2 100644
--- a/csrc/cpu/segment_csr_cpu.h
+++ b/csrc/cpu/segment_csr_cpu.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out,
+                std::optional<torch::Tensor> optional_out,
                 std::string reduce);
 
 torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr,
-                             torch::optional<torch::Tensor> optional_out);
+                             std::optional<torch::Tensor> optional_out);
diff --git a/csrc/cpu/utils.h b/csrc/cpu/utils.h
index 40dfb344..66ae38bf 100644
--- a/csrc/cpu/utils.h
+++ b/csrc/cpu/utils.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
 #define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 #define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
diff --git a/csrc/cuda/atomics.cuh b/csrc/cuda/atomics.cuh
index 8a7c4724..16b24fc0 100644
--- a/csrc/cuda/atomics.cuh
+++ b/csrc/cuda/atomics.cuh
@@ -68,8 +68,8 @@
                                                                                \
   template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl;    \
                                                                                \
-  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 2> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
+  template <> struct Atomic##NAME##DecimalImpl<at::Half, 2> {                  \
+    inline __device__ void operator()(at::Half *address, at::Half val) {       \
       unsigned int *address_as_ui =                                            \
           (unsigned int *)((char *)address - ((size_t)address & 2));           \
       unsigned int old = *address_as_ui;                                       \
@@ -87,6 +87,25 @@
     }                                                                          \
   };                                                                           \
                                                                                \
+  template <> struct Atomic##NAME##DecimalImpl<at::BFloat16, 2> {              \
+    inline __device__ void operator()(at::BFloat16 *address, at::BFloat16 val){\
+      unsigned int *address_as_ui =                                            \
+          (unsigned int *)((char *)address - ((size_t)address & 2));           \
+      unsigned int old = *address_as_ui;                                       \
+      unsigned int assumed;                                                    \
+                                                                               \
+      do {                                                                     \
+        assumed = old;                                                         \
+        at::BFloat16 hsum;                                                     \
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \
+        hsum = OP(hsum, val);                                                  \
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \
+                                  : (old & 0xffff0000) | hsum.x;               \
+        old = atomicCAS(address_as_ui, assumed, old);                          \
+      } while (assumed != old);                                                \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
   template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> {     \
     inline __device__ void operator()(scalar *address, scalar val) {           \
       int *address_as_i = (int *)address;                                      \
@@ -135,7 +154,7 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) {
 static inline __device__ void atomAdd(int64_t *address, int64_t val) {
   AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000)
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000))
 static inline __device__ void atomAdd(at::Half *address, at::Half val) {
   AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
@@ -156,6 +175,9 @@ static inline __device__ void atomAdd(double *address, double val) {
   atomicAdd(address, val);
 }
 #endif
+static inline __device__ void atomAdd(at::BFloat16 *address, at::BFloat16 val) {
+  AtomicAddDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+}
 
 #define OP(X, Y) Y *X
 ATOMIC(Mul)
@@ -184,6 +206,9 @@ static inline __device__ void atomMul(at::Half *address, at::Half val) {
 static inline __device__ void atomMul(double *address, double val) {
   AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);
 }
+static inline __device__ void atomMul(at::BFloat16 *address, at::BFloat16 val) {
+  AtomicMulDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+}
 
 #define OP(X, Y) Y / X
 ATOMIC(Div)
@@ -212,6 +237,9 @@ static inline __device__ void atomDiv(float *address, float val) {
 static inline __device__ void atomDiv(double *address, double val) {
   AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);
 }
+static inline __device__ void atomDiv(at::BFloat16 *address, at::BFloat16 val) {
+  AtomicDivDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+}
 
 #define OP(X, Y) max(Y, X)
 ATOMIC(Max)
@@ -240,6 +268,9 @@ static inline __device__ void atomMax(float *address, float val) {
 static inline __device__ void atomMax(double *address, double val) {
   AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);
 }
+static inline __device__ void atomMax(at::BFloat16 *address, at::BFloat16 val) {
+  AtomicMaxDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+}
 
 #define OP(X, Y) min(Y, X)
 ATOMIC(Min)
@@ -268,3 +299,6 @@ static inline __device__ void atomMin(float *address, float val) {
 static inline __device__ void atomMin(double *address, double val) {
   AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);
 }
+static inline __device__ void atomMin(at::BFloat16 *address, at::BFloat16 val) {
+  AtomicMinDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+}
diff --git a/csrc/cuda/reducer.cuh b/csrc/cuda/reducer.cuh
index 8c851d20..577f4286 100644
--- a/csrc/cuda/reducer.cuh
+++ b/csrc/cuda/reducer.cuh
@@ -16,27 +16,27 @@ const std::map<std::string, ReductionType> reduce2REDUCE = {
   [&] {                                                                        \
     switch (reduce2REDUCE.at(reduce)) {                                        \
     case SUM: {                                                                \
-      const ReductionType REDUCE = SUM;                                        \
+      static constexpr ReductionType REDUCE = SUM;                                        \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     case MEAN: {                                                               \
-      const ReductionType REDUCE = MEAN;                                       \
+      static constexpr ReductionType REDUCE = MEAN;                                       \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     case MUL: {                                                                \
-      const ReductionType REDUCE = MUL;                                        \
+      static constexpr ReductionType REDUCE = MUL;                                        \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     case DIV: {                                                                \
-      const ReductionType REDUCE = DIV;                                        \
+      static constexpr ReductionType REDUCE = DIV;                                        \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     case MIN: {                                                                \
-      const ReductionType REDUCE = MIN;                                        \
+      static constexpr ReductionType REDUCE = MIN;                                        \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     case MAX: {                                                                \
-      const ReductionType REDUCE = MAX;                                        \
+      static constexpr ReductionType REDUCE = MAX;                                        \
       return __VA_ARGS__();                                                    \
     }                                                                          \
     }                                                                          \
diff --git a/csrc/cuda/scatter_cuda.cu b/csrc/cuda/scatter_cuda.cu
index bcf5d7eb..83db7ccd 100644
--- a/csrc/cuda/scatter_cuda.cu
+++ b/csrc/cuda/scatter_cuda.cu
@@ -7,7 +7,7 @@
 #include "reducer.cuh"
 #include "utils.cuh"
 
-#define THREADS 1024
+#define THREADS 256
 #define BLOCKS(N) (N + THREADS - 1) / THREADS
 
 template <typename scalar_t, ReductionType REDUCE>
@@ -55,15 +55,15 @@ scatter_arg_kernel(const scalar_t *src_data,
   }
 }
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size, std::string reduce) {
+             std::optional<torch::Tensor> optional_out,
+             std::optional<int64_t> dim_size, std::string reduce) {
   CHECK_CUDA(src);
   CHECK_CUDA(index);
   if (optional_out.has_value())
     CHECK_CUDA(optional_out.value());
-  cudaSetDevice(src.get_device());
+  c10::cuda::MaybeSetDevice(src.get_device());
 
   CHECK_INPUT(src.dim() == index.dim());
   for (auto i = 0; i < index.dim() - 1; i++)
@@ -89,7 +89,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
     out = torch::empty(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full_like(out, src.size(dim), index.options());
@@ -111,7 +111,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
 
   auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
   auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cuda/scatter_cuda.h b/csrc/cuda/scatter_cuda.h
index 95c80642..a0496793 100644
--- a/csrc/cuda/scatter_cuda.h
+++ b/csrc/cuda/scatter_cuda.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size, std::string reduce);
+             std::optional<torch::Tensor> optional_out,
+             std::optional<int64_t> dim_size, std::string reduce);
diff --git a/csrc/cuda/segment_coo_cuda.cu b/csrc/cuda/segment_coo_cuda.cu
index ad30f972..29610dfc 100644
--- a/csrc/cuda/segment_coo_cuda.cu
+++ b/csrc/cuda/segment_coo_cuda.cu
@@ -36,8 +36,8 @@ segment_coo_kernel(const scalar_t *src_data,
 #pragma unroll
     for (int i = 1; i < 32; i *= 2) {
       // Parallel reduction inside a single warp.
-      tmp = __shfl_up_sync(FULL_MASK, val, i);
-      next_idx = __shfl_up_sync(FULL_MASK, idx, i);
+      tmp = SHFL_UP_SYNC(FULL_MASK, val, i);
+      next_idx = SHFL_UP_SYNC(FULL_MASK, idx, i);
       if (lane_idx >= i && row_idx / D == (row_idx - i) / D) {
         assert(idx >= next_idx);
         if (idx == next_idx)
@@ -45,7 +45,7 @@ segment_coo_kernel(const scalar_t *src_data,
       }
     }
 
-    next_idx = __shfl_down_sync(FULL_MASK, idx, 1);
+    next_idx = SHFL_DOWN_SYNC(FULL_MASK, idx, 1);
     if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D ||
         idx != next_idx)
       Reducer<scalar_t, REDUCE>::atomic_write(out_data + out_idx, val);
@@ -149,15 +149,15 @@ __global__ void segment_coo_arg_broadcast_kernel(
   }
 }
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_coo_cuda(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size, std::string reduce) {
+                 std::optional<torch::Tensor> optional_out,
+                 std::optional<int64_t> dim_size, std::string reduce) {
   CHECK_CUDA(src);
   CHECK_CUDA(index);
   if (optional_out.has_value())
     CHECK_CUDA(optional_out.value());
-  cudaSetDevice(src.get_device());
+  c10::cuda::MaybeSetDevice(src.get_device());
 
   CHECK_INPUT(src.dim() >= index.dim());
 
@@ -191,7 +191,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
     out = torch::zeros(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full_like(out, src.size(dim), index.options());
@@ -214,7 +214,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
 
   auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
   auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
@@ -325,12 +325,12 @@ __global__ void gather_coo_broadcast_kernel(
 }
 
 torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out) {
+                              std::optional<torch::Tensor> optional_out) {
   CHECK_CUDA(src);
   CHECK_CUDA(index);
   if (optional_out.has_value())
     CHECK_CUDA(optional_out.value());
-  cudaSetDevice(src.get_device());
+  c10::cuda::MaybeSetDevice(src.get_device());
 
   CHECK_INPUT(src.dim() >= index.dim());
 
@@ -365,7 +365,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
 
   auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
   auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cuda/segment_coo_cuda.h b/csrc/cuda/segment_coo_cuda.h
index 68154775..93f2aee6 100644
--- a/csrc/cuda/segment_coo_cuda.h
+++ b/csrc/cuda/segment_coo_cuda.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_coo_cuda(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size, std::string reduce);
+                 std::optional<torch::Tensor> optional_out,
+                 std::optional<int64_t> dim_size, std::string reduce);
 
 torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out);
+                              std::optional<torch::Tensor> optional_out);
diff --git a/csrc/cuda/segment_csr_cuda.cu b/csrc/cuda/segment_csr_cuda.cu
index d88f3250..7061ba69 100644
--- a/csrc/cuda/segment_csr_cuda.cu
+++ b/csrc/cuda/segment_csr_cuda.cu
@@ -46,9 +46,9 @@ segment_csr_kernel(const scalar_t *src_data,
     for (int i = TB / 2; i > 0; i /= 2) {
       // Parallel reduction inside a single warp.
       if (REDUCE == MIN || REDUCE == MAX)
-        arg_tmp = __shfl_down_sync(FULL_MASK, arg, i);
+        arg_tmp = SHFL_DOWN_SYNC(FULL_MASK, arg, i);
       Reducer<scalar_t, REDUCE>::update(
-          &val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp);
+          &val, SHFL_DOWN_SYNC(FULL_MASK, val, i), &arg, arg_tmp);
     }
 
     if (lane_idx == 0) {
@@ -94,15 +94,15 @@ __global__ void segment_csr_broadcast_kernel(
   }
 }
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out,
+                 std::optional<torch::Tensor> optional_out,
                  std::string reduce) {
   CHECK_CUDA(src);
   CHECK_CUDA(indptr);
   if (optional_out.has_value())
     CHECK_CUDA(optional_out.value());
-  cudaSetDevice(src.get_device());
+  c10::cuda::MaybeSetDevice(src.get_device());
 
   CHECK_INPUT(src.dim() >= indptr.dim());
 
@@ -128,7 +128,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
     out = torch::empty(sizes, src.options());
   }
 
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  std::optional<torch::Tensor> arg_out = std::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
     arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());
@@ -147,7 +147,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
 
   auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
   auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
@@ -217,12 +217,12 @@ __global__ void gather_csr_broadcast_kernel(
 }
 
 torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out) {
+                              std::optional<torch::Tensor> optional_out) {
   CHECK_CUDA(src);
   CHECK_CUDA(indptr);
   if (optional_out.has_value())
     CHECK_CUDA(optional_out.value());
-  cudaSetDevice(src.get_device());
+  c10::cuda::MaybeSetDevice(src.get_device());
 
   CHECK_INPUT(src.dim() >= indptr.dim());
 
@@ -264,7 +264,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
 
   auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
   auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
     auto src_data = src.data_ptr<scalar_t>();
     auto out_data = out.data_ptr<scalar_t>();
 
diff --git a/csrc/cuda/segment_csr_cuda.h b/csrc/cuda/segment_csr_cuda.h
index 5f8bd40e..8014e766 100644
--- a/csrc/cuda/segment_csr_cuda.h
+++ b/csrc/cuda/segment_csr_cuda.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out,
+                 std::optional<torch::Tensor> optional_out,
                  std::string reduce);
 
 torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out);
+                              std::optional<torch::Tensor> optional_out);
diff --git a/csrc/cuda/utils.cuh b/csrc/cuda/utils.cuh
index d08b0536..396b4fa1 100644
--- a/csrc/cuda/utils.cuh
+++ b/csrc/cuda/utils.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "../extensions.h"
 
 #define CHECK_CUDA(x)                                                          \
   AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
@@ -9,11 +9,32 @@
 __device__ __inline__ at::Half __shfl_up_sync(const unsigned mask,
                                               const at::Half var,
                                               const unsigned int delta) {
-  return __shfl_up_sync(mask, (__half)var, delta);
+  return __shfl_up_sync(mask, var.operator __half(), delta);
 }
 
 __device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
                                                 const at::Half var,
                                                 const unsigned int delta) {
-  return __shfl_down_sync(mask, (__half)var, delta);
+  return __shfl_down_sync(mask, var.operator __half(), delta);
 }
+
+__device__ __inline__ at::Half __shfl_up(const at::Half var,
+                                         const unsigned int delta) {
+  return __shfl_up(var.operator __half(), delta);
+}
+
+__device__ __inline__ at::Half __shfl_down(const at::Half var,
+                                           const unsigned int delta) {
+  return __shfl_down(var.operator __half(), delta);
+}
+
+#ifdef USE_ROCM
+__device__ __inline__ at::Half __ldg(const at::Half* ptr) {
+  return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#define SHFL_UP_SYNC(mask, var, delta) __shfl_up(var, delta)
+#define SHFL_DOWN_SYNC(mask, var, delta) __shfl_down(var, delta)
+#else
+#define SHFL_UP_SYNC __shfl_up_sync
+#define SHFL_DOWN_SYNC __shfl_down_sync
+#endif
diff --git a/csrc/extensions.h b/csrc/extensions.h
new file mode 100644
index 00000000..91c4df1a
--- /dev/null
+++ b/csrc/extensions.h
@@ -0,0 +1,2 @@
+#include "macros.h"
+#include <torch/torch.h>
diff --git a/csrc/macros.h b/csrc/macros.h
new file mode 100644
index 00000000..d55e6236
--- /dev/null
+++ b/csrc/macros.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef _WIN32
+#if defined(torchscatter_EXPORTS)
+#define SCATTER_API __declspec(dllexport)
+#else
+#define SCATTER_API __declspec(dllimport)
+#endif
+#else
+#define SCATTER_API
+#endif
+
+#if (defined __cpp_inline_variables) || __cplusplus >= 201703L
+#define SCATTER_INLINE_VARIABLE inline
+#else
+#ifdef _MSC_VER
+#define SCATTER_INLINE_VARIABLE __declspec(selectany)
+#else
+#define SCATTER_INLINE_VARIABLE __attribute__((weak))
+#endif
+#endif
diff --git a/csrc/scatter.cpp b/csrc/scatter.cpp
index 3a418ab3..0c986c00 100644
--- a/csrc/scatter.cpp
+++ b/csrc/scatter.cpp
@@ -1,7 +1,11 @@
+#ifdef WITH_PYTHON
 #include <Python.h>
+#endif
+
 #include <torch/script.h>
 
 #include "cpu/scatter_cpu.h"
+#include "macros.h"
 #include "utils.h"
 
 #ifdef WITH_CUDA
@@ -9,12 +13,14 @@
 #endif
 
 #ifdef _WIN32
+#ifdef WITH_PYTHON
 #ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__scatter_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__scatter_cpu(void) { return NULL; }
 #endif
 #endif
+#endif
 
 torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) {
   if (src.dim() == 1)
@@ -26,10 +32,10 @@ torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) {
   return src;
 }
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 scatter_fw(torch::Tensor src, torch::Tensor index, int64_t dim,
-           torch::optional<torch::Tensor> optional_out,
-           torch::optional<int64_t> dim_size, std::string reduce) {
+           std::optional<torch::Tensor> optional_out,
+           std::optional<int64_t> dim_size, std::string reduce) {
   if (src.device().is_cuda()) {
 #ifdef WITH_CUDA
     return scatter_cuda(src, index, dim, optional_out, dim_size, reduce);
@@ -49,8 +55,8 @@ class ScatterSum : public torch::autograd::Function<ScatterSum> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     dim = dim < 0 ? src.dim() + dim : dim;
     ctx->saved_data["dim"] = dim;
     ctx->saved_data["src_shape"] = src.sizes();
@@ -78,8 +84,8 @@ class ScatterMul : public torch::autograd::Function<ScatterMul> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     dim = dim < 0 ? src.dim() + dim : dim;
     ctx->saved_data["dim"] = dim;
     ctx->saved_data["src_shape"] = src.sizes();
@@ -110,8 +116,8 @@ class ScatterMean : public torch::autograd::Function<ScatterMean> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     dim = dim < 0 ? src.dim() + dim : dim;
     ctx->saved_data["dim"] = dim;
     ctx->saved_data["src_shape"] = src.sizes();
@@ -125,7 +131,7 @@ class ScatterMean : public torch::autograd::Function<ScatterMean> {
     auto ones = torch::ones(old_index.sizes(), src.options());
     result = scatter_fw(ones, old_index,
                         old_index.dim() <= dim ? old_index.dim() - 1 : dim,
-                        torch::nullopt, out.size(dim), "sum");
+                        std::nullopt, out.size(dim), "sum");
     auto count = std::get<0>(result);
     count.masked_fill_(count < 1, 1);
     count = broadcast(count, out, dim);
@@ -158,8 +164,8 @@ class ScatterMin : public torch::autograd::Function<ScatterMin> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     dim = dim < 0 ? src.dim() + dim : dim;
     ctx->saved_data["dim"] = dim;
     ctx->saved_data["src_shape"] = src.sizes();
@@ -194,8 +200,8 @@ class ScatterMax : public torch::autograd::Function<ScatterMax> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     dim = dim < 0 ? src.dim() + dim : dim;
     ctx->saved_data["dim"] = dim;
     ctx->saved_data["src_shape"] = src.sizes();
@@ -226,36 +232,39 @@ class ScatterMax : public torch::autograd::Function<ScatterMax> {
   }
 };
 
-torch::Tensor scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim,
-                          torch::optional<torch::Tensor> optional_out,
-                          torch::optional<int64_t> dim_size) {
+SCATTER_API torch::Tensor
+scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim,
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size) {
   return ScatterSum::apply(src, index, dim, optional_out, dim_size)[0];
 }
 
-torch::Tensor scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim,
-                          torch::optional<torch::Tensor> optional_out,
-                          torch::optional<int64_t> dim_size) {
+SCATTER_API torch::Tensor
+scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim,
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size) {
   return ScatterMul::apply(src, index, dim, optional_out, dim_size)[0];
 }
 
-torch::Tensor scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim,
-                           torch::optional<torch::Tensor> optional_out,
-                           torch::optional<int64_t> dim_size) {
+SCATTER_API torch::Tensor
+scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim,
+             std::optional<torch::Tensor> optional_out,
+             std::optional<int64_t> dim_size) {
   return ScatterMean::apply(src, index, dim, optional_out, dim_size)[0];
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 scatter_min(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size) {
   auto result = ScatterMin::apply(src, index, dim, optional_out, dim_size);
   return std::make_tuple(result[0], result[1]);
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 scatter_max(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size) {
   auto result = ScatterMax::apply(src, index, dim, optional_out, dim_size);
   return std::make_tuple(result[0], result[1]);
 }
diff --git a/csrc/scatter.h b/csrc/scatter.h
index 629c96f4..f5d9eaaf 100644
--- a/csrc/scatter.h
+++ b/csrc/scatter.h
@@ -1,61 +1,80 @@
 #pragma once
 
-#include <torch/extension.h>
+#include "extensions.h"
 
-int64_t cuda_version();
+namespace scatter {
+SCATTER_API int64_t cuda_version() noexcept;
 
-torch::Tensor scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim,
-                          torch::optional<torch::Tensor> optional_out,
-                          torch::optional<int64_t> dim_size);
+namespace detail {
+SCATTER_INLINE_VARIABLE int64_t _cuda_version = cuda_version();
+} // namespace detail
+} // namespace scatter
 
-torch::Tensor scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim,
-                           torch::optional<torch::Tensor> optional_out,
-                           torch::optional<int64_t> dim_size);
+SCATTER_API torch::Tensor
+scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim,
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API torch::Tensor
+scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim,
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size);
+
+SCATTER_API torch::Tensor
+scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim,
+             std::optional<torch::Tensor> optional_out,
+             std::optional<int64_t> dim_size);
+
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 scatter_min(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size);
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 scatter_max(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size);
+            std::optional<torch::Tensor> optional_out,
+            std::optional<int64_t> dim_size);
 
-torch::Tensor segment_sum_coo(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out,
-                              torch::optional<int64_t> dim_size);
+SCATTER_API torch::Tensor
+segment_sum_coo(torch::Tensor src, torch::Tensor index,
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size);
 
-torch::Tensor segment_mean_coo(torch::Tensor src, torch::Tensor index,
-                               torch::optional<torch::Tensor> optional_out,
-                               torch::optional<int64_t> dim_size);
+SCATTER_API torch::Tensor
+segment_mean_coo(torch::Tensor src, torch::Tensor index,
+                 std::optional<torch::Tensor> optional_out,
+                 std::optional<int64_t> dim_size);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_min_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size);
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_max_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size);
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size);
 
-torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index,
-                         torch::optional<torch::Tensor> optional_out);
+SCATTER_API torch::Tensor
+gather_coo(torch::Tensor src, torch::Tensor index,
+           std::optional<torch::Tensor> optional_out);
 
-torch::Tensor segment_sum_csr(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out);
+SCATTER_API torch::Tensor
+segment_sum_csr(torch::Tensor src, torch::Tensor indptr,
+                std::optional<torch::Tensor> optional_out);
 
-torch::Tensor segment_mean_csr(torch::Tensor src, torch::Tensor indptr,
-                               torch::optional<torch::Tensor> optional_out);
+SCATTER_API torch::Tensor
+segment_mean_csr(torch::Tensor src, torch::Tensor indptr,
+                 std::optional<torch::Tensor> optional_out);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_min_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out);
+                std::optional<torch::Tensor> optional_out);
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_max_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out);
+                std::optional<torch::Tensor> optional_out);
 
-torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr,
-                         torch::optional<torch::Tensor> optional_out);
+SCATTER_API torch::Tensor
+gather_csr(torch::Tensor src, torch::Tensor indptr,
+           std::optional<torch::Tensor> optional_out);
diff --git a/csrc/segment_coo.cpp b/csrc/segment_coo.cpp
index 234f3ee4..e59877a8 100644
--- a/csrc/segment_coo.cpp
+++ b/csrc/segment_coo.cpp
@@ -1,7 +1,11 @@
+#ifdef WITH_PYTHON
 #include <Python.h>
+#endif
+
 #include <torch/script.h>
 
 #include "cpu/segment_coo_cpu.h"
+#include "macros.h"
 #include "utils.h"
 
 #ifdef WITH_CUDA
@@ -9,17 +13,19 @@
 #endif
 
 #ifdef _WIN32
+#ifdef WITH_PYTHON
 #ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__segment_coo_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__segment_coo_cpu(void) { return NULL; }
 #endif
 #endif
+#endif
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_coo_fw(torch::Tensor src, torch::Tensor index,
-               torch::optional<torch::Tensor> optional_out,
-               torch::optional<int64_t> dim_size, std::string reduce) {
+               std::optional<torch::Tensor> optional_out,
+               std::optional<int64_t> dim_size, std::string reduce) {
   if (src.device().is_cuda()) {
 #ifdef WITH_CUDA
     return segment_coo_cuda(src, index, optional_out, dim_size, reduce);
@@ -32,7 +38,7 @@ segment_coo_fw(torch::Tensor src, torch::Tensor index,
 }
 
 torch::Tensor gather_coo_fw(torch::Tensor src, torch::Tensor index,
-                            torch::optional<torch::Tensor> optional_out) {
+                            std::optional<torch::Tensor> optional_out) {
   if (src.device().is_cuda()) {
 #ifdef WITH_CUDA
     return gather_coo_cuda(src, index, optional_out);
@@ -52,8 +58,8 @@ class SegmentSumCOO : public torch::autograd::Function<SegmentSumCOO> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_coo_fw(src, index, optional_out, dim_size, "sum");
     auto out = std::get<0>(result);
@@ -78,8 +84,8 @@ class SegmentMeanCOO : public torch::autograd::Function<SegmentMeanCOO> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_coo_fw(src, index, optional_out, dim_size, "mean");
     auto out = std::get<0>(result);
@@ -98,7 +104,7 @@ class SegmentMeanCOO : public torch::autograd::Function<SegmentMeanCOO> {
     auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
     auto grad_in = torch::empty(src_shape, grad_out.options());
     gather_coo_fw(grad_out, index, grad_in);
-    count = gather_coo_fw(count, index, torch::nullopt);
+    count = gather_coo_fw(count, index, std::nullopt);
     for (auto i = 0; i < grad_out.dim() - index.dim(); i++)
       count = count.unsqueeze(-1);
     grad_in.true_divide_(count);
@@ -110,8 +116,8 @@ class SegmentMinCOO : public torch::autograd::Function<SegmentMinCOO> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_coo_fw(src, index, optional_out, dim_size, "min");
     auto out = std::get<0>(result);
@@ -142,8 +148,8 @@ class SegmentMaxCOO : public torch::autograd::Function<SegmentMaxCOO> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
+                               std::optional<Variable> optional_out,
+                               std::optional<int64_t> dim_size) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_coo_fw(src, index, optional_out, dim_size, "max");
     auto out = std::get<0>(result);
@@ -174,7 +180,7 @@ class GatherCOO : public torch::autograd::Function<GatherCOO> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable index,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto out = gather_coo_fw(src, index, optional_out);
     ctx->save_for_backward({index});
@@ -190,41 +196,44 @@ class GatherCOO : public torch::autograd::Function<GatherCOO> {
     auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
 
     auto grad_in = torch::zeros(src_shape, grad_out.options());
-    segment_coo_fw(grad_out, index, grad_in, torch::nullopt, "sum");
+    segment_coo_fw(grad_out, index, grad_in, std::nullopt, "sum");
     return {grad_in, Variable(), Variable()};
   }
 };
 
-torch::Tensor segment_sum_coo(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out,
-                              torch::optional<int64_t> dim_size) {
+SCATTER_API torch::Tensor
+segment_sum_coo(torch::Tensor src, torch::Tensor index,
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size) {
   return SegmentSumCOO::apply(src, index, optional_out, dim_size)[0];
 }
 
-torch::Tensor segment_mean_coo(torch::Tensor src, torch::Tensor index,
-                               torch::optional<torch::Tensor> optional_out,
-                               torch::optional<int64_t> dim_size) {
+SCATTER_API torch::Tensor
+segment_mean_coo(torch::Tensor src, torch::Tensor index,
+                 std::optional<torch::Tensor> optional_out,
+                 std::optional<int64_t> dim_size) {
   return SegmentMeanCOO::apply(src, index, optional_out, dim_size)[0];
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_min_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size) {
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size) {
   auto result = SegmentMinCOO::apply(src, index, optional_out, dim_size);
   return std::make_tuple(result[0], result[1]);
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_max_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size) {
+                std::optional<torch::Tensor> optional_out,
+                std::optional<int64_t> dim_size) {
   auto result = SegmentMaxCOO::apply(src, index, optional_out, dim_size);
   return std::make_tuple(result[0], result[1]);
 }
 
-torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index,
-                         torch::optional<torch::Tensor> optional_out) {
+SCATTER_API torch::Tensor
+gather_coo(torch::Tensor src, torch::Tensor index,
+           std::optional<torch::Tensor> optional_out) {
   return GatherCOO::apply(src, index, optional_out)[0];
 }
 
diff --git a/csrc/segment_csr.cpp b/csrc/segment_csr.cpp
index 4b2ad08c..5f913c21 100644
--- a/csrc/segment_csr.cpp
+++ b/csrc/segment_csr.cpp
@@ -1,7 +1,11 @@
+#ifdef WITH_PYTHON
 #include <Python.h>
+#endif
+
 #include <torch/script.h>
 
 #include "cpu/segment_csr_cpu.h"
+#include "macros.h"
 #include "utils.h"
 
 #ifdef WITH_CUDA
@@ -9,16 +13,18 @@
 #endif
 
 #ifdef _WIN32
+#ifdef WITH_PYTHON
 #ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__segment_csr_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__segment_csr_cpu(void) { return NULL; }
 #endif
 #endif
+#endif
 
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>>
 segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
-               torch::optional<torch::Tensor> optional_out,
+               std::optional<torch::Tensor> optional_out,
                std::string reduce) {
   if (src.device().is_cuda()) {
 #ifdef WITH_CUDA
@@ -32,7 +38,7 @@ segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
 }
 
 torch::Tensor gather_csr_fw(torch::Tensor src, torch::Tensor indptr,
-                            torch::optional<torch::Tensor> optional_out) {
+                            std::optional<torch::Tensor> optional_out) {
   if (src.device().is_cuda()) {
 #ifdef WITH_CUDA
     return gather_csr_cuda(src, indptr, optional_out);
@@ -52,7 +58,7 @@ class SegmentSumCSR : public torch::autograd::Function<SegmentSumCSR> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable indptr,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "sum"));
     ctx->save_for_backward({indptr});
@@ -76,7 +82,7 @@ class SegmentMeanCSR : public torch::autograd::Function<SegmentMeanCSR> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable indptr,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "mean"));
     ctx->save_for_backward({indptr});
@@ -96,7 +102,7 @@ class SegmentMeanCSR : public torch::autograd::Function<SegmentMeanCSR> {
       auto indptr1 = indptr.narrow(-1, 0, indptr.size(-1) - 1);
       auto indptr2 = indptr.narrow(-1, 1, indptr.size(-1) - 1);
       auto count = (indptr2 - indptr1).to(grad_in.options());
-      count = gather_csr_fw(count, indptr, torch::nullopt);
+      count = gather_csr_fw(count, indptr, std::nullopt);
       for (auto i = 0; i < grad_out.dim() - indptr.dim(); i++)
         count = count.unsqueeze(-1);
       grad_in.true_divide_(count);
@@ -109,7 +115,7 @@ class SegmentMinCSR : public torch::autograd::Function<SegmentMinCSR> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable indptr,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_csr_fw(src, indptr, optional_out, "min");
     auto out = std::get<0>(result);
@@ -140,7 +146,7 @@ class SegmentMaxCSR : public torch::autograd::Function<SegmentMaxCSR> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable indptr,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto result = segment_csr_fw(src, indptr, optional_out, "max");
     auto out = std::get<0>(result);
@@ -171,7 +177,7 @@ class GatherCSR : public torch::autograd::Function<GatherCSR> {
 public:
   static variable_list forward(AutogradContext *ctx, Variable src,
                                Variable indptr,
-                               torch::optional<Variable> optional_out) {
+                               std::optional<Variable> optional_out) {
     ctx->saved_data["src_shape"] = src.sizes();
     auto out = gather_csr_fw(src, indptr, optional_out);
     ctx->save_for_backward({indptr});
@@ -192,32 +198,35 @@ class GatherCSR : public torch::autograd::Function<GatherCSR> {
   }
 };
 
-torch::Tensor segment_sum_csr(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out) {
+SCATTER_API torch::Tensor
+segment_sum_csr(torch::Tensor src, torch::Tensor indptr,
+                std::optional<torch::Tensor> optional_out) {
   return SegmentSumCSR::apply(src, indptr, optional_out)[0];
 }
 
-torch::Tensor segment_mean_csr(torch::Tensor src, torch::Tensor indptr,
-                               torch::optional<torch::Tensor> optional_out) {
+SCATTER_API torch::Tensor
+segment_mean_csr(torch::Tensor src, torch::Tensor indptr,
+                 std::optional<torch::Tensor> optional_out) {
   return SegmentMeanCSR::apply(src, indptr, optional_out)[0];
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_min_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out) {
+                std::optional<torch::Tensor> optional_out) {
   auto result = SegmentMinCSR::apply(src, indptr, optional_out);
   return std::make_tuple(result[0], result[1]);
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
+SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
 segment_max_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out) {
+                std::optional<torch::Tensor> optional_out) {
   auto result = SegmentMaxCSR::apply(src, indptr, optional_out);
   return std::make_tuple(result[0], result[1]);
 }
 
-torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr,
-                         torch::optional<torch::Tensor> optional_out) {
+SCATTER_API torch::Tensor
+gather_csr(torch::Tensor src, torch::Tensor indptr,
+           std::optional<torch::Tensor> optional_out) {
   return GatherCSR::apply(src, indptr, optional_out)[0];
 }
 
diff --git a/csrc/version.cpp b/csrc/version.cpp
index a003ea81..3efac185 100644
--- a/csrc/version.cpp
+++ b/csrc/version.cpp
@@ -1,25 +1,42 @@
+#ifdef WITH_PYTHON
 #include <Python.h>
+#endif
+
 #include <torch/script.h>
+#include "scatter.h"
+#include "macros.h"
 
 #ifdef WITH_CUDA
+#ifdef USE_ROCM
+#include <hip/hip_version.h>
+#else
 #include <cuda.h>
 #endif
+#endif
 
 #ifdef _WIN32
+#ifdef WITH_PYTHON
 #ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
 #endif
 #endif
+#endif
 
-int64_t cuda_version() {
+namespace scatter {
+SCATTER_API int64_t cuda_version() noexcept {
 #ifdef WITH_CUDA
+#ifdef USE_ROCM
+  return HIP_VERSION;
+#else
   return CUDA_VERSION;
+#endif
 #else
   return -1;
 #endif
 }
+} // namespace scatter
 
-static auto registry =
-    torch::RegisterOperators().op("torch_scatter::cuda_version", &cuda_version);
+static auto registry = torch::RegisterOperators().op(
+    "torch_scatter::cuda_version", [] { return scatter::cuda_version(); });
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 7d432994..abd2d7c7 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,3 @@
-numpy
-https://download.pytorch.org/whl/cpu/torch-1.5.0%2Bcpu-cp37-cp37m-linux_x86_64.whl
+https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp38-cp38-linux_x86_64.whl
 sphinx>=3
 sphinx_rtd_theme
diff --git a/docs/source/conf.py b/docs/source/conf.py
index eccdb5f1..0db6a74a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,16 +1,15 @@
 import datetime
-import sphinx_rtd_theme
 import doctest
+
+import sphinx_rtd_theme
 import torch_scatter
 
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.doctest',
     'sphinx.ext.intersphinx',
     'sphinx.ext.mathjax',
     'sphinx.ext.napoleon',
     'sphinx.ext.viewcode',
-    'sphinx.ext.githubpages',
 ]
 
 source_suffix = '.rst'
diff --git a/docs/source/functions/composite.rst b/docs/source/functions/composite.rst
new file mode 100644
index 00000000..2aca0605
--- /dev/null
+++ b/docs/source/functions/composite.rst
@@ -0,0 +1,8 @@
+composite
+=========
+
+.. currentmodule:: torch_scatter.composite
+
+.. automodule:: torch_scatter.composite
+   :members:
+   :undoc-members:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 51551643..3481d4a9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,10 +11,13 @@ All included operations are broadcastable, work on varying data types, are imple
 
 .. toctree::
    :glob:
-   :maxdepth: 1
+   :maxdepth: 0
    :caption: Package reference
 
-   functions/*
+   functions/scatter
+   functions/segment_coo
+   functions/segment_csr
+   functions/composite
 
 Indices and tables
 ==================
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..dd14ceb1
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "torch"]
+build-backend = "setuptools.build_meta"
diff --git a/readthedocs.yml b/readthedocs.yml
index 2931804e..e4ca6eaa 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -4,7 +4,7 @@ build:
    image: latest
 
 python:
-   version: 3.7
+   version: 3.8
    system_packages: true
    install:
       - requirements: docs/requirements.txt
diff --git a/setup.cfg b/setup.cfg
index 93f7ab99..a13a3c32 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,8 +1,21 @@
 [metadata]
-description-file = README.md
+long_description=file: README.md
+long_description_content_type=text/markdown
+
+classifiers =
+    Development Status :: 5 - Production/Stable
+    License :: OSI Approved :: MIT License
+    Programming Language :: Python
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3 :: Only
 
 [aliases]
 test = pytest
 
 [tool:pytest]
-addopts = --capture=no --cov
+addopts = --capture=no
diff --git a/setup.py b/setup.py
index 2394709a..fba55fc8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,17 +1,22 @@
-import os
-import sys
 import glob
+import os
 import os.path as osp
-from itertools import product
-from setuptools import setup, find_packages
 import platform
+import sys
+from itertools import product
 
 import torch
+from setuptools import find_packages, setup
 from torch.__config__ import parallel_info
-from torch.utils.cpp_extension import BuildExtension
-from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
+from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, CppExtension,
+                                       CUDAExtension)
 
-WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+__version__ = '2.1.2'
+URL = 'https://github.com/rusty1s/pytorch_scatter'
+
+WITH_CUDA = False
+if torch.cuda.is_available():
+    WITH_CUDA = CUDA_HOME is not None or torch.version.hip
 suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 if os.getenv('FORCE_CUDA', '0') == '1':
     suffices = ['cuda', 'cpu']
@@ -21,6 +26,7 @@
     suffices = ['cpu']
 
 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
+WITH_SYMBOLS = os.getenv('WITH_SYMBOLS', '0') == '1'
 
 
 def get_extensions():
@@ -28,13 +34,20 @@ def get_extensions():
 
     extensions_dir = osp.join('csrc')
     main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
+    # remove generated 'hip' files, in case of rebuilds
+    main_files = [path for path in main_files if 'hip' not in path]
 
     for main, suffix in product(main_files, suffices):
-        define_macros = []
-        extra_compile_args = {'cxx': ['-O2']}
+        define_macros = [('WITH_PYTHON', None)]
+        undef_macros = []
+
+        if sys.platform == 'win32':
+            define_macros += [('torchscatter_EXPORTS', None)]
+
+        extra_compile_args = {'cxx': ['-O3']}
         if not os.name == 'nt':  # Not on Windows:
             extra_compile_args['cxx'] += ['-Wno-sign-compare']
-        extra_link_args = ['-s']
+        extra_link_args = [] if WITH_SYMBOLS else ['-s']
 
         info = parallel_info()
         if ('backend: OpenMP' in info and 'OpenMP not found' not in info
@@ -48,15 +61,24 @@ def get_extensions():
             print('Compiling without OpenMP...')
 
         # Compile for mac arm64
-        if (sys.platform == 'darwin' and platform.machine() == 'arm64'):
-            extra_compile_args['cxx'] += ['-arch', 'arm64']
-            extra_link_args += ['-arch', 'arm64']
+        if sys.platform == 'darwin':
+            extra_compile_args['cxx'] += ['-D_LIBCPP_DISABLE_AVAILABILITY']
+            if platform.machine == 'arm64':
+                extra_compile_args['cxx'] += ['-arch', 'arm64']
+                extra_link_args += ['-arch', 'arm64']
 
         if suffix == 'cuda':
             define_macros += [('WITH_CUDA', None)]
             nvcc_flags = os.getenv('NVCC_FLAGS', '')
             nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
-            nvcc_flags += ['--expt-relaxed-constexpr', '-O2']
+            nvcc_flags += ['-O3']
+            if torch.version.hip:
+                # USE_ROCM was added to later versions of PyTorch.
+                # Define here to support older PyTorch versions as well:
+                define_macros += [('USE_ROCM', None)]
+                undef_macros += ['__HIP_NO_HALF_CONVERSIONS__']
+            else:
+                nvcc_flags += ['--expt-relaxed-constexpr']
             extra_compile_args['nvcc'] = nvcc_flags
 
         name = main.split(os.sep)[-1][:-4]
@@ -76,6 +98,7 @@ def get_extensions():
             sources,
             include_dirs=[extensions_dir],
             define_macros=define_macros,
+            undef_macros=undef_macros,
             extra_compile_args=extra_compile_args,
             extra_link_args=extra_link_args,
         )
@@ -85,27 +108,36 @@ def get_extensions():
 
 
 install_requires = []
-setup_requires = []
-tests_require = ['pytest', 'pytest-runner', 'pytest-cov']
+
+test_requires = [
+    'pytest',
+    'pytest-cov',
+]
+
+# work-around hipify abs paths
+include_package_data = True
+if torch.cuda.is_available() and torch.version.hip:
+    include_package_data = False
 
 setup(
     name='torch_scatter',
-    version='2.0.9',
+    version=__version__,
+    description='PyTorch Extension Library of Optimized Scatter Operations',
     author='Matthias Fey',
     author_email='matthias.fey@tu-dortmund.de',
-    url='https://github.com/rusty1s/pytorch_scatter',
-    description='PyTorch Extension Library of Optimized Scatter Operations',
+    url=URL,
+    download_url=f'{URL}/archive/{__version__}.tar.gz',
     keywords=['pytorch', 'scatter', 'segment', 'gather'],
-    license='MIT',
-    python_requires='>=3.6',
+    python_requires='>=3.8',
     install_requires=install_requires,
-    setup_requires=setup_requires,
-    tests_require=tests_require,
-    extras_require={'test': tests_require},
+    extras_require={
+        'test': test_requires,
+    },
     ext_modules=get_extensions() if not BUILD_DOCS else [],
     cmdclass={
         'build_ext':
         BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
     },
     packages=find_packages(),
+    include_package_data=include_package_data,
 )
diff --git a/test/__init__.py b/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/test/composite/test_logsumexp.py b/test/composite/test_logsumexp.py
index 92844b73..a6b3d160 100644
--- a/test/composite/test_logsumexp.py
+++ b/test/composite/test_logsumexp.py
@@ -4,20 +4,37 @@
 
 def test_logsumexp():
     inputs = torch.tensor([
-        0.5, 0.5, 0.0, -2.1, 3.2, 7.0, -1.0, -100.0,
-        float('-inf'),
-        float('-inf'), 0.0
+        0.5,
+        0.5,
+        0.0,
+        -2.1,
+        3.2,
+        7.0,
+        -1.0,
+        -100.0,
     ])
     inputs.requires_grad_()
-    index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4, 5, 6, 6])
-    splits = [2, 3, 1, 0, 2, 1, 2]
+    index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4])
+    splits = [2, 3, 1, 0, 2]
 
     outputs = scatter_logsumexp(inputs, index)
 
     for src, out in zip(inputs.split(splits), outputs.unbind()):
-        assert out.tolist() == torch.logsumexp(src, dim=0).tolist()
+        if src.numel() > 0:
+            assert out.tolist() == torch.logsumexp(src, dim=0).tolist()
+        else:
+            assert out.item() == 0.0
 
     outputs.backward(torch.randn_like(outputs))
 
     jit = torch.jit.script(scatter_logsumexp)
     assert jit(inputs, index).tolist() == outputs.tolist()
+
+
+def test_logsumexp_out():
+    src = torch.tensor([-1.0, -50.0])
+    index = torch.tensor([0, 0])
+    out = torch.tensor([-10.0, -10.0])
+
+    scatter_logsumexp(src=src, index=index, out=out)
+    assert out.allclose(torch.tensor([-0.9999, -10.0]), atol=1e-4)
diff --git a/test/test_broadcasting.py b/test/test_broadcasting.py
index cfb3593c..0b332e49 100644
--- a/test/test_broadcasting.py
+++ b/test/test_broadcasting.py
@@ -3,8 +3,7 @@
 import pytest
 import torch
 from torch_scatter import scatter
-
-from .utils import reductions, devices
+from torch_scatter.testing import devices, reductions
 
 
 @pytest.mark.parametrize('reduce,device', product(reductions, devices))
diff --git a/test/test_gather.py b/test/test_gather.py
index 8d0d100f..0b40e5d4 100644
--- a/test/test_gather.py
+++ b/test/test_gather.py
@@ -3,9 +3,8 @@
 import pytest
 import torch
 from torch.autograd import gradcheck
-from torch_scatter import gather_csr, gather_coo
-
-from .utils import tensor, dtypes, devices
+from torch_scatter import gather_coo, gather_csr
+from torch_scatter.testing import devices, dtypes, tensor
 
 tests = [
     {
diff --git a/test/test_multi_gpu.py b/test/test_multi_gpu.py
index cdaf893e..98ed38c4 100644
--- a/test/test_multi_gpu.py
+++ b/test/test_multi_gpu.py
@@ -3,8 +3,7 @@
 import pytest
 import torch
 import torch_scatter
-
-from .utils import reductions, tensor, dtypes
+from torch_scatter.testing import dtypes, reductions, tensor
 
 tests = [
     {
diff --git a/test/test_scatter.py b/test/test_scatter.py
index e3b874a2..93257619 100644
--- a/test/test_scatter.py
+++ b/test/test_scatter.py
@@ -2,10 +2,9 @@
 
 import pytest
 import torch
-from torch.autograd import gradcheck
 import torch_scatter
-
-from .utils import reductions, tensor, dtypes, devices
+from torch.autograd import gradcheck
+from torch_scatter.testing import devices, dtypes, reductions, tensor
 
 reductions = reductions + ['mul']
 
@@ -13,7 +12,7 @@
     {
         'src': [1, 3, 2, 4, 5, 6],
         'index': [0, 1, 0, 1, 1, 3],
-        'dim': 0,
+        'dim': -1,
         'sum': [3, 12, 0, 6],
         'add': [3, 12, 0, 6],
         'mul': [2, 60, 1, 6],
diff --git a/test/test_segment.py b/test/test_segment.py
index 3e8996be..9adc49da 100644
--- a/test/test_segment.py
+++ b/test/test_segment.py
@@ -2,10 +2,9 @@
 
 import pytest
 import torch
-from torch.autograd import gradcheck
 import torch_scatter
-
-from .utils import reductions, tensor, dtypes, devices
+from torch.autograd import gradcheck
+from torch_scatter.testing import devices, dtypes, reductions, tensor
 
 tests = [
     {
diff --git a/test/test_zero_tensors.py b/test/test_zero_tensors.py
index 60855427..f744eb56 100644
--- a/test/test_zero_tensors.py
+++ b/test/test_zero_tensors.py
@@ -2,10 +2,9 @@
 
 import pytest
 import torch
-from torch_scatter import scatter, segment_coo, gather_coo
-from torch_scatter import segment_csr, gather_csr
-
-from .utils import reductions, tensor, grad_dtypes, devices
+from torch_scatter import (gather_coo, gather_csr, scatter, segment_coo,
+                           segment_csr)
+from torch_scatter.testing import devices, grad_dtypes, reductions, tensor
 
 
 @pytest.mark.parametrize('reduce,dtype,device',
diff --git a/torch_scatter/__init__.py b/torch_scatter/__init__.py
index 8fff3818..d9388480 100644
--- a/torch_scatter/__init__.py
+++ b/torch_scatter/__init__.py
@@ -1,10 +1,10 @@
-import os
 import importlib
+import os
 import os.path as osp
 
 import torch
 
-__version__ = '2.0.9'
+__version__ = '2.1.2'
 
 for library in ['_version', '_scatter', '_segment_csr', '_segment_coo']:
     cuda_spec = importlib.machinery.PathFinder().find_spec(
@@ -28,18 +28,18 @@
         torch.ops.torch_scatter.scatter_min = scatter_arg_placeholder
         torch.ops.torch_scatter.scatter_max = scatter_arg_placeholder
 
-        from .placeholder import segment_csr_placeholder
-        from .placeholder import segment_csr_arg_placeholder
-        from .placeholder import gather_csr_placeholder
+        from .placeholder import (gather_csr_placeholder,
+                                  segment_csr_arg_placeholder,
+                                  segment_csr_placeholder)
         torch.ops.torch_scatter.segment_sum_csr = segment_csr_placeholder
         torch.ops.torch_scatter.segment_mean_csr = segment_csr_placeholder
         torch.ops.torch_scatter.segment_min_csr = segment_csr_arg_placeholder
         torch.ops.torch_scatter.segment_max_csr = segment_csr_arg_placeholder
         torch.ops.torch_scatter.gather_csr = gather_csr_placeholder
 
-        from .placeholder import segment_coo_placeholder
-        from .placeholder import segment_coo_arg_placeholder
-        from .placeholder import gather_coo_placeholder
+        from .placeholder import (gather_coo_placeholder,
+                                  segment_coo_arg_placeholder,
+                                  segment_coo_placeholder)
         torch.ops.torch_scatter.segment_sum_coo = segment_coo_placeholder
         torch.ops.torch_scatter.segment_mean_coo = segment_coo_placeholder
         torch.ops.torch_scatter.segment_min_coo = segment_coo_arg_placeholder
@@ -47,7 +47,9 @@
         torch.ops.torch_scatter.gather_coo = gather_coo_placeholder
 
 cuda_version = torch.ops.torch_scatter.cuda_version()
-if torch.cuda.is_available() and cuda_version != -1:  # pragma: no cover
+is_not_hip = torch.version.hip is None
+is_cuda = torch.version.cuda is not None
+if is_not_hip and is_cuda and cuda_version != -1:  # pragma: no cover
     if cuda_version < 10000:
         major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
     else:
diff --git a/torch_scatter/composite/logsumexp.py b/torch_scatter/composite/logsumexp.py
index 8fd47454..1d5ff9b6 100644
--- a/torch_scatter/composite/logsumexp.py
+++ b/torch_scatter/composite/logsumexp.py
@@ -1,15 +1,18 @@
 from typing import Optional
 
 import torch
-from torch_scatter import scatter_sum, scatter_max
-
+from torch_scatter import scatter_max, scatter_sum
 from torch_scatter.utils import broadcast
 
 
-def scatter_logsumexp(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
-                      out: Optional[torch.Tensor] = None,
-                      dim_size: Optional[int] = None,
-                      eps: float = 1e-12) -> torch.Tensor:
+def scatter_logsumexp(
+    src: torch.Tensor,
+    index: torch.Tensor,
+    dim: int = -1,
+    out: Optional[torch.Tensor] = None,
+    dim_size: Optional[int] = None,
+    eps: float = 1e-12,
+) -> torch.Tensor:
     if not torch.is_floating_point(src):
         raise ValueError('`scatter_logsumexp` can only be computed over '
                          'tensors with floating point data types.')
@@ -24,17 +27,30 @@ def scatter_logsumexp(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
 
     size = list(src.size())
     size[dim] = dim_size
-    max_value_per_index = torch.full(size, float('-inf'), dtype=src.dtype,
-                                     device=src.device)
+    max_value_per_index = torch.full(
+        size,
+        fill_value=float('-inf'),
+        dtype=src.dtype,
+        device=src.device,
+    )
     scatter_max(src, index, dim, max_value_per_index, dim_size=dim_size)[0]
     max_per_src_element = max_value_per_index.gather(dim, index)
     recentered_score = src - max_per_src_element
     recentered_score.masked_fill_(torch.isnan(recentered_score), float('-inf'))
 
+    orig_out: Optional[torch.Tensor] = None
     if out is not None:
+        orig_out = out.clone()
         out = out.sub_(max_value_per_index).exp_()
 
     sum_per_index = scatter_sum(recentered_score.exp_(), index, dim, out,
                                 dim_size)
 
-    return sum_per_index.add_(eps).log_().add_(max_value_per_index)
+    out = sum_per_index.add_(eps).log_().add_(max_value_per_index)
+
+    if orig_out is None:
+        return out.nan_to_num_(neginf=0.0)
+
+    mask = ~out.isfinite()
+    out[mask] = orig_out[mask]
+    return out
diff --git a/torch_scatter/scatter.py b/torch_scatter/scatter.py
index b994873d..d4d80503 100644
--- a/torch_scatter/scatter.py
+++ b/torch_scatter/scatter.py
@@ -5,7 +5,9 @@
 from .utils import broadcast
 
 
-def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+def scatter_sum(src: torch.Tensor,
+                index: torch.Tensor,
+                dim: int = -1,
                 out: Optional[torch.Tensor] = None,
                 dim_size: Optional[int] = None) -> torch.Tensor:
     index = broadcast(index, src, dim)
@@ -23,19 +25,25 @@ def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
         return out.scatter_add_(dim, index, src)
 
 
-def scatter_add(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+def scatter_add(src: torch.Tensor,
+                index: torch.Tensor,
+                dim: int = -1,
                 out: Optional[torch.Tensor] = None,
                 dim_size: Optional[int] = None) -> torch.Tensor:
     return scatter_sum(src, index, dim, out, dim_size)
 
 
-def scatter_mul(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+def scatter_mul(src: torch.Tensor,
+                index: torch.Tensor,
+                dim: int = -1,
                 out: Optional[torch.Tensor] = None,
                 dim_size: Optional[int] = None) -> torch.Tensor:
     return torch.ops.torch_scatter.scatter_mul(src, index, dim, out, dim_size)
 
 
-def scatter_mean(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+def scatter_mean(src: torch.Tensor,
+                 index: torch.Tensor,
+                 dim: int = -1,
                  out: Optional[torch.Tensor] = None,
                  dim_size: Optional[int] = None) -> torch.Tensor:
     out = scatter_sum(src, index, dim, out, dim_size)
@@ -59,21 +67,28 @@ def scatter_mean(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
 
 
 def scatter_min(
-        src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+        src: torch.Tensor,
+        index: torch.Tensor,
+        dim: int = -1,
         out: Optional[torch.Tensor] = None,
         dim_size: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     return torch.ops.torch_scatter.scatter_min(src, index, dim, out, dim_size)
 
 
 def scatter_max(
-        src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+        src: torch.Tensor,
+        index: torch.Tensor,
+        dim: int = -1,
         out: Optional[torch.Tensor] = None,
         dim_size: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     return torch.ops.torch_scatter.scatter_max(src, index, dim, out, dim_size)
 
 
-def scatter(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
-            out: Optional[torch.Tensor] = None, dim_size: Optional[int] = None,
+def scatter(src: torch.Tensor,
+            index: torch.Tensor,
+            dim: int = -1,
+            out: Optional[torch.Tensor] = None,
+            dim_size: Optional[int] = None,
             reduce: str = "sum") -> torch.Tensor:
     r"""
     |
diff --git a/test/utils.py b/torch_scatter/testing.py
similarity index 52%
rename from test/utils.py
rename to torch_scatter/testing.py
index dcdb19c9..2407b8a0 100644
--- a/test/utils.py
+++ b/torch_scatter/testing.py
@@ -1,14 +1,19 @@
+from typing import Any
+
 import torch
 
 reductions = ['sum', 'add', 'mean', 'min', 'max']
 
-dtypes = [torch.half, torch.float, torch.double, torch.int, torch.long]
+dtypes = [
+    torch.half, torch.bfloat16, torch.float, torch.double, torch.int,
+    torch.long
+]
 grad_dtypes = [torch.float, torch.double]
 
 devices = [torch.device('cpu')]
 if torch.cuda.is_available():
-    devices += [torch.device(f'cuda:{torch.cuda.current_device()}')]
+    devices += [torch.device('cuda:0')]
 
 
-def tensor(x, dtype, device):
+def tensor(x: Any, dtype: torch.dtype, device: torch.device):
     return None if x is None else torch.tensor(x, device=device).to(dtype)