From 06f31e8b6d6510bb5ecc9612cd7dafad72406396 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Tue, 4 Mar 2025 20:54:12 +0800 Subject: [PATCH 1/3] Enable rocWMMA for Windows CI build --- .github/workflows/build.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b96e1f50acc9e..881b061b5e98b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1203,6 +1203,11 @@ jobs: id: checkout uses: actions/checkout@v4 + - name: Clone rocWMMA repository + id: clone_rocwmma + run: | + git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: Install id: depends run: | @@ -1232,8 +1237,10 @@ jobs: cmake -G "Unix Makefiles" -B build -S . ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-Irocwmma/library/include/" ` -DCMAKE_BUILD_TYPE=Release ` -DGGML_HIP=ON ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} @@ -1252,6 +1259,11 @@ jobs: with: fetch-depth: 0 + - name: Clone rocWMMA repository + id: clone_rocwmma + run: | + git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 with: @@ -1281,8 +1293,10 @@ jobs: cmake -G "Unix Makefiles" -B build -S . ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-Irocwmma/library/include/" ` -DCMAKE_BUILD_TYPE=Release ` -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} From 6453169032ec93796bf6c6b195ef922e16813e95 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Tue, 4 Mar 2025 20:56:53 +0800 Subject: [PATCH 2/3] Enable for Ubuntu --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 881b061b5e98b..d22c462f6f901 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -467,6 +467,7 @@ jobs: run: | cmake -B build -S . \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON cmake --build build --config Release -j $(nproc) @@ -476,6 +477,7 @@ jobs: cmake -B build2 -S . \ -DCMAKE_C_COMPILER=hipcc \ -DCMAKE_CXX_COMPILER=hipcc \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON cmake --build build2 --config Release -j $(nproc) From 353c61c475c3dfdcc3ecbfd5a71020b9313822e6 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Tue, 4 Mar 2025 21:25:09 +0800 Subject: [PATCH 3/3] GGML_HIP_ROCWMMA_FATTN documentation work --- docs/build.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/build.md b/docs/build.md index b3ecf043d7e48..3d8333328fce0 100644 --- a/docs/build.md +++ b/docs/build.md @@ -235,6 +235,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system. + + The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager. + + As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD. + Note that if you get the following error: ``` clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library