diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 1f9a43a7e..69ca622ce 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -191,10 +191,11 @@ jobs: DFLASH_CUDA_ARCHES: ${{ github.event_name == 'pull_request' && '86' || '75;80;86;89;90;120' }} # Same split for HIP: PR builds compile gfx1151 only (the verified # reference, Strix Halo); main / release builds widen to consumer - # RDNA so the published :rocm runs on RX 7900 (gfx1100) and RDNA4 - # (gfx1200) too. CDNA datacenter parts (gfx90a/gfx942) stay out - # until someone can test them. - DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200' }} + # RDNA so the published :rocm runs on RX 7900 (gfx1100) and both RDNA4 + # code objects — gfx1200 (RX 9060) and gfx1201 (RX 9070 / Radeon AI + # PRO R9700), which are NOT binary-compatible with each other. CDNA + # datacenter parts (gfx90a/gfx942) stay out until someone can test them. + DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200;gfx1201' }} with: # bake-action v6+ defaults `source` to the remote git context, which # makes the local metadata-action bake-file (written to the runner diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 4b0787c6c..4786f3eb4 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -26,8 +26,10 @@ ARG DEBIAN_FRONTEND=noninteractive # assumes; pre-RDNA/CDNA1 parts are excluded): # gfx1151 Strix Halo / Ryzen AI MAX+ 395 (the lucebox appliance iGPU) # gfx1100 RDNA3 RX 7900 XTX / W7900 -# gfx1200 RDNA4 RX 9070 +# gfx1200 RDNA4 RX 9060 (Navi 44) +# gfx1201 RDNA4 RX 9070 / Radeon AI PRO R9700 (Navi 48) # gfx942 CDNA3 MI300X / MI300A +# NB: gfx1200 and gfx1201 are NOT code-object compatible; the R9700 needs gfx1201. # gfx90a CDNA2 MI200 / MI250 ARG DFLASH_HIP_ARCHES="gfx1151" diff --git a/README.md b/README.md index 90573243f..4b3949e31 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Reference target: **RTX 3090 (Ampere sm_86)** — all headline numbers. Other NV | | Volta `sm_70` / Pascal `sm_61` | V100, P40 | CUDA 12.0 | 🟡 fallback paths, unbenched | — | | | RDNA3.5 `gfx1151` | Ryzen AI MAX+ 395 / Strix Halo | ROCm 6+ | ✅ 37 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) | | | RDNA3 `gfx1100` | Radeon RX 7900 XTX | ROCm 6+ | ✅ 50 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) | +| — | RDNA4 `gfx1201` | Radeon AI PRO R9700 | ROCm 6.4+ | ✅ 55 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) | `server/` (DFlash) builds with CMake 3.18+ and `--recurse-submodules` for `Luce-Org/llama.cpp@luce-dflash` — no PyTorch needed. `optimizations/megakernel/` is the only component requiring PyTorch 2.0+ (CUDAExtension links against torch C++ libs). Power-tune: `sudo nvidia-smi -pl 220` (3090 sweet spot, re-sweep for other cards). diff --git a/docker-bake.hcl b/docker-bake.hcl index 8b377674b..ab9348a25 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -47,9 +47,11 @@ variable "DFLASH_CUDA_ARCHES" { default = "75;80;86;89;90;120" } # Fat-binary HIP/gfx arch list for the rocm variant (semicolon-separated). # Default is gfx1151 (Strix Halo, the lucebox appliance iGPU) only, to keep the # build tractable. Widen for a broadly-runnable released image, e.g.: -# DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx942;gfx90a" docker buildx bake rocm -# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4, gfx942 MI300, -# gfx90a MI200.) +# DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx1201;gfx942;gfx90a" docker buildx bake rocm +# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4 RX9060, +# gfx1201 RDNA4 RX9070/Radeon AI PRO R9700, gfx942 MI300, gfx90a MI200.) +# Note: gfx1200 and gfx1201 are NOT code-object compatible — the R9700 needs +# gfx1201 explicitly. variable "DFLASH_HIP_ARCHES" { default = "gfx1151" } # ROCm base-image tag for the rocm variant. gfx1151 needs >= 6.4.1. Default diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 5c258bdc1..c654354bb 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -19,10 +19,19 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) find_package(nlohmann_json CONFIG QUIET) if(NOT nlohmann_json_FOUND) include(FetchContent) + # DOWNLOAD_EXTRACT_TIMESTAMP (and policy CMP0135) only exist on CMake >= 3.24. + # On older CMake (e.g. 3.22 in the Docker/CI base images) the unknown keyword + # is swallowed into the URL list, which fails with "At least one entry of URL + # is a path (invalid in a list)". Apply it only where it is understood; on + # >= 3.24 it still silences the CMP0135 extraction-timestamp dev warning. + set(_json_fc_extra "") + if(NOT CMAKE_VERSION VERSION_LESS 3.24) + list(APPEND _json_fc_extra DOWNLOAD_EXTRACT_TIMESTAMP TRUE) + endif() FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz - DOWNLOAD_EXTRACT_TIMESTAMP TRUE URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d + ${_json_fc_extra} ) FetchContent_MakeAvailable(json) endif() @@ -590,6 +599,21 @@ if(DFLASH27B_TESTS) set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}") target_include_directories(test_flashprefill_kernels PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common CUDA::cudart) + elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND DFLASH27B_HIP_SM80_EQUIV AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp") + # HIP build of the rocWMMA flashprefill numerics test. The test source is + # written in CUDA spellings; the hip_compat/ shim maps cuda_runtime.h / + # cuda_bf16.h and the cuda* APIs onto HIP, exactly as the kernel sources + # do. Only meaningful when Phase 2 (rocWMMA) is enabled, since that is + # what provides the launch_*_bf16 symbols in dflash_common. Lets the + # rocWMMA path be validated on AMD (gfx1100 / gfx1151 / gfx12xx incl. + # the R9700) instead of CUDA-only. + add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp) + set_source_files_properties(test/test_flashprefill_kernels.cpp PROPERTIES LANGUAGE HIP) + set_target_properties(test_flashprefill_kernels PROPERTIES HIP_ARCHITECTURES "${_dflash_archs}") + target_include_directories(test_flashprefill_kernels PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat) + target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kv_quant.cpp") add_executable(test_kv_quant test/test_kv_quant.cpp) diff --git a/server/README.md b/server/README.md index 01735ba2f..18f5ffb6d 100644 --- a/server/README.md +++ b/server/README.md @@ -513,6 +513,8 @@ cmake --build build --target test_dflash dflash_server -j Same DFlash + PFlash stack on AMD GPUs. PR #119 ports the Phase 2 rocWMMA flashprefill kernels to HIP. End-to-end on a Ryzen AI MAX+ 395 box (Radeon 8060S iGPU, `gfx1151`, 128 GiB LPDDR5X-8000 unified): **37.0 tok/s DFlash decode** on Qwen3.5-27B Q4_K_M, **27.6 s TTFT @ 16K** with NIAH retrieval intact. **3.08× decode and 2.24× prefill over llama.cpp HIP AR** on the same iGPU. End-to-end wall clock at a 16K prompt + 1K generation workload: **2.66× faster** than vanilla llama.cpp. +**RDNA4 — Radeon AI PRO R9700 (`gfx1201`, 32 GB).** First-class RDNA4 target as of this build. Qwen3.6-27B Q4_K_M + DFlash draft (`dflash-draft-3.6-q4_k_m.gguf`), `--ddtree-budget=22`: **54.65 tok/s mean DFlash decode** across the 10-prompt HumanEval suite (`bench_he.py --n-gen 256`, AL 7.14, range 36.9–93.0 tok/s) on ROCm 7.1.1. The rocWMMA Phase 2 flashprefill kernels are numerically correct on RDNA4 — ROCm 7.1's rocWMMA handles the gfx12 WMMA operand-format change internally, so no kernel changes are needed (`test_flashprefill_kernels` PASS on `gfx1201`: max diff 5e-4, e2e `flash_prefill_forward_bf16` at S=8192 in 10.7 ms/iter). Note `gfx1200` (RX 9060) and `gfx1201` (RX 9070 / R9700) are **not** code-object compatible — build for `gfx1201` explicitly for the R9700. + ```bash git clone --recurse-submodules https://github.com/Luce-Org/lucebox-hub && cd lucebox-hub/server @@ -525,9 +527,11 @@ cmake -B build -S . \ cmake --build build --target test_dflash -j ``` -`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed). +`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed). With `SM80_EQUIV=ON` the build also produces `test_flashprefill_kernels` (HIP) — run it on your card to validate the rocWMMA kernels numerically (`HIP_VISIBLE_DEVICES= ./build/test_flashprefill_kernels`). + +**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). `gfx1201` (RDNA4 / R9700, GDDR6) prefers `budget=22` (`budget=8` is a ~9% regression). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card. -**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card. +> **Multi-GPU / distro note.** On a host with more than one AMD GPU, pin the bench to the target with `HIP_VISIBLE_DEVICES`. On distros that link PIE executables by default (e.g. Fedora's system ROCm under `/usr`), add `-DCMAKE_EXE_LINKER_FLAGS=-no-pie` to the `cmake` configure line, and point at the toolchain with `-DCMAKE_HIP_COMPILER_ROCM_ROOT=/usr -DROCM_PATH=/usr` if ROCm lives under `/usr` rather than `/opt/rocm`. **Drafter recipe for max decode:** target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `server/scripts/quantize_draft_q8.py`. Matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness.