diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 1f9a43a7e..69ca622ce 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -191,10 +191,11 @@ jobs:
DFLASH_CUDA_ARCHES: ${{ github.event_name == 'pull_request' && '86' || '75;80;86;89;90;120' }}
# Same split for HIP: PR builds compile gfx1151 only (the verified
# reference, Strix Halo); main / release builds widen to consumer
- # RDNA so the published :rocm runs on RX 7900 (gfx1100) and RDNA4
- # (gfx1200) too. CDNA datacenter parts (gfx90a/gfx942) stay out
- # until someone can test them.
- DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200' }}
+ # RDNA so the published :rocm runs on RX 7900 (gfx1100) and both RDNA4
+ # code objects — gfx1200 (RX 9060) and gfx1201 (RX 9070 / Radeon AI
+ # PRO R9700), which are NOT binary-compatible with each other. CDNA
+ # datacenter parts (gfx90a/gfx942) stay out until someone can test them.
+ DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200;gfx1201' }}
with:
# bake-action v6+ defaults `source` to the remote git context, which
# makes the local metadata-action bake-file (written to the runner
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 4b0787c6c..4786f3eb4 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -26,8 +26,10 @@ ARG DEBIAN_FRONTEND=noninteractive
# assumes; pre-RDNA/CDNA1 parts are excluded):
# gfx1151 Strix Halo / Ryzen AI MAX+ 395 (the lucebox appliance iGPU)
# gfx1100 RDNA3 RX 7900 XTX / W7900
-# gfx1200 RDNA4 RX 9070
+# gfx1200 RDNA4 RX 9060 (Navi 44)
+# gfx1201 RDNA4 RX 9070 / Radeon AI PRO R9700 (Navi 48)
# gfx942 CDNA3 MI300X / MI300A
+# NB: gfx1200 and gfx1201 are NOT code-object compatible; the R9700 needs gfx1201.
# gfx90a CDNA2 MI200 / MI250
ARG DFLASH_HIP_ARCHES="gfx1151"
diff --git a/README.md b/README.md
index 90573243f..4b3949e31 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Reference target: **RTX 3090 (Ampere sm_86)** — all headline numbers. Other NV
|
| Volta `sm_70` / Pascal `sm_61` | V100, P40 | CUDA 12.0 | 🟡 fallback paths, unbenched | — |
|
| RDNA3.5 `gfx1151` | Ryzen AI MAX+ 395 / Strix Halo | ROCm 6+ | ✅ 37 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
|
| RDNA3 `gfx1100` | Radeon RX 7900 XTX | ROCm 6+ | ✅ 50 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
+| — | RDNA4 `gfx1201` | Radeon AI PRO R9700 | ROCm 6.4+ | ✅ 55 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
`server/` (DFlash) builds with CMake 3.18+ and `--recurse-submodules` for `Luce-Org/llama.cpp@luce-dflash` — no PyTorch needed. `optimizations/megakernel/` is the only component requiring PyTorch 2.0+ (CUDAExtension links against torch C++ libs). Power-tune: `sudo nvidia-smi -pl 220` (3090 sweet spot, re-sweep for other cards).
diff --git a/docker-bake.hcl b/docker-bake.hcl
index 8b377674b..ab9348a25 100644
--- a/docker-bake.hcl
+++ b/docker-bake.hcl
@@ -47,9 +47,11 @@ variable "DFLASH_CUDA_ARCHES" { default = "75;80;86;89;90;120" }
# Fat-binary HIP/gfx arch list for the rocm variant (semicolon-separated).
# Default is gfx1151 (Strix Halo, the lucebox appliance iGPU) only, to keep the
# build tractable. Widen for a broadly-runnable released image, e.g.:
-# DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx942;gfx90a" docker buildx bake rocm
-# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4, gfx942 MI300,
-# gfx90a MI200.)
+# DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx1201;gfx942;gfx90a" docker buildx bake rocm
+# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4 RX9060,
+# gfx1201 RDNA4 RX9070/Radeon AI PRO R9700, gfx942 MI300, gfx90a MI200.)
+# Note: gfx1200 and gfx1201 are NOT code-object compatible — the R9700 needs
+# gfx1201 explicitly.
variable "DFLASH_HIP_ARCHES" { default = "gfx1151" }
# ROCm base-image tag for the rocm variant. gfx1151 needs >= 6.4.1. Default
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 5c258bdc1..c654354bb 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -19,10 +19,19 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
find_package(nlohmann_json CONFIG QUIET)
if(NOT nlohmann_json_FOUND)
include(FetchContent)
+ # DOWNLOAD_EXTRACT_TIMESTAMP (and policy CMP0135) only exist on CMake >= 3.24.
+ # On older CMake (e.g. 3.22 in the Docker/CI base images) the unknown keyword
+ # is swallowed into the URL list, which fails with "At least one entry of URL
+ # is a path (invalid in a list)". Apply it only where it is understood; on
+ # >= 3.24 it still silences the CMP0135 extraction-timestamp dev warning.
+ set(_json_fc_extra "")
+ if(NOT CMAKE_VERSION VERSION_LESS 3.24)
+ list(APPEND _json_fc_extra DOWNLOAD_EXTRACT_TIMESTAMP TRUE)
+ endif()
FetchContent_Declare(json
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
- DOWNLOAD_EXTRACT_TIMESTAMP TRUE
URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d
+ ${_json_fc_extra}
)
FetchContent_MakeAvailable(json)
endif()
@@ -590,6 +599,21 @@ if(DFLASH27B_TESTS)
set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}")
target_include_directories(test_flashprefill_kernels PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common CUDA::cudart)
+ elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND DFLASH27B_HIP_SM80_EQUIV AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
+ # HIP build of the rocWMMA flashprefill numerics test. The test source is
+ # written in CUDA spellings; the hip_compat/ shim maps cuda_runtime.h /
+ # cuda_bf16.h and the cuda* APIs onto HIP, exactly as the kernel sources
+ # do. Only meaningful when Phase 2 (rocWMMA) is enabled, since that is
+ # what provides the launch_*_bf16 symbols in dflash_common. Lets the
+ # rocWMMA path be validated on AMD (gfx1100 / gfx1151 / gfx12xx incl.
+ # the R9700) instead of CUDA-only.
+ add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp)
+ set_source_files_properties(test/test_flashprefill_kernels.cpp PROPERTIES LANGUAGE HIP)
+ set_target_properties(test_flashprefill_kernels PROPERTIES HIP_ARCHITECTURES "${_dflash_archs}")
+ target_include_directories(test_flashprefill_kernels PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
+ ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat)
+ target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kv_quant.cpp")
add_executable(test_kv_quant test/test_kv_quant.cpp)
diff --git a/server/README.md b/server/README.md
index 01735ba2f..18f5ffb6d 100644
--- a/server/README.md
+++ b/server/README.md
@@ -513,6 +513,8 @@ cmake --build build --target test_dflash dflash_server -j
Same DFlash + PFlash stack on AMD GPUs. PR #119 ports the Phase 2 rocWMMA flashprefill kernels to HIP. End-to-end on a Ryzen AI MAX+ 395 box (Radeon 8060S iGPU, `gfx1151`, 128 GiB LPDDR5X-8000 unified): **37.0 tok/s DFlash decode** on Qwen3.5-27B Q4_K_M, **27.6 s TTFT @ 16K** with NIAH retrieval intact. **3.08× decode and 2.24× prefill over llama.cpp HIP AR** on the same iGPU. End-to-end wall clock at a 16K prompt + 1K generation workload: **2.66× faster** than vanilla llama.cpp.
+**RDNA4 — Radeon AI PRO R9700 (`gfx1201`, 32 GB).** First-class RDNA4 target as of this build. Qwen3.6-27B Q4_K_M + DFlash draft (`dflash-draft-3.6-q4_k_m.gguf`), `--ddtree-budget=22`: **54.65 tok/s mean DFlash decode** across the 10-prompt HumanEval suite (`bench_he.py --n-gen 256`, AL 7.14, range 36.9–93.0 tok/s) on ROCm 7.1.1. The rocWMMA Phase 2 flashprefill kernels are numerically correct on RDNA4 — ROCm 7.1's rocWMMA handles the gfx12 WMMA operand-format change internally, so no kernel changes are needed (`test_flashprefill_kernels` PASS on `gfx1201`: max diff 5e-4, e2e `flash_prefill_forward_bf16` at S=8192 in 10.7 ms/iter). Note `gfx1200` (RX 9060) and `gfx1201` (RX 9070 / R9700) are **not** code-object compatible — build for `gfx1201` explicitly for the R9700.
+
```bash
git clone --recurse-submodules https://github.com/Luce-Org/lucebox-hub && cd lucebox-hub/server
@@ -525,9 +527,11 @@ cmake -B build -S . \
cmake --build build --target test_dflash -j
```
-`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed).
+`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed). With `SM80_EQUIV=ON` the build also produces `test_flashprefill_kernels` (HIP) — run it on your card to validate the rocWMMA kernels numerically (`HIP_VISIBLE_DEVICES= ./build/test_flashprefill_kernels`).
+
+**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). `gfx1201` (RDNA4 / R9700, GDDR6) prefers `budget=22` (`budget=8` is a ~9% regression). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card.
-**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card.
+> **Multi-GPU / distro note.** On a host with more than one AMD GPU, pin the bench to the target with `HIP_VISIBLE_DEVICES`. On distros that link PIE executables by default (e.g. Fedora's system ROCm under `/usr`), add `-DCMAKE_EXE_LINKER_FLAGS=-no-pie` to the `cmake` configure line, and point at the toolchain with `-DCMAKE_HIP_COMPILER_ROCM_ROOT=/usr -DROCM_PATH=/usr` if ROCm lives under `/usr` rather than `/opt/rocm`.
**Drafter recipe for max decode:** target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `server/scripts/quantize_draft_q8.py`. Matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness.