Luce-Org · DeanoC · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -191,10 +191,11 @@ jobs:
           DFLASH_CUDA_ARCHES: ${{ github.event_name == 'pull_request' && '86' || '75;80;86;89;90;120' }}
           # Same split for HIP: PR builds compile gfx1151 only (the verified
           # reference, Strix Halo); main / release builds widen to consumer
-          # RDNA so the published :rocm runs on RX 7900 (gfx1100) and RDNA4
-          # (gfx1200) too. CDNA datacenter parts (gfx90a/gfx942) stay out
-          # until someone can test them.
-          DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200' }}
+          # RDNA so the published :rocm runs on RX 7900 (gfx1100) and both RDNA4
+          # code objects — gfx1200 (RX 9060) and gfx1201 (RX 9070 / Radeon AI
+          # PRO R9700), which are NOT binary-compatible with each other. CDNA
+          # datacenter parts (gfx90a/gfx942) stay out until someone can test them.
+          DFLASH_HIP_ARCHES: ${{ github.event_name == 'pull_request' && 'gfx1151' || 'gfx1151;gfx1100;gfx1200;gfx1201' }}
         with:
           # bake-action v6+ defaults `source` to the remote git context, which
           # makes the local metadata-action bake-file (written to the runner

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -26,8 +26,10 @@ ARG DEBIAN_FRONTEND=noninteractive
 # assumes; pre-RDNA/CDNA1 parts are excluded):
 #   gfx1151  Strix Halo / Ryzen AI MAX+ 395  (the lucebox appliance iGPU)
 #   gfx1100  RDNA3        RX 7900 XTX / W7900
-#   gfx1200  RDNA4        RX 9070
+#   gfx1200  RDNA4        RX 9060 (Navi 44)
+#   gfx1201  RDNA4        RX 9070 / Radeon AI PRO R9700 (Navi 48)
 #   gfx942   CDNA3        MI300X / MI300A
+# NB: gfx1200 and gfx1201 are NOT code-object compatible; the R9700 needs gfx1201.
 #   gfx90a   CDNA2        MI200 / MI250
 ARG DFLASH_HIP_ARCHES="gfx1151"
 

diff --git a/README.md b/README.md
@@ -93,6 +93,7 @@ Reference target: **RTX 3090 (Ampere sm_86)** — all headline numbers. Other NV
 | <img src="assets/gpus/v100.png" width="750" /> | Volta `sm_70` / Pascal `sm_61` | V100, P40 | CUDA 12.0 | 🟡 fallback paths, unbenched | — |
 | <img src="assets/gpus/ryze395.png" width="750" /> | RDNA3.5 `gfx1151` | Ryzen AI MAX+ 395 / Strix Halo | ROCm 6+ | ✅ 37 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
 | <img src="assets/gpus/7900xtx.png" width="750" /> | RDNA3 `gfx1100` | Radeon RX 7900 XTX | ROCm 6+ | ✅ 50 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
+| — | RDNA4 `gfx1201` | Radeon AI PRO R9700 | ROCm 6.4+ | ✅ 55 tok/s HIP | [↗](server/README.md#amd-hip-backend-strix-halo-rx-7900-xtx) |
 
 `server/` (DFlash) builds with CMake 3.18+ and `--recurse-submodules` for `Luce-Org/llama.cpp@luce-dflash` — no PyTorch needed. `optimizations/megakernel/` is the only component requiring PyTorch 2.0+ (CUDAExtension links against torch C++ libs). Power-tune: `sudo nvidia-smi -pl 220` (3090 sweet spot, re-sweep for other cards).
 

diff --git a/docker-bake.hcl b/docker-bake.hcl
@@ -47,9 +47,11 @@ variable "DFLASH_CUDA_ARCHES" { default = "75;80;86;89;90;120" }
 # Fat-binary HIP/gfx arch list for the rocm variant (semicolon-separated).
 # Default is gfx1151 (Strix Halo, the lucebox appliance iGPU) only, to keep the
 # build tractable. Widen for a broadly-runnable released image, e.g.:
-#   DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx942;gfx90a" docker buildx bake rocm
-# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4, gfx942 MI300,
-# gfx90a MI200.)
+#   DFLASH_HIP_ARCHES="gfx1151;gfx1100;gfx1200;gfx1201;gfx942;gfx90a" docker buildx bake rocm
+# (gfx1151 Strix Halo, gfx1100 RX7900/RDNA3, gfx1200 RDNA4 RX9060,
+# gfx1201 RDNA4 RX9070/Radeon AI PRO R9700, gfx942 MI300, gfx90a MI200.)
+# Note: gfx1200 and gfx1201 are NOT code-object compatible — the R9700 needs
+# gfx1201 explicitly.
 variable "DFLASH_HIP_ARCHES" { default = "gfx1151" }
 
 # ROCm base-image tag for the rocm variant. gfx1151 needs >= 6.4.1. Default

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -19,10 +19,19 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 find_package(nlohmann_json CONFIG QUIET)
 if(NOT nlohmann_json_FOUND)
     include(FetchContent)
+    # DOWNLOAD_EXTRACT_TIMESTAMP (and policy CMP0135) only exist on CMake >= 3.24.
+    # On older CMake (e.g. 3.22 in the Docker/CI base images) the unknown keyword
+    # is swallowed into the URL list, which fails with "At least one entry of URL
+    # is a path (invalid in a list)". Apply it only where it is understood; on
+    # >= 3.24 it still silences the CMP0135 extraction-timestamp dev warning.
+    set(_json_fc_extra "")
+    if(NOT CMAKE_VERSION VERSION_LESS 3.24)
+        list(APPEND _json_fc_extra DOWNLOAD_EXTRACT_TIMESTAMP TRUE)
+    endif()
     FetchContent_Declare(json
         URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
-        DOWNLOAD_EXTRACT_TIMESTAMP TRUE
         URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d
+        ${_json_fc_extra}
     )
     FetchContent_MakeAvailable(json)
 endif()
@@ -590,6 +599,21 @@ if(DFLASH27B_TESTS)
         set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}")
         target_include_directories(test_flashprefill_kernels PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
         target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common CUDA::cudart)
+    elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND DFLASH27B_HIP_SM80_EQUIV AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
+        # HIP build of the rocWMMA flashprefill numerics test. The test source is
+        # written in CUDA spellings; the hip_compat/ shim maps cuda_runtime.h /
+        # cuda_bf16.h and the cuda* APIs onto HIP, exactly as the kernel sources
+        # do. Only meaningful when Phase 2 (rocWMMA) is enabled, since that is
+        # what provides the launch_*_bf16 symbols in dflash_common. Lets the
+        # rocWMMA path be validated on AMD (gfx1100 / gfx1151 / gfx12xx incl.
+        # the R9700) instead of CUDA-only.
+        add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp)
+        set_source_files_properties(test/test_flashprefill_kernels.cpp PROPERTIES LANGUAGE HIP)
+        set_target_properties(test_flashprefill_kernels PROPERTIES HIP_ARCHITECTURES "${_dflash_archs}")
+        target_include_directories(test_flashprefill_kernels PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src
+            ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat)
+        target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kv_quant.cpp")
         add_executable(test_kv_quant test/test_kv_quant.cpp)

diff --git a/server/README.md b/server/README.md
@@ -513,6 +513,8 @@ cmake --build build --target test_dflash dflash_server -j
 
 Same DFlash + PFlash stack on AMD GPUs. PR #119 ports the Phase 2 rocWMMA flashprefill kernels to HIP. End-to-end on a Ryzen AI MAX+ 395 box (Radeon 8060S iGPU, `gfx1151`, 128 GiB LPDDR5X-8000 unified): **37.0 tok/s DFlash decode** on Qwen3.5-27B Q4_K_M, **27.6 s TTFT @ 16K** with NIAH retrieval intact. **3.08× decode and 2.24× prefill over llama.cpp HIP AR** on the same iGPU. End-to-end wall clock at a 16K prompt + 1K generation workload: **2.66× faster** than vanilla llama.cpp.
 
+**RDNA4 — Radeon AI PRO R9700 (`gfx1201`, 32 GB).** First-class RDNA4 target as of this build. Qwen3.6-27B Q4_K_M + DFlash draft (`dflash-draft-3.6-q4_k_m.gguf`), `--ddtree-budget=22`: **54.65 tok/s mean DFlash decode** across the 10-prompt HumanEval suite (`bench_he.py --n-gen 256`, AL 7.14, range 36.9–93.0 tok/s) on ROCm 7.1.1. The rocWMMA Phase 2 flashprefill kernels are numerically correct on RDNA4 — ROCm 7.1's rocWMMA handles the gfx12 WMMA operand-format change internally, so no kernel changes are needed (`test_flashprefill_kernels` PASS on `gfx1201`: max diff 5e-4, e2e `flash_prefill_forward_bf16` at S=8192 in 10.7 ms/iter). Note `gfx1200` (RX 9060) and `gfx1201` (RX 9070 / R9700) are **not** code-object compatible — build for `gfx1201` explicitly for the R9700.
+
 ```bash
 git clone --recurse-submodules https://github.com/Luce-Org/lucebox-hub && cd lucebox-hub/server
 
@@ -525,9 +527,11 @@ cmake -B build -S . \
 cmake --build build --target test_dflash -j
 ```
 
-`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed).
+`DFLASH27B_HIP_SM80_EQUIV=ON` enables the rocWMMA Phase 2 flashprefill kernels (path that delivers the prefill speedup). `OFF` falls back to ggml's `flash_attn_ext` (slower but no rocwmma headers needed). With `SM80_EQUIV=ON` the build also produces `test_flashprefill_kernels` (HIP) — run it on your card to validate the rocWMMA kernels numerically (`HIP_VISIBLE_DEVICES=<gpu> ./build/test_flashprefill_kernels`).
+
+**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). `gfx1201` (RDNA4 / R9700, GDDR6) prefers `budget=22` (`budget=8` is a ~9% regression). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card.
 
-**Per-arch DDTree tuning:** `gfx1151` (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. `gfx1100` (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card.
+> **Multi-GPU / distro note.** On a host with more than one AMD GPU, pin the bench to the target with `HIP_VISIBLE_DEVICES`. On distros that link PIE executables by default (e.g. Fedora's system ROCm under `/usr`), add `-DCMAKE_EXE_LINKER_FLAGS=-no-pie` to the `cmake` configure line, and point at the toolchain with `-DCMAKE_HIP_COMPILER_ROCM_ROOT=/usr -DROCM_PATH=/usr` if ROCm lives under `/usr` rather than `/opt/rocm`.
 
 **Drafter recipe for max decode:** target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `server/scripts/quantize_draft_q8.py`. Matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness.