Luce-Org · davide221 · Jun 4, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -251,12 +251,14 @@ add_library(dflash_common STATIC
     src/qwen35moe/qwen35moe_ffn.cpp
     src/qwen35moe/qwen35moe_backend.cpp
     src/qwen35moe/qwen35moe_daemon.cpp
-    src/qwen35moe/qwen35moe_routing_stats.cpp
-    src/qwen35moe/qwen35moe_expert_placement.cpp
-    src/qwen35moe/qwen35moe_hybrid_storage.cpp
-    src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
     src/qwen35moe/qwen35moe_pipelined_decode.cpp
-    src/qwen35moe/qwen35moe_swap_manager.cpp
+    # ── Common MoE hybrid infrastructure ──
+    src/common/moe_hybrid_placement.cpp
+    src/common/moe_hybrid_routing_stats.cpp
+    src/common/moe_hybrid_storage.cpp
+    src/common/moe_hybrid_ffn_eval.cpp
+    src/common/cold_ffn_cpu.cpp
+    src/common/moe_hybrid_swap_manager.cpp
     src/qwen35/layer_split_forward.cpp
     src/qwen35/layer_split_daemon.cpp
     src/qwen35/qwen35_backend.cpp
@@ -523,6 +525,11 @@ target_link_libraries(dflash_common
         ggml-base
         nlohmann_json::nlohmann_json
 )
+# OpenMP for parallel cold FFN kernel (saturate memory bandwidth).
+find_package(OpenMP)
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(dflash_common PRIVATE OpenMP::OpenMP_CXX)
+endif()
 if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     target_link_libraries(dflash_common PRIVATE hip::host)
 endif()
@@ -638,32 +645,32 @@ if(DFLASH27B_TESTS)
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
         add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
         target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
         add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
         target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
         add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
         target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
         add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
         target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
         add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
         target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
         add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
         target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
         add_executable(smoke_target_forward test/smoke_target_forward.cpp)

diff --git a/server/README.md b/server/README.md
@@ -231,6 +231,59 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
 | Math500 | 35.13 | 69.77 | 5.15 | **1.99×** |
 | **Mean** | 34.97 | 69.19 | 5.17 | **1.98×** |
 
+## Hybrid MoE (hot/cold expert split)
+
+For MoE targets whose experts don't fit in VRAM, dflash can split experts
+across the GPU and CPU: the most-used (**hot**) experts stay resident on the
+GPU, the rest (**cold**) live in host RAM and are evaluated on the CPU,
+overlapped with the GPU hot path. This trades some decode/prefill speed for
+VRAM headroom, and is computed automatically at load by a dynamic-placement
+pass. It applies to both MoE arches: `qwen35`/`qwen36` and `laguna`.
+
+**When it triggers.** If the experts fit in the available VRAM budget, all
+experts load to GPU (no split, fastest path). Otherwise placement keeps as
+many hot experts as the budget allows and routes the rest to CPU. You can also
+shrink the budget manually to force a split (e.g. to free VRAM for a longer
+context or a larger target).
+
+### Budget knobs
+
+| Env | Arch | Effect |
+|---|---|---|
+| `DFLASH_EXPERT_BUDGET_MB N` | both | Cap hot-expert VRAM to `N` MB. Applies only when `N` is below the auto-computed budget; experts beyond it go cold (CPU). |
+| `DFLASH_EXPERT_BUDGET_PCT P` | laguna | Keep hot experts to `P`% (`0<P<100`) of total expert bytes. Applies only when below the auto budget. |
+| `DFLASH_MAX_CONTEXT N` | both | Override the max context used when sizing the KV cache (more KV = less VRAM left for hot experts). |
+
+### Placement / tuning knobs (per arch)
+
+Substitute `<ARCH>` = `LAGUNA` or `QWEN35MOE`:
+
+| Env | Effect |
+|---|---|
+| `DFLASH_<ARCH>_HOTNESS <file>` | Expert frequency/hotness file driving which experts are placed hot. |
+| `DFLASH_<ARCH>_TELEMETRY 1` | Log per-layer hot/cold FFN timing telemetry. |
+| `DFLASH_<ARCH>_SWAP_MAX N` | Max hot/cold promotions per request boundary (runtime re-placement); `0` disables swapping. |
+| `DFLASH_<ARCH>_SWAP_MIN_GAIN N` | Min observed-frequency gain before a cold expert is promoted to hot. |
+| `DFLASH_<ARCH>_NEXT_PLACEMENT_OUT <file>` | Dump the placement chosen this run (warm-start the hotness file next time). |
+| `DFLASH_QWEN35MOE_RUNTIME_STATS_OUT <file>` | (qwen only) Dump runtime routing-frequency stats. |
+
+### Example
+
+```bash
+# Force ~8 GB of hot experts on GPU; the rest run cold on the CPU.
+DFLASH_EXPERT_BUDGET_MB=8000 ./build/dflash_server models/laguna-xs2-Q4_K_M.gguf --port 8000
+# Startup log e.g.: "dynamic placement result: 4717 hot experts, 5267 cold experts"
+```
+
+### Caveat: reduced-stack prefill chunking
+
+When a layer's hot-expert stack is **reduced** (i.e. a genuine split), the
+ggml-cuda MMQ `mul_mat_id` kernel illegal-accesses for certain batch sizes on
+**both** HIP/gfx1151 and CUDA/sm_86. As a guard, hybrid-split **prefill** is
+sliced into ≤4-token sub-batches (forcing the stable MMVQ path); **decode**
+(single-token) is unaffected. This costs some prefill throughput on split
+layers and is removed once the kernel is fixed upstream.
+
 ## Laguna-XS.2 target (experimental, Poolside MoE)
 
 [Poolside Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2) is a 40-layer MoE LLM with 256 experts (top-8) plus an always-on shared expert, per-layer head counts `[48,64,64,64]×10`, and a per-layer SWA pattern (window 512). It is **architecturally distinct from `qwen35`**, so dflash adds a hand-rolled CUDA forward path (`Path A`, ggml-only — no libllama dependency) that mirrors the qwen35 stack. The Q4_K_M GGUF lands at 18.77 GiB on a single RTX 3090; tok_embd stays CPU-only (110 MiB) to keep the GPU budget under 24 GB.

diff --git a/server/src/common/cold_ffn_compute.h b/server/src/common/cold_ffn_compute.h
@@ -0,0 +1,59 @@
+// ColdFfnCompute: Direct compute interface for cold expert FFN.
+// Bypasses ggml graph dispatch overhead. Shared-memory model (CPU/Halo).
+#pragma once
+
+#include "ggml.h"
+#include <cstdint>
+#include <memory>
+
+namespace dflash::common {
+
+// Per-layer cold weight metadata — raw pointers into shared memory.
+struct ColdFfnLayer {
+    const void * gate_up_data = nullptr;  // fused [n_cold, n_ff*2, n_embd] quantized
+    const void * gate_data = nullptr;     // separate gate [n_cold, n_ff, n_embd]
+    const void * up_data = nullptr;       // separate up   [n_cold, n_ff, n_embd]
+    const void * down_data = nullptr;     // [n_cold, n_embd, n_ff] quantized
+
+    size_t gate_up_stride = 0;   // bytes between experts in gate_up tensor
+    size_t gate_stride = 0;      // bytes between experts in gate tensor
+    size_t up_stride = 0;        // bytes between experts in up tensor
+    size_t down_stride = 0;      // bytes between experts in down tensor
+
+    ggml_type gate_up_type = GGML_TYPE_Q4_K;  // type for fused gate_up
+    ggml_type gate_type = GGML_TYPE_Q4_K;     // type for separate gate
+    ggml_type up_type = GGML_TYPE_Q4_K;       // type for separate up
+    ggml_type down_type = GGML_TYPE_Q4_K;     // type for down projection
+    bool fused_gate_up = false;               // true if gate+up are fused
+
+    // Scale factors (applied after matmul). 1.0 = no scaling.
+    float gate_up_scale = 1.0f;
+    float gate_scale = 1.0f;
+    float up_scale = 1.0f;
+    float down_scale = 1.0f;
+};
+
+// Abstract compute interface. Implementations: CPU (now), Halo (future).
+struct ColdFfnCompute {
+    virtual ~ColdFfnCompute() = default;
+
+    // Compute cold expert FFN contributions and accumulate into output.
+    // input:   [n_embd] F32 — post-norm hidden state
+    // ids:     [n_cold] I32 — local cold expert indices
+    // weights: [n_cold] F32 — routing weights for each cold expert
+    // output:  [n_embd] F32 — accumulated weighted expert outputs (zeroed by callee)
+    virtual void compute(
+        const ColdFfnLayer & layer,
+        const float * input,
+        const int32_t * ids,
+        const float * weights,
+        int n_cold,
+        int n_embd,
+        int n_ff,
+        float * output) = 0;
+};
+
+// Create CPU-based fused cold FFN compute.
+std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max);
+
+}  // namespace dflash::common
diff --git a/server/src/common/cold_ffn_cpu.cpp b/server/src/common/cold_ffn_cpu.cpp
@@ -0,0 +1,190 @@
+// CpuColdFfnCompute: Fused cold expert FFN using ggml vec_dot primitives.
+// Bypasses ggml graph dispatch overhead. Uses OpenMP to saturate memory bandwidth.
+// Memory-bandwidth bound at ~45 GB/s DDR4. Target: 15.7ms → ~3ms/token.
+
+#include "cold_ffn_compute.h"
+#include "ggml-cpu.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace dflash::common {
+
+class CpuColdFfnCompute : public ColdFfnCompute {
+    int n_ff_max_;
+    int n_threads_;
+
+    // Per-thread scratch buffers for parallel down matmul
+    struct ThreadBuf {
+        std::vector<float> scratch;   // [n_ff * 2] gate_up result + SwiGLU
+        std::vector<uint8_t> mid_conv; // down input converted to vec_dot_type
+    };
+    std::vector<ThreadBuf> thread_bufs_;
+    std::vector<uint8_t> inp_conv_;  // input converted (shared, read-only during matmul)
+
+public:
+    explicit CpuColdFfnCompute(int n_ff_max, int n_threads = 0) : n_ff_max_(n_ff_max) {
+#ifdef _OPENMP
+        if (n_threads <= 0) {
+            const char * env = std::getenv("DFLASH_COLD_THREADS");
+            n_threads = env ? std::atoi(env) : 0;
+        }
+        n_threads_ = n_threads > 0 ? n_threads : std::min(omp_get_max_threads(), 8);
+#else
+        n_threads_ = 1;
+#endif
+        fprintf(stderr, "[cold_ffn] using %d threads\n", n_threads_);
+        thread_bufs_.resize(n_threads_);
+        for (auto & tb : thread_bufs_) {
+            tb.scratch.resize((size_t)n_ff_max * 2);
+        }
+    }
+
+    void compute(
+        const ColdFfnLayer & layer,
+        const float * input,
+        const int32_t * ids,
+        const float * weights,
+        int n_cold,
+        int n_embd,
+        int n_ff,
+        float * output) override {
+
+        if (n_cold <= 0) return;
+        std::memset(output, 0, sizeof(float) * (size_t)n_embd);
+
+        // Gate/up phase type traits
+        const ggml_type gu_type = layer.fused_gate_up ? layer.gate_up_type : layer.gate_type;
+        const auto * gu_cpu_traits = ggml_get_type_traits_cpu(gu_type);
+        const auto gu_vec_dot = gu_cpu_traits->vec_dot;
+        const auto gu_vec_dot_type = gu_cpu_traits->vec_dot_type;
+        const auto gu_from_float = ggml_get_type_traits_cpu(gu_vec_dot_type)->from_float;
+
+        // Down phase type traits (may differ from gate/up)
+        const auto * dn_cpu_traits = ggml_get_type_traits_cpu(layer.down_type);
+        const auto dn_vec_dot = dn_cpu_traits->vec_dot;
+        const auto dn_vec_dot_type = dn_cpu_traits->vec_dot_type;
+        const auto dn_from_float = ggml_get_type_traits_cpu(dn_vec_dot_type)->from_float;
+
+        const size_t inp_row_size = ggml_row_size(gu_vec_dot_type, n_embd);
+        const size_t mid_row_size = ggml_row_size(dn_vec_dot_type, n_ff);
+        const size_t gu_weight_row = ggml_row_size(gu_type, n_embd);
+        const size_t dn_weight_row = ggml_row_size(layer.down_type, n_ff);
+
+        // For separate gate/up — up may have a different type than gate
+        size_t up_weight_row = gu_weight_row;
+        const ggml_type up_type_actual = layer.fused_gate_up ? gu_type : layer.up_type;
+        (void)up_type_actual;
+        ggml_vec_dot_t up_vec_dot = gu_vec_dot;
+        ggml_type up_vdt = gu_vec_dot_type;
+        if (!layer.fused_gate_up && layer.up_type != layer.gate_type) {
+            const auto * up_cpu_traits = ggml_get_type_traits_cpu(layer.up_type);
+            up_vec_dot = up_cpu_traits->vec_dot;
+            up_vdt = up_cpu_traits->vec_dot_type;
+            up_weight_row = ggml_row_size(layer.up_type, n_embd);
+        }
+
+        // Ensure input conversion buffer is large enough
+        if (inp_conv_.size() < inp_row_size) inp_conv_.resize(inp_row_size);
+        // Ensure per-thread mid_conv buffers
+        for (auto & tb : thread_bufs_) {
+            if (tb.mid_conv.size() < mid_row_size) tb.mid_conv.resize(mid_row_size);
+        }
+
+        // Convert input for up if different type
+        std::vector<uint8_t> inp_conv_up;
+        if (!layer.fused_gate_up && up_vdt != gu_vec_dot_type) {
+            size_t up_inp_row_size = ggml_row_size(up_vdt, n_embd);
+            inp_conv_up.resize(up_inp_row_size);
+            auto up_from_float = ggml_get_type_traits_cpu(up_vdt)->from_float;
+            up_from_float(input, inp_conv_up.data(), n_embd);
+        }
+
+        // Convert input to gate's vec_dot format once
+        gu_from_float(input, inp_conv_.data(), n_embd);
+
+        for (int e = 0; e < n_cold; ++e) {
+            const int32_t eid = ids[e];
+            const float w = weights[e];
+            if (w == 0.0f) continue;
+
+            // Use thread 0's scratch for gate_up (serial phase)
+            float * scratch = thread_bufs_[0].scratch.data();
+
+            // ── Phase 1: gate_up matmul → scratch[0..n_ff*2) ──
+            // Parallel over rows (each row is independent, reading shared inp_conv_)
+            if (layer.fused_gate_up) {
+                const char * expert = (const char *)layer.gate_up_data + (size_t)eid * layer.gate_up_stride;
+                const int n_rows = n_ff * 2;
+#ifdef _OPENMP
+                #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+                for (int row = 0; row < n_rows; ++row) {
+                    const void * row_ptr = expert + (size_t)row * gu_weight_row;
+                    gu_vec_dot(n_embd, &scratch[row], 0, row_ptr, 0, inp_conv_.data(), 0, 1);
+                }
+                if (layer.gate_up_scale != 1.0f) {
+                    for (int i = 0; i < n_rows; ++i) scratch[i] *= layer.gate_up_scale;
+                }
+            } else {
+                const char * gate_expert = (const char *)layer.gate_data + (size_t)eid * layer.gate_stride;
+                const char * up_expert = (const char *)layer.up_data + (size_t)eid * layer.up_stride;
+                const uint8_t * up_inp = (!inp_conv_up.empty()) ? inp_conv_up.data() : inp_conv_.data();
+#ifdef _OPENMP
+                #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+                for (int row = 0; row < n_ff; ++row) {
+                    const void * gp = gate_expert + (size_t)row * gu_weight_row;
+                    gu_vec_dot(n_embd, &scratch[row], 0, gp, 0, inp_conv_.data(), 0, 1);
+                    const void * up = up_expert + (size_t)row * up_weight_row;
+                    up_vec_dot(n_embd, &scratch[n_ff + row], 0, up, 0, up_inp, 0, 1);
+                }
+                if (layer.gate_scale != 1.0f) {
+                    for (int i = 0; i < n_ff; ++i) scratch[i] *= layer.gate_scale;
+                }
+                if (layer.up_scale != 1.0f) {
+                    for (int i = 0; i < n_ff; ++i) scratch[n_ff + i] *= layer.up_scale;
+                }
+            }
+
+            // ── Phase 2: SwiGLU activation ──
+            for (int i = 0; i < n_ff; ++i) {
+                const float gate = scratch[i];
+                const float up = scratch[n_ff + i];
+                scratch[i] = (gate / (1.0f + expf(-gate))) * up;
+            }
+
+            // ── Phase 3: down matmul → output (weighted accumulate) ──
+            // Convert SwiGLU result to down's vec_dot format (serial, small)
+            dn_from_float(scratch, thread_bufs_[0].mid_conv.data(), n_ff);
+            const uint8_t * mid_conv_data = thread_bufs_[0].mid_conv.data();
+
+            const char * down_expert = (const char *)layer.down_data + (size_t)eid * layer.down_stride;
+            const float scale = w * layer.down_scale;
+
+            // Parallel down matmul — each thread accumulates its own output rows
+#ifdef _OPENMP
+            #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+            for (int row = 0; row < n_embd; ++row) {
+                float val;
+                const void * row_ptr = down_expert + (size_t)row * dn_weight_row;
+                dn_vec_dot(n_ff, &val, 0, row_ptr, 0, mid_conv_data, 0, 1);
+                output[row] += scale * val;
+            }
+        }
+    }
+};
+
+std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max) {
+    return std::make_unique<CpuColdFfnCompute>(n_ff_max);
+}
+
+}  // namespace dflash::common