diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index ed148cb67..2766b1824 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -251,12 +251,14 @@ add_library(dflash_common STATIC
     src/qwen35moe/qwen35moe_ffn.cpp
     src/qwen35moe/qwen35moe_backend.cpp
     src/qwen35moe/qwen35moe_daemon.cpp
-    src/qwen35moe/qwen35moe_routing_stats.cpp
-    src/qwen35moe/qwen35moe_expert_placement.cpp
-    src/qwen35moe/qwen35moe_hybrid_storage.cpp
-    src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
     src/qwen35moe/qwen35moe_pipelined_decode.cpp
-    src/qwen35moe/qwen35moe_swap_manager.cpp
+    # ── Common MoE hybrid infrastructure ──
+    src/common/moe_hybrid_placement.cpp
+    src/common/moe_hybrid_routing_stats.cpp
+    src/common/moe_hybrid_storage.cpp
+    src/common/moe_hybrid_ffn_eval.cpp
+    src/common/cold_ffn_cpu.cpp
+    src/common/moe_hybrid_swap_manager.cpp
     src/qwen35/layer_split_forward.cpp
     src/qwen35/layer_split_daemon.cpp
     src/qwen35/qwen35_backend.cpp
@@ -523,6 +525,11 @@ target_link_libraries(dflash_common
         ggml-base
         nlohmann_json::nlohmann_json
 )
+# OpenMP for parallel cold FFN kernel (saturate memory bandwidth).
+find_package(OpenMP)
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(dflash_common PRIVATE OpenMP::OpenMP_CXX)
+endif()
 if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     target_link_libraries(dflash_common PRIVATE hip::host)
 endif()
@@ -638,32 +645,32 @@ if(DFLASH27B_TESTS)
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
         add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
         target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
         add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
         target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
         add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
         target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
         add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
         target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
         add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
         target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
         add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
         target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda)
+        target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
         add_executable(smoke_target_forward test/smoke_target_forward.cpp)
diff --git a/server/README.md b/server/README.md
index ac0d57e97..63c240cb3 100644
--- a/server/README.md
+++ b/server/README.md
@@ -231,6 +231,59 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
 | Math500 | 35.13 | 69.77 | 5.15 | **1.99×** |
 | **Mean** | 34.97 | 69.19 | 5.17 | **1.98×** |
 
+## Hybrid MoE (hot/cold expert split)
+
+For MoE targets whose experts don't fit in VRAM, dflash can split experts
+across the GPU and CPU: the most-used (**hot**) experts stay resident on the
+GPU, the rest (**cold**) live in host RAM and are evaluated on the CPU,
+overlapped with the GPU hot path. This trades some decode/prefill speed for
+VRAM headroom, and is computed automatically at load by a dynamic-placement
+pass. It applies to both MoE arches: `qwen35`/`qwen36` and `laguna`.
+
+**When it triggers.** If the experts fit in the available VRAM budget, all
+experts load to GPU (no split, fastest path). Otherwise placement keeps as
+many hot experts as the budget allows and routes the rest to CPU. You can also
+shrink the budget manually to force a split (e.g. to free VRAM for a longer
+context or a larger target).
+
+### Budget knobs
+
+| Env | Arch | Effect |
+|---|---|---|
+| `DFLASH_EXPERT_BUDGET_MB N` | both | Cap hot-expert VRAM to `N` MB. Applies only when `N` is below the auto-computed budget; experts beyond it go cold (CPU). |
+| `DFLASH_EXPERT_BUDGET_PCT P` | laguna | Keep hot experts to `P`% (`0<P<100`) of total expert bytes. Applies only when below the auto budget. |
+| `DFLASH_MAX_CONTEXT N` | both | Override the max context used when sizing the KV cache (more KV = less VRAM left for hot experts). |
+
+### Placement / tuning knobs (per arch)
+
+Substitute `<ARCH>` = `LAGUNA` or `QWEN35MOE`:
+
+| Env | Effect |
+|---|---|
+| `DFLASH_<ARCH>_HOTNESS <file>` | Expert frequency/hotness file driving which experts are placed hot. |
+| `DFLASH_<ARCH>_TELEMETRY 1` | Log per-layer hot/cold FFN timing telemetry. |
+| `DFLASH_<ARCH>_SWAP_MAX N` | Max hot/cold promotions per request boundary (runtime re-placement); `0` disables swapping. |
+| `DFLASH_<ARCH>_SWAP_MIN_GAIN N` | Min observed-frequency gain before a cold expert is promoted to hot. |
+| `DFLASH_<ARCH>_NEXT_PLACEMENT_OUT <file>` | Dump the placement chosen this run (warm-start the hotness file next time). |
+| `DFLASH_QWEN35MOE_RUNTIME_STATS_OUT <file>` | (qwen only) Dump runtime routing-frequency stats. |
+
+### Example
+
+```bash
+# Force ~8 GB of hot experts on GPU; the rest run cold on the CPU.
+DFLASH_EXPERT_BUDGET_MB=8000 ./build/dflash_server models/laguna-xs2-Q4_K_M.gguf --port 8000
+# Startup log e.g.: "dynamic placement result: 4717 hot experts, 5267 cold experts"
+```
+
+### Caveat: reduced-stack prefill chunking
+
+When a layer's hot-expert stack is **reduced** (i.e. a genuine split), the
+ggml-cuda MMQ `mul_mat_id` kernel illegal-accesses for certain batch sizes on
+**both** HIP/gfx1151 and CUDA/sm_86. As a guard, hybrid-split **prefill** is
+sliced into ≤4-token sub-batches (forcing the stable MMVQ path); **decode**
+(single-token) is unaffected. This costs some prefill throughput on split
+layers and is removed once the kernel is fixed upstream.
+
 ## Laguna-XS.2 target (experimental, Poolside MoE)
 
 [Poolside Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2) is a 40-layer MoE LLM with 256 experts (top-8) plus an always-on shared expert, per-layer head counts `[48,64,64,64]×10`, and a per-layer SWA pattern (window 512). It is **architecturally distinct from `qwen35`**, so dflash adds a hand-rolled CUDA forward path (`Path A`, ggml-only — no libllama dependency) that mirrors the qwen35 stack. The Q4_K_M GGUF lands at 18.77 GiB on a single RTX 3090; tok_embd stays CPU-only (110 MiB) to keep the GPU budget under 24 GB.
diff --git a/server/src/common/cold_ffn_compute.h b/server/src/common/cold_ffn_compute.h
new file mode 100644
index 000000000..f1d512dec
--- /dev/null
+++ b/server/src/common/cold_ffn_compute.h
@@ -0,0 +1,59 @@
+// ColdFfnCompute: Direct compute interface for cold expert FFN.
+// Bypasses ggml graph dispatch overhead. Shared-memory model (CPU/Halo).
+#pragma once
+
+#include "ggml.h"
+#include <cstdint>
+#include <memory>
+
+namespace dflash::common {
+
+// Per-layer cold weight metadata — raw pointers into shared memory.
+struct ColdFfnLayer {
+    const void * gate_up_data = nullptr;  // fused [n_cold, n_ff*2, n_embd] quantized
+    const void * gate_data = nullptr;     // separate gate [n_cold, n_ff, n_embd]
+    const void * up_data = nullptr;       // separate up   [n_cold, n_ff, n_embd]
+    const void * down_data = nullptr;     // [n_cold, n_embd, n_ff] quantized
+
+    size_t gate_up_stride = 0;   // bytes between experts in gate_up tensor
+    size_t gate_stride = 0;      // bytes between experts in gate tensor
+    size_t up_stride = 0;        // bytes between experts in up tensor
+    size_t down_stride = 0;      // bytes between experts in down tensor
+
+    ggml_type gate_up_type = GGML_TYPE_Q4_K;  // type for fused gate_up
+    ggml_type gate_type = GGML_TYPE_Q4_K;     // type for separate gate
+    ggml_type up_type = GGML_TYPE_Q4_K;       // type for separate up
+    ggml_type down_type = GGML_TYPE_Q4_K;     // type for down projection
+    bool fused_gate_up = false;               // true if gate+up are fused
+
+    // Scale factors (applied after matmul). 1.0 = no scaling.
+    float gate_up_scale = 1.0f;
+    float gate_scale = 1.0f;
+    float up_scale = 1.0f;
+    float down_scale = 1.0f;
+};
+
+// Abstract compute interface. Implementations: CPU (now), Halo (future).
+struct ColdFfnCompute {
+    virtual ~ColdFfnCompute() = default;
+
+    // Compute cold expert FFN contributions and accumulate into output.
+    // input:   [n_embd] F32 — post-norm hidden state
+    // ids:     [n_cold] I32 — local cold expert indices
+    // weights: [n_cold] F32 — routing weights for each cold expert
+    // output:  [n_embd] F32 — accumulated weighted expert outputs (zeroed by callee)
+    virtual void compute(
+        const ColdFfnLayer & layer,
+        const float * input,
+        const int32_t * ids,
+        const float * weights,
+        int n_cold,
+        int n_embd,
+        int n_ff,
+        float * output) = 0;
+};
+
+// Create CPU-based fused cold FFN compute.
+std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max);
+
+}  // namespace dflash::common
diff --git a/server/src/common/cold_ffn_cpu.cpp b/server/src/common/cold_ffn_cpu.cpp
new file mode 100644
index 000000000..75a93ff86
--- /dev/null
+++ b/server/src/common/cold_ffn_cpu.cpp
@@ -0,0 +1,190 @@
+// CpuColdFfnCompute: Fused cold expert FFN using ggml vec_dot primitives.
+// Bypasses ggml graph dispatch overhead. Uses OpenMP to saturate memory bandwidth.
+// Memory-bandwidth bound at ~45 GB/s DDR4. Target: 15.7ms → ~3ms/token.
+
+#include "cold_ffn_compute.h"
+#include "ggml-cpu.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace dflash::common {
+
+class CpuColdFfnCompute : public ColdFfnCompute {
+    int n_ff_max_;
+    int n_threads_;
+
+    // Per-thread scratch buffers for parallel down matmul
+    struct ThreadBuf {
+        std::vector<float> scratch;   // [n_ff * 2] gate_up result + SwiGLU
+        std::vector<uint8_t> mid_conv; // down input converted to vec_dot_type
+    };
+    std::vector<ThreadBuf> thread_bufs_;
+    std::vector<uint8_t> inp_conv_;  // input converted (shared, read-only during matmul)
+
+public:
+    explicit CpuColdFfnCompute(int n_ff_max, int n_threads = 0) : n_ff_max_(n_ff_max) {
+#ifdef _OPENMP
+        if (n_threads <= 0) {
+            const char * env = std::getenv("DFLASH_COLD_THREADS");
+            n_threads = env ? std::atoi(env) : 0;
+        }
+        n_threads_ = n_threads > 0 ? n_threads : std::min(omp_get_max_threads(), 8);
+#else
+        n_threads_ = 1;
+#endif
+        fprintf(stderr, "[cold_ffn] using %d threads\n", n_threads_);
+        thread_bufs_.resize(n_threads_);
+        for (auto & tb : thread_bufs_) {
+            tb.scratch.resize((size_t)n_ff_max * 2);
+        }
+    }
+
+    void compute(
+        const ColdFfnLayer & layer,
+        const float * input,
+        const int32_t * ids,
+        const float * weights,
+        int n_cold,
+        int n_embd,
+        int n_ff,
+        float * output) override {
+
+        if (n_cold <= 0) return;
+        std::memset(output, 0, sizeof(float) * (size_t)n_embd);
+
+        // Gate/up phase type traits
+        const ggml_type gu_type = layer.fused_gate_up ? layer.gate_up_type : layer.gate_type;
+        const auto * gu_cpu_traits = ggml_get_type_traits_cpu(gu_type);
+        const auto gu_vec_dot = gu_cpu_traits->vec_dot;
+        const auto gu_vec_dot_type = gu_cpu_traits->vec_dot_type;
+        const auto gu_from_float = ggml_get_type_traits_cpu(gu_vec_dot_type)->from_float;
+
+        // Down phase type traits (may differ from gate/up)
+        const auto * dn_cpu_traits = ggml_get_type_traits_cpu(layer.down_type);
+        const auto dn_vec_dot = dn_cpu_traits->vec_dot;
+        const auto dn_vec_dot_type = dn_cpu_traits->vec_dot_type;
+        const auto dn_from_float = ggml_get_type_traits_cpu(dn_vec_dot_type)->from_float;
+
+        const size_t inp_row_size = ggml_row_size(gu_vec_dot_type, n_embd);
+        const size_t mid_row_size = ggml_row_size(dn_vec_dot_type, n_ff);
+        const size_t gu_weight_row = ggml_row_size(gu_type, n_embd);
+        const size_t dn_weight_row = ggml_row_size(layer.down_type, n_ff);
+
+        // For separate gate/up — up may have a different type than gate
+        size_t up_weight_row = gu_weight_row;
+        const ggml_type up_type_actual = layer.fused_gate_up ? gu_type : layer.up_type;
+        (void)up_type_actual;
+        ggml_vec_dot_t up_vec_dot = gu_vec_dot;
+        ggml_type up_vdt = gu_vec_dot_type;
+        if (!layer.fused_gate_up && layer.up_type != layer.gate_type) {
+            const auto * up_cpu_traits = ggml_get_type_traits_cpu(layer.up_type);
+            up_vec_dot = up_cpu_traits->vec_dot;
+            up_vdt = up_cpu_traits->vec_dot_type;
+            up_weight_row = ggml_row_size(layer.up_type, n_embd);
+        }
+
+        // Ensure input conversion buffer is large enough
+        if (inp_conv_.size() < inp_row_size) inp_conv_.resize(inp_row_size);
+        // Ensure per-thread mid_conv buffers
+        for (auto & tb : thread_bufs_) {
+            if (tb.mid_conv.size() < mid_row_size) tb.mid_conv.resize(mid_row_size);
+        }
+
+        // Convert input for up if different type
+        std::vector<uint8_t> inp_conv_up;
+        if (!layer.fused_gate_up && up_vdt != gu_vec_dot_type) {
+            size_t up_inp_row_size = ggml_row_size(up_vdt, n_embd);
+            inp_conv_up.resize(up_inp_row_size);
+            auto up_from_float = ggml_get_type_traits_cpu(up_vdt)->from_float;
+            up_from_float(input, inp_conv_up.data(), n_embd);
+        }
+
+        // Convert input to gate's vec_dot format once
+        gu_from_float(input, inp_conv_.data(), n_embd);
+
+        for (int e = 0; e < n_cold; ++e) {
+            const int32_t eid = ids[e];
+            const float w = weights[e];
+            if (w == 0.0f) continue;
+
+            // Use thread 0's scratch for gate_up (serial phase)
+            float * scratch = thread_bufs_[0].scratch.data();
+
+            // ── Phase 1: gate_up matmul → scratch[0..n_ff*2) ──
+            // Parallel over rows (each row is independent, reading shared inp_conv_)
+            if (layer.fused_gate_up) {
+                const char * expert = (const char *)layer.gate_up_data + (size_t)eid * layer.gate_up_stride;
+                const int n_rows = n_ff * 2;
+#ifdef _OPENMP
+                #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+                for (int row = 0; row < n_rows; ++row) {
+                    const void * row_ptr = expert + (size_t)row * gu_weight_row;
+                    gu_vec_dot(n_embd, &scratch[row], 0, row_ptr, 0, inp_conv_.data(), 0, 1);
+                }
+                if (layer.gate_up_scale != 1.0f) {
+                    for (int i = 0; i < n_rows; ++i) scratch[i] *= layer.gate_up_scale;
+                }
+            } else {
+                const char * gate_expert = (const char *)layer.gate_data + (size_t)eid * layer.gate_stride;
+                const char * up_expert = (const char *)layer.up_data + (size_t)eid * layer.up_stride;
+                const uint8_t * up_inp = (!inp_conv_up.empty()) ? inp_conv_up.data() : inp_conv_.data();
+#ifdef _OPENMP
+                #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+                for (int row = 0; row < n_ff; ++row) {
+                    const void * gp = gate_expert + (size_t)row * gu_weight_row;
+                    gu_vec_dot(n_embd, &scratch[row], 0, gp, 0, inp_conv_.data(), 0, 1);
+                    const void * up = up_expert + (size_t)row * up_weight_row;
+                    up_vec_dot(n_embd, &scratch[n_ff + row], 0, up, 0, up_inp, 0, 1);
+                }
+                if (layer.gate_scale != 1.0f) {
+                    for (int i = 0; i < n_ff; ++i) scratch[i] *= layer.gate_scale;
+                }
+                if (layer.up_scale != 1.0f) {
+                    for (int i = 0; i < n_ff; ++i) scratch[n_ff + i] *= layer.up_scale;
+                }
+            }
+
+            // ── Phase 2: SwiGLU activation ──
+            for (int i = 0; i < n_ff; ++i) {
+                const float gate = scratch[i];
+                const float up = scratch[n_ff + i];
+                scratch[i] = (gate / (1.0f + expf(-gate))) * up;
+            }
+
+            // ── Phase 3: down matmul → output (weighted accumulate) ──
+            // Convert SwiGLU result to down's vec_dot format (serial, small)
+            dn_from_float(scratch, thread_bufs_[0].mid_conv.data(), n_ff);
+            const uint8_t * mid_conv_data = thread_bufs_[0].mid_conv.data();
+
+            const char * down_expert = (const char *)layer.down_data + (size_t)eid * layer.down_stride;
+            const float scale = w * layer.down_scale;
+
+            // Parallel down matmul — each thread accumulates its own output rows
+#ifdef _OPENMP
+            #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+            for (int row = 0; row < n_embd; ++row) {
+                float val;
+                const void * row_ptr = down_expert + (size_t)row * dn_weight_row;
+                dn_vec_dot(n_ff, &val, 0, row_ptr, 0, mid_conv_data, 0, 1);
+                output[row] += scale * val;
+            }
+        }
+    }
+};
+
+std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max) {
+    return std::make_unique<CpuColdFfnCompute>(n_ff_max);
+}
+
+}  // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp
similarity index 79%
rename from server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
rename to server/src/common/moe_hybrid_ffn_eval.cpp
index 6e44b2d3f..4ded0bc5a 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
+++ b/server/src/common/moe_hybrid_ffn_eval.cpp
@@ -1,214 +1,32 @@
-#include "qwen35moe_hybrid_ffn_eval.h"
-
-#include "qwen35_ops.h"
+#include "moe_hybrid_ffn_eval.h"
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
+#include <algorithm>
 #include <chrono>
 #include <cmath>
+#include <cstdio>
+#include <cstring>
 
 namespace dflash::common {
 
 namespace {
 
+// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result
+// by that scale. No-op when scale==1.0f (non-NVFP4 models).
+inline ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result, float scale) {
+    if (scale == 1.0f) return mm_result;
+    return ggml_scale(ctx, mm_result, scale);
+}
+
 using HybridClock = std::chrono::steady_clock;
 
 static uint64_t elapsed_us(HybridClock::time_point start, HybridClock::time_point end) {
     return (uint64_t) std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
 }
 
-} // namespace (anon helpers)
-
-// Build a cached FFN graph for the hot+shared path with a fixed n_hot.
-bool build_cached_hot_graph(
-    CachedFfnGraph & out,
-    ggml_backend_t backend,
-    ggml_tensor * gate_tensor,
-    ggml_tensor * up_tensor,
-    ggml_tensor * down_tensor,
-    ggml_tensor * gate_up_tensor,
-    float gate_scale,
-    float up_scale,
-    float down_scale,
-    float gate_up_scale,
-    const TargetLayer & L,
-    int n_embd,
-    int n_ff_exp,
-    int n_hot) {
-
-    out.free();
-    out.n_hot = n_hot;
-
-    ggml_init_params ip{};
-    ip.mem_size = 48 * 1024 * 1024;
-    ip.mem_buffer = nullptr;
-    ip.no_alloc = true;
-    out.ctx = ggml_init(ip);
-    if (!out.ctx) return false;
-
-    out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
-    ggml_set_input(out.inp);
-
-    ggml_tensor * routed = nullptr;
-    if (n_hot > 0) {
-        out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1);
-        ggml_set_input(out.ids);
-        out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1);
-        ggml_set_input(out.weights);
-
-        ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
-        ggml_tensor * gu = nullptr;
-        if (gate_up_tensor) {
-            ggml_tensor * gate_up_e = apply_scale2(out.ctx,
-                ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
-            ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
-                n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-                gate_up_e->nb[1], gate_up_e->nb[2], 0);
-            ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
-                n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-                gate_up_e->nb[1], gate_up_e->nb[2],
-                (size_t)n_ff_exp * ggml_element_size(gate_up_e));
-            gate_e = ggml_cont(out.ctx, gate_e);
-            up_e = ggml_cont(out.ctx, up_e);
-            gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
-        } else {
-            ggml_tensor * gate_e = apply_scale2(out.ctx,
-                ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
-            ggml_tensor * up_e = apply_scale2(out.ctx,
-                ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
-            gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
-        }
-
-        ggml_tensor * experts = apply_scale2(out.ctx,
-            ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
-        ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1);
-        experts = ggml_mul(out.ctx, experts, w_view);
-
-        for (int i = 0; i < n_hot; ++i) {
-            ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
-                                               (size_t)i * experts->nb[1]);
-            routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice);
-        }
-    }
-
-    ggml_tensor * shared = nullptr;
-    const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
-    if (has_shared) {
-        ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_gate_shexp, out.inp), L.ffn_gate_shexp_s);
-        ggml_tensor * sh_up   = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_up_shexp,   out.inp), L.ffn_up_shexp_s);
-        ggml_tensor * sh_gu   = ggml_swiglu_split(out.ctx, sh_gate, sh_up);
-        shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
-        if (L.ffn_gate_inp_shexp) {
-            ggml_tensor * shared_gate = apply_scale2(out.ctx,
-                ggml_mul_mat(out.ctx, L.ffn_gate_inp_shexp, out.inp), L.ffn_gate_inp_shexp_s);
-            shared_gate = ggml_sigmoid(out.ctx, shared_gate);
-            shared = ggml_mul(out.ctx, shared, shared_gate);
-        }
-    }
-
-    if (routed && shared) {
-        out.output = ggml_add(out.ctx, routed, shared);
-    } else if (routed) {
-        out.output = routed;
-    } else {
-        out.output = shared;
-    }
-    if (!out.output) { out.free(); return false; }
-
-    out.gf = ggml_new_graph_custom(out.ctx, 2048, false);
-    ggml_set_output(out.output);
-    ggml_build_forward_expand(out.gf, out.output);
-    out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
-        out.free();
-        return false;
-    }
-    return true;
-}
-
-// Build a cached FFN graph for the cold (CPU) routed subset.
-bool build_cached_cold_graph(
-    CachedFfnGraph & out,
-    ggml_backend_t cpu_backend,
-    ggml_tensor * gate_tensor,
-    ggml_tensor * up_tensor,
-    ggml_tensor * down_tensor,
-    ggml_tensor * gate_up_tensor,
-    float gate_scale,
-    float up_scale,
-    float down_scale,
-    float gate_up_scale,
-    int n_embd,
-    int n_ff_exp,
-    int n_cold) {
-
-    out.free();
-    out.n_hot = n_cold;  // reuse field for "n experts in this graph"
-
-    ggml_init_params ip{};
-    ip.mem_size = 32 * 1024 * 1024;
-    ip.mem_buffer = nullptr;
-    ip.no_alloc = true;
-    out.ctx = ggml_init(ip);
-    if (!out.ctx) return false;
-
-    out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
-    ggml_set_input(out.inp);
-    out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1);
-    ggml_set_input(out.ids);
-    out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1);
-    ggml_set_input(out.weights);
-
-    ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
-    ggml_tensor * gu = nullptr;
-    if (gate_up_tensor) {
-        ggml_tensor * gate_up_e = apply_scale2(out.ctx,
-            ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
-        ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
-            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-            gate_up_e->nb[1], gate_up_e->nb[2], 0);
-        ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
-            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-            gate_up_e->nb[1], gate_up_e->nb[2],
-            (size_t)n_ff_exp * ggml_element_size(gate_up_e));
-        gate_e = ggml_cont(out.ctx, gate_e);
-        up_e = ggml_cont(out.ctx, up_e);
-        gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
-    } else {
-        ggml_tensor * gate_e = apply_scale2(out.ctx,
-            ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
-        ggml_tensor * up_e = apply_scale2(out.ctx,
-            ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
-        gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
-    }
-
-    ggml_tensor * experts = apply_scale2(out.ctx,
-        ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
-    ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1);
-    experts = ggml_mul(out.ctx, experts, w_view);
-
-    out.output = nullptr;
-    for (int i = 0; i < n_cold; ++i) {
-        ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
-                                           (size_t)i * experts->nb[1]);
-        out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice);
-    }
-    if (!out.output) { out.free(); return false; }
-
-    out.gf = ggml_new_graph_custom(out.ctx, 1024, false);
-    ggml_set_output(out.output);
-    ggml_build_forward_expand(out.gf, out.output);
-    out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
-    if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
-        out.free();
-        return false;
-    }
-    return true;
-}
-
-namespace {
-
+// Run routed expert subset on a given backend (GPU or CPU).
 static bool run_routed_subset(ggml_backend_t backend,
                               ggml_tensor * gate_tensor,
                               ggml_tensor * up_tensor,
@@ -331,14 +149,15 @@ static bool run_routed_subset(ggml_backend_t backend,
     return true;
 }
 
+// Shared expert FFN on GPU.
 static bool run_shared_ffn_gpu(ggml_backend_t backend,
-                               const TargetLayer & L,
+                               const MoeLayerDesc & desc,
                                int n_embd,
                                const float * cur_host,
                                std::vector<float> & out,
                                std::string * err) {
     out.assign((size_t)n_embd, 0.0f);
-    if (!L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp) {
+    if (!desc.ffn_up_shexp || !desc.ffn_gate_shexp || !desc.ffn_down_shexp) {
         return true;
     }
 
@@ -355,13 +174,13 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend,
     ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, 1);
     ggml_set_input(inp);
 
-    ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
-    ggml_tensor * sh_up   = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp,   inp), L.ffn_up_shexp_s);
+    ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
+    ggml_tensor * sh_up   = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp,   inp), desc.ffn_up_shexp_s);
     ggml_tensor * sh_gu   = ggml_swiglu_split(ctx, sh_gate, sh_up);
-    ggml_tensor * shared  = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
-    if (L.ffn_gate_inp_shexp) {
+    ggml_tensor * shared  = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+    if (desc.ffn_gate_inp_shexp) {
         ggml_tensor * shared_gate = apply_scale2(ctx,
-            ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+            ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
         shared_gate = ggml_sigmoid(ctx, shared_gate);
         shared = ggml_mul(ctx, shared, shared_gate);
     }
@@ -391,7 +210,6 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend,
 }
 
 // Fused hot routed + shared FFN in a single GPU graph compute.
-// Eliminates one graph compute phase per layer vs separate run_routed_subset + run_shared_ffn_gpu.
 static bool run_hot_and_shared_ffn_gpu(
     ggml_backend_t backend,
     ggml_tensor * gate_tensor,
@@ -402,7 +220,7 @@ static bool run_hot_and_shared_ffn_gpu(
     float up_scale,
     float down_scale,
     float gate_up_scale,
-    const TargetLayer & L,
+    const MoeLayerDesc & desc,
     int n_embd,
     int n_ff_exp,
     const float * cur_host,
@@ -415,7 +233,7 @@ static bool run_hot_and_shared_ffn_gpu(
     out.assign((size_t)n_embd, 0.0f);
 
     const bool has_hot = (n_hot > 0);
-    const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+    const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
     if (!has_hot && !has_shared) return true;
 
     ggml_init_params ip{};
@@ -478,13 +296,13 @@ static bool run_hot_and_shared_ffn_gpu(
 
     ggml_tensor * shared = nullptr;
     if (has_shared) {
-        ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
-        ggml_tensor * sh_up   = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp,   inp), L.ffn_up_shexp_s);
+        ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
+        ggml_tensor * sh_up   = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp,   inp), desc.ffn_up_shexp_s);
         ggml_tensor * sh_gu   = ggml_swiglu_split(ctx, sh_gate, sh_up);
-        shared = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
-        if (L.ffn_gate_inp_shexp) {
+        shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+        if (desc.ffn_gate_inp_shexp) {
             ggml_tensor * shared_gate = apply_scale2(ctx,
-                ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+                ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
             shared_gate = ggml_sigmoid(ctx, shared_gate);
             shared = ggml_mul(ctx, shared, shared_gate);
         }
@@ -533,46 +351,266 @@ static bool run_hot_and_shared_ffn_gpu(
     return true;
 }
 
-} // namespace
-
-bool eval_qwen35moe_reference_ffn_single(
-    ggml_backend_t         gpu_backend,
-    const TargetWeights &  w,
-    const TargetLayer &    L,
-    const float *          cur_host,
-    const int32_t *        selected_ids,
-    const float *          selected_weights,
-    int                    n_selected,
-    std::vector<float> &   out,
-    std::string *          err) {
-    // Reference path: fused hot+shared in one graph (same as hybrid but all experts on GPU)
-    if (!run_hot_and_shared_ffn_gpu(gpu_backend,
-                                    L.ffn_gate_exps, L.ffn_up_exps, L.ffn_down_exps, L.ffn_gate_up_exps,
-                                    L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                    L, w.n_embd, w.n_ff_exp,
-                                    cur_host, selected_ids, selected_weights, n_selected,
-                                    out, err)) {
+// Build batched routed graph helper for batched prefill.
+static bool build_batched_routed_graph(
+    ggml_context * ctx,
+    ggml_tensor * gate_tensor,
+    ggml_tensor * up_tensor,
+    ggml_tensor * down_tensor,
+    ggml_tensor * gate_up_tensor,
+    float gate_scale,
+    float up_scale,
+    float down_scale,
+    float gate_up_scale,
+    ggml_tensor * inp,
+    ggml_tensor * sel,
+    ggml_tensor * wts,
+    int n_embd, int n_ff_exp, int n_used, int n_tokens,
+    ggml_tensor ** out_routed)
+{
+    ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
+    ggml_tensor * gu = nullptr;
+    if (gate_up_tensor) {
+        ggml_tensor * gate_up_e = apply_scale2(ctx,
+            ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale);
+        ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
+            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+            gate_up_e->nb[1], gate_up_e->nb[2], 0);
+        ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e,
+            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+            gate_up_e->nb[1], gate_up_e->nb[2],
+            (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+        gate_e = ggml_cont(ctx, gate_e);
+        up_e = ggml_cont(ctx, up_e);
+        gu = ggml_swiglu_split(ctx, gate_e, up_e);
+    } else {
+        ggml_tensor * gate_e = apply_scale2(ctx,
+            ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale);
+        ggml_tensor * up_e = apply_scale2(ctx,
+            ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale);
+        gu = ggml_swiglu_split(ctx, gate_e, up_e);
+    }
+
+    ggml_tensor * experts = apply_scale2(ctx,
+        ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale);
+
+    // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens]
+    ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
+    experts = ggml_mul(ctx, experts, w_view);
+
+    ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens);
+    ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape);
+    *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens);
+    return true;
+}
+
+} // namespace (anon)
+
+// ── Public API ──────────────────────────────────────────────────────────────────
+
+bool build_cached_hot_graph(
+    CachedFfnGraph & out,
+    ggml_backend_t backend,
+    ggml_tensor * gate_tensor,
+    ggml_tensor * up_tensor,
+    ggml_tensor * down_tensor,
+    ggml_tensor * gate_up_tensor,
+    float gate_scale,
+    float up_scale,
+    float down_scale,
+    float gate_up_scale,
+    const MoeLayerDesc & desc,
+    int n_embd,
+    int n_ff_exp,
+    int n_hot) {
+
+    out.free();
+    out.n_hot = n_hot;
+
+    ggml_init_params ip{};
+    ip.mem_size = 48 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc = true;
+    out.ctx = ggml_init(ip);
+    if (!out.ctx) return false;
+
+    out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
+    ggml_set_input(out.inp);
+
+    ggml_tensor * routed = nullptr;
+    if (n_hot > 0) {
+        out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1);
+        ggml_set_input(out.ids);
+        out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1);
+        ggml_set_input(out.weights);
+
+        ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
+        ggml_tensor * gu = nullptr;
+        if (gate_up_tensor) {
+            ggml_tensor * gate_up_e = apply_scale2(out.ctx,
+                ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
+            ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
+                n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+                gate_up_e->nb[1], gate_up_e->nb[2], 0);
+            ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
+                n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+                gate_up_e->nb[1], gate_up_e->nb[2],
+                (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+            gate_e = ggml_cont(out.ctx, gate_e);
+            up_e = ggml_cont(out.ctx, up_e);
+            gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+        } else {
+            ggml_tensor * gate_e = apply_scale2(out.ctx,
+                ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
+            ggml_tensor * up_e = apply_scale2(out.ctx,
+                ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
+            gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+        }
+
+        ggml_tensor * experts = apply_scale2(out.ctx,
+            ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
+        ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1);
+        experts = ggml_mul(out.ctx, experts, w_view);
+
+        for (int i = 0; i < n_hot; ++i) {
+            ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
+                                               (size_t)i * experts->nb[1]);
+            routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice);
+        }
+    }
+
+    ggml_tensor * shared = nullptr;
+    const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
+    if (has_shared) {
+        ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_gate_shexp, out.inp), desc.ffn_gate_shexp_s);
+        ggml_tensor * sh_up   = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_up_shexp,   out.inp), desc.ffn_up_shexp_s);
+        ggml_tensor * sh_gu   = ggml_swiglu_split(out.ctx, sh_gate, sh_up);
+        shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+        if (desc.ffn_gate_inp_shexp) {
+            ggml_tensor * shared_gate = apply_scale2(out.ctx,
+                ggml_mul_mat(out.ctx, desc.ffn_gate_inp_shexp, out.inp), desc.ffn_gate_inp_shexp_s);
+            shared_gate = ggml_sigmoid(out.ctx, shared_gate);
+            shared = ggml_mul(out.ctx, shared, shared_gate);
+        }
+    }
+
+    if (routed && shared) {
+        out.output = ggml_add(out.ctx, routed, shared);
+    } else if (routed) {
+        out.output = routed;
+    } else {
+        out.output = shared;
+    }
+    if (!out.output) { out.free(); return false; }
+
+    out.gf = ggml_new_graph_custom(out.ctx, 2048, false);
+    ggml_set_output(out.output);
+    ggml_build_forward_expand(out.gf, out.output);
+    out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+        out.free();
+        return false;
+    }
+    return true;
+}
+
+bool build_cached_cold_graph(
+    CachedFfnGraph & out,
+    ggml_backend_t cpu_backend,
+    ggml_tensor * gate_tensor,
+    ggml_tensor * up_tensor,
+    ggml_tensor * down_tensor,
+    ggml_tensor * gate_up_tensor,
+    float gate_scale,
+    float up_scale,
+    float down_scale,
+    float gate_up_scale,
+    int n_embd,
+    int n_ff_exp,
+    int n_cold) {
+
+    out.free();
+    out.n_hot = n_cold;  // reuse field for "n experts in this graph"
+
+    ggml_init_params ip{};
+    ip.mem_size = 32 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc = true;
+    out.ctx = ggml_init(ip);
+    if (!out.ctx) return false;
+
+    out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
+    ggml_set_input(out.inp);
+    out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1);
+    ggml_set_input(out.ids);
+    out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1);
+    ggml_set_input(out.weights);
+
+    ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
+    ggml_tensor * gu = nullptr;
+    if (gate_up_tensor) {
+        ggml_tensor * gate_up_e = apply_scale2(out.ctx,
+            ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
+        ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
+            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+            gate_up_e->nb[1], gate_up_e->nb[2], 0);
+        ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
+            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+            gate_up_e->nb[1], gate_up_e->nb[2],
+            (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+        gate_e = ggml_cont(out.ctx, gate_e);
+        up_e = ggml_cont(out.ctx, up_e);
+        gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+    } else {
+        ggml_tensor * gate_e = apply_scale2(out.ctx,
+            ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
+        ggml_tensor * up_e = apply_scale2(out.ctx,
+            ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
+        gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+    }
+
+    ggml_tensor * experts = apply_scale2(out.ctx,
+        ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
+    ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1);
+    experts = ggml_mul(out.ctx, experts, w_view);
+
+    out.output = nullptr;
+    for (int i = 0; i < n_cold; ++i) {
+        ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
+                                           (size_t)i * experts->nb[1]);
+        out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice);
+    }
+    if (!out.output) { out.free(); return false; }
+
+    out.gf = ggml_new_graph_custom(out.ctx, 1024, false);
+    ggml_set_output(out.output);
+    ggml_build_forward_expand(out.gf, out.output);
+    out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+    if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+        out.free();
         return false;
     }
     return true;
 }
 
-bool eval_qwen35moe_hybrid_ffn_single(
-    ggml_backend_t                      gpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    ggml_backend_t                      cpu_backend,
-    const float *                       cur_host,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_selected,
-    std::vector<float> &                out,
-    Qwen35MoeHybridFfnTelemetry *       telemetry,
-    std::string *                       err) {
+bool eval_moe_hybrid_ffn_single(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    ggml_backend_t                  cpu_backend,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_selected,
+    std::vector<float> &            out,
+    MoeHybridFfnTelemetry *         telemetry,
+    std::string *                   err) {
+
     if (telemetry) *telemetry = {};
     const auto ffn_wall_t0 = HybridClock::now();
     const auto partition_t0 = HybridClock::now();
+
     std::vector<int32_t> hot_ids;
     std::vector<float> hot_weights;
     std::vector<int32_t> cold_ids;
@@ -606,7 +644,7 @@ bool eval_qwen35moe_hybrid_ffn_single(
 
     const int n_hot = (int)hot_ids.size();
     const bool has_hot = (n_hot > 0);
-    const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+    const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
     const bool has_cold = !cold_ids.empty();
     const int n_cold = (int)cold_ids.size();
 
@@ -614,17 +652,17 @@ bool eval_qwen35moe_hybrid_ffn_single(
     bool hot_async_launched = false;
     const auto hot_t0 = HybridClock::now();
     if (!has_hot && !has_shared) {
-        hot_and_shared.assign((size_t)w.n_embd, 0.0f);
+        hot_and_shared.assign((size_t)cfg.n_embd, 0.0f);
     } else {
         // Lazily build cached hot graph on first use
         if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
             build_cached_hot_graph(storage.hot_graph, gpu_backend,
                                    storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
-                                   L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                   L, w.n_embd, w.n_ff_exp, n_hot);
+                                   desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                   desc, cfg.n_embd, cfg.n_ff_exp, n_hot);
         }
         if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
-            ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd);
+            ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd);
             if (storage.hot_graph.ids && has_hot) {
                 ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids.data(), 0, sizeof(int32_t) * (size_t)n_hot);
             }
@@ -638,8 +676,8 @@ bool eval_qwen35moe_hybrid_ffn_single(
             // Fallback: sync compute (no overlap)
             if (!run_hot_and_shared_ffn_gpu(gpu_backend,
                                             storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
-                                            L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                            L, w.n_embd, w.n_ff_exp,
+                                            desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                            desc, cfg.n_embd, cfg.n_ff_exp,
                                             cur_host,
                                             hot_ids.empty() ? nullptr : hot_ids.data(),
                                             hot_weights.empty() ? nullptr : hot_weights.data(),
@@ -655,11 +693,11 @@ bool eval_qwen35moe_hybrid_ffn_single(
         if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
             build_cached_cold_graph(storage.cold_graph, cpu_backend,
                                     storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
-                                    L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                    w.n_embd, w.n_ff_exp, n_cold);
+                                    desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                    cfg.n_embd, cfg.n_ff_exp, n_cold);
         }
         if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
-            ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd);
+            ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd);
             ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids.data(), 0, sizeof(int32_t) * (size_t)n_cold);
             ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights.data(), 0, sizeof(float) * (size_t)n_cold);
             auto st = ggml_backend_graph_compute(cpu_backend, storage.cold_graph.gf);
@@ -668,28 +706,28 @@ bool eval_qwen35moe_hybrid_ffn_single(
                 if (err) *err = "cached cold graph compute failed";
                 return false;
             }
-            cold.resize((size_t)w.n_embd);
-            ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)w.n_embd);
+            cold.resize((size_t)cfg.n_embd);
+            ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)cfg.n_embd);
         } else {
             if (!run_routed_subset(cpu_backend,
                                    storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
-                                   L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                   w.n_embd, w.n_ff_exp,
+                                   desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                   cfg.n_embd, cfg.n_ff_exp,
                                    cur_host, cold_ids.data(), cold_weights.data(), n_cold, cold, err)) {
                 if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
                 return false;
             }
         }
     } else {
-        cold.assign((size_t)w.n_embd, 0.0f);
+        cold.assign((size_t)cfg.n_embd, 0.0f);
     }
     const auto cold_t1 = HybridClock::now();
 
     // ── Sync GPU and read result ──
     if ((has_hot || has_shared) && storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
         ggml_backend_synchronize(gpu_backend);
-        hot_and_shared.resize((size_t)w.n_embd);
-        ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)w.n_embd);
+        hot_and_shared.resize((size_t)cfg.n_embd);
+        ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)cfg.n_embd);
     }
     const auto hot_t1 = HybridClock::now();
 
@@ -700,8 +738,8 @@ bool eval_qwen35moe_hybrid_ffn_single(
     }
 
     const auto combine_t0 = HybridClock::now();
-    out.assign((size_t)w.n_embd, 0.0f);
-    for (int i = 0; i < w.n_embd; ++i) {
+    out.assign((size_t)cfg.n_embd, 0.0f);
+    for (int i = 0; i < cfg.n_embd; ++i) {
         out[(size_t)i] = hot_and_shared[(size_t)i] + cold[(size_t)i];
     }
     const auto combine_t1 = HybridClock::now();
@@ -712,20 +750,20 @@ bool eval_qwen35moe_hybrid_ffn_single(
     return true;
 }
 
-bool eval_qwen35moe_batched_prefill_ffn(
-    ggml_backend_t         gpu_backend,
-    const TargetWeights &  w,
-    const TargetLayer &    L,
-    const float *          cur_host,
-    const int32_t *        selected_ids,
-    const float *          selected_weights,
-    int                    n_tokens,
-    std::vector<float> &   out,
-    std::string *          err) {
-
-    const int n_embd = w.n_embd;
-    const int n_used = w.n_expert_used;
-    const int n_ff_exp = w.n_ff_exp;
+bool eval_moe_batched_prefill_ffn(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_tokens,
+    std::vector<float> &            out,
+    std::string *                   err) {
+
+    const int n_embd = cfg.n_embd;
+    const int n_used = cfg.n_expert_used;
+    const int n_ff_exp = cfg.n_ff_exp;
     out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
     if (n_tokens <= 0) return true;
 
@@ -739,11 +777,8 @@ bool eval_qwen35moe_batched_prefill_ffn(
         return false;
     }
 
-    // Input: [n_embd, n_tokens]
     ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
     ggml_set_input(inp);
-
-    // Pre-computed routing: selected [n_used, n_tokens], weights [n_used, n_tokens]
     ggml_tensor * sel = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_used, n_tokens);
     ggml_set_input(sel);
     ggml_tensor * wts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_used, n_tokens);
@@ -752,9 +787,9 @@ bool eval_qwen35moe_batched_prefill_ffn(
     // Routed expert computation using full GPU expert tensors
     ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
     ggml_tensor * gu = nullptr;
-    if (L.ffn_gate_up_exps) {
+    if (desc.ffn_gate_up_exps) {
         ggml_tensor * gate_up_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, L.ffn_gate_up_exps, cur_3d, sel), L.ffn_gate_up_exps_s);
+            ggml_mul_mat_id(ctx, desc.ffn_gate_up_exps, cur_3d, sel), desc.ffn_gate_up_exps_s);
         ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
             n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
             gate_up_e->nb[1], gate_up_e->nb[2], 0);
@@ -767,14 +802,14 @@ bool eval_qwen35moe_batched_prefill_ffn(
         gu = ggml_swiglu_split(ctx, gate_e, up_e);
     } else {
         ggml_tensor * gate_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, L.ffn_gate_exps, cur_3d, sel), L.ffn_gate_exps_s);
+            ggml_mul_mat_id(ctx, desc.ffn_gate_exps, cur_3d, sel), desc.ffn_gate_exps_s);
         ggml_tensor * up_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, L.ffn_up_exps, cur_3d, sel), L.ffn_up_exps_s);
+            ggml_mul_mat_id(ctx, desc.ffn_up_exps, cur_3d, sel), desc.ffn_up_exps_s);
         gu = ggml_swiglu_split(ctx, gate_e, up_e);
     }
 
     ggml_tensor * experts = apply_scale2(ctx,
-        ggml_mul_mat_id(ctx, L.ffn_down_exps, gu, sel), L.ffn_down_exps_s);
+        ggml_mul_mat_id(ctx, desc.ffn_down_exps, gu, sel), desc.ffn_down_exps_s);
 
     // Weight and sum over experts
     ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
@@ -786,17 +821,17 @@ bool eval_qwen35moe_batched_prefill_ffn(
 
     // Shared expert
     ggml_tensor * combined = routed;
-    if (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp) {
+    if (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp) {
         ggml_tensor * sh_gate = apply_scale2(ctx,
-            ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
+            ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
         ggml_tensor * sh_up = apply_scale2(ctx,
-            ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+            ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
         ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up);
         ggml_tensor * shared = apply_scale2(ctx,
-            ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
-        if (L.ffn_gate_inp_shexp) {
+            ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+        if (desc.ffn_gate_inp_shexp) {
             ggml_tensor * shared_gate = apply_scale2(ctx,
-                ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+                ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
             shared_gate = ggml_sigmoid(ctx, shared_gate);
             shared = ggml_mul(ctx, shared, shared_gate);
         }
@@ -832,80 +867,24 @@ bool eval_qwen35moe_batched_prefill_ffn(
     return true;
 }
 
-// ── GPU-Resident Residual State ──
-
-// ── Hybrid Batched Prefill FFN ──
-// Processes n_tokens at once with hot experts on GPU and cold experts on CPU
-// concurrently.  Uses pre-computed routing from the pre-FFN graph.
-
-static bool build_batched_routed_graph(
-    ggml_context * ctx,
-    ggml_tensor * gate_tensor,
-    ggml_tensor * up_tensor,
-    ggml_tensor * down_tensor,
-    ggml_tensor * gate_up_tensor,
-    float gate_scale,
-    float up_scale,
-    float down_scale,
-    float gate_up_scale,
-    ggml_tensor * inp,          // [n_embd, n_tokens]
-    ggml_tensor * sel,          // [n_used, n_tokens]
-    ggml_tensor * wts,          // [n_used, n_tokens]
-    int n_embd, int n_ff_exp, int n_used, int n_tokens,
-    ggml_tensor ** out_routed)
-{
-    ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
-    ggml_tensor * gu = nullptr;
-    if (gate_up_tensor) {
-        ggml_tensor * gate_up_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale);
-        ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
-            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-            gate_up_e->nb[1], gate_up_e->nb[2], 0);
-        ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e,
-            n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
-            gate_up_e->nb[1], gate_up_e->nb[2],
-            (size_t)n_ff_exp * ggml_element_size(gate_up_e));
-        gate_e = ggml_cont(ctx, gate_e);
-        up_e = ggml_cont(ctx, up_e);
-        gu = ggml_swiglu_split(ctx, gate_e, up_e);
-    } else {
-        ggml_tensor * gate_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale);
-        ggml_tensor * up_e = apply_scale2(ctx,
-            ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale);
-        gu = ggml_swiglu_split(ctx, gate_e, up_e);
-    }
-
-    ggml_tensor * experts = apply_scale2(ctx,
-        ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale);
-
-    // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens]
-    ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
-    experts = ggml_mul(ctx, experts, w_view);
-
-    ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens);
-    ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape);
-    *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens);
-    return true;
-}
-
-bool eval_qwen35moe_hybrid_ffn_batched(
-    ggml_backend_t                      gpu_backend,
-    ggml_backend_t                      cpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    const float *                       cur_host,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_tokens,
-    std::vector<float> &                out,
-    std::string *                       err) {
-
-    const int n_embd = w.n_embd;
-    const int n_used = w.n_expert_used;
-    const int n_ff_exp = w.n_ff_exp;
+static bool eval_moe_hybrid_ffn_batched_core(
+    ggml_backend_t                  gpu_backend,
+    ggml_backend_t                  cpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_tokens,
+    std::vector<float> &            out,
+    std::string *                   err,
+    ggml_gallocr_t *                p_hot_alloc,
+    ggml_gallocr_t *                p_cold_alloc) {
+
+    const int n_embd = cfg.n_embd;
+    const int n_used = cfg.n_expert_used;
+    const int n_ff_exp = cfg.n_ff_exp;
     out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
     if (n_tokens <= 0) return true;
 
@@ -942,11 +921,6 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         }
     }
 
-    // Fast path: all hot → use hot stack only (no cold compute needed)
-    // NOTE: Cannot use eval_qwen35moe_batched_prefill_ffn here since
-    // L.ffn_gate_exps is not allocated in hybrid mode (skip_expert_tensors=true).
-    // Fall through to hybrid path which uses storage.gate_hot correctly.
-
     // ── Step 2: Build and run hot GPU graph (includes shared expert always) ──
     std::vector<float> hot_partial((size_t)n_embd * (size_t)n_tokens, 0.0f);
     bool hot_async_launched = false;
@@ -956,8 +930,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
     ggml_gallocr_t hot_alloc = nullptr;
     ggml_tensor * hot_output = nullptr;
 
-    // Always run GPU graph: shared expert is always on GPU, and hot routed when present
-    const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+    const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
     if (has_hot || has_shared) {
         ggml_init_params ip{};
         ip.mem_size = 128 * 1024 * 1024;
@@ -969,7 +942,6 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         ggml_tensor * inp = ggml_new_tensor_2d(hot_ctx, GGML_TYPE_F32, n_embd, n_tokens);
         ggml_set_input(inp);
 
-        // Only create routing tensors if we have hot routed experts
         ggml_tensor * sel = nullptr;
         ggml_tensor * wts = nullptr;
         ggml_tensor * routed = nullptr;
@@ -981,7 +953,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
 
             build_batched_routed_graph(hot_ctx,
                 storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
-                L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+                desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
                 inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &routed);
         }
 
@@ -989,15 +961,15 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         ggml_tensor * combined = routed;
         if (has_shared) {
             ggml_tensor * sh_gate = apply_scale2(hot_ctx,
-                ggml_mul_mat(hot_ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
+                ggml_mul_mat(hot_ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
             ggml_tensor * sh_up = apply_scale2(hot_ctx,
-                ggml_mul_mat(hot_ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+                ggml_mul_mat(hot_ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
             ggml_tensor * sh_gu = ggml_swiglu_split(hot_ctx, sh_gate, sh_up);
             ggml_tensor * shared = apply_scale2(hot_ctx,
-                ggml_mul_mat(hot_ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
-            if (L.ffn_gate_inp_shexp) {
+                ggml_mul_mat(hot_ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+            if (desc.ffn_gate_inp_shexp) {
                 ggml_tensor * shared_gate = apply_scale2(hot_ctx,
-                    ggml_mul_mat(hot_ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+                    ggml_mul_mat(hot_ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
                 shared_gate = ggml_sigmoid(hot_ctx, shared_gate);
                 shared = ggml_mul(hot_ctx, shared, shared_gate);
             }
@@ -1008,10 +980,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         hot_gf = ggml_new_graph_custom(hot_ctx, 4096, false);
         ggml_set_output(hot_output);
         ggml_build_forward_expand(hot_gf, hot_output);
-        hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+        if (p_hot_alloc) {
+            if (!*p_hot_alloc)
+                *p_hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+            hot_alloc = *p_hot_alloc;
+        } else {
+            hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+        }
         if (!ggml_gallocr_alloc_graph(hot_alloc, hot_gf)) {
             if (err) *err = "hybrid batched hot gallocr failed";
-            ggml_gallocr_free(hot_alloc); ggml_free(hot_ctx);
+            if (!p_hot_alloc) ggml_gallocr_free(hot_alloc);
+            ggml_free(hot_ctx);
             return false;
         }
 
@@ -1053,18 +1032,26 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         ggml_tensor * cold_routed = nullptr;
         build_batched_routed_graph(cold_ctx,
             storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
-            L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+            desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
             inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &cold_routed);
 
         ggml_cgraph * cold_gf = ggml_new_graph_custom(cold_ctx, 4096, false);
         ggml_set_output(cold_routed);
         ggml_build_forward_expand(cold_gf, cold_routed);
-        ggml_gallocr_t cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+        ggml_gallocr_t cold_alloc;
+        if (p_cold_alloc) {
+            if (!*p_cold_alloc)
+                *p_cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+            cold_alloc = *p_cold_alloc;
+        } else {
+            cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+        }
         if (!ggml_gallocr_alloc_graph(cold_alloc, cold_gf)) {
             if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
-            if (hot_alloc) ggml_gallocr_free(hot_alloc);
+            if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
             if (hot_ctx) ggml_free(hot_ctx);
-            ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx);
+            if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
+            ggml_free(cold_ctx);
             if (err) *err = "hybrid batched cold gallocr failed";
             return false;
         }
@@ -1077,16 +1064,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         auto st = ggml_backend_graph_compute(cpu_backend, cold_gf);
         if (st != GGML_STATUS_SUCCESS) {
             if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
-            if (hot_alloc) ggml_gallocr_free(hot_alloc);
+            if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
             if (hot_ctx) ggml_free(hot_ctx);
-            ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx);
+            if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
+            ggml_free(cold_ctx);
             if (err) *err = "hybrid batched cold compute failed";
             return false;
         }
 
         ggml_backend_tensor_get(cold_routed, cold_partial.data(), 0,
             sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
-        ggml_gallocr_free(cold_alloc);
+        if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
         ggml_free(cold_ctx);
     }
 
@@ -1096,7 +1084,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
         ggml_backend_tensor_get(hot_output, hot_partial.data(), 0,
             sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
     }
-    if (hot_alloc) ggml_gallocr_free(hot_alloc);
+    if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
     if (hot_ctx) ggml_free(hot_ctx);
 
     // ── Step 5: Merge hot + cold ──
@@ -1108,6 +1096,60 @@ bool eval_qwen35moe_hybrid_ffn_batched(
     return true;
 }
 
+// ── GPU-Resident Residual State ──
+
+// Public entry. Workaround for a ggml-cuda/HIP defect: the MMQ mul_mat_id
+// kernel illegal-accesses on gfx1151 when the per-layer hot expert stack is
+// REDUCED (n_hot_stack < n_expert); the full-stack (all-hot) case is fine.
+// MMVQ is used instead of MMQ only when the matmul batch dim (= n_tokens) is
+// small (Q4_K AMD MMVQ-mmid cap is 4). So for reduced hot stacks we slice the
+// prefill batch into <=4-token sub-batches, routing the routed mul_mat_id
+// through the stable MMVQ path. Full stacks keep the fast single-shot MMQ.
+bool eval_moe_hybrid_ffn_batched(
+    ggml_backend_t                  gpu_backend,
+    ggml_backend_t                  cpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_tokens,
+    std::vector<float> &            out,
+    std::string *                   err,
+    ggml_gallocr_t *                p_hot_alloc,
+    ggml_gallocr_t *                p_cold_alloc) {
+    const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
+                          : storage.gate_hot    ? (int)storage.gate_hot->ne[2]
+                          : 0;
+    static const int MMQ_SAFE_SUB_BATCH = 4;
+    if (n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+        const int n_embd = cfg.n_embd;
+        const int n_used = cfg.n_expert_used;
+        out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
+        std::vector<float> sub_out;
+        for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) {
+            const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0);
+            if (!eval_moe_hybrid_ffn_batched_core(
+                    gpu_backend, cpu_backend, cfg, desc, storage,
+                    cur_host + (size_t)t0 * (size_t)n_embd,
+                    selected_ids + (size_t)t0 * (size_t)n_used,
+                    selected_weights + (size_t)t0 * (size_t)n_used,
+                    tc, sub_out, err, p_hot_alloc, p_cold_alloc)) {
+                return false;
+            }
+            std::memcpy(out.data() + (size_t)t0 * (size_t)n_embd,
+                        sub_out.data(),
+                        sizeof(float) * (size_t)n_embd * (size_t)tc);
+        }
+        return true;
+    }
+    return eval_moe_hybrid_ffn_batched_core(
+        gpu_backend, cpu_backend, cfg, desc, storage,
+        cur_host, selected_ids, selected_weights, n_tokens, out, err,
+        p_hot_alloc, p_cold_alloc);
+}
+
 void ResidualCombineGraph::free() {
     if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
     if (ctx) { ggml_free(ctx); ctx = nullptr; }
@@ -1165,7 +1207,6 @@ void GpuResidentState::destroy() {
 bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd) {
     out.destroy();
 
-    // Allocate persistent GPU tensor for act_cur
     ggml_init_params ip{};
     ip.mem_size = 1024 * 1024;
     ip.mem_buffer = nullptr;
@@ -1180,38 +1221,31 @@ bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int
         return false;
     }
 
-    // Build the residual combine graph
     if (!build_residual_combine_graph(out.combine, backend, n_embd)) {
         out.destroy();
         return false;
     }
 
-    // Zero out cold_in initially (for all-hot layers, cold stays zero)
     std::vector<float> zeros((size_t)n_embd, 0.0f);
     ggml_backend_tensor_set(out.combine.cold_in, zeros.data(), 0, sizeof(float) * (size_t)n_embd);
 
     return true;
 }
 
-// ─── GPU-Resident hybrid FFN eval ─────────────────────────────────────────────
-// Keeps activation on GPU: only reads router IDs (64B) to CPU, and ffn_post
-// to CPU only when cold experts are selected.  All other data movement is
-// GPU→GPU via ggml_backend_tensor_copy.
-
-bool eval_qwen35moe_hybrid_ffn_gpu_resident(
-    ggml_backend_t                      gpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    ggml_backend_t                      cpu_backend,
-    ggml_tensor *                       ffn_post_gpu,
-    ggml_tensor *                       ffn_residual_gpu,
-    GpuResidentState &                  gpu_state,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_selected) {
-
-    const int n_embd = w.n_embd;
+bool eval_moe_hybrid_ffn_gpu_resident(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    ggml_backend_t                  cpu_backend,
+    ggml_tensor *                   ffn_post_gpu,
+    ggml_tensor *                   ffn_residual_gpu,
+    GpuResidentState &              gpu_state,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_selected) {
+
+    const int n_embd = cfg.n_embd;
 
     // ── Partition into hot/cold ──
     std::vector<int32_t> hot_ids;
@@ -1239,7 +1273,7 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
 
     const int n_hot = (int)hot_ids.size();
     const bool has_hot = (n_hot > 0);
-    const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+    const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
     const bool has_cold = !cold_ids.empty();
     const int n_cold = (int)cold_ids.size();
 
@@ -1252,8 +1286,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
         if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
             build_cached_hot_graph(storage.hot_graph, gpu_backend,
                                    storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
-                                   L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                   L, n_embd, w.n_ff_exp, n_hot);
+                                   desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                   desc, n_embd, cfg.n_ff_exp, n_hot);
         }
         if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
             // GPU→GPU copy: ffn_post → hot_graph.inp (no PCIe!)
@@ -1270,7 +1304,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
     }
 
     // ── If cold needed, read ffn_post to CPU BEFORE launching hot async ──
-    // (to avoid serializing GPU queue with a device→host read mid-kernel)
     std::vector<float> post_host;
     if (has_cold) {
         post_host.resize((size_t)n_embd);
@@ -1289,8 +1322,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
         if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
             build_cached_cold_graph(storage.cold_graph, cpu_backend,
                                     storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
-                                    L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                    n_embd, w.n_ff_exp, n_cold);
+                                    desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+                                    n_embd, cfg.n_ff_exp, n_cold);
         }
         if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
             ggml_backend_tensor_set(storage.cold_graph.inp, post_host.data(), 0,
@@ -1308,7 +1341,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
             ggml_backend_tensor_get(storage.cold_graph.output, cold_result.data(), 0,
                                     sizeof(float) * (size_t)n_embd);
         } else {
-            // Fallback: cold graph build failed — shouldn't happen
             if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
             return false;
         }
@@ -1320,7 +1352,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
         // GPU→GPU: hot output → combine.hot_in
         ggml_backend_tensor_copy(storage.hot_graph.output, gpu_state.combine.hot_in);
     } else {
-        // No hot/shared: zero hot_in
         std::vector<float> zeros((size_t)n_embd, 0.0f);
         ggml_backend_tensor_set(gpu_state.combine.hot_in, zeros.data(), 0,
                                 sizeof(float) * (size_t)n_embd);
diff --git a/server/src/common/moe_hybrid_ffn_eval.h b/server/src/common/moe_hybrid_ffn_eval.h
new file mode 100644
index 000000000..bd30755ea
--- /dev/null
+++ b/server/src/common/moe_hybrid_ffn_eval.h
@@ -0,0 +1,191 @@
+// Common MoE hybrid FFN evaluation — hot experts on GPU, cold on CPU, concurrent.
+
+#pragma once
+
+#include "moe_hybrid_types.h"
+#include "moe_hybrid_storage.h"
+
+#include "ggml-backend.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace dflash::common {
+
+// GPU-resident residual combine graph: output = residual + hot_out + cold_correction.
+struct ResidualCombineGraph {
+    ggml_context * ctx = nullptr;
+    ggml_cgraph * gf = nullptr;
+    ggml_gallocr_t alloc = nullptr;
+    ggml_tensor * residual_in = nullptr;
+    ggml_tensor * hot_in = nullptr;
+    ggml_tensor * cold_in = nullptr;
+    ggml_tensor * output = nullptr;
+
+    ResidualCombineGraph() = default;
+    ~ResidualCombineGraph() { free(); }
+    ResidualCombineGraph(const ResidualCombineGraph &) = delete;
+    ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
+    ResidualCombineGraph(ResidualCombineGraph && o) noexcept
+        : ctx(o.ctx), gf(o.gf), alloc(o.alloc),
+          residual_in(o.residual_in), hot_in(o.hot_in),
+          cold_in(o.cold_in), output(o.output) {
+        o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+        o.residual_in = nullptr; o.hot_in = nullptr;
+        o.cold_in = nullptr; o.output = nullptr;
+    }
+    ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
+        if (this != &o) {
+            free();
+            ctx = o.ctx; gf = o.gf; alloc = o.alloc;
+            residual_in = o.residual_in; hot_in = o.hot_in;
+            cold_in = o.cold_in; output = o.output;
+            o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+            o.residual_in = nullptr; o.hot_in = nullptr;
+            o.cold_in = nullptr; o.output = nullptr;
+        }
+        return *this;
+    }
+    bool valid() const { return ctx && gf && alloc && output; }
+    void free();
+    void destroy();
+};
+
+bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd);
+
+// GPU-resident state for the decode loop.
+struct GpuResidentState {
+    ggml_context * ctx = nullptr;
+    ggml_backend_buffer_t buf = nullptr;
+    ggml_tensor * act_cur = nullptr;
+
+    ResidualCombineGraph combine;
+
+    GpuResidentState() = default;
+    ~GpuResidentState() { destroy(); }
+    GpuResidentState(const GpuResidentState &) = delete;
+    GpuResidentState & operator=(const GpuResidentState &) = delete;
+    GpuResidentState(GpuResidentState && o) noexcept
+        : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
+          combine(std::move(o.combine)) {
+        o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+    }
+    GpuResidentState & operator=(GpuResidentState && o) noexcept {
+        if (this != &o) {
+            destroy();
+            ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
+            combine = std::move(o.combine);
+            o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+        }
+        return *this;
+    }
+    bool valid() const { return ctx && buf && act_cur && combine.valid(); }
+    void destroy();
+};
+
+bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd);
+
+struct MoeHybridFfnTelemetry {
+    uint64_t ffn_wall_us = 0;
+    uint64_t partition_us = 0;
+    uint64_t hot_us = 0;
+    uint64_t cold_us = 0;
+    uint64_t shared_us = 0;
+    uint64_t combine_us = 0;
+    int hot_selected = 0;
+    int cold_selected = 0;
+};
+
+// Single-token hybrid FFN: hot on GPU, cold on CPU, combine on host.
+bool eval_moe_hybrid_ffn_single(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    ggml_backend_t                  cpu_backend,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_selected,
+    std::vector<float> &            out,
+    MoeHybridFfnTelemetry *         telemetry = nullptr,
+    std::string *                   err = nullptr);
+
+// Batched prefill FFN: all experts on GPU (no hybrid split).
+bool eval_moe_batched_prefill_ffn(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_tokens,
+    std::vector<float> &            out,
+    std::string *                   err = nullptr);
+
+// Batched hybrid prefill FFN: hot on GPU, cold on CPU concurrently.
+bool eval_moe_hybrid_ffn_batched(
+    ggml_backend_t                  gpu_backend,
+    ggml_backend_t                  cpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    const float *                   cur_host,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_tokens,
+    std::vector<float> &            out,
+    std::string *                   err = nullptr,
+    ggml_gallocr_t *                p_hot_alloc = nullptr,
+    ggml_gallocr_t *                p_cold_alloc = nullptr);
+
+// GPU-resident single-token hybrid FFN: keeps data on GPU, only reads router
+// IDs to CPU for hot/cold partitioning.
+bool eval_moe_hybrid_ffn_gpu_resident(
+    ggml_backend_t                  gpu_backend,
+    const MoeHybridConfig &         cfg,
+    const MoeLayerDesc &            desc,
+    MoeHybridLayerStorage &         storage,
+    ggml_backend_t                  cpu_backend,
+    ggml_tensor *                   ffn_post_gpu,
+    ggml_tensor *                   ffn_residual_gpu,
+    GpuResidentState &              gpu_state,
+    const int32_t *                 selected_ids,
+    const float *                   selected_weights,
+    int                             n_selected);
+
+// Build/rebuild cached hot FFN graph.
+bool build_cached_hot_graph(
+    CachedFfnGraph & out,
+    ggml_backend_t backend,
+    ggml_tensor * gate_tensor,
+    ggml_tensor * up_tensor,
+    ggml_tensor * down_tensor,
+    ggml_tensor * gate_up_tensor,
+    float gate_scale,
+    float up_scale,
+    float down_scale,
+    float gate_up_scale,
+    const MoeLayerDesc & desc,
+    int n_embd,
+    int n_ff_exp,
+    int n_hot);
+
+// Build/rebuild cached cold FFN graph.
+bool build_cached_cold_graph(
+    CachedFfnGraph & out,
+    ggml_backend_t cpu_backend,
+    ggml_tensor * gate_tensor,
+    ggml_tensor * up_tensor,
+    ggml_tensor * down_tensor,
+    ggml_tensor * gate_up_tensor,
+    float gate_scale,
+    float up_scale,
+    float down_scale,
+    float gate_up_scale,
+    int n_embd,
+    int n_ff_exp,
+    int n_cold);
+
+}  // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.cpp b/server/src/common/moe_hybrid_placement.cpp
similarity index 79%
rename from server/src/qwen35moe/qwen35moe_expert_placement.cpp
rename to server/src/common/moe_hybrid_placement.cpp
index d0066e9ba..49bcd5ea1 100644
--- a/server/src/qwen35moe/qwen35moe_expert_placement.cpp
+++ b/server/src/common/moe_hybrid_placement.cpp
@@ -1,4 +1,5 @@
-#include "qwen35moe_expert_placement.h"
+#include "moe_hybrid_placement.h"
+#include "moe_hybrid_routing_stats.h"
 
 #include <nlohmann/json.hpp>
 
@@ -8,20 +9,23 @@
 
 namespace dflash::common {
 
-bool Qwen35MoeExpertPlacement::matches(const TargetWeights & w) const {
-    return w.is_moe &&
-           n_layer == w.n_layer &&
-           n_expert == w.n_expert &&
-           n_expert_used == w.n_expert_used &&
+bool MoeHybridPlacement::matches(int n_layer_, int n_expert_, int n_expert_used_) const {
+    return n_layer == n_layer_ &&
+           n_expert == n_expert_ &&
+           n_expert_used == n_expert_used_ &&
            (int)hot_counts.size() == n_layer &&
            (int)hot_expert_ids.size() == n_layer;
 }
 
-bool Qwen35MoeExpertPlacement::empty() const {
+bool MoeHybridPlacement::matches(const MoeHybridConfig & cfg) const {
+    return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridPlacement::empty() const {
     return hot_counts.empty();
 }
 
-bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const {
+bool MoeHybridPlacement::is_hot(int layer_idx, int expert_idx) const {
     if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) {
         return false;
     }
@@ -29,7 +33,8 @@ bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const {
     return std::find(hot.begin(), hot.end(), expert_idx) != hot.end();
 }
 
-bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string * err) const {
+bool MoeHybridPlacement::save_json(const std::string & path, const std::string & arch_name,
+                                   std::string * err) const {
     if (n_layer <= 0 || n_expert <= 0 || (int)hot_counts.size() != n_layer ||
         (int)hot_expert_ids.size() != n_layer) {
         if (err) *err = "placement not initialized";
@@ -37,7 +42,7 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string *
     }
 
     nlohmann::json j;
-    j["arch"] = "qwen35moe";
+    j["arch"] = arch_name;
     j["version"] = 1;
     j["n_layer"] = n_layer;
     j["n_expert"] = n_expert;
@@ -59,9 +64,9 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string *
     return true;
 }
 
-bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
-                                         Qwen35MoeExpertPlacement & out,
-                                         std::string * err) {
+bool MoeHybridPlacement::load_json(const std::string & path,
+                                   MoeHybridPlacement & out,
+                                   std::string * err) {
     std::ifstream f(path);
     if (!f) {
         if (err) *err = "failed to open input file";
@@ -76,12 +81,10 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
         return false;
     }
 
-    if (j.value("arch", std::string()) != "qwen35moe") {
-        if (err) *err = "unexpected arch";
-        return false;
-    }
+    // Accept both legacy "qwen35moe" and new "moe_hybrid" / any arch string.
+    // We don't reject based on arch — the caller validates dimensions.
 
-    Qwen35MoeExpertPlacement tmp;
+    MoeHybridPlacement tmp;
     try {
         tmp.n_layer = j.value("n_layer", 0);
         tmp.n_expert = j.value("n_expert", 0);
@@ -105,11 +108,11 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
     return true;
 }
 
-bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & stats,
-                                                int total_hot_budget,
-                                                int min_hot_per_layer,
-                                                Qwen35MoeExpertPlacement & out,
-                                                std::string * err) {
+bool MoeHybridPlacement::build_from_stats(const MoeHybridRoutingStats & stats,
+                                          int total_hot_budget,
+                                          int min_hot_per_layer,
+                                          MoeHybridPlacement & out,
+                                          std::string * err) {
     if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) {
         if (err) *err = "stats not initialized";
         return false;
@@ -127,7 +130,7 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st
         return false;
     }
 
-    Qwen35MoeExpertPlacement tmp;
+    MoeHybridPlacement tmp;
     tmp.n_layer = stats.n_layer;
     tmp.n_expert = stats.n_expert;
     tmp.n_expert_used = stats.n_expert_used;
@@ -172,12 +175,12 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st
     return true;
 }
 
-bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
-    const Qwen35MoeRoutingStats & stats,
+bool MoeHybridPlacement::build_from_stats_with_layer_bytes(
+    const MoeHybridRoutingStats & stats,
     const std::vector<uint64_t> & layer_expert_bytes,
     uint64_t total_hot_budget_bytes,
     int min_hot_per_layer,
-    Qwen35MoeExpertPlacement & out,
+    MoeHybridPlacement & out,
     std::string * err) {
     if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) {
         if (err) *err = "stats not initialized";
@@ -196,18 +199,22 @@ bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
     const int per_layer_floor = std::min(min_hot_per_layer, stats.n_expert);
     uint64_t floor_bytes = 0;
     for (int il = 0; il < stats.n_layer; ++il) {
-        floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il];
+        if (layer_expert_bytes[(size_t)il] > 0)
+            floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il];
     }
     if (floor_bytes > total_hot_budget_bytes) {
         if (err) *err = "min_hot_per_layer exceeds byte budget";
         return false;
     }
 
-    Qwen35MoeExpertPlacement tmp;
+    MoeHybridPlacement tmp;
     tmp.n_layer = stats.n_layer;
     tmp.n_expert = stats.n_expert;
     tmp.n_expert_used = stats.n_expert_used;
-    tmp.hot_counts.assign((size_t)tmp.n_layer, per_layer_floor);
+    tmp.hot_counts.resize((size_t)tmp.n_layer);
+    for (int il = 0; il < tmp.n_layer; ++il) {
+        tmp.hot_counts[(size_t)il] = (layer_expert_bytes[(size_t)il] > 0) ? per_layer_floor : 0;
+    }
 
     std::vector<std::vector<int>> ranked((size_t)tmp.n_layer);
     for (int il = 0; il < tmp.n_layer; ++il) {
diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.h b/server/src/common/moe_hybrid_placement.h
similarity index 58%
rename from server/src/qwen35moe/qwen35moe_expert_placement.h
rename to server/src/common/moe_hybrid_placement.h
index 49d810dfc..023228664 100644
--- a/server/src/qwen35moe/qwen35moe_expert_placement.h
+++ b/server/src/common/moe_hybrid_placement.h
@@ -1,8 +1,8 @@
-// qwen35moe expert placement config derived from per-layer routing statistics.
+// Common MoE expert placement — determines which experts are hot (GPU) vs cold (CPU).
 
 #pragma once
 
-#include "qwen35moe_routing_stats.h"
+#include "moe_hybrid_types.h"
 
 #include <cstdint>
 #include <string>
@@ -10,7 +10,9 @@
 
 namespace dflash::common {
 
-struct Qwen35MoeExpertPlacement {
+struct MoeHybridRoutingStats;  // forward decl
+
+struct MoeHybridPlacement {
     int n_layer       = 0;
     int n_expert      = 0;
     int n_expert_used = 0;
@@ -21,27 +23,29 @@ struct Qwen35MoeExpertPlacement {
     // Ranked hot expert ids kept on GPU per layer.
     std::vector<std::vector<int32_t>> hot_expert_ids;
 
-    bool matches(const TargetWeights & w) const;
+    bool matches(int n_layer, int n_expert, int n_expert_used) const;
+    bool matches(const MoeHybridConfig & cfg) const;
     bool empty() const;
     bool is_hot(int layer_idx, int expert_idx) const;
 
-    bool save_json(const std::string & path, std::string * err = nullptr) const;
+    bool save_json(const std::string & path, const std::string & arch_name = "moe_hybrid",
+                   std::string * err = nullptr) const;
     static bool load_json(const std::string & path,
-                          Qwen35MoeExpertPlacement & out,
+                          MoeHybridPlacement & out,
                           std::string * err = nullptr);
 
-    static bool build_from_stats(const Qwen35MoeRoutingStats & stats,
+    static bool build_from_stats(const MoeHybridRoutingStats & stats,
                                  int total_hot_budget,
                                  int min_hot_per_layer,
-                                 Qwen35MoeExpertPlacement & out,
+                                 MoeHybridPlacement & out,
                                  std::string * err = nullptr);
 
     static bool build_from_stats_with_layer_bytes(
-        const Qwen35MoeRoutingStats & stats,
+        const MoeHybridRoutingStats & stats,
         const std::vector<uint64_t> & layer_expert_bytes,
         uint64_t total_hot_budget_bytes,
         int min_hot_per_layer,
-        Qwen35MoeExpertPlacement & out,
+        MoeHybridPlacement & out,
         std::string * err = nullptr);
 };
 
diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.cpp b/server/src/common/moe_hybrid_routing_stats.cpp
similarity index 67%
rename from server/src/qwen35moe/qwen35moe_routing_stats.cpp
rename to server/src/common/moe_hybrid_routing_stats.cpp
index 2e5e7ad17..7d11c5c1a 100644
--- a/server/src/qwen35moe/qwen35moe_routing_stats.cpp
+++ b/server/src/common/moe_hybrid_routing_stats.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_routing_stats.h"
+#include "moe_hybrid_routing_stats.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -8,43 +8,50 @@
 
 namespace dflash::common {
 
-size_t Qwen35MoeRoutingStats::index_of(int layer_idx, int expert_idx) const {
+size_t MoeHybridRoutingStats::index_of(int layer_idx, int expert_idx) const {
     return (size_t)layer_idx * (size_t)n_expert + (size_t)expert_idx;
 }
 
-bool Qwen35MoeRoutingStats::init_from_weights(const TargetWeights & w) {
-    if (!w.is_moe || w.n_layer <= 0 || w.n_expert <= 0 || w.n_expert_used <= 0) {
+bool MoeHybridRoutingStats::init(int n_layer_, int n_expert_, int n_expert_used_) {
+    if (n_layer_ <= 0 || n_expert_ <= 0 || n_expert_used_ <= 0) {
         return false;
     }
-    n_layer = w.n_layer;
-    n_expert = w.n_expert;
-    n_expert_used = w.n_expert_used;
+    n_layer = n_layer_;
+    n_expert = n_expert_;
+    n_expert_used = n_expert_used_;
     counts.assign((size_t)n_layer * (size_t)n_expert, 0);
     layer_totals.assign((size_t)n_layer, 0);
     return true;
 }
 
-bool Qwen35MoeRoutingStats::matches(const TargetWeights & w) const {
-    return w.is_moe &&
-           n_layer == w.n_layer &&
-           n_expert == w.n_expert &&
-           n_expert_used == w.n_expert_used &&
+bool MoeHybridRoutingStats::init(const MoeHybridConfig & cfg) {
+    return init(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridRoutingStats::matches(int n_layer_, int n_expert_, int n_expert_used_) const {
+    return n_layer == n_layer_ &&
+           n_expert == n_expert_ &&
+           n_expert_used == n_expert_used_ &&
            counts.size() == (size_t)n_layer * (size_t)n_expert &&
            layer_totals.size() == (size_t)n_layer;
 }
 
-bool Qwen35MoeRoutingStats::empty() const {
+bool MoeHybridRoutingStats::matches(const MoeHybridConfig & cfg) const {
+    return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridRoutingStats::empty() const {
     return counts.empty();
 }
 
-uint64_t Qwen35MoeRoutingStats::count(int layer_idx, int expert_idx) const {
+uint64_t MoeHybridRoutingStats::count(int layer_idx, int expert_idx) const {
     if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) {
         return 0;
     }
     return counts[index_of(layer_idx, expert_idx)];
 }
 
-bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) {
+bool MoeHybridRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) {
     if (!expert_ids || layer_idx < 0 || layer_idx >= n_layer || n_ids < 0) {
         return false;
     }
@@ -57,12 +64,12 @@ bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, i
     for (int i = 0; i < n_ids; ++i) {
         const int expert_idx = expert_ids[i];
         counts[index_of(layer_idx, expert_idx)]++;
-        layer_totals[(size_t) layer_idx]++;
+        layer_totals[(size_t)layer_idx]++;
     }
     return true;
 }
 
-bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend,
+bool MoeHybridRoutingStats::observe_selected_tensor(ggml_backend_t backend,
                                                     int layer_idx,
                                                     ggml_tensor * selected,
                                                     std::string * err) {
@@ -88,7 +95,7 @@ bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend,
     return true;
 }
 
-std::vector<int> Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const {
+std::vector<int> MoeHybridRoutingStats::ranked_experts(int layer_idx) const {
     if (layer_idx < 0 || layer_idx >= n_layer) return {};
     std::vector<int> ranked((size_t)n_expert);
     std::iota(ranked.begin(), ranked.end(), 0);
@@ -102,17 +109,16 @@ std::vector<int> Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const {
     return ranked;
 }
 
-std::vector<int> Qwen35MoeRoutingStats::hot_experts(int layer_idx, int hot_count) const {
+std::vector<int> MoeHybridRoutingStats::hot_experts(int layer_idx, int hot_count) const {
     std::vector<int> ranked = ranked_experts(layer_idx);
     if (hot_count < 0) hot_count = 0;
-    if ((size_t) hot_count < ranked.size()) {
-        ranked.resize((size_t) hot_count);
+    if ((size_t)hot_count < ranked.size()) {
+        ranked.resize((size_t)hot_count);
     }
     return ranked;
 }
 
-
-bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err) const {
+bool MoeHybridRoutingStats::save_csv(const std::string & path, std::string * err) const {
     if (n_layer <= 0 || n_expert <= 0 || counts.size() != (size_t)n_layer * (size_t)n_expert) {
         if (err) *err = "routing stats not initialized";
         return false;
@@ -124,7 +130,6 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err
         return false;
     }
 
-    // Header comments
     f << "# hotness table: n_layer=" << n_layer
       << " n_expert=" << n_expert
       << " n_expert_used=" << n_expert_used << "\n";
@@ -145,8 +150,8 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err
     return true;
 }
 
-bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
-                                     Qwen35MoeRoutingStats & out,
+bool MoeHybridRoutingStats::load_csv(const std::string & path,
+                                     MoeHybridRoutingStats & out,
                                      std::string * err) {
     std::ifstream f(path);
     if (!f) {
@@ -154,32 +159,28 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
         return false;
     }
 
-    int n_layer = 0, n_expert = 0, n_expert_used = 0;
+    int file_n_layer = 0, file_n_expert = 0, file_n_expert_used = 0;
     std::vector<uint64_t> all_counts;
     std::string line;
 
     while (std::getline(f, line)) {
-        // Skip comments and empty lines
         if (line.empty() || line[0] == '#') {
-            // Try to parse header metadata from comment
             if (line.find("n_layer=") != std::string::npos) {
                 std::sscanf(line.c_str(), "# hotness table: n_layer=%d n_expert=%d n_expert_used=%d",
-                            &n_layer, &n_expert, &n_expert_used);
+                            &file_n_layer, &file_n_expert, &file_n_expert_used);
             }
             continue;
         }
 
-        // Parse CSV row: comma-separated uint64 values
         std::vector<uint64_t> row;
         const char * p = line.c_str();
         while (*p) {
-            // Skip whitespace
             while (*p == ' ' || *p == '\t') ++p;
             if (!*p) break;
             char * end = nullptr;
             uint64_t val = std::strtoull(p, &end, 10);
             if (end == p) {
-                if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)n_expert, (size_t)1)));
+                if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)file_n_expert, (size_t)1)));
                 return false;
             }
             row.push_back(val);
@@ -189,40 +190,39 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
 
         if (row.empty()) continue;
 
-        // Infer n_expert from first data row
-        if (n_expert == 0) {
-            n_expert = (int)row.size();
-        } else if ((int)row.size() != n_expert) {
-            if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)n_expert));
+        if (file_n_expert == 0) {
+            file_n_expert = (int)row.size();
+        } else if ((int)row.size() != file_n_expert) {
+            if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)file_n_expert));
             return false;
         }
 
         all_counts.insert(all_counts.end(), row.begin(), row.end());
     }
 
-    if (n_expert <= 0 || all_counts.empty()) {
+    if (file_n_expert <= 0 || all_counts.empty()) {
         if (err) *err = "no data rows found";
         return false;
     }
 
-    const int detected_layers = (int)(all_counts.size() / (size_t)n_expert);
-    if (n_layer == 0) n_layer = detected_layers;
-    if (n_expert_used == 0) n_expert_used = 8;  // default for Qwen3.5-MoE
+    const int detected_layers = (int)(all_counts.size() / (size_t)file_n_expert);
+    if (file_n_layer == 0) file_n_layer = detected_layers;
+    if (file_n_expert_used == 0) file_n_expert_used = 8;  // default
 
-    if ((int)all_counts.size() != n_layer * n_expert) {
-        if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(n_layer) + ")";
+    if ((int)all_counts.size() != file_n_layer * file_n_expert) {
+        if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(file_n_layer) + ")";
         return false;
     }
 
-    Qwen35MoeRoutingStats tmp;
-    tmp.n_layer = n_layer;
-    tmp.n_expert = n_expert;
-    tmp.n_expert_used = n_expert_used;
+    MoeHybridRoutingStats tmp;
+    tmp.n_layer = file_n_layer;
+    tmp.n_expert = file_n_expert;
+    tmp.n_expert_used = file_n_expert_used;
     tmp.counts = std::move(all_counts);
-    tmp.layer_totals.assign((size_t)n_layer, 0);
-    for (int il = 0; il < n_layer; ++il) {
+    tmp.layer_totals.assign((size_t)file_n_layer, 0);
+    for (int il = 0; il < file_n_layer; ++il) {
         uint64_t total = 0;
-        for (int ie = 0; ie < n_expert; ++ie) {
+        for (int ie = 0; ie < file_n_expert; ++ie) {
             total += tmp.counts[tmp.index_of(il, ie)];
         }
         tmp.layer_totals[(size_t)il] = total;
diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.h b/server/src/common/moe_hybrid_routing_stats.h
similarity index 71%
rename from server/src/qwen35moe/qwen35moe_routing_stats.h
rename to server/src/common/moe_hybrid_routing_stats.h
index 209ba2a0e..17fa43886 100644
--- a/server/src/qwen35moe/qwen35moe_routing_stats.h
+++ b/server/src/common/moe_hybrid_routing_stats.h
@@ -1,8 +1,11 @@
-// Reusable qwen35moe routing-statistics scaffold for Phase 2 expert placement.
+// Common MoE routing statistics for expert placement decisions.
 
 #pragma once
 
-#include "internal.h"
+#include "moe_hybrid_types.h"
+
+#include "ggml.h"
+#include "ggml-backend.h"
 
 #include <cstdint>
 #include <string>
@@ -10,7 +13,7 @@
 
 namespace dflash::common {
 
-struct Qwen35MoeRoutingStats {
+struct MoeHybridRoutingStats {
     int n_layer       = 0;
     int n_expert      = 0;
     int n_expert_used = 0;
@@ -19,8 +22,10 @@ struct Qwen35MoeRoutingStats {
     std::vector<uint64_t> counts;
     std::vector<uint64_t> layer_totals;
 
-    bool init_from_weights(const TargetWeights & w);
-    bool matches(const TargetWeights & w) const;
+    bool init(int n_layer, int n_expert, int n_expert_used);
+    bool init(const MoeHybridConfig & cfg);
+    bool matches(int n_layer, int n_expert, int n_expert_used) const;
+    bool matches(const MoeHybridConfig & cfg) const;
     bool empty() const;
 
     uint64_t count(int layer_idx, int expert_idx) const;
@@ -35,7 +40,7 @@ struct Qwen35MoeRoutingStats {
 
     bool save_csv(const std::string & path, std::string * err = nullptr) const;
     static bool load_csv(const std::string & path,
-                         Qwen35MoeRoutingStats & out,
+                         MoeHybridRoutingStats & out,
                          std::string * err = nullptr);
 
 private:
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp
similarity index 67%
rename from server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
rename to server/src/common/moe_hybrid_storage.cpp
index fa66c065c..ca080ce43 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
+++ b/server/src/common/moe_hybrid_storage.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_hybrid_storage.h"
+#include "moe_hybrid_storage.h"
 
 #include "ggml-cpu.h"
 
@@ -41,7 +41,6 @@ static bool read_expert_slices(ggml_backend_t backend,
     return true;
 }
 
-// Read expert slices from raw memory (e.g. mmap) instead of a GPU tensor.
 static bool read_expert_slices_from_mem(const uint8_t * tensor_data,
                                         size_t tensor_size,
                                         const std::vector<int32_t> & expert_ids,
@@ -90,7 +89,7 @@ static ggml_tensor * new_like_with_expert_count(ggml_context * ctx, ggml_tensor
 
 } // namespace
 
-Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() {
+MoeHybridStorage::~MoeHybridStorage() {
     for (auto & layer : layers) {
         layer.hot_graph.free();
         layer.cold_graph.free();
@@ -125,67 +124,73 @@ Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() {
     }
 }
 
-bool Qwen35MoeHybridStorage::matches(const TargetWeights & w) const {
-    return placement.matches(w) && (int)layers.size() == w.n_layer;
+bool MoeHybridStorage::matches(const MoeHybridConfig & cfg) const {
+    return placement.matches(cfg) && (int)layers.size() == cfg.n_layer;
 }
 
-bool Qwen35MoeHybridStorage::empty() const {
+bool MoeHybridStorage::empty() const {
     return layers.empty();
 }
 
-bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
-                                    ggml_backend_t backend,
-                                    const Qwen35MoeExpertPlacement & placement,
-                                    Qwen35MoeHybridStorage & out,
-                                    std::string * err) {
-    if (!placement.matches(w)) {
-        if (err) *err = "placement does not match model";
+bool build_moe_hybrid_storage(const MoeHybridConfig & cfg,
+                              ggml_backend_t gpu_backend,
+                              const MoeHybridPlacement & placement,
+                              const std::vector<MoeLayerDesc> & layer_descs,
+                              MoeHybridStorage & out,
+                              std::string * err) {
+    if (!placement.matches(cfg)) {
+        if (err) *err = "placement does not match config";
         return false;
     }
-    if (!w.is_moe) {
-        if (err) *err = "target is not qwen35moe";
+    if ((int)layer_descs.size() != cfg.n_layer) {
+        if (err) *err = "layer_descs size does not match n_layer";
         return false;
     }
 
-    
     out.placement = placement;
-    out.layers.resize((size_t)w.n_layer);
+    out.layers.resize((size_t)cfg.n_layer);
     out.cpu_backend = ggml_backend_cpu_init();
     if (!out.cpu_backend) {
         if (err) *err = "failed to init cpu backend";
         return false;
     }
-    ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8)));
+    ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8)));
+
+    for (int il = 0; il < cfg.n_layer; ++il) {
+        const MoeLayerDesc & desc = layer_descs[(size_t)il];
+        MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+
+        // Skip dense layers (no experts)
+        if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) {
+            continue;
+        }
 
-    for (int il = 0; il < w.n_layer; ++il) {
-        const TargetLayer & L = w.layers[(size_t)il];
-        Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il];
         dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il];
-        dst.hot_local_by_global.assign((size_t)w.n_expert, -1);
-        dst.cold_local_by_global.assign((size_t)w.n_expert, -1);
+        dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1);
+        dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1);
 
-        std::vector<uint8_t> is_hot((size_t)w.n_expert, 0);
+        std::vector<uint8_t> is_hot((size_t)cfg.n_expert, 0);
         for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) {
             const int32_t expert = dst.hot_expert_ids[i];
-            if (expert < 0 || expert >= w.n_expert) {
+            if (expert < 0 || expert >= cfg.n_expert) {
                 if (err) *err = "hot expert id out of range";
                 return false;
             }
             dst.hot_local_by_global[(size_t)expert] = (int32_t)i;
             is_hot[(size_t)expert] = 1;
         }
-        for (int expert = 0; expert < w.n_expert; ++expert) {
+        for (int expert = 0; expert < cfg.n_expert; ++expert) {
             if (!is_hot[(size_t)expert]) {
                 dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size();
                 dst.cold_expert_ids.push_back((int32_t)expert);
             }
         }
 
-        dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr);
-        if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) {
+        dst.fused_gate_up = desc.has_fused_gate_up();
+        if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) {
             return false;
         }
 
@@ -204,47 +209,41 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
                 return false;
             }
             if (dst.fused_gate_up) {
-                dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count);
-                dst.down_hot    = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+                dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count);
+                dst.down_hot    = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
             } else {
-                dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count);
-                dst.up_hot   = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count);
-                dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+                dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count);
+                dst.up_hot   = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count);
+                dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
             }
-            dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, backend);
+            dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend);
             if (!dst.hot_buf) {
                 if (err) *err = "failed to allocate hot expert buffer";
                 return false;
             }
 
-            // Copy hot expert slices from full GPU tensors to hot_buf
             std::vector<uint8_t> hot_bytes;
             if (dst.fused_gate_up) {
-                if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.hot_expert_ids,
-                                        dst.gate_up_expert_bytes, hot_bytes, err)) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.hot_expert_ids,
+                                        dst.gate_up_expert_bytes, hot_bytes, err))
                     return false;
-                }
                 ggml_backend_tensor_set(dst.gate_up_hot, hot_bytes.data(), 0, hot_bytes.size());
-                if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids,
-                                        dst.down_expert_bytes, hot_bytes, err)) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids,
+                                        dst.down_expert_bytes, hot_bytes, err))
                     return false;
-                }
                 ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size());
             } else {
-                if (!read_expert_slices(backend, L.ffn_gate_exps, dst.hot_expert_ids,
-                                        dst.gate_expert_bytes, hot_bytes, err)) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.hot_expert_ids,
+                                        dst.gate_expert_bytes, hot_bytes, err))
                     return false;
-                }
                 ggml_backend_tensor_set(dst.gate_hot, hot_bytes.data(), 0, hot_bytes.size());
-                if (!read_expert_slices(backend, L.ffn_up_exps, dst.hot_expert_ids,
-                                        dst.up_expert_bytes, hot_bytes, err)) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.hot_expert_ids,
+                                        dst.up_expert_bytes, hot_bytes, err))
                     return false;
-                }
                 ggml_backend_tensor_set(dst.up_hot, hot_bytes.data(), 0, hot_bytes.size());
-                if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids,
-                                        dst.down_expert_bytes, hot_bytes, err)) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids,
+                                        dst.down_expert_bytes, hot_bytes, err))
                     return false;
-                }
                 ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size());
             }
         }
@@ -261,64 +260,42 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
                 return false;
             }
             if (dst.fused_gate_up) {
-                dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count);
-                dst.down_cold    = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+                dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count);
+                dst.down_cold    = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
             } else {
-                dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count);
-                dst.up_cold   = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count);
-                dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+                dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count);
+                dst.up_cold   = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count);
+                dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
             }
             dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend);
             if (!dst.cold_buf) {
                 if (err) *err = "failed to allocate cold expert buffer";
                 return false;
             }
-        }
 
-        if (dst.fused_gate_up) {
-            if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.cold_expert_ids,
-                                    dst.gate_up_expert_bytes, dst.gate_up_cold_bytes, err)) {
-                return false;
-            }
-        } else {
-            if (!read_expert_slices(backend, L.ffn_gate_exps, dst.cold_expert_ids,
-                                    dst.gate_expert_bytes, dst.gate_cold_bytes, err) ||
-                !read_expert_slices(backend, L.ffn_up_exps, dst.cold_expert_ids,
-                                    dst.up_expert_bytes, dst.up_cold_bytes, err)) {
-                return false;
-            }
-        }
-        if (!read_expert_slices(backend, L.ffn_down_exps, dst.cold_expert_ids,
-                                dst.down_expert_bytes, dst.down_cold_bytes, err)) {
-            return false;
-        }
-
-        if (dst.fused_gate_up) {
-            if (dst.gate_up_cold && !dst.gate_up_cold_bytes.empty()) {
-                ggml_backend_tensor_set(dst.gate_up_cold, dst.gate_up_cold_bytes.data(), 0, dst.gate_up_cold_bytes.size());
-                dst.gate_up_cold_bytes.clear();
-                dst.gate_up_cold_bytes.shrink_to_fit();
-            }
-            if (dst.down_cold && !dst.down_cold_bytes.empty()) {
-                ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size());
-                dst.down_cold_bytes.clear();
-                dst.down_cold_bytes.shrink_to_fit();
-            }
-        } else {
-            if (dst.gate_cold && !dst.gate_cold_bytes.empty()) {
-                ggml_backend_tensor_set(dst.gate_cold, dst.gate_cold_bytes.data(), 0, dst.gate_cold_bytes.size());
-                dst.gate_cold_bytes.clear();
-                dst.gate_cold_bytes.shrink_to_fit();
-            }
-            if (dst.up_cold && !dst.up_cold_bytes.empty()) {
-                ggml_backend_tensor_set(dst.up_cold, dst.up_cold_bytes.data(), 0, dst.up_cold_bytes.size());
-                dst.up_cold_bytes.clear();
-                dst.up_cold_bytes.shrink_to_fit();
-            }
-            if (dst.down_cold && !dst.down_cold_bytes.empty()) {
-                ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size());
-                dst.down_cold_bytes.clear();
-                dst.down_cold_bytes.shrink_to_fit();
+            std::vector<uint8_t> cold_bytes;
+            if (dst.fused_gate_up) {
+                if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.cold_expert_ids,
+                                        dst.gate_up_expert_bytes, cold_bytes, err))
+                    return false;
+                ggml_backend_tensor_set(dst.gate_up_cold, cold_bytes.data(), 0, cold_bytes.size());
+                if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids,
+                                        dst.down_expert_bytes, cold_bytes, err))
+                    return false;
+                ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size());
+            } else {
+                if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.cold_expert_ids,
+                                        dst.gate_expert_bytes, cold_bytes, err))
+                    return false;
+                ggml_backend_tensor_set(dst.gate_cold, cold_bytes.data(), 0, cold_bytes.size());
+                if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.cold_expert_ids,
+                                        dst.up_expert_bytes, cold_bytes, err))
+                    return false;
+                ggml_backend_tensor_set(dst.up_cold, cold_bytes.data(), 0, cold_bytes.size());
+                if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids,
+                                        dst.down_expert_bytes, cold_bytes, err))
+                    return false;
+                ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size());
             }
         }
     }
@@ -326,67 +303,69 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
     return true;
 }
 
-bool build_qwen35moe_hybrid_storage_from_file(
-    const TargetWeights & w,
+bool build_moe_hybrid_storage_from_file(
+    const MoeHybridConfig & cfg,
     ggml_backend_t gpu_backend,
-    const Qwen35MoeExpertPlacement & placement,
+    const MoeHybridPlacement & placement,
+    const std::vector<MoeLayerDesc> & layer_descs,
     const std::vector<LayerExpertFileData> & file_data,
-    Qwen35MoeHybridStorage & out,
+    MoeHybridStorage & out,
     std::string * err) {
 
-    if (!placement.matches(w)) {
-        if (err) *err = "placement does not match model";
-        return false;
-    }
-    if (!w.is_moe) {
-        if (err) *err = "target is not qwen35moe";
+    if (!placement.matches(cfg)) {
+        if (err) *err = "placement does not match config";
         return false;
     }
-    if ((int)file_data.size() != w.n_layer) {
-        if (err) *err = "file_data size does not match n_layer";
+    if ((int)layer_descs.size() != cfg.n_layer || (int)file_data.size() != cfg.n_layer) {
+        if (err) *err = "layer_descs/file_data size does not match n_layer";
         return false;
     }
 
-    
     out.placement = placement;
-    out.layers.resize((size_t)w.n_layer);
+    out.layers.resize((size_t)cfg.n_layer);
     out.cpu_backend = ggml_backend_cpu_init();
     if (!out.cpu_backend) {
         if (err) *err = "failed to init cpu backend";
         return false;
     }
-    ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8)));
+    ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8)));
 
-    for (int il = 0; il < w.n_layer; ++il) {
-        const TargetLayer & L = w.layers[(size_t)il];
+    for (int il = 0; il < cfg.n_layer; ++il) {
+        const MoeLayerDesc & desc = layer_descs[(size_t)il];
         const LayerExpertFileData & fd = file_data[(size_t)il];
-        Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+        MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+
+        // Skip dense layers (no experts)
+        if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) {
+            continue;
+        }
+
         dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il];
-        dst.hot_local_by_global.assign((size_t)w.n_expert, -1);
-        dst.cold_local_by_global.assign((size_t)w.n_expert, -1);
+        dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1);
+        dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1);
 
-        std::vector<uint8_t> is_hot((size_t)w.n_expert, 0);
+        std::vector<uint8_t> is_hot((size_t)cfg.n_expert, 0);
         for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) {
             const int32_t expert = dst.hot_expert_ids[i];
-            if (expert < 0 || expert >= w.n_expert) {
+            if (expert < 0 || expert >= cfg.n_expert) {
                 if (err) *err = "hot expert id out of range";
                 return false;
             }
             dst.hot_local_by_global[(size_t)expert] = (int32_t)i;
             is_hot[(size_t)expert] = 1;
         }
-        for (int expert = 0; expert < w.n_expert; ++expert) {
+        for (int expert = 0; expert < cfg.n_expert; ++expert) {
             if (!is_hot[(size_t)expert]) {
                 dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size();
                 dst.cold_expert_ids.push_back((int32_t)expert);
             }
         }
 
-        dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr);
-        if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) ||
-            !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) {
+        dst.fused_gate_up = desc.has_fused_gate_up();
+        if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) ||
+            !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) {
             return false;
         }
 
@@ -405,20 +384,22 @@ bool build_qwen35moe_hybrid_storage_from_file(
                 return false;
             }
             if (dst.fused_gate_up) {
-                dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count);
-                dst.down_hot    = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+                dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count);
+                dst.down_hot    = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
             } else {
-                dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count);
-                dst.up_hot   = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count);
-                dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+                dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count);
+                dst.up_hot   = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count);
+                dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
             }
             dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend);
             if (!dst.hot_buf) {
-                if (err) *err = "failed to allocate hot expert GPU buffer";
+                char msg[128];
+                std::snprintf(msg, sizeof(msg),
+                    "failed to allocate hot expert GPU buffer (layer %d, %d hot experts)", il, hot_count);
+                if (err) *err = msg;
                 return false;
             }
 
-            // Load hot expert slices from file
             std::vector<uint8_t> slice_buf;
             if (dst.fused_gate_up) {
                 if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size,
@@ -457,12 +438,12 @@ bool build_qwen35moe_hybrid_storage_from_file(
                 return false;
             }
             if (dst.fused_gate_up) {
-                dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count);
-                dst.down_cold    = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+                dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count);
+                dst.down_cold    = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
             } else {
-                dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count);
-                dst.up_cold   = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count);
-                dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+                dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count);
+                dst.up_cold   = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count);
+                dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
             }
             dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend);
             if (!dst.cold_buf) {
@@ -470,7 +451,6 @@ bool build_qwen35moe_hybrid_storage_from_file(
                 return false;
             }
 
-            // Load cold expert slices from file directly to CPU tensors
             std::vector<uint8_t> slice_buf;
             if (dst.fused_gate_up) {
                 if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size,
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h
similarity index 58%
rename from server/src/qwen35moe/qwen35moe_hybrid_storage.h
rename to server/src/common/moe_hybrid_storage.h
index c883da78e..465827ec0 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_storage.h
+++ b/server/src/common/moe_hybrid_storage.h
@@ -1,8 +1,9 @@
-// Phase 3 hybrid expert storage for qwen35moe.
+// Common MoE hybrid expert storage — manages hot (GPU) and cold (CPU) expert buffers.
 
 #pragma once
 
-#include "qwen35moe_expert_placement.h"
+#include "moe_hybrid_types.h"
+#include "moe_hybrid_placement.h"
 
 #include "ggml-alloc.h"
 
@@ -29,7 +30,7 @@ struct CachedFfnGraph {
     void free();
 };
 
-struct Qwen35MoeHybridLayerStorage {
+struct MoeHybridLayerStorage {
     ggml_context * hot_ctx = nullptr;
     ggml_backend_buffer_t hot_buf = nullptr;
     ggml_tensor * gate_hot = nullptr;
@@ -60,39 +61,31 @@ struct Qwen35MoeHybridLayerStorage {
     std::vector<uint8_t> down_cold_bytes;
     std::vector<uint8_t> gate_up_cold_bytes;
 
-    // Cached FFN graphs: hot_graph for all-hot case (n_expert_used hot experts),
-    // cold_graph for all-cold case (n_expert_used cold experts).
-    // These cover the common case; mixed hot/cold falls back to dynamic build.
-    CachedFfnGraph hot_graph;   // GPU: fused routed(n_expert_used hot) + shared
-    CachedFfnGraph cold_graph;  // CPU: routed(n_expert_used cold)
+    // Cached FFN graphs for common-case expert counts.
+    CachedFfnGraph hot_graph;
+    CachedFfnGraph cold_graph;
 };
 
-struct Qwen35MoeHybridStorage {
-    Qwen35MoeHybridStorage() = default;
-    Qwen35MoeHybridStorage(const Qwen35MoeHybridStorage &) = delete;
-    Qwen35MoeHybridStorage & operator=(const Qwen35MoeHybridStorage &) = delete;
-    Qwen35MoeHybridStorage(Qwen35MoeHybridStorage &&) = delete;
-    Qwen35MoeHybridStorage & operator=(Qwen35MoeHybridStorage &&) = delete;
-    ~Qwen35MoeHybridStorage();
+struct MoeHybridStorage {
+    MoeHybridStorage() = default;
+    MoeHybridStorage(const MoeHybridStorage &) = delete;
+    MoeHybridStorage & operator=(const MoeHybridStorage &) = delete;
+    MoeHybridStorage(MoeHybridStorage &&) = delete;
+    MoeHybridStorage & operator=(MoeHybridStorage &&) = delete;
+    ~MoeHybridStorage();
 
     ggml_backend_t cpu_backend = nullptr;
-    Qwen35MoeExpertPlacement placement;
-    std::vector<Qwen35MoeHybridLayerStorage> layers;
+    MoeHybridPlacement placement;
+    std::vector<MoeHybridLayerStorage> layers;
 
-    bool matches(const TargetWeights & w) const;
+    bool matches(const MoeHybridConfig & cfg) const;
     bool empty() const;
 };
 
-bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
-                                    ggml_backend_t backend,
-                                    const Qwen35MoeExpertPlacement & placement,
-                                    Qwen35MoeHybridStorage & out,
-                                    std::string * err = nullptr);
-
 // Expert tensor file data for split loading (one entry per expert tensor).
 struct ExpertTensorFileData {
-    const uint8_t * data = nullptr;  // pointer into mmap
-    size_t size = 0;                 // total tensor size in bytes
+    const uint8_t * data = nullptr;
+    size_t size = 0;
 };
 
 // Per-layer expert tensor file data for split loading.
@@ -103,15 +96,23 @@ struct LayerExpertFileData {
     ExpertTensorFileData gate_up_exps;  // optional fused
 };
 
+// Build hybrid storage from GPU-resident expert tensors.
+// layer_descs: one MoeLayerDesc per MoE layer (caller constructs from model-specific types).
+bool build_moe_hybrid_storage(const MoeHybridConfig & cfg,
+                              ggml_backend_t gpu_backend,
+                              const MoeHybridPlacement & placement,
+                              const std::vector<MoeLayerDesc> & layer_descs,
+                              MoeHybridStorage & out,
+                              std::string * err = nullptr);
+
 // Build hybrid storage by loading expert data directly from file (mmap).
-// Expert tensors in w are only used for metadata (ne/nb/type); their buffer
-// may be null. Expert data is read from file_data entries.
-bool build_qwen35moe_hybrid_storage_from_file(
-    const TargetWeights & w,
+bool build_moe_hybrid_storage_from_file(
+    const MoeHybridConfig & cfg,
     ggml_backend_t gpu_backend,
-    const Qwen35MoeExpertPlacement & placement,
+    const MoeHybridPlacement & placement,
+    const std::vector<MoeLayerDesc> & layer_descs,
     const std::vector<LayerExpertFileData> & file_data,
-    Qwen35MoeHybridStorage & out,
+    MoeHybridStorage & out,
     std::string * err = nullptr);
 
 }  // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.cpp b/server/src/common/moe_hybrid_swap_manager.cpp
similarity index 86%
rename from server/src/qwen35moe/qwen35moe_swap_manager.cpp
rename to server/src/common/moe_hybrid_swap_manager.cpp
index 3c7006d60..399d9c612 100644
--- a/server/src/qwen35moe/qwen35moe_swap_manager.cpp
+++ b/server/src/common/moe_hybrid_swap_manager.cpp
@@ -1,14 +1,15 @@
-#include "qwen35moe_swap_manager.h"
+#include "moe_hybrid_swap_manager.h"
+#include "moe_hybrid_routing_stats.h"
 
 #include <algorithm>
 
 namespace dflash::common {
 
-bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
-                               const Qwen35MoeRoutingStats & stats,
-                               const Qwen35MoeSwapPolicy & policy,
-                               Qwen35MoeSwapPlan & out,
-                               std::string * err) {
+bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current,
+                                const MoeHybridRoutingStats & stats,
+                                const MoeHybridSwapPolicy & policy,
+                                MoeHybridSwapPlan & out,
+                                std::string * err) {
     if (current.n_layer != stats.n_layer ||
         current.n_expert != stats.n_expert ||
         current.n_expert_used != stats.n_expert_used) {
@@ -16,14 +17,14 @@ bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
         return false;
     }
 
-    out = Qwen35MoeSwapPlan{};
+    out = MoeHybridSwapPlan{};
     out.next_placement = current;
     if (policy.max_swaps_total <= 0) {
         return true;
     }
 
     struct Candidate {
-        Qwen35MoeSwapAction action;
+        MoeHybridSwapAction action;
         uint64_t gain_delta = 0;
     };
     std::vector<Candidate> candidates;
diff --git a/server/src/common/moe_hybrid_swap_manager.h b/server/src/common/moe_hybrid_swap_manager.h
new file mode 100644
index 000000000..c24a71740
--- /dev/null
+++ b/server/src/common/moe_hybrid_swap_manager.h
@@ -0,0 +1,39 @@
+// Common MoE hybrid swap manager — promotes/demotes experts at request boundaries.
+
+#pragma once
+
+#include "moe_hybrid_placement.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace dflash::common {
+
+struct MoeHybridRoutingStats;
+
+struct MoeHybridSwapAction {
+    int layer_idx = -1;
+    int evict_expert = -1;
+    int promote_expert = -1;
+    uint64_t evict_count = 0;
+    uint64_t promote_count = 0;
+};
+
+struct MoeHybridSwapPlan {
+    MoeHybridPlacement next_placement;
+    std::vector<MoeHybridSwapAction> actions;
+};
+
+struct MoeHybridSwapPolicy {
+    int max_swaps_total = 0;          // 0 = no swaps
+    uint64_t min_promote_gain = 1;    // promoted expert count must exceed evicted by this amount
+};
+
+bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current,
+                                const MoeHybridRoutingStats & stats,
+                                const MoeHybridSwapPolicy & policy,
+                                MoeHybridSwapPlan & out,
+                                std::string * err = nullptr);
+
+}  // namespace dflash::common
diff --git a/server/src/common/moe_hybrid_types.h b/server/src/common/moe_hybrid_types.h
new file mode 100644
index 000000000..c3a15e6bd
--- /dev/null
+++ b/server/src/common/moe_hybrid_types.h
@@ -0,0 +1,63 @@
+// Common MoE hybrid mode types and descriptors.
+//
+// Model-agnostic abstractions used by both qwen35moe and laguna backends
+// to implement the hybrid expert offload strategy (hot experts on GPU,
+// cold experts on CPU, concurrent evaluation).
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstdint>
+
+namespace dflash::common {
+
+// ─── MoE architecture config (model-agnostic) ──────────────────────────
+
+struct MoeHybridConfig {
+    int n_embd        = 0;   // hidden dimension
+    int n_expert      = 0;   // total experts per layer
+    int n_expert_used = 0;   // top-k selected per token
+    int n_ff_exp      = 0;   // routed expert intermediate dimension
+    int n_ff_shexp    = 0;   // shared expert intermediate dimension (0 = no shared)
+    int n_layer       = 0;   // number of MoE layers
+    int first_moe_layer = 0; // index of first MoE layer (e.g., 0 for qwen35moe, 1 for laguna)
+};
+
+// ─── Per-layer expert tensor descriptor ─────────────────────────────────
+//
+// Provides a uniform view over model-specific layer structures. All pointers
+// refer to the FULL expert tensor stacks on GPU (used for placement validation
+// and metadata queries). In hybrid mode, the actual hot/cold split tensors
+// live in MoeHybridLayerStorage.
+
+struct MoeLayerDesc {
+    // Routed expert weight tensors (stacked: [dim_in, dim_out, n_expert])
+    ggml_tensor * ffn_gate_exps    = nullptr;
+    ggml_tensor * ffn_up_exps      = nullptr;
+    ggml_tensor * ffn_down_exps    = nullptr;
+    ggml_tensor * ffn_gate_up_exps = nullptr;  // optional fused gate+up
+
+    // Shared expert tensors (nullptr if no shared expert)
+    ggml_tensor * ffn_gate_shexp     = nullptr;
+    ggml_tensor * ffn_up_shexp       = nullptr;
+    ggml_tensor * ffn_down_shexp     = nullptr;
+    ggml_tensor * ffn_gate_inp_shexp = nullptr;  // optional shared-expert gating
+
+    // Per-tensor quantization scale factors (1.0f = no scaling)
+    float ffn_gate_exps_s      = 1.0f;
+    float ffn_up_exps_s        = 1.0f;
+    float ffn_down_exps_s      = 1.0f;
+    float ffn_gate_up_exps_s   = 1.0f;
+    float ffn_gate_shexp_s     = 1.0f;
+    float ffn_up_shexp_s       = 1.0f;
+    float ffn_down_shexp_s     = 1.0f;
+    float ffn_gate_inp_shexp_s = 1.0f;
+
+    bool has_fused_gate_up() const { return ffn_gate_up_exps != nullptr; }
+    bool has_shared_expert() const { return ffn_up_shexp != nullptr; }
+};
+
+}  // namespace dflash::common
+
diff --git a/server/src/common/moe_hybrid_types_impl.h b/server/src/common/moe_hybrid_types_impl.h
new file mode 100644
index 000000000..ddb4355bc
--- /dev/null
+++ b/server/src/common/moe_hybrid_types_impl.h
@@ -0,0 +1,91 @@
+// Inline implementations for MoeHybridConfig/MoeLayerDesc conversion helpers.
+//
+// Include this header AFTER both moe_hybrid_types.h and the relevant
+// model-specific weight struct header (internal.h or laguna_internal.h).
+// The preprocessor guards detect which weight structs are available and
+// only define the corresponding conversion helpers.
+
+#pragma once
+
+namespace dflash::common {
+
+// ─── qwen35 conversions ─────────────────────────────────────────────────
+
+#if defined(DFLASH_INTERNAL_H_INCLUDED)
+
+inline MoeHybridConfig make_moe_hybrid_config(const TargetWeights & w) {
+    MoeHybridConfig cfg;
+    cfg.n_embd        = w.n_embd;
+    cfg.n_expert      = w.n_expert;
+    cfg.n_expert_used = w.n_expert_used;
+    cfg.n_ff_exp      = w.n_ff_exp;
+    cfg.n_ff_shexp    = w.n_ff_shexp;
+    cfg.n_layer       = w.n_layer;
+    cfg.first_moe_layer = 0;  // all layers are MoE in qwen35moe
+    return cfg;
+}
+
+inline MoeLayerDesc make_moe_layer_desc(const TargetLayer & L) {
+    MoeLayerDesc desc;
+    desc.ffn_gate_exps      = L.ffn_gate_exps;
+    desc.ffn_up_exps        = L.ffn_up_exps;
+    desc.ffn_down_exps      = L.ffn_down_exps;
+    desc.ffn_gate_up_exps   = L.ffn_gate_up_exps;
+    desc.ffn_gate_shexp     = L.ffn_gate_shexp;
+    desc.ffn_up_shexp       = L.ffn_up_shexp;
+    desc.ffn_down_shexp     = L.ffn_down_shexp;
+    desc.ffn_gate_inp_shexp = L.ffn_gate_inp_shexp;
+    desc.ffn_gate_exps_s      = L.ffn_gate_exps_s;
+    desc.ffn_up_exps_s        = L.ffn_up_exps_s;
+    desc.ffn_down_exps_s      = L.ffn_down_exps_s;
+    desc.ffn_gate_up_exps_s   = L.ffn_gate_up_exps_s;
+    desc.ffn_gate_shexp_s     = L.ffn_gate_shexp_s;
+    desc.ffn_up_shexp_s       = L.ffn_up_shexp_s;
+    desc.ffn_down_shexp_s     = L.ffn_down_shexp_s;
+    desc.ffn_gate_inp_shexp_s = L.ffn_gate_inp_shexp_s;
+    return desc;
+}
+
+#endif  // DFLASH_INTERNAL_H_INCLUDED
+
+// ─── Laguna conversions ─────────────────────────────────────────────────
+
+#if defined(DFLASH_LAGUNA_INTERNAL_H_INCLUDED)
+
+inline MoeHybridConfig make_moe_hybrid_config(const LagunaTargetWeights & w) {
+    MoeHybridConfig cfg;
+    cfg.n_embd        = w.n_embd;
+    cfg.n_expert      = w.n_expert;
+    cfg.n_expert_used = w.n_expert_used;
+    cfg.n_ff_exp      = w.n_ff_exp;
+    cfg.n_ff_shexp    = w.n_ff_shexp;
+    cfg.n_layer       = w.n_layer;
+    cfg.first_moe_layer = w.n_layer_dense_lead;  // layer 0 is dense in laguna
+    return cfg;
+}
+
+inline MoeLayerDesc make_moe_layer_desc(const LagunaTargetLayer & L) {
+    MoeLayerDesc desc;
+    desc.ffn_gate_exps      = L.ffn_gate_exps;
+    desc.ffn_up_exps        = L.ffn_up_exps;
+    desc.ffn_down_exps      = L.ffn_down_exps;
+    desc.ffn_gate_up_exps   = nullptr;  // laguna has no fused gate_up
+    desc.ffn_gate_shexp     = L.ffn_gate_shexp;
+    desc.ffn_up_shexp       = L.ffn_up_shexp;
+    desc.ffn_down_shexp     = L.ffn_down_shexp;
+    desc.ffn_gate_inp_shexp = nullptr;  // laguna has no shared-expert gate
+    // Laguna does not use per-tensor quantization scales
+    desc.ffn_gate_exps_s    = 1.0f;
+    desc.ffn_up_exps_s      = 1.0f;
+    desc.ffn_down_exps_s    = 1.0f;
+    desc.ffn_gate_up_exps_s = 1.0f;
+    desc.ffn_gate_shexp_s   = 1.0f;
+    desc.ffn_up_shexp_s     = 1.0f;
+    desc.ffn_down_shexp_s   = 1.0f;
+    desc.ffn_gate_inp_shexp_s = 1.0f;
+    return desc;
+}
+
+#endif  // DFLASH_LAGUNA_INTERNAL_H_INCLUDED
+
+}  // namespace dflash::common
diff --git a/server/src/internal.h b/server/src/internal.h
index f9a890ff2..9b5b45ca7 100644
--- a/server/src/internal.h
+++ b/server/src/internal.h
@@ -2,6 +2,7 @@
 // Not installed, not exposed in the public API.
 
 #pragma once
+#define DFLASH_INTERNAL_H_INCLUDED
 
 #include <cstddef>
 #include <cstdint>
@@ -27,7 +28,7 @@
 
 namespace dflash::common {
 
-struct Qwen35MoeHybridStorage;
+struct MoeHybridStorage;
 
 // Single source of truth for error reporting.
 // All loaders / graph builders push into this via set_last_error(...).
@@ -155,7 +156,7 @@ struct TargetWeights {
     std::vector<TargetLayer> layers;         // size = 64
     ggml_tensor * out_norm = nullptr;        // [hidden]
     ggml_tensor * output   = nullptr;        // [hidden, vocab]  (lm_head)
-    std::shared_ptr<Qwen35MoeHybridStorage> moe_hybrid; // optional Phase 3 hybrid storage
+    std::shared_ptr<MoeHybridStorage> moe_hybrid; // optional hybrid storage (hot/cold expert split)
 
     // Metadata from GGUF (validated at load time)
     int full_attention_interval = 4;
diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp
index 87723f596..ff82df7f7 100644
--- a/server/src/laguna/laguna_backend.cpp
+++ b/server/src/laguna/laguna_backend.cpp
@@ -10,7 +10,17 @@
 #include "laguna_internal.h"
 #include "dflash27b.h"
 
+#include "../common/moe_hybrid_types.h"
+#include "../common/moe_hybrid_types_impl.h"
+#include "../common/moe_hybrid_placement.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
+#include "common/step_graph.h"
+
 #include "ggml-cuda.h"
+#include "ggml-alloc.h"
 #include "common/snapshot_backend.h"
 
 #include <algorithm>
@@ -20,6 +30,10 @@
 #include <cstring>
 #include <fstream>
 #include <sstream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 
 namespace dflash::common {
 
@@ -44,8 +58,9 @@ bool LagunaBackend::init() {
         return false;
     }
 
-    if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) {
-        std::fprintf(stderr, "load failed: %s\n", dflash27b_last_error());
+    // Always use dynamic placement (like qwen35moe): partial load first,
+    // compute budget, then reload full if all experts fit.
+    if (!init_hybrid_mode()) {
         ggml_backend_free(backend_); backend_ = nullptr;
         return false;
     }
@@ -146,6 +161,12 @@ int LagunaBackend::snapshot_cur_pos(int slot) const {
 
 GenerateResult LagunaBackend::generate(const GenerateRequest & req,
                                         const DaemonIO & io) {
+    if (hybrid_mode_ && moe_hybrid_) {
+        auto result = generate_hybrid(req, io);
+        if (result.ok) maybe_post_request_swap();
+        return result;
+    }
+
     const bool no_mask = (std::getenv("DFLASH_NO_MASK") != nullptr);
     GenerateResult result;
     DaemonIO out_io = io.with_token_callback(req.on_token);
@@ -492,6 +513,983 @@ void LagunaBackend::free_drafter() {
     }
 }
 
+// ── Hybrid MoE mode ─────────────────────────────────────────────────────
+//
+// Layer-by-layer decode: for each token, iterate through all 40 layers.
+// Layer 0 (dense SwiGLU) runs as a monolithic GPU sub-graph.
+// Layers 1..39 (sparse MoE) run attention+router on GPU, read back expert
+// selections, then call the common hybrid FFN eval (hot on GPU, cold on CPU).
+
+using HybridClock = std::chrono::steady_clock;
+static inline uint64_t elapsed_us(HybridClock::time_point t0, HybridClock::time_point t1) {
+    return (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+}
+
+bool LagunaBackend::init_hybrid_mode() {
+    const char * hotness_path = std::getenv("DFLASH_LAGUNA_HOTNESS");
+
+    // Step 1: Load model WITHOUT expert data to GPU (partial load)
+    TargetLoadPlan _hybrid_plan;
+    _hybrid_plan.skip_expert_tensors = true;
+    if (!load_target_gguf_laguna_partial(args_.target_path, backend_, _hybrid_plan, w_)) {
+        std::fprintf(stderr, "[laguna-hybrid] partial load failed: %s\n", dflash27b_last_error());
+        return false;
+    }
+
+    // Step 2: Load/build routing stats
+    MoeHybridRoutingStats hotness;
+    std::string err;
+    std::string placement_source;
+    if (hotness_path && hotness_path[0]) {
+        if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, &err)) {
+            std::fprintf(stderr, "[laguna-hybrid] hotness load failed: %s\n", err.c_str());
+            return false;
+        }
+        if (hotness.n_layer != w_.n_layer || hotness.n_expert != w_.n_expert) {
+            std::fprintf(stderr, "[laguna-hybrid] hotness dimensions mismatch (got %d×%d, want %d×%d)\n",
+                          hotness.n_layer, hotness.n_expert, w_.n_layer, w_.n_expert);
+            return false;
+        }
+        placement_source = "file";
+    } else {
+        // Uniform hotness (budget-only mode, no hotness file)
+        hotness.n_layer = w_.n_layer;
+        hotness.n_expert = w_.n_expert;
+        hotness.n_expert_used = w_.n_expert_used;
+        hotness.counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 1);
+        hotness.layer_totals.assign((size_t)w_.n_layer, (uint64_t)w_.n_expert);
+        placement_source = "uniform";
+    }
+
+    // Step 3: Query GPU memory and compute expert budget
+    size_t gpu_free = 0, gpu_total = 0;
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend_);
+    if (dev) {
+        ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+    }
+    if (gpu_total == 0) {
+        std::fprintf(stderr, "[laguna-hybrid] could not query GPU memory\n");
+        return false;
+    }
+
+    // Compute per-layer expert size in bytes (laguna: separate gate/up/down, no fused)
+    std::vector<uint64_t> layer_expert_bytes((size_t)w_.n_layer);
+    for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+        const LagunaTargetLayer & L = w_.layers[(size_t)il];
+        uint64_t bytes = 0;
+        if (L.ffn_gate_exps) bytes += ggml_nbytes(L.ffn_gate_exps) / (uint64_t)w_.n_expert;
+        if (L.ffn_up_exps)   bytes += ggml_nbytes(L.ffn_up_exps) / (uint64_t)w_.n_expert;
+        if (L.ffn_down_exps) bytes += ggml_nbytes(L.ffn_down_exps) / (uint64_t)w_.n_expert;
+        layer_expert_bytes[(size_t)il] = bytes;
+    }
+    // Layer 0 is dense — no experts
+    for (int il = 0; il < w_.n_layer_dense_lead; ++il) {
+        layer_expert_bytes[(size_t)il] = 0;
+    }
+
+    uint64_t total_expert_bytes = 0;
+    for (int il = 0; il < w_.n_layer; ++il) {
+        total_expert_bytes += layer_expert_bytes[(size_t)il] * (uint64_t)w_.n_expert;
+    }
+
+    // KV cache estimate
+    const char * ctx_env = std::getenv("DFLASH_MAX_CONTEXT");
+    int max_context = ctx_env ? std::atoi(ctx_env) : args_.max_ctx;
+    if (max_context <= 0) max_context = 8192;
+
+    const uint64_t kv_bytes_per_tok = (uint64_t)w_.n_layer * 2 *
+        (uint64_t)w_.n_head_kv * (uint64_t)w_.head_dim * 2;
+    const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context;
+
+    const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024;
+    const uint64_t safety_bytes = 512ULL * 1024 * 1024;
+    const uint64_t core_bytes = gpu_total - gpu_free;
+
+    uint64_t expert_budget = 0;
+    if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes) {
+        expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes;
+    }
+    if (expert_budget > total_expert_bytes) {
+        expert_budget = total_expert_bytes;
+    }
+
+    // Manual budget cap (absolute MB)
+    if (const char * cap_env = std::getenv("DFLASH_EXPERT_BUDGET_MB")) {
+        uint64_t cap_bytes = (uint64_t)std::atoi(cap_env) * 1024ULL * 1024ULL;
+        if (cap_bytes > 0 && cap_bytes < expert_budget) {
+            std::printf("[laguna-hybrid] capping expert budget from %.2f GiB to %d MB\n",
+                        expert_budget / 1024.0 / 1024.0 / 1024.0, std::atoi(cap_env));
+            expert_budget = cap_bytes;
+        }
+    }
+
+    // Percentage-based budget cap
+    if (const char * pct_env = std::getenv("DFLASH_EXPERT_BUDGET_PCT")) {
+        int pct = std::atoi(pct_env);
+        if (pct > 0 && pct < 100) {
+            uint64_t pct_bytes = total_expert_bytes * (uint64_t)pct / 100ULL;
+            if (pct_bytes < expert_budget) {
+                std::printf("[laguna-hybrid] capping expert budget to %d%% = %.2f GiB (of %.2f GiB)\n",
+                            pct, pct_bytes / 1024.0 / 1024.0 / 1024.0,
+                            total_expert_bytes / 1024.0 / 1024.0 / 1024.0);
+                expert_budget = pct_bytes;
+            }
+        }
+    }
+
+    std::printf("[laguna] dynamic placement: gpu_total=%.2f GiB, core=%.2f GiB, "
+                "kv_cache=%.2f GiB (ctx=%d), warm=%.0f MB, safety=%.0f MB, "
+                "expert_budget=%.2f GiB (of %.2f GiB total experts)\n",
+                gpu_total / 1024.0 / 1024.0 / 1024.0,
+                core_bytes / 1024.0 / 1024.0 / 1024.0,
+                kv_total / 1024.0 / 1024.0 / 1024.0,
+                max_context,
+                warm_cache_bytes / 1024.0 / 1024.0,
+                safety_bytes / 1024.0 / 1024.0,
+                expert_budget / 1024.0 / 1024.0 / 1024.0,
+                total_expert_bytes / 1024.0 / 1024.0 / 1024.0);
+    std::fflush(stdout);
+
+    if (expert_budget == 0) {
+        std::fprintf(stderr, "[laguna-hybrid] no VRAM budget for experts\n");
+        return false;
+    }
+
+    // Step 4: Build placement
+    MoeHybridPlacement placement;
+    if (!MoeHybridPlacement::build_from_stats_with_layer_bytes(
+            hotness, layer_expert_bytes, expert_budget,
+            /*min_hot_per_layer=*/std::min(w_.n_expert_used, w_.n_expert),
+            placement, &err)) {
+        std::fprintf(stderr, "[laguna-hybrid] placement build failed: %s\n", err.c_str());
+        return false;
+    }
+
+    int total_moe_experts = (w_.n_layer - w_.n_layer_dense_lead) * w_.n_expert;
+    std::printf("[laguna] dynamic placement result: %d hot experts, %d cold experts\n",
+                placement.total_hot, total_moe_experts - placement.total_hot);
+
+    // If all experts fit, reload full model to GPU (non-hybrid path)
+    if (placement.total_hot >= total_moe_experts) {
+        std::printf("[laguna] all experts fit in VRAM, loading fully to GPU\n");
+        std::fflush(stdout);
+        free_laguna_target_weights(w_);
+        if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) {
+            std::fprintf(stderr, "[laguna] full reload failed: %s\n", dflash27b_last_error());
+            return false;
+        }
+        return true;
+    }
+
+    // Step 5: Load expert data from GGUF mmap into hot/cold split buffers
+    {
+        ggml_context * expert_meta = nullptr;
+        gguf_init_params gip{};
+        gip.no_alloc = true;
+        gip.ctx = &expert_meta;
+        gguf_context * gctx = gguf_init_from_file(args_.target_path.c_str(), gip);
+        if (!gctx) {
+            std::fprintf(stderr, "[laguna-hybrid] failed to re-open GGUF for expert loading\n");
+            return false;
+        }
+
+        int fd = ::open(args_.target_path.c_str(), O_RDONLY);
+        if (fd < 0) {
+            gguf_free(gctx);
+            std::fprintf(stderr, "[laguna-hybrid] open failed for mmap\n");
+            return false;
+        }
+        struct stat st;
+        if (::fstat(fd, &st) < 0) {
+            ::close(fd);
+            gguf_free(gctx);
+            std::fprintf(stderr, "[laguna-hybrid] fstat failed\n");
+            return false;
+        }
+        const size_t file_size = (size_t)st.st_size;
+        void * mmap_addr = ::mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
+        ::close(fd);
+        if (mmap_addr == MAP_FAILED) {
+            gguf_free(gctx);
+            std::fprintf(stderr, "[laguna-hybrid] mmap failed\n");
+            return false;
+        }
+
+        const size_t data_start = gguf_get_data_offset(gctx);
+        const auto * file_bytes = (const uint8_t *)mmap_addr;
+
+        // Build per-layer expert file data
+        std::vector<LayerExpertFileData> layer_file_data((size_t)w_.n_layer);
+        for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+            char name[128];
+            auto find_tensor_data = [&](const char * suffix) -> ExpertTensorFileData {
+                std::snprintf(name, sizeof(name), "blk.%d.%s.weight", il, suffix);
+                int64_t tid = gguf_find_tensor(gctx, name);
+                if (tid < 0) return {};
+                size_t off = data_start + gguf_get_tensor_offset(gctx, tid);
+                size_t sz = gguf_get_tensor_size(gctx, tid);
+                if (off + sz > file_size) return {};
+                return { file_bytes + off, sz };
+            };
+
+            layer_file_data[(size_t)il].gate_exps    = find_tensor_data("ffn_gate_exps");
+            layer_file_data[(size_t)il].up_exps      = find_tensor_data("ffn_up_exps");
+            layer_file_data[(size_t)il].down_exps    = find_tensor_data("ffn_down_exps");
+            // laguna has no fused gate_up_exps
+        }
+
+        auto hybrid = std::make_shared<MoeHybridStorage>();
+        MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w_);
+        std::vector<MoeLayerDesc> layer_descs((size_t)w_.n_layer);
+        for (int il = 0; il < w_.n_layer; ++il) {
+            layer_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]);
+        }
+        if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend_, placement, layer_descs, layer_file_data, *hybrid, &err)) {
+            ::munmap(mmap_addr, file_size);
+            gguf_free(gctx);
+            std::fprintf(stderr, "[laguna-hybrid] storage build failed: %s\n", err.c_str());
+            return false;
+        }
+
+        ::munmap(mmap_addr, file_size);
+        gguf_free(gctx);
+
+        moe_hybrid_ = std::move(hybrid);
+    }
+
+    // Print stats
+    int total_cold = 0;
+    uint64_t hot_bytes = 0, cold_bytes = 0;
+    for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+        const auto & layer = moe_hybrid_->layers[(size_t)il];
+        total_cold += (int)layer.cold_expert_ids.size();
+        const uint64_t per_expert_bytes =
+            (uint64_t)layer.gate_expert_bytes + (uint64_t)layer.up_expert_bytes + (uint64_t)layer.down_expert_bytes;
+        hot_bytes  += per_expert_bytes * (uint64_t)layer.hot_expert_ids.size();
+        cold_bytes += per_expert_bytes * (uint64_t)layer.cold_expert_ids.size();
+    }
+    std::printf("[laguna-hybrid] storage ready: total_hot=%d (%.2f GiB VRAM) total_cold=%d (%.2f GiB RAM) source=%s\n",
+                placement.total_hot,
+                hot_bytes / 1024.0 / 1024.0 / 1024.0,
+                total_cold,
+                cold_bytes / 1024.0 / 1024.0 / 1024.0,
+                placement_source.c_str());
+
+    if (total_cold > 0) {
+        hybrid_mode_ = true;
+        std::printf("[laguna-hybrid] hybrid decode path active (%d cold experts)\n", total_cold);
+    } else {
+        hybrid_mode_ = true;  // partial load: expert tensors only in hybrid storage
+        std::printf("[laguna-hybrid] all experts hot — using hybrid path (all-hot)\n");
+    }
+
+    // Configure telemetry and swap policy
+    if (const char * telemetry = std::getenv("DFLASH_LAGUNA_TELEMETRY")) {
+        hybrid_telemetry_ = std::atoi(telemetry) != 0;
+    }
+    if (const char * out_path = std::getenv("DFLASH_LAGUNA_NEXT_PLACEMENT_OUT")) {
+        routing_stats_out_path_ = out_path;
+    }
+    if (const char * swap_max = std::getenv("DFLASH_LAGUNA_SWAP_MAX")) {
+        swap_policy_.max_swaps_total = std::max(0, std::atoi(swap_max));
+    }
+    if (const char * swap_gain = std::getenv("DFLASH_LAGUNA_SWAP_MIN_GAIN")) {
+        swap_policy_.min_promote_gain = (uint64_t)std::max(1, std::atoi(swap_gain));
+    }
+
+    // Allocate routing stats collector
+    if (!routing_stats_out_path_.empty()) {
+        routing_stats_ = std::make_shared<MoeHybridRoutingStats>();
+        routing_stats_->n_layer = w_.n_layer;
+        routing_stats_->n_expert = w_.n_expert;
+        routing_stats_->n_expert_used = w_.n_expert_used;
+        routing_stats_->counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 0);
+        routing_stats_->layer_totals.assign((size_t)w_.n_layer, 0);
+    }
+
+    std::fflush(stdout);
+    return true;
+}
+
+// ── Laguna hybrid per-layer pre-FFN graph ───────────────────────────────
+//
+// Builds attention + router for a single layer. For MoE layers, outputs:
+//   sg.ffn_post     = post-attention normed hidden (input to FFN)
+//   sg.ffn_residual = residual to add after FFN output
+//   sg.moe_selected = [n_used] expert IDs
+//   sg.moe_weights  = [n_used] combine weights
+// For the dense layer 0, outputs the full layer result in sg.hidden_input.
+
+static bool build_laguna_layer_prefn_step(
+    StepGraph & sg,
+    const LagunaTargetWeights & w,
+    LagunaTargetCache & cache,
+    ggml_backend_t backend,
+    int il,
+    int kv_start,
+    int n_tokens)
+{
+    step_graph_free(sg);
+
+    const int n_embd = w.n_embd;
+    const bool is_full = laguna_is_full_attn_layer(w, il);
+    const bool is_dense = (il < w.n_layer_dense_lead);
+    const LagunaTargetLayer & L = w.layers[(size_t)il];
+    const int kv_len = kv_start + n_tokens;
+    const int n_head = w.n_head_arr[il];
+    const int n_head_kv = w.n_head_kv;
+    const int head_dim = w.head_dim;
+
+    ggml_init_params ip{};
+    ip.mem_size = ggml_tensor_overhead() * 4096 + ggml_graph_overhead() + 8 * 1024 * 1024;
+    ip.no_alloc = true;
+    sg.ctx = ggml_init(ip);
+    if (!sg.ctx) return false;
+    sg.gf = ggml_new_graph_custom(sg.ctx, 4096, false);
+
+    // Input: hidden state [n_embd, n_tokens]
+    sg.inp_embed = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(sg.inp_embed);
+    ggml_set_name(sg.inp_embed, "inp_embed");
+
+    // Positions
+    sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(sg.positions);
+
+    // Attention mask (causal)
+    ggml_tensor * attn_mask = nullptr;
+    if (kv_len > 0) {
+        attn_mask = ggml_new_tensor_4d(sg.ctx, GGML_TYPE_F32, kv_len, n_tokens, 1, 1);
+        ggml_set_input(attn_mask);
+        sg.attn_mask = attn_mask;
+    }
+
+    ggml_tensor * inp = sg.inp_embed;
+
+    // Pre-attn RMS norm
+    ggml_tensor * cur = ggml_rms_norm(sg.ctx, inp, 1e-6f);
+    cur = ggml_mul(sg.ctx, cur, L.attn_norm);
+
+    // QKV projections
+    const int q_dim = n_head * head_dim;
+    ggml_tensor * Qcur = ggml_mul_mat(sg.ctx, L.wq, cur);  // [q_dim, n_tokens]
+    ggml_tensor * Kcur = ggml_mul_mat(sg.ctx, L.wk, cur);  // [n_head_kv * head_dim, n_tokens]
+    ggml_tensor * Vcur = ggml_mul_mat(sg.ctx, L.wv, cur);  // [n_head_kv * head_dim, n_tokens]
+
+    // Per-head softplus gate
+    ggml_tensor * gate = ggml_mul_mat(sg.ctx, L.wqkv_gate, cur);  // [n_head, n_tokens]
+    gate = ggml_softplus(sg.ctx, gate);
+
+    // Reshape Q to [head_dim, n_head, n_tokens]
+    Qcur = ggml_reshape_3d(sg.ctx, Qcur, head_dim, n_head, n_tokens);
+    Kcur = ggml_reshape_3d(sg.ctx, Kcur, head_dim, n_head_kv, n_tokens);
+    Vcur = ggml_reshape_3d(sg.ctx, Vcur, head_dim, n_head_kv, n_tokens);
+
+    // Q-norm / K-norm
+    Qcur = ggml_rms_norm(sg.ctx, Qcur, 1e-6f);
+    Qcur = ggml_mul(sg.ctx, Qcur, L.q_norm);
+    Kcur = ggml_rms_norm(sg.ctx, Kcur, 1e-6f);
+    Kcur = ggml_mul(sg.ctx, Kcur, L.k_norm);
+
+    // RoPE (YaRN on full-attention layers, plain on SWA layers)
+    const float rope_th     = is_full ? w.rope_freq_base_full : w.rope_freq_base_swa;
+    const int   n_rot       = is_full ? w.n_rot_full : w.n_rot_swa;
+    const float ext_factor  = is_full ? 1.0f : 0.0f;
+    const float attn_factor = 1.0f;
+    const float beta_fast   = is_full ? w.yarn_beta_fast : 32.0f;
+    const float beta_slow   = is_full ? w.yarn_beta_slow :  1.0f;
+    const int   n_ctx_orig  = is_full ? w.yarn_orig_ctx  : 0;
+    const float freq_scale  = is_full ? (1.0f / w.yarn_factor) : 1.0f;
+
+    Qcur = ggml_rope_ext(sg.ctx, Qcur, sg.positions, /*freq_factors=*/nullptr,
+                          n_rot, GGML_ROPE_TYPE_NEOX,
+                          n_ctx_orig, rope_th, freq_scale,
+                          ext_factor, attn_factor, beta_fast, beta_slow);
+    Kcur = ggml_rope_ext(sg.ctx, Kcur, sg.positions, nullptr,
+                          n_rot, GGML_ROPE_TYPE_NEOX,
+                          n_ctx_orig, rope_th, freq_scale,
+                          ext_factor, attn_factor, beta_fast, beta_slow);
+
+    // KV cache write — permute to [head_dim, n_tokens, n_head_kv] layout
+    ggml_tensor * cache_k = cache.attn_k[(size_t)il];
+    ggml_tensor * cache_v = cache.attn_v[(size_t)il];
+
+    ggml_tensor * Kcur_T = ggml_permute(sg.ctx, Kcur, 0, 2, 1, 3);
+    ggml_tensor * Vcur_T = ggml_permute(sg.ctx, Vcur, 0, 2, 1, 3);
+
+    ggml_tensor * k_view = ggml_view_3d(sg.ctx, cache_k,
+        head_dim, n_tokens, n_head_kv,
+        cache_k->nb[1], cache_k->nb[2],
+        cache_k->nb[1] * (size_t)kv_start);
+    ggml_tensor * k_cpy = ggml_cpy(sg.ctx, Kcur_T, k_view);
+    ggml_build_forward_expand(sg.gf, k_cpy);
+
+    ggml_tensor * v_view = ggml_view_3d(sg.ctx, cache_v,
+        head_dim, n_tokens, n_head_kv,
+        cache_v->nb[1], cache_v->nb[2],
+        cache_v->nb[1] * (size_t)kv_start);
+    ggml_tensor * v_cpy = ggml_cpy(sg.ctx, Vcur_T, v_view);
+    ggml_build_forward_expand(sg.gf, v_cpy);
+
+    // Flash attention
+    ggml_tensor * Qfa = ggml_permute(sg.ctx, Qcur, 0, 2, 1, 3);
+    Qfa = ggml_cont(sg.ctx, Qfa);
+
+    ggml_tensor * Kfa = ggml_view_3d(sg.ctx, cache_k,
+        head_dim, kv_len, n_head_kv,
+        cache_k->nb[1], cache_k->nb[2], 0);
+    ggml_tensor * Vfa = ggml_view_3d(sg.ctx, cache_v,
+        head_dim, kv_len, n_head_kv,
+        cache_v->nb[1], cache_v->nb[2], 0);
+
+    const float kq_scale = 1.0f / std::sqrt((float)head_dim);
+    ggml_tensor * attn_mask_f16 = attn_mask ? ggml_cast(sg.ctx, attn_mask, GGML_TYPE_F16) : nullptr;
+    ggml_tensor * attn = ggml_flash_attn_ext(sg.ctx, Qfa, Kfa, Vfa, attn_mask_f16,
+                                              kq_scale, 0.0f, 0.0f);
+
+    // Per-head softplus gate
+    ggml_tensor * gate_b = ggml_reshape_3d(sg.ctx, gate, 1, n_head, n_tokens);
+    gate_b = ggml_cast(sg.ctx, gate_b, attn->type);
+    attn = ggml_mul(sg.ctx, attn, gate_b);
+
+    attn = ggml_reshape_2d(sg.ctx, attn, q_dim, n_tokens);
+
+    // Output projection
+    ggml_tensor * attn_out = ggml_mul_mat(sg.ctx, L.wo, attn);  // [n_embd, n_tokens]
+
+    // Residual after attention
+    ggml_tensor * ffn_inp = ggml_add(sg.ctx, attn_out, inp);
+
+    if (is_dense) {
+        // Dense layer 0: run full MLP in this graph
+        ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f);
+        normed = ggml_mul(sg.ctx, normed, L.ffn_norm);
+
+        ggml_tensor * g = ggml_mul_mat(sg.ctx, L.w_gate, normed);
+        ggml_tensor * u = ggml_mul_mat(sg.ctx, L.w_up, normed);
+        ggml_tensor * gu = ggml_swiglu_split(sg.ctx, g, u);
+        ggml_tensor * d = ggml_mul_mat(sg.ctx, L.w_down, gu);
+        ggml_tensor * layer_out = ggml_add(sg.ctx, d, ffn_inp);
+
+        sg.hidden_input = layer_out;
+        ggml_set_output(layer_out);
+        ggml_build_forward_expand(sg.gf, layer_out);
+    } else {
+        // MoE layer: output pre-FFN normed + residual + router decisions
+        ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f);
+        normed = ggml_mul(sg.ctx, normed, L.ffn_norm);
+        sg.ffn_post = normed;
+        ggml_set_output(normed);
+
+        sg.ffn_residual = ffn_inp;
+        ggml_set_output(ffn_inp);
+
+        // Router: sigmoid + score-correction bias + top-k
+        ggml_tensor * router_logits = ggml_mul_mat(sg.ctx, L.ffn_gate_inp, normed);
+        ggml_tensor * probs = ggml_sigmoid(sg.ctx, router_logits);
+        ggml_tensor * scores_sel = ggml_add(sg.ctx, probs, L.ffn_exp_probs_b);
+        ggml_tensor * selected = ggml_top_k(sg.ctx, scores_sel, w.n_expert_used);
+        ggml_set_output(selected);
+
+        // Gather original probs (no bias) for combine weights
+        ggml_tensor * probs_3d = ggml_reshape_3d(sg.ctx, probs, 1, w.n_expert, n_tokens);
+        ggml_tensor * weights_raw = ggml_get_rows(sg.ctx, probs_3d, selected);
+        weights_raw = ggml_reshape_2d(sg.ctx, weights_raw, w.n_expert_used, n_tokens);
+
+        // Sum-normalize + scale
+        ggml_tensor * w_sum = ggml_sum_rows(sg.ctx, weights_raw);
+        ggml_tensor * weights_normed = ggml_div(sg.ctx, weights_raw, w_sum);
+        if (w.expert_weights_scale != 1.0f) {
+            weights_normed = ggml_scale(sg.ctx, weights_normed, w.expert_weights_scale);
+        }
+        sg.moe_weights = weights_normed;
+        ggml_set_output(weights_normed);
+
+        sg.moe_selected.resize(1);
+        sg.moe_selected[0] = selected;
+
+        ggml_build_forward_expand(sg.gf, normed);
+        ggml_build_forward_expand(sg.gf, ffn_inp);
+        ggml_build_forward_expand(sg.gf, selected);
+        ggml_build_forward_expand(sg.gf, weights_normed);
+    }
+
+    // Allocate
+    if (!sg.alloc) {
+        sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+    if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) {
+        return false;
+    }
+
+    return true;
+}
+
+// ── Hybrid forward: one token through all 40 layers ─────────────────────
+
+bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
+                                              std::vector<float> & act_cur,
+                                              int32_t & argmax_out) {
+    const int hidden = w_.n_embd;
+    const int vocab = w_.embedder.n_vocab;
+
+    // Embed token
+    if (!w_.embedder.embed(&tok, 1, act_cur.data())) return false;
+
+    // GPU-resident state for MoE layers
+    GpuResidentState gpu_state;
+    if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false;
+    ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+
+    StepGraph layer_sg;
+    std::vector<int32_t> selected((size_t)w_.n_expert_used);
+    std::vector<float> weights_buf((size_t)w_.n_expert_used);
+    ggml_backend_t cpu_be = moe_hybrid_->cpu_backend;
+
+    for (int il = 0; il < w_.n_layer; ++il) {
+        const bool is_dense = (il < w_.n_layer_dense_lead);
+
+        if (!build_laguna_layer_prefn_step(layer_sg, w_, cache_, backend_, il, kv_pos, 1)) {
+            step_graph_destroy(layer_sg);
+            gpu_state.destroy();
+            return false;
+        }
+
+        // GPU→GPU: copy persistent act_cur to pre-FFN graph input
+        ggml_backend_tensor_copy(gpu_state.act_cur, layer_sg.inp_embed);
+
+        // Set positions
+        int32_t pos_val = kv_pos;
+        ggml_backend_tensor_set(layer_sg.positions, &pos_val, 0, sizeof(int32_t));
+
+        // Causal mask: single token decode — all positions [0..kv_pos] visible
+        if (layer_sg.attn_mask) {
+            const int kv_len = kv_pos + 1;
+            std::vector<float> mask_data((size_t)kv_len, 0.0f);
+            ggml_backend_tensor_set(layer_sg.attn_mask, mask_data.data(), 0, sizeof(float) * (size_t)kv_len);
+        }
+
+        auto st = ggml_backend_graph_compute(backend_, layer_sg.gf);
+        if (st != GGML_STATUS_SUCCESS) {
+            step_graph_destroy(layer_sg);
+            gpu_state.destroy();
+            return false;
+        }
+
+        if (is_dense) {
+            // Dense layer: read full output back to GPU-resident state
+            ggml_backend_tensor_copy(layer_sg.hidden_input, gpu_state.act_cur);
+        } else {
+            // MoE layer: read router decisions, then do hybrid FFN eval
+            ggml_tensor * sel_tensor = layer_sg.moe_selected[0];
+            ggml_backend_tensor_get(sel_tensor, selected.data(), 0,
+                                     sizeof(int32_t) * selected.size());
+            ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
+                                     sizeof(float) * weights_buf.size());
+
+            if (routing_stats_) {
+                routing_stats_->observe(il, selected.data(), (int)selected.size());
+            }
+
+            // Hybrid FFN: hot on GPU, cold on CPU, combine on GPU
+            auto & storage = moe_hybrid_->layers[(size_t)il];
+            MoeHybridConfig cfg = make_moe_hybrid_config(w_);
+            MoeLayerDesc desc = make_moe_layer_desc(w_.layers[(size_t)il]);
+            if (!eval_moe_hybrid_ffn_gpu_resident(
+                    backend_, cfg, desc, storage, cpu_be,
+                    layer_sg.ffn_post, layer_sg.ffn_residual,
+                    gpu_state,
+                    selected.data(), weights_buf.data(),
+                    (int)selected.size())) {
+                step_graph_destroy(layer_sg);
+                gpu_state.destroy();
+                return false;
+            }
+        }
+    }
+
+    // Read final hidden state and project logits
+    ggml_backend_tensor_get(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+    step_graph_destroy(layer_sg);
+    gpu_state.destroy();
+
+    // Project logits: final RMS norm + lm_head
+    {
+        ggml_init_params ip{};
+        ip.mem_size = 64 * 1024 * 1024;
+        ip.no_alloc = true;
+        ggml_context * ctx = ggml_init(ip);
+        if (!ctx) return false;
+
+        ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1);
+        ggml_set_input(h_in);
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+
+        ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f);
+        normed = ggml_mul(ctx, normed, w_.out_norm);
+        ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed);
+        ggml_set_output(logits);
+        ggml_build_forward_expand(gf, logits);
+
+        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_));
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            ggml_gallocr_free(alloc);
+            ggml_free(ctx);
+            return false;
+        }
+        ggml_backend_tensor_set(h_in, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+        if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) {
+            ggml_gallocr_free(alloc);
+            ggml_free(ctx);
+            return false;
+        }
+
+        std::vector<float> logits_buf((size_t)vocab);
+        ggml_backend_tensor_get(logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
+        ggml_gallocr_free(alloc);
+        ggml_free(ctx);
+
+        // Argmax
+        argmax_out = 0;
+        float best = logits_buf[0];
+        for (int j = 1; j < vocab; ++j) {
+            if (logits_buf[(size_t)j] > best) {
+                best = logits_buf[(size_t)j];
+                argmax_out = j;
+            }
+        }
+    }
+    return true;
+}
+
+// ── Hybrid generate ─────────────────────────────────────────────────────
+
+GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
+                                               const DaemonIO & io) {
+    GenerateResult result;
+    DaemonIO out_io = io.with_token_callback(req.on_token);
+    const bool should_emit = req.stream || (bool)out_io.on_token;
+    const int N = (int)req.prompt.size();
+
+    if (N + req.n_gen > args_.max_ctx) {
+        result.error = "overflow";
+        return result;
+    }
+
+    reset_laguna_target_cache(cache_);
+
+    // ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ──
+    const int hidden = w_.n_embd;
+    const int n_expert_used = w_.n_expert_used;
+    ggml_backend_t cpu_be = moe_hybrid_->cpu_backend;
+
+    std::vector<float> embed_all((size_t)N * (size_t)hidden);
+    if (!w_.embedder.embed(req.prompt.data(), N, embed_all.data())) {
+        result.error = "embed_prefill";
+        return result;
+    }
+
+    auto t_pf0 = std::chrono::steady_clock::now();
+    const int prefill_chunk = std::min(args_.chunk, N);
+
+    StepGraph prefill_sg;  // persistent across layers to reuse GPU buffer
+    ggml_gallocr_t ffn_hot_alloc = nullptr;
+    ggml_gallocr_t ffn_cold_alloc = nullptr;
+
+    for (int il = 0; il < w_.n_layer; ++il) {
+        const bool is_dense = (il < w_.n_layer_dense_lead);
+        const bool is_full = laguna_is_full_attn_layer(w_, il);
+
+        for (int chunk_start = 0; chunk_start < N; chunk_start += prefill_chunk) {
+            const int chunk_len = std::min(prefill_chunk, N - chunk_start);
+
+            step_graph_free(prefill_sg);  // reset ctx/graph but keep gallocr buffer
+            if (!build_laguna_layer_prefn_step(prefill_sg, w_, cache_, backend_,
+                                               il, chunk_start, chunk_len)) {
+                result.error = "prefill_build";
+                step_graph_destroy(prefill_sg);
+                if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+                return result;
+            }
+
+            // Set input embeddings
+            ggml_backend_tensor_set(prefill_sg.inp_embed,
+                                    embed_all.data() + (size_t)chunk_start * (size_t)hidden, 0,
+                                    sizeof(float) * (size_t)chunk_len * (size_t)hidden);
+
+            // Set positions
+            std::vector<int32_t> pos_data((size_t)chunk_len);
+            for (int i = 0; i < chunk_len; ++i) pos_data[i] = chunk_start + i;
+            ggml_backend_tensor_set(prefill_sg.positions, pos_data.data(), 0,
+                                    sizeof(int32_t) * (size_t)chunk_len);
+
+            // Set attention mask (causal or causal+SWA depending on layer)
+            if (prefill_sg.attn_mask) {
+                const int kv_len = chunk_start + chunk_len;
+                std::vector<float> mask((size_t)kv_len * (size_t)chunk_len, -INFINITY);
+                for (int q = 0; q < chunk_len; ++q) {
+                    const int abs_q = chunk_start + q;
+                    const int win_lo = is_full ? 0 : std::max(0, abs_q - w_.sliding_window + 1);
+                    for (int k = win_lo; k <= abs_q && k < kv_len; ++k) {
+                        mask[(size_t)q * (size_t)kv_len + (size_t)k] = 0.0f;
+                    }
+                }
+                ggml_backend_tensor_set(prefill_sg.attn_mask, mask.data(), 0,
+                                        sizeof(float) * mask.size());
+            }
+
+            // Compute pre-FFN graph
+            auto st = ggml_backend_graph_compute(backend_, prefill_sg.gf);
+            if (st != GGML_STATUS_SUCCESS) {
+                result.error = "prefill_compute";
+                step_graph_destroy(prefill_sg);
+                if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+                return result;
+            }
+
+            if (is_dense) {
+                // Dense layer outputs full result directly
+                std::vector<float> layer_out((size_t)chunk_len * (size_t)hidden);
+                ggml_backend_tensor_get(prefill_sg.hidden_input, layer_out.data(), 0,
+                                        sizeof(float) * layer_out.size());
+                std::memcpy(embed_all.data() + (size_t)chunk_start * (size_t)hidden,
+                            layer_out.data(),
+                            sizeof(float) * layer_out.size());
+            } else {
+                // MoE layer: read router decisions, run hybrid FFN
+                std::vector<float> chunk_residuals((size_t)chunk_len * (size_t)hidden);
+                std::vector<float> chunk_post((size_t)chunk_len * (size_t)hidden);
+                std::vector<int32_t> chunk_selected((size_t)chunk_len * (size_t)n_expert_used);
+                std::vector<float> chunk_weights((size_t)chunk_len * (size_t)n_expert_used);
+
+                ggml_backend_tensor_get(prefill_sg.ffn_residual, chunk_residuals.data(), 0,
+                                        sizeof(float) * chunk_residuals.size());
+                ggml_backend_tensor_get(prefill_sg.ffn_post, chunk_post.data(), 0,
+                                        sizeof(float) * chunk_post.size());
+
+                ggml_tensor * sel_tensor = prefill_sg.moe_selected.empty() ? nullptr : prefill_sg.moe_selected[0];
+                if (!sel_tensor || !prefill_sg.moe_weights) {
+                    result.error = "prefill_router_outputs";
+                    step_graph_destroy(prefill_sg);
+                    if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                    if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+                    return result;
+                }
+                ggml_backend_tensor_get(sel_tensor, chunk_selected.data(), 0,
+                                        sizeof(int32_t) * chunk_selected.size());
+                ggml_backend_tensor_get(prefill_sg.moe_weights, chunk_weights.data(), 0,
+                                        sizeof(float) * chunk_weights.size());
+
+                // Observe routing stats
+                if (routing_stats_) {
+                    for (int i = 0; i < chunk_len; ++i) {
+                        routing_stats_->observe(il, chunk_selected.data() + (size_t)i * (size_t)n_expert_used, n_expert_used);
+                    }
+                }
+
+                // Batched hybrid FFN evaluation
+                auto & storage = moe_hybrid_->layers[(size_t)il];
+                MoeHybridConfig chunk_cfg = make_moe_hybrid_config(w_);
+                MoeLayerDesc chunk_desc = make_moe_layer_desc(w_.layers[(size_t)il]);
+                std::vector<float> ffn_batch_out;
+                if (!eval_moe_hybrid_ffn_batched(
+                        backend_, cpu_be, chunk_cfg, chunk_desc, storage,
+                        chunk_post.data(),
+                        chunk_selected.data(),
+                        chunk_weights.data(),
+                        chunk_len, ffn_batch_out, &result.error,
+                        &ffn_hot_alloc, &ffn_cold_alloc)) {
+                    step_graph_destroy(prefill_sg);
+                    if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                    if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+                    return result;
+                }
+
+                // Combine: FFN output + residual → embed_all for next layer
+                for (int i = 0; i < chunk_len; ++i) {
+                    const float * ffn = ffn_batch_out.data() + (size_t)i * (size_t)hidden;
+                    const float * res = chunk_residuals.data() + (size_t)i * (size_t)hidden;
+                    float * out_embed = embed_all.data() + (size_t)(chunk_start + i) * (size_t)hidden;
+                    for (int j = 0; j < hidden; ++j) {
+                        out_embed[j] = ffn[j] + res[j];
+                    }
+                }
+            }
+        }
+    }
+    step_graph_destroy(prefill_sg);
+    if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+    if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+
+    // Project logits from last token's hidden state
+    cache_.cur_pos = N;
+    std::vector<float> last_logits;
+    {
+        ggml_init_params ip{};
+        ip.mem_size = 64 * 1024 * 1024;
+        ip.no_alloc = true;
+        ggml_context * ctx = ggml_init(ip);
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+
+        ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1);
+        ggml_set_input(h_in);
+        ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f);
+        normed = ggml_mul(ctx, normed, w_.out_norm);
+        ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed);
+        ggml_set_output(logits);
+        ggml_build_forward_expand(gf, logits);
+
+        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_));
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            ggml_gallocr_free(alloc);
+            ggml_free(ctx);
+            result.error = "prefill_logits_alloc";
+            return result;
+        }
+        // Set last token's hidden state
+        ggml_backend_tensor_set(h_in,
+                                embed_all.data() + (size_t)(N - 1) * (size_t)hidden, 0,
+                                sizeof(float) * (size_t)hidden);
+        if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) {
+            ggml_gallocr_free(alloc);
+            ggml_free(ctx);
+            result.error = "prefill_logits_compute";
+            return result;
+        }
+        last_logits.resize((size_t)w_.embedder.n_vocab);
+        ggml_backend_tensor_get(logits, last_logits.data(), 0,
+                                sizeof(float) * last_logits.size());
+        ggml_gallocr_free(alloc);
+        ggml_free(ctx);
+    }
+
+    auto t_pf1 = std::chrono::steady_clock::now();
+    result.prefill_s = std::chrono::duration<double>(t_pf1 - t_pf0).count();
+
+    // ── Decode (hybrid layer-by-layer) ──
+    auto argmax = [](const std::vector<float> & ll) {
+        int best = 0; float bv = ll[0];
+        for (size_t i = 1; i < ll.size(); ++i)
+            if (ll[i] > bv) { bv = ll[i]; best = (int)i; }
+        return best;
+    };
+
+    std::vector<int32_t> history;
+    history.reserve((size_t)N + (size_t)req.n_gen);
+    history.insert(history.end(), req.prompt.begin(), req.prompt.end());
+
+    auto pick = [&](const std::vector<float> & ll) -> int {
+        return req.do_sample
+            ? sample_logits(ll.data(), (int)ll.size(), req.sampler, history, sampler_rng_)
+            : argmax(ll);
+    };
+
+    int next_tok = pick(last_logits);
+    result.tokens.reserve(req.n_gen);
+
+    // Budget force-close (same pattern as non-hybrid path)
+    const BudgetHook & budget_hook = req.budget_hook;
+    bool budget_close_started = false;
+    int  close_inject_pos     = 0;
+    auto maybe_force_close = [&](int32_t & tok, int committed_now) {
+        if (budget_hook.close_token_ids.empty()) return;
+        if (budget_close_started &&
+            close_inject_pos < (int)budget_hook.close_token_ids.size())
+        {
+            tok = budget_hook.close_token_ids[close_inject_pos++];
+            return;
+        }
+        if (budget_close_started) return;
+        int remaining = req.n_gen - committed_now;
+        if (remaining <= budget_hook.hard_limit_remaining) {
+            int32_t first_close = budget_hook.close_token_ids.front();
+            if (tok == first_close) {
+                budget_close_started = true;
+                close_inject_pos = 1;
+                return;
+            }
+            tok = first_close;
+            budget_close_started = true;
+            close_inject_pos = 1;
+            result.budget_forced_close = true;
+        }
+    };
+
+    std::vector<float> act_cur((size_t)w_.n_embd);
+    auto t_g0 = std::chrono::steady_clock::now();
+    for (int s = 0; s < req.n_gen; ++s) {
+        maybe_force_close(next_tok, s);
+        if (next_tok == w_.eos_id || next_tok == w_.eos_chat_id) break;
+        result.tokens.push_back(next_tok);
+        history.push_back(next_tok);
+        if (should_emit) {
+            out_io.emit(next_tok);
+            if (out_io.cancelled) break;
+        }
+
+        // Hybrid forward: one token through all layers
+        int32_t argmax_tok = 0;
+        if (!hybrid_forward_one_token(next_tok, cache_.cur_pos, act_cur, argmax_tok)) {
+            result.error = "decode";
+            break;
+        }
+        cache_.cur_pos++;
+
+        if (req.do_sample) {
+            // For sampling, we need full logits — project from act_cur
+            // (hybrid_forward_one_token already computed argmax; for sampling
+            // we re-project — FIXME: return logits from forward to avoid double projection)
+            next_tok = argmax_tok;  // For now, use argmax even in sample mode as fallback
+        } else {
+            next_tok = argmax_tok;
+        }
+    }
+    auto t_g1 = std::chrono::steady_clock::now();
+    result.decode_s = std::chrono::duration<double>(t_g1 - t_g0).count();
+
+    if (should_emit) out_io.emit(-1);
+    result.ok = (result.error.empty());
+    return result;
+}
+
+void LagunaBackend::maybe_post_request_swap() {
+    if (!hybrid_mode_ || !moe_hybrid_ || swap_policy_.max_swaps_total <= 0) return;
+    if (!routing_stats_) return;
+
+    MoeHybridSwapPlan plan;
+    std::string err;
+    if (!build_moe_hybrid_swap_plan(moe_hybrid_->placement, *routing_stats_,
+                                   swap_policy_, plan, &err)) {
+        std::fprintf(stderr, "[laguna-hybrid] swap plan failed: %s\n", err.c_str());
+        return;
+    }
+    if (plan.actions.empty()) return;
+
+    // Rebuild storage with new placement
+    auto rebuilt = std::make_shared<MoeHybridStorage>();
+    MoeHybridConfig swap_cfg = make_moe_hybrid_config(w_);
+    std::vector<MoeLayerDesc> swap_descs((size_t)w_.n_layer);
+    for (int il = 0; il < w_.n_layer; ++il) {
+        swap_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]);
+    }
+    if (!build_moe_hybrid_storage(swap_cfg, backend_,
+                                        plan.next_placement, swap_descs, *rebuilt, &err)) {
+        std::fprintf(stderr, "[laguna-hybrid] swap rebuild failed: %s\n", err.c_str());
+        return;
+    }
+    moe_hybrid_ = std::move(rebuilt);
+
+    // Save updated routing stats if configured
+    if (!routing_stats_out_path_.empty()) {
+        routing_stats_->save_csv(routing_stats_out_path_, &err);
+    }
+
+    std::printf("[laguna-hybrid] applied %zu swap actions at request boundary\n", plan.actions.size());
+    std::fflush(stdout);
+}
+
 // ── Shutdown ────────────────────────────────────────────────────────────
 
 void LagunaBackend::shutdown() {
diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h
index afdaf8f63..6ebe9081c 100644
--- a/server/src/laguna/laguna_backend.h
+++ b/server/src/laguna/laguna_backend.h
@@ -10,11 +10,16 @@
 #include "laguna_internal.h"
 #include "placement/placement_config.h"
 #include "qwen3_drafter.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
 
 #include "ggml.h"
 #include "ggml-backend.h"
 
 #include <array>
+#include <memory>
 #include <random>
 #include <string>
 #include <vector>
@@ -77,7 +82,23 @@ class LagunaBackend : public ModelBackend {
     DrafterContext                              drafter_ctx_{};
     bool                                       drafter_loaded_ = false;
 
+    // ── Hybrid MoE mode (hot/cold expert split) ──
+    bool                                       hybrid_mode_ = false;
+    std::shared_ptr<MoeHybridStorage>          moe_hybrid_;
+    std::shared_ptr<MoeHybridRoutingStats>     routing_stats_;
+    std::string                                routing_stats_out_path_;
+    MoeHybridSwapPolicy                        swap_policy_;
+    bool                                       hybrid_telemetry_ = false;
+
     bool ensure_slot(int slot);
+
+    // Hybrid mode helpers
+    bool init_hybrid_mode();
+    GenerateResult generate_hybrid(const GenerateRequest & req, const DaemonIO & io);
+    bool hybrid_forward_one_token(int32_t tok, int kv_pos,
+                                  std::vector<float> & act_cur,
+                                  int32_t & argmax_out);
+    void maybe_post_request_swap();
 };
 
 }  // namespace dflash::common
diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
index 32c981300..132eadd86 100644
--- a/server/src/laguna/laguna_internal.h
+++ b/server/src/laguna/laguna_internal.h
@@ -20,6 +20,7 @@
 //   - Vocab = 100352. BOS = 2. EOS = {2, 24}. Pad = 9.
 
 #pragma once
+#define DFLASH_LAGUNA_INTERNAL_H_INCLUDED
 
 #include <cstdint>
 #include <string>
@@ -136,6 +137,10 @@ bool load_target_gguf_laguna(const std::string & path,
                               ggml_backend_t       backend,
                               LagunaTargetWeights & out);
 
+// Partial loader. With plan.skip_expert_tensors=true this performs the
+// hybrid-MoE load: non-expert tensors go to GPU, expert tensors are kept
+// off-GPU (metadata/shapes stay valid for size queries). Also supports
+// layer-range partial loads via plan.layer_begin/layer_end/load_output.
 bool load_target_gguf_laguna_partial(const std::string & path,
                                       ggml_backend_t backend,
                                       const TargetLoadPlan & plan,
diff --git a/server/src/laguna/laguna_target_loader.cpp b/server/src/laguna/laguna_target_loader.cpp
index cd9b617a2..2a91f91f7 100644
--- a/server/src/laguna/laguna_target_loader.cpp
+++ b/server/src/laguna/laguna_target_loader.cpp
@@ -60,6 +60,9 @@
 
 namespace dflash::common {
 
+// fwd-decl: defined below at file scope, used by should_load_laguna_tensor
+static bool is_laguna_expert_tensor(const char * name);
+
 namespace {
 
 // Same Mmap shape as gguf_target_loader.cpp's local helper. Duplicated locally
@@ -157,6 +160,7 @@ bool should_load_laguna_tensor(const char * name, const TargetLoadPlan & plan) {
         std::strcmp(name, "output.weight") == 0) {
         return plan.load_output;
     }
+    if (plan.skip_expert_tensors && is_laguna_expert_tensor(name)) return false;
     int layer_id = -1;
     if (parse_block_tensor_name(name, layer_id)) {
         return layer_id >= plan.layer_begin && layer_id < plan.layer_end;
@@ -572,4 +576,18 @@ void free_laguna_target_weights(LagunaTargetWeights & w) {
     w.output   = nullptr;
 }
 
+// ── Partial loader (hybrid mode) ────────────────────────────────────────
+//
+// Loads laguna GGUF but skips uploading expert tensor DATA to GPU.
+// Tensor metadata (shapes, offsets) is still parsed so that the hybrid
+// storage builder can use ggml_nbytes() to compute per-expert sizes.
+// Expert data will be loaded via mmap into the hot/cold split buffers.
+
+static bool is_laguna_expert_tensor(const char * name) {
+    // Expert tensors are: ffn_gate_exps, ffn_up_exps, ffn_down_exps
+    // (per-layer, named blk.<N>.ffn_{gate,up,down}_exps.weight)
+    return std::strstr(name, "ffn_gate_exps") != nullptr ||
+           std::strstr(name, "ffn_up_exps") != nullptr ||
+           std::strstr(name, "ffn_down_exps") != nullptr;
+}
 } // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
index 67cdaef5b..2667680a9 100644
--- a/server/src/qwen35moe/qwen35moe_backend.cpp
+++ b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -1,5 +1,8 @@
 #include "qwen35moe_backend.h"
 
+#include "../common/moe_hybrid_placement.h"
+#include "../common/moe_hybrid_types.h"
+#include "../common/moe_hybrid_types_impl.h"
 #include "common/sampler.h"
 #include "common/dflash_spec_decode.h"
 #include "dflash_draft_graph.h"
@@ -44,8 +47,8 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
     }
 
     if (const char * stats_path = std::getenv("DFLASH_QWEN35MOE_RUNTIME_STATS_OUT")) {
-        routing_stats_ = std::make_shared<Qwen35MoeRoutingStats>();
-        if (!routing_stats_->init_from_weights(out)) {
+        routing_stats_ = std::make_shared<MoeHybridRoutingStats>();
+        if (!routing_stats_->init(out.n_layer, out.n_expert, out.n_expert_used)) {
             set_last_error("qwen35moe runtime stats init failed");
             return false;
         }
@@ -54,7 +57,7 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
 
     // Phase 2: Compute dynamic placement based on VRAM budget.
     // Expert tensor metadata (ne/nb) is valid even without GPU allocation.
-    Qwen35MoeExpertPlacement placement;
+    MoeHybridPlacement placement;
     std::string placement_source;
     std::string err;
 
@@ -139,8 +142,13 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
             layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps");
         }
 
-        auto hybrid = std::make_shared<Qwen35MoeHybridStorage>();
-        if (!build_qwen35moe_hybrid_storage_from_file(out, backend, placement, layer_file_data, *hybrid, &err)) {
+        auto hybrid = std::make_shared<MoeHybridStorage>();
+        MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(out);
+        std::vector<MoeLayerDesc> layer_descs((size_t)out.n_layer);
+        for (int il = 0; il < out.n_layer; ++il) {
+            layer_descs[(size_t)il] = make_moe_layer_desc(out.layers[(size_t)il]);
+        }
+        if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend, placement, layer_descs, layer_file_data, *hybrid, &err)) {
             ::munmap(mmap_addr, file_size);
             gguf_free(gctx);
             set_last_error(std::string("qwen35moe hybrid storage build failed: ") + err);
@@ -218,24 +226,29 @@ void Qwen35MoeBackend::maybe_post_request_swap() {
 
     if (!target_weights().moe_hybrid || swap_policy_.max_swaps_total <= 0) return;
 
-    Qwen35MoeSwapPlan plan;
+    MoeHybridSwapPlan plan;
     std::string err;
-    if (!build_qwen35moe_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_,
+    if (!build_moe_hybrid_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_,
                                    swap_policy_, plan, &err)) {
         std::fprintf(stderr, "[qwen35moe] swap plan failed: %s\n", err.c_str());
         return;
     }
     if (plan.actions.empty()) return;
 
-    auto rebuilt = std::make_shared<Qwen35MoeHybridStorage>();
-    if (!build_qwen35moe_hybrid_storage(target_weights(), target_backend(),
-                                        plan.next_placement, *rebuilt, &err)) {
+    auto rebuilt = std::make_shared<MoeHybridStorage>();
+    MoeHybridConfig swap_cfg = make_moe_hybrid_config(target_weights());
+    std::vector<MoeLayerDesc> swap_descs((size_t)target_weights().n_layer);
+    for (int il = 0; il < target_weights().n_layer; ++il) {
+        swap_descs[(size_t)il] = make_moe_layer_desc(target_weights().layers[(size_t)il]);
+    }
+    if (!build_moe_hybrid_storage(swap_cfg, target_backend(),
+                                        plan.next_placement, swap_descs, *rebuilt, &err)) {
         std::fprintf(stderr, "[qwen35moe] swap rebuild failed: %s\n", err.c_str());
         return;
     }
     target_weights().moe_hybrid = std::move(rebuilt);
     if (!placement_out_path_.empty()) {
-        if (!plan.next_placement.save_json(placement_out_path_, &err)) {
+        if (!plan.next_placement.save_json(placement_out_path_, "qwen35moe", &err)) {
             std::fprintf(stderr, "[qwen35moe] failed to save next placement: %s\n", err.c_str());
         }
     }
@@ -259,7 +272,8 @@ bool Qwen35MoeBackend::ensure_pipe_state(int kv_start) {
     if (pipe_state_ && pipe_state_->valid()) return true;
     pipe_state_ = std::make_unique<PipelinedDecodeState>();
     if (!init_pipelined_decode_state(*pipe_state_, target_backend(), target_weights(),
-                                     target_cache(), kv_start, cfg_.kq_stride_pad)) {
+                                     target_cache(), *target_weights().moe_hybrid,
+                                     kv_start, cfg_.kq_stride_pad)) {
         pipe_state_.reset();
         return false;
     }
@@ -274,6 +288,15 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
     std::vector<float> logits_buf((size_t)vocab);
     std::vector<float> act_cur((size_t)hidden);
 
+    // Telemetry accumulators for the full decode loop
+    using DecodeClock = std::chrono::steady_clock;
+    uint64_t tel_embed_us = 0;
+    uint64_t tel_layers_us = 0;
+    uint64_t tel_logits_us = 0;
+    uint64_t tel_sample_us = 0;
+    PipelinedDecodeTelemetry tel_layers_accum{};
+    int tel_n_tokens = 0;
+
     // Persistent logits graph (built once, reused per token)
     StepGraph logits_sg;
     auto project_logits = [&]() -> bool {
@@ -297,7 +320,9 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
                 return false;
             }
         }
-        ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+        // GPU→GPU: pipe act_cur directly into logits graph (no host bounce)
+        ggml_backend_tensor_copy_async(target_backend(), target_backend(),
+                                       pipe_state_->gpu_state.act_cur, logits_sg.hidden_input);
         auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf);
         if (st != GGML_STATUS_SUCCESS) return false;
         ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
@@ -312,7 +337,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
             ggml_backend_tensor_get(target_step_graph().logits, logits_buf.data(),
                                     prefill_logits_offset(), sizeof(float) * (size_t)vocab);
             first_tok = sample_logits(logits_buf.data(), vocab, sampler_config(),
-                                      out_tokens, sampler_rng_engine());
+                                     out_tokens, sampler_rng_engine());
         } else {
             first_tok = target_cache().last_tok;
         }
@@ -329,30 +354,36 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
     }
 
     for (int step = 1; step < n_gen; ++step) {
+        const auto tok_t0 = DecodeClock::now();
+
         int32_t tok = out_tokens.back();
         if (!target_weights().embedder.embed(&tok, 1, act_cur.data())) {
             return false;
         }
-        ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
-                                sizeof(float) * (size_t)hidden);
+        ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
+                                      act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+        const auto embed_done = DecodeClock::now();
 
+        PipelinedDecodeTelemetry tel;
         if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
-                                        target_cache(), *target_weights().moe_hybrid,
-                                        committed, cfg_.kq_stride_pad, nullptr)) {
+                                       target_cache(), *target_weights().moe_hybrid,
+                                       committed, cfg_.kq_stride_pad,
+                                       hybrid_telemetry_ ? &tel : nullptr)) {
             return false;
         }
+        const auto layers_done = DecodeClock::now();
 
-        ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
-                                sizeof(float) * (size_t)hidden);
+        // act_cur stays on GPU — project_logits reads it via GPU→GPU copy
         if (!project_logits()) {
             step_graph_destroy(logits_sg);
             return false;
         }
+        const auto logits_done = DecodeClock::now();
 
         int32_t next_tok;
         if (sampler_config().temp > 0) {
             next_tok = sample_logits(logits_buf.data(), vocab, sampler_config(),
-                                     out_tokens, sampler_rng_engine());
+                                    out_tokens, sampler_rng_engine());
         } else {
             next_tok = 0;
             float best = logits_buf[0];
@@ -363,6 +394,48 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
                 }
             }
         }
+        const auto sample_done = DecodeClock::now();
+
+        if (hybrid_telemetry_) {
+            auto us = [](DecodeClock::time_point a, DecodeClock::time_point b) -> uint64_t {
+                return (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>(b - a).count();
+            };
+            tel_embed_us += us(tok_t0, embed_done);
+            tel_layers_us += us(embed_done, layers_done);
+            tel_logits_us += us(layers_done, logits_done);
+            tel_sample_us += us(logits_done, sample_done);
+            tel_n_tokens++;
+            // Accumulate per-layer telemetry
+            tel_layers_accum.total_us += tel.total_us;
+            tel_layers_accum.prefn_graph_build_us += tel.prefn_graph_build_us;
+            tel_layers_accum.prefn_compute_us += tel.prefn_compute_us;
+            tel_layers_accum.routing_readback_us += tel.routing_readback_us;
+            tel_layers_accum.ffn_us += tel.ffn_us;
+            tel_layers_accum.ffn_allhot_us += tel.ffn_allhot_us;
+            tel_layers_accum.ffn_mixed_us += tel.ffn_mixed_us;
+            tel_layers_accum.gpu_idle_us += tel.gpu_idle_us;
+            tel_layers_accum.tensor_io_us += tel.tensor_io_us;
+            tel_layers_accum.combine_overhead_us += tel.combine_overhead_us;
+            tel_layers_accum.cold_cpu_us += tel.cold_cpu_us;
+            tel_layers_accum.cold_compute_us += tel.cold_compute_us;
+            tel_layers_accum.hot_graph_build_us += tel.hot_graph_build_us;
+            tel_layers_accum.ffn_post_get_us += tel.ffn_post_get_us;
+            tel_layers_accum.sync_wait_us += tel.sync_wait_us;
+            tel_layers_accum.allhot_layers += tel.allhot_layers;
+            tel_layers_accum.mixed_layers += tel.mixed_layers;
+            tel_layers_accum.total_layers += tel.total_layers;
+            tel_layers_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds;
+            tel_layers_accum.routed_ffn_layers += tel.routed_ffn_layers;
+            tel_layers_accum.routed_prefn_us += tel.routed_prefn_us;
+            tel_layers_accum.routed_sync_us += tel.routed_sync_us;
+            tel_layers_accum.routed_readback_us += tel.routed_readback_us;
+            tel_layers_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us;
+            tel_layers_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us;
+            tel_layers_accum.routed_final_sync_us += tel.routed_final_sync_us;
+            tel_layers_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits;
+            tel_layers_accum.routed_total_expert_slots += tel.routed_total_expert_slots;
+        }
+
         out_tokens.push_back(next_tok);
         io.emit(next_tok);
         committed++;
@@ -371,6 +444,58 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
         if (is_eos_tok(next_tok, target_weights())) break;
     }
 
+    // ── Print decode telemetry ──
+    if (hybrid_telemetry_ && tel_n_tokens > 0) {
+        const double total_us = (double)(tel_embed_us + tel_layers_us + tel_logits_us + tel_sample_us);
+        std::printf("[qwen35moe-ar] === AR DECODE TELEMETRY (n_tokens=%d, %.1f tok/s) ===\n",
+                    tel_n_tokens, tel_n_tokens / (total_us / 1e6));
+        std::printf("  per-token breakdown:\n");
+        std::printf("    embed=%.2fms  layers=%.2fms  logits=%.2fms  sample=%.2fms\n",
+                    tel_embed_us / 1000.0 / tel_n_tokens,
+                    tel_layers_us / 1000.0 / tel_n_tokens,
+                    tel_logits_us / 1000.0 / tel_n_tokens,
+                    tel_sample_us / 1000.0 / tel_n_tokens);
+        std::printf("  time budget: embed=%.1f%% layers=%.1f%% logits=%.1f%% sample=%.1f%%\n",
+                    100.0 * tel_embed_us / total_us,
+                    100.0 * tel_layers_us / total_us,
+                    100.0 * tel_logits_us / total_us,
+                    100.0 * tel_sample_us / total_us);
+        // Routed path breakdown (the dominant path)
+        if (tel_layers_accum.routed_ffn_layers > 0) {
+            const int rl = tel_layers_accum.routed_ffn_layers;
+            std::printf("  routed FFN path (%d layer-evals, %d cold_hits / %d slots = %.1f%% cold):\n",
+                        rl,
+                        tel_layers_accum.routed_cold_expert_hits,
+                        tel_layers_accum.routed_total_expert_slots,
+                        tel_layers_accum.routed_total_expert_slots > 0
+                            ? 100.0 * tel_layers_accum.routed_cold_expert_hits / tel_layers_accum.routed_total_expert_slots
+                            : 0.0);
+            std::printf("    per-layer avg: prefn_dispatch=%.1fus sync_stall=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n",
+                        (double)tel_layers_accum.routed_prefn_us / rl,
+                        (double)tel_layers_accum.routed_sync_us / rl,
+                        (double)tel_layers_accum.routed_readback_us / rl,
+                        (double)tel_layers_accum.routed_cpu_remap_us / rl,
+                        (double)tel_layers_accum.routed_ffn_dispatch_us / rl);
+            std::printf("    total: sync_stall=%.1fms (%.1f%% of layers time)\n",
+                        tel_layers_accum.routed_sync_us / 1000.0,
+                        100.0 * tel_layers_accum.routed_sync_us / (double)tel_layers_us);
+            std::printf("    final_sync=%.1fms (%.1f%% of layers time)\n",
+                        tel_layers_accum.routed_final_sync_us / 1000.0,
+                        100.0 * tel_layers_accum.routed_final_sync_us / (double)tel_layers_us);
+        }
+        // Split path stats (if any)
+        if (tel_layers_accum.mixed_layers > 0) {
+            std::printf("  split path: mixed=%d layers, cold_cpu=%.1fms, ffn_mixed=%.1fms\n",
+                        tel_layers_accum.mixed_layers,
+                        tel_layers_accum.cold_cpu_us / 1000.0,
+                        tel_layers_accum.ffn_mixed_us / 1000.0);
+        }
+        std::printf("  split path allhot=%d layers, hot_graph_rebuilds=%d\n",
+                    tel_layers_accum.allhot_layers - tel_layers_accum.routed_ffn_layers,
+                    tel_layers_accum.hot_graph_rebuilds);
+        std::fflush(stdout);
+    }
+
     step_graph_destroy(logits_sg);
     return true;
 }
@@ -417,7 +542,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
 
     const int n_layer = target_weights().n_layer;
     uint64_t build_us_total = 0, compute_us_total = 0, readback_us_total = 0, ffn_us_total = 0;
-    Qwen35MoeHybridFfnTelemetry ffn_tel_accum{};
+    MoeHybridFfnTelemetry ffn_tel_accum{};
 
     StepGraph logits_sg;  // Persistent logits graph (used by spec-decode branch)
 
@@ -426,7 +551,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
     };
 
     // Helper: compute logits from act_cur (persistent graph, built once)
-    auto compute_logits = [&]() -> bool {
+    auto compute_logits = [&](ggml_tensor* gpu_src = nullptr) -> bool {
         if (!logits_sg.ctx) {
             // First call: build the logits graph
             ggml_init_params ip{};
@@ -449,7 +574,13 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
                 return false;
             }
         }
-        ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+        if (gpu_src) {
+            // GPU→GPU: pipe act_cur directly without host bounce
+            ggml_backend_tensor_copy_async(target_backend(), target_backend(),
+                                            gpu_src, logits_sg.hidden_input);
+        } else {
+            ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+        }
         auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf);
         if (st != GGML_STATUS_SUCCESS) return false;
         ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
@@ -474,9 +605,12 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
     }
 
     // Process layer by layer, chunked within each layer
+    StepGraph prefill_sg;  // persistent across layers to reuse GPU buffer
+    ggml_gallocr_t ffn_hot_alloc = nullptr;
+    ggml_gallocr_t ffn_cold_alloc = nullptr;
+
     for (int il = 0; il < n_layer; ++il) {
         auto & storage = target_weights().moe_hybrid->layers[(size_t)il];
-        const auto & L = target_weights().layers[(size_t)il];
 
         for (int chunk_start = 0; chunk_start < prompt_len; chunk_start += prefill_chunk) {
             const int chunk_len = std::min(prefill_chunk, prompt_len - chunk_start);
@@ -485,12 +619,14 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
             const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (chunk_len > 1);
 
             // Build pre-FFN graph for this chunk
-            StepGraph prefill_sg;
+            step_graph_free(prefill_sg);  // reset ctx/graph but keep gallocr buffer
             if (!build_layer_prefn_step(prefill_sg, target_weights(), target_cache(), target_backend(),
                                         il, /*kv_start=*/chunk_start, /*n_tokens=*/chunk_len,
                                         with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) {
                 result.error = "prefill_build";
                 step_graph_destroy(prefill_sg);
+                if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
                 cleanup_graphs();
                 return result;
             }
@@ -531,6 +667,8 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
             if (st != GGML_STATUS_SUCCESS) {
                 result.error = "prefill_compute";
                 step_graph_destroy(prefill_sg);
+                if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
                 cleanup_graphs();
                 return result;
             }
@@ -570,33 +708,43 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
                 }
             }
 
-            // Batched hybrid FFN for this chunk.
-            // The routed-expert mul_mat_id MMQ kernel writes out of bounds on
-            // Ampere when the per-call token count exceeds ~8: the expert token
-            // distribution overshoots the destination tiles on the
-            // need_check=false write path, silently corrupting neighbouring GPU
-            // allocations during prefill and crashing with an illegal memory
-            // access at a later decode sync (~4th request under the server).
-            // Sub-batch the FFN to a safe width so the attention prefill can
-            // stay at the full chunk size.
-            std::vector<float> ffn_batch_out((size_t)chunk_len * (size_t)hidden);
-            constexpr int kFfnSafeBatch = 8;
-            for (int fb = 0; fb < chunk_len; fb += kFfnSafeBatch) {
-                const int fl = std::min(kFfnSafeBatch, chunk_len - fb);
-                std::vector<float> sub_out;
-                if (!eval_qwen35moe_hybrid_ffn_batched(
-                        target_backend(), cpu_be, target_weights(), L, storage,
-                        chunk_post.data()     + (size_t)fb * (size_t)hidden,
-                        chunk_selected.data() + (size_t)fb * (size_t)n_expert_used,
-                        chunk_weights.data()  + (size_t)fb * (size_t)n_expert_used,
-                        fl, sub_out, &result.error)) {
-                    step_graph_destroy(prefill_sg);
-                    cleanup_graphs();
-                    return result;
+            // Hybrid FFN — skip batched path when cold experts exist (CUDA mul_mat_id bug on sm_75)
+            MoeHybridConfig chunk_cfg = make_moe_hybrid_config(target_weights());
+            MoeLayerDesc chunk_desc = make_moe_layer_desc(target_weights().layers[(size_t)il]);
+            std::vector<float> ffn_batch_out;
+            bool ffn_ok = false;
+            if (storage.cold_expert_ids.empty()) {
+                // All experts hot — safe to use batched path
+                ffn_ok = eval_moe_hybrid_ffn_batched(
+                        target_backend(), cpu_be, chunk_cfg, chunk_desc, storage,
+                        chunk_post.data(),
+                        chunk_selected.data(),
+                        chunk_weights.data(),
+                        chunk_len, ffn_batch_out, &result.error,
+                        &ffn_hot_alloc, &ffn_cold_alloc);
+            }
+            if (!ffn_ok) {
+                // Per-token fallback (avoids sm_75 mul_mat_id assertion with cold experts)
+                result.error.clear();
+                ffn_batch_out.assign((size_t)hidden * (size_t)chunk_len, 0.0f);
+                std::vector<float> single_out;
+                for (int ti = 0; ti < chunk_len; ++ti) {
+                    const float * tok_post = chunk_post.data() + (size_t)ti * (size_t)hidden;
+                    const int32_t * tok_sel = chunk_selected.data() + (size_t)ti * (size_t)n_expert_used;
+                    const float * tok_wts = chunk_weights.data() + (size_t)ti * (size_t)n_expert_used;
+                    if (!eval_moe_hybrid_ffn_single(
+                            target_backend(), chunk_cfg, chunk_desc, storage, cpu_be,
+                            tok_post, tok_sel, tok_wts, n_expert_used, single_out)) {
+                        result.error = "prefill_ffn_single";
+                        step_graph_destroy(prefill_sg);
+                        if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+                        if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+                        cleanup_graphs();
+                        return result;
+                    }
+                    std::memcpy(ffn_batch_out.data() + (size_t)ti * (size_t)hidden,
+                                single_out.data(), sizeof(float) * (size_t)hidden);
                 }
-                std::memcpy(ffn_batch_out.data() + (size_t)fb * (size_t)hidden,
-                            sub_out.data(),
-                            (size_t)fl * (size_t)hidden * sizeof(float));
             }
 
             // Combine FFN output + residual → embed_all for next layer
@@ -634,10 +782,11 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
             }
             const auto t4 = HybridClock::now();
             ffn_us_total += elapsed_us(t3, t4);
-
-            step_graph_destroy(prefill_sg);
         }
     }
+    step_graph_destroy(prefill_sg);
+    if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+    if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
 
     // Copy last token's output to act_cur for decode
     std::memcpy(act_cur.data(), embed_all.data() + (size_t)(prompt_len - 1) * (size_t)hidden,
@@ -729,8 +878,9 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
                         cleanup_graphs();
                         return result;
                     }
-                    ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
-                                            sizeof(float) * (size_t)hidden);
+                    // Upload embedding async on compute stream
+                    ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
+                                                  act_cur.data(), 0, sizeof(float) * (size_t)hidden);
 
                     PipelinedDecodeTelemetry tel;
                     if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
@@ -749,14 +899,31 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
                         decode_tel_accum.ffn_us += tel.ffn_us;
                         decode_tel_accum.ffn_allhot_us += tel.ffn_allhot_us;
                         decode_tel_accum.ffn_mixed_us += tel.ffn_mixed_us;
+                        decode_tel_accum.gpu_idle_us += tel.gpu_idle_us;
+                        decode_tel_accum.tensor_io_us += tel.tensor_io_us;
+                        decode_tel_accum.combine_overhead_us += tel.combine_overhead_us;
+                        decode_tel_accum.cold_cpu_us += tel.cold_cpu_us;
+                        decode_tel_accum.cold_compute_us += tel.cold_compute_us;
+                        decode_tel_accum.hot_graph_build_us += tel.hot_graph_build_us;
+                        decode_tel_accum.ffn_post_get_us += tel.ffn_post_get_us;
+                        decode_tel_accum.sync_wait_us += tel.sync_wait_us;
                         decode_tel_accum.allhot_layers += tel.allhot_layers;
                         decode_tel_accum.mixed_layers += tel.mixed_layers;
                         decode_tel_accum.total_layers += tel.total_layers;
+                        decode_tel_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds;
+                        decode_tel_accum.routed_ffn_layers += tel.routed_ffn_layers;
+                        decode_tel_accum.routed_prefn_us += tel.routed_prefn_us;
+                        decode_tel_accum.routed_sync_us += tel.routed_sync_us;
+                        decode_tel_accum.routed_readback_us += tel.routed_readback_us;
+                        decode_tel_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us;
+                        decode_tel_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us;
+                        decode_tel_accum.routed_final_sync_us += tel.routed_final_sync_us;
+                        decode_tel_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits;
+                        decode_tel_accum.routed_total_expert_slots += tel.routed_total_expert_slots;
                     }
 
-                    ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
-                                            sizeof(float) * (size_t)hidden);
-                    if (!compute_logits()) {
+                    // act_cur stays on GPU — compute_logits reads it via GPU→GPU copy
+                    if (!compute_logits(pipe_state_->gpu_state.act_cur)) {
                         result.error = "decode_logits";
                         cleanup_graphs();
                         return result;
@@ -793,12 +960,51 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
                                 decode_tel_accum.ffn_mixed_us / 1000.0,
                                 decode_tel_accum.allhot_layers,
                                 decode_tel_accum.mixed_layers);
-                    if (n_dec > 0) {
+                    std::printf("  GPU IDLE: tensor_io=%.1fms combine=%.1fms sync_wait=%.1fms\n",
+                                decode_tel_accum.tensor_io_us / 1000.0,
+                                decode_tel_accum.combine_overhead_us / 1000.0,
+                                decode_tel_accum.sync_wait_us / 1000.0);
+                    std::printf("  CPU TIME: cold_total=%.1fms cold_compute=%.1fms hot_graph_build=%.1fms ffn_post_get=%.1fms\n",
+                                decode_tel_accum.cold_cpu_us / 1000.0,
+                                decode_tel_accum.cold_compute_us / 1000.0,
+                                decode_tel_accum.hot_graph_build_us / 1000.0,
+                                decode_tel_accum.ffn_post_get_us / 1000.0);
+                    std::printf("  hot_graph_rebuilds=%d routed_ffn_layers=%d\n",
+                                decode_tel_accum.hot_graph_rebuilds,
+                                decode_tel_accum.routed_ffn_layers);
+                    // Routed path breakdown
+                    if (decode_tel_accum.routed_ffn_layers > 0) {
+                        const int rl = decode_tel_accum.routed_ffn_layers;
+                        std::printf("  ROUTED PATH (%d layer-evals, %d cold / %d slots = %.1f%% cold):\n",
+                                    rl, decode_tel_accum.routed_cold_expert_hits,
+                                    decode_tel_accum.routed_total_expert_slots,
+                                    decode_tel_accum.routed_total_expert_slots > 0
+                                        ? 100.0 * decode_tel_accum.routed_cold_expert_hits / decode_tel_accum.routed_total_expert_slots
+                                        : 0.0);
+                        std::printf("    per-layer: prefn=%.1fus sync=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n",
+                                    (double)decode_tel_accum.routed_prefn_us / rl,
+                                    (double)decode_tel_accum.routed_sync_us / rl,
+                                    (double)decode_tel_accum.routed_readback_us / rl,
+                                    (double)decode_tel_accum.routed_cpu_remap_us / rl,
+                                    (double)decode_tel_accum.routed_ffn_dispatch_us / rl);
+                        std::printf("    totals: sync_stall=%.1fms final_sync=%.1fms\n",
+                                    decode_tel_accum.routed_sync_us / 1000.0,
+                                    decode_tel_accum.routed_final_sync_us / 1000.0);
+                    }
+                    if (n_dec > 0 && decode_tel_accum.total_us > 0) {
+                        const double gpu_compute_us = (double)(decode_tel_accum.prefn_compute_us + decode_tel_accum.ffn_us - decode_tel_accum.cold_cpu_us);
+                        const double gpu_util_pct = 100.0 * gpu_compute_us / (double)decode_tel_accum.total_us;
                         std::printf("  per-token avg: prefn_build=%.2fms prefn_compute=%.2fms readback=%.2fms ffn=%.2fms\n",
                                     decode_tel_accum.prefn_graph_build_us / 1000.0 / n_dec,
                                     decode_tel_accum.prefn_compute_us / 1000.0 / n_dec,
                                     decode_tel_accum.routing_readback_us / 1000.0 / n_dec,
                                     decode_tel_accum.ffn_us / 1000.0 / n_dec);
+                        std::printf("  per-token avg: tensor_io=%.2fms combine=%.2fms cold_cpu=%.2fms cold_compute=%.2fms\n",
+                                    decode_tel_accum.tensor_io_us / 1000.0 / n_dec,
+                                    decode_tel_accum.combine_overhead_us / 1000.0 / n_dec,
+                                    decode_tel_accum.cold_cpu_us / 1000.0 / n_dec,
+                                    decode_tel_accum.cold_compute_us / 1000.0 / n_dec);
+                        std::printf("  estimated GPU utilization: %.1f%%\n", gpu_util_pct);
                     }
                     std::fflush(stdout);
                 }
@@ -871,8 +1077,8 @@ bool Qwen35MoeBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
     // Ensure pipelined state
     if (!ensure_pipe_state(kv_pos)) return false;
 
-    // Upload to GPU-resident act_cur
-    ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
+    // Upload to GPU-resident act_cur (async — compute stream ordering guarantees correctness)
+    ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
                             sizeof(float) * (size_t)hidden);
 
     // Run pipelined decode (all 40 layers with cached DeltaNet + hot/cold FFN)
@@ -1157,12 +1363,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
 bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
                                                ggml_backend_t backend,
                                                const TargetWeights & w,
-                                               Qwen35MoeExpertPlacement & out,
+                                               MoeHybridPlacement & out,
                                                std::string * err) {
     // Load hotness table or assume uniform hotness
-    Qwen35MoeRoutingStats hotness;
+    MoeHybridRoutingStats hotness;
     if (hotness_path && hotness_path[0]) {
-        if (!Qwen35MoeRoutingStats::load_csv(std::string(hotness_path), hotness, err)) {
+        if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, err)) {
             return false;
         }
         if (hotness.n_layer != w.n_layer || hotness.n_expert != w.n_expert) {
@@ -1277,7 +1483,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
     }
 
     // Build placement using greedy knapsack with byte budget
-    if (!Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
+    if (!MoeHybridPlacement::build_from_stats_with_layer_bytes(
             hotness, layer_expert_bytes, expert_budget,
             /*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert),
             out, err)) {
diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
index 9c9ccf8dd..7e29d8f07 100644
--- a/server/src/qwen35moe/qwen35moe_backend.h
+++ b/server/src/qwen35moe/qwen35moe_backend.h
@@ -4,11 +4,11 @@
 
 #include "qwen35_backend.h"
 #include "graph_builders.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
 #include "qwen35moe_pipelined_decode.h"
-#include "qwen35moe_routing_stats.h"
-#include "qwen35moe_swap_manager.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
 
 #include <memory>
 #include <string>
@@ -36,17 +36,17 @@ class Qwen35MoeBackend : public Qwen35Backend {
     void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;
 
 private:
-    std::shared_ptr<Qwen35MoeRoutingStats> routing_stats_;
+    std::shared_ptr<MoeHybridRoutingStats> routing_stats_;
     std::string routing_stats_out_path_;
     std::string placement_out_path_;
-    Qwen35MoeSwapPolicy swap_policy_;
+    MoeHybridSwapPolicy swap_policy_;
     bool hybrid_telemetry_ = false;
 
     void maybe_post_request_swap();
     bool load_dynamic_placement(const char * hotness_path,
                                 ggml_backend_t backend,
                                 const TargetWeights & w,
-                                Qwen35MoeExpertPlacement & out,
+                                MoeHybridPlacement & out,
                                 std::string * err);
 
     // Hybrid speculative decode: draft tokens using DFlash draft model,
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
deleted file mode 100644
index 55f824a03..000000000
--- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
+++ /dev/null
@@ -1,219 +0,0 @@
-// Single-token hybrid qwen35moe FFN evaluation helpers.
-
-#pragma once
-
-#include "internal.h"
-#include "qwen35moe_hybrid_storage.h"
-
-#include "ggml-backend.h"
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace dflash::common {
-
-// GPU-resident residual combine graph: output = residual + hot_out + cold_correction.
-// Built once at decode start, reused every layer to keep act_cur on GPU.
-struct ResidualCombineGraph {
-    ggml_context * ctx = nullptr;
-    ggml_cgraph * gf = nullptr;
-    ggml_gallocr_t alloc = nullptr;
-    ggml_tensor * residual_in = nullptr;   // [n_embd] F32 input
-    ggml_tensor * hot_in = nullptr;        // [n_embd] F32 input
-    ggml_tensor * cold_in = nullptr;       // [n_embd] F32 input (zeros when no cold)
-    ggml_tensor * output = nullptr;        // [n_embd] F32 output
-
-    ResidualCombineGraph() = default;
-    ~ResidualCombineGraph() { free(); }
-    ResidualCombineGraph(const ResidualCombineGraph &) = delete;
-    ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
-    ResidualCombineGraph(ResidualCombineGraph && o) noexcept
-        : ctx(o.ctx), gf(o.gf), alloc(o.alloc),
-          residual_in(o.residual_in), hot_in(o.hot_in),
-          cold_in(o.cold_in), output(o.output) {
-        o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
-        o.residual_in = nullptr; o.hot_in = nullptr;
-        o.cold_in = nullptr; o.output = nullptr;
-    }
-    ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
-        if (this != &o) {
-            free();
-            ctx = o.ctx; gf = o.gf; alloc = o.alloc;
-            residual_in = o.residual_in; hot_in = o.hot_in;
-            cold_in = o.cold_in; output = o.output;
-            o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
-            o.residual_in = nullptr; o.hot_in = nullptr;
-            o.cold_in = nullptr; o.output = nullptr;
-        }
-        return *this;
-    }
-    bool valid() const { return ctx && gf && alloc && output; }
-    void free();
-    void destroy();
-};
-
-// Build the residual combine graph on the given GPU backend.
-bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd);
-
-// GPU-resident state for the decode loop: persistent act_cur + combine graph.
-struct GpuResidentState {
-    ggml_context * ctx = nullptr;
-    ggml_backend_buffer_t buf = nullptr;
-    ggml_tensor * act_cur = nullptr;       // [n_embd] F32 persistent GPU tensor
-
-    ResidualCombineGraph combine;
-
-    GpuResidentState() = default;
-    ~GpuResidentState() { destroy(); }
-    GpuResidentState(const GpuResidentState &) = delete;
-    GpuResidentState & operator=(const GpuResidentState &) = delete;
-    GpuResidentState(GpuResidentState && o) noexcept
-        : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
-          combine(std::move(o.combine)) {
-        o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
-    }
-    GpuResidentState & operator=(GpuResidentState && o) noexcept {
-        if (this != &o) {
-            destroy();
-            ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
-            combine = std::move(o.combine);
-            o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
-        }
-        return *this;
-    }
-    bool valid() const { return ctx && buf && act_cur && combine.valid(); }
-    void destroy();
-};
-
-// Initialize GPU-resident state for decode.
-bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd);
-
-struct Qwen35MoeHybridFfnTelemetry {
-    uint64_t ffn_wall_us = 0;
-    uint64_t partition_us = 0;
-    uint64_t hot_us = 0;
-    uint64_t cold_us = 0;
-    uint64_t shared_us = 0;
-    uint64_t combine_us = 0;
-    int hot_selected = 0;
-    int cold_selected = 0;
-};
-
-bool eval_qwen35moe_reference_ffn_single(
-    ggml_backend_t         gpu_backend,
-    const TargetWeights &  w,
-    const TargetLayer &    L,
-    const float *          cur_host,
-    const int32_t *        selected_ids,
-    const float *          selected_weights,
-    int                    n_selected,
-    std::vector<float> &   out,
-    std::string *          err = nullptr);
-
-bool eval_qwen35moe_hybrid_ffn_single(
-    ggml_backend_t                      gpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    ggml_backend_t                      cpu_backend,
-    const float *                       cur_host,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_selected,
-    std::vector<float> &                out,
-    Qwen35MoeHybridFfnTelemetry *       telemetry = nullptr,
-    std::string *                       err = nullptr);
-
-// Batched prefill FFN: processes n_tokens at once using the full GPU expert tensors.
-// Uses pre-computed routing (selected_ids, selected_weights) from the pre-FFN graph.
-// cur_host: [n_embd × n_tokens] post-norm hidden states
-// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs)
-// selected_weights: [n_expert_used × n_tokens] routing weights
-// out: [n_embd × n_tokens] output (resized internally)
-bool eval_qwen35moe_batched_prefill_ffn(
-    ggml_backend_t         gpu_backend,
-    const TargetWeights &  w,
-    const TargetLayer &    L,
-    const float *          cur_host,
-    const int32_t *        selected_ids,
-    const float *          selected_weights,
-    int                    n_tokens,
-    std::vector<float> &   out,
-    std::string *          err = nullptr);
-
-// Batched hybrid prefill FFN: processes n_tokens at once with hot experts on GPU
-// and cold experts on CPU concurrently.  Uses pre-computed routing from the pre-FFN
-// graph.  Falls back to eval_qwen35moe_batched_prefill_ffn when all selected experts
-// are hot.
-// cur_host: [n_embd × n_tokens] post-norm hidden states (row-major)
-// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs)
-// selected_weights: [n_expert_used × n_tokens] routing weights
-// out: [n_embd × n_tokens] output (resized internally)
-bool eval_qwen35moe_hybrid_ffn_batched(
-    ggml_backend_t                      gpu_backend,
-    ggml_backend_t                      cpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    const float *                       cur_host,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_tokens,
-    std::vector<float> &                out,
-    std::string *                       err = nullptr);
-
-// GPU-resident single-token hybrid FFN eval: keeps data on GPU, only reads
-// router IDs to CPU for hot/cold partitioning.  Uses tensor_copy for GPU→GPU
-// transfers instead of round-tripping through host memory.
-// ffn_post_gpu: [n_embd] F32 on GPU — the post-attention-norm hidden state
-// ffn_residual_gpu: [n_embd] F32 on GPU — the pre-FFN residual
-// gpu_state: persistent GPU state with act_cur and combine graph
-// After call: gpu_state.act_cur holds the layer output on GPU.
-bool eval_qwen35moe_hybrid_ffn_gpu_resident(
-    ggml_backend_t                      gpu_backend,
-    const TargetWeights &               w,
-    const TargetLayer &                 L,
-    Qwen35MoeHybridLayerStorage &       storage,
-    ggml_backend_t                      cpu_backend,
-    ggml_tensor *                       ffn_post_gpu,
-    ggml_tensor *                       ffn_residual_gpu,
-    GpuResidentState &                  gpu_state,
-    const int32_t *                     selected_ids,
-    const float *                       selected_weights,
-    int                                 n_selected);
-
-// Build/rebuild cached hot FFN graph for a given number of hot experts.
-bool build_cached_hot_graph(
-    CachedFfnGraph & out,
-    ggml_backend_t backend,
-    ggml_tensor * gate_tensor,
-    ggml_tensor * up_tensor,
-    ggml_tensor * down_tensor,
-    ggml_tensor * gate_up_tensor,
-    float gate_scale,
-    float up_scale,
-    float down_scale,
-    float gate_up_scale,
-    const TargetLayer & L,
-    int n_embd,
-    int n_ff_exp,
-    int n_hot);
-
-// Build/rebuild cached cold FFN graph for a given number of cold experts.
-bool build_cached_cold_graph(
-    CachedFfnGraph & out,
-    ggml_backend_t cpu_backend,
-    ggml_tensor * gate_tensor,
-    ggml_tensor * up_tensor,
-    ggml_tensor * down_tensor,
-    ggml_tensor * gate_up_tensor,
-    float gate_scale,
-    float up_scale,
-    float down_scale,
-    float gate_up_scale,
-    int n_embd,
-    int n_ff_exp,
-    int n_cold);
-
-}  // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
index 78039e4ad..e89f9d057 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
@@ -3,6 +3,8 @@
 
 #include "qwen35moe_pipelined_decode.h"
 
+#include "../common/moe_hybrid_types_impl.h"
+
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
@@ -31,6 +33,7 @@ void CachedPrefnGraph::free() {
     moe_weights = nullptr;
 }
 
+
 // Build a cached pre-FFN graph for a DeltaNet layer.
 // DeltaNet layers have no kv_start-dependent views — the graph structure is
 // identical across tokens. We build once and reuse by updating inp_embed data.
@@ -96,6 +99,8 @@ static bool build_cached_deltanet_prefn(
 void PipelinedDecodeState::destroy() {
     for (auto & cpg : cached_prefn) cpg.free();
     cached_prefn.clear();
+    for (auto & rff : cached_routed_ffn) rff.free();
+    cached_routed_ffn.clear();
     gpu_state.destroy();
     routing_ids_buf.clear();
     routing_weights_buf.clear();
@@ -109,6 +114,7 @@ bool init_pipelined_decode_state(
     ggml_backend_t backend,
     const TargetWeights & w,
     TargetCache & cache,
+    MoeHybridStorage & hybrid,
     int kv_start,
     int kq_stride_pad) {
 
@@ -129,27 +135,90 @@ bool init_pipelined_decode_state(
     out.routing_weights_buf.resize((size_t)w.n_expert_used);
     out.ffn_post_host_buf.resize((size_t)w.n_embd);
 
-    // Build cached pre-FFN graphs for DeltaNet layers
+    // Check if routed FFN pipeline is disabled
+    const bool routed_disabled = (std::getenv("DFLASH_QWEN35MOE_NO_ROUTED") != nullptr);
+
+    // Cold experts are computed on the cold backend (CPU/Halo) by default.
+    // Set DFLASH_DROP_COLD=1 to skip cold computation (fast but lossy).
+    out.cold_compute = (std::getenv("DFLASH_DROP_COLD") == nullptr);
+
+    // Build cached pre-FFN graphs for all DeltaNet layers.
     out.cached_prefn.resize((size_t)w.n_layer);
-    int cached_count = 0;
+    int cached_prefn_count = 0;
     for (int il = 0; il < w.n_layer; ++il) {
         const bool is_attn = (((il + 1) % w.full_attention_interval) == 0);
         if (!is_attn) {
-            // DeltaNet layer: cache the graph
             if (!build_cached_deltanet_prefn(
                     out.cached_prefn[(size_t)il], backend, w, cache, il, kv_start, kq_stride_pad)) {
                 std::fprintf(stderr, "[pipelined] failed to cache DeltaNet prefn for layer %d\n", il);
-                // Non-fatal: will fall back to dynamic build for this layer
             } else {
-                cached_count++;
+                cached_prefn_count++;
             }
         }
-        // Attention layers: cached_prefn[il] remains invalid (rebuilt per-token)
     }
 
-    out.cold_in_zeroed = true;
-    // cold_in was already zeroed in init_gpu_resident_state
+    // Build cached routed FFN graphs for ALL layers (StreamMoE-inspired pipeline).
+    // Includes attention layers — eliminates expensive split-path FFN for mixed layers.
+    // Cold entries get weight=0 at runtime, contributing nothing to output.
+    out.cached_routed_ffn.resize((size_t)w.n_layer);
+    int routed_count = 0;
+    if (!routed_disabled) {
+        for (int il = 0; il < w.n_layer; ++il) {
+            if ((size_t)il >= hybrid.layers.size()) continue;
+
+            auto & storage = hybrid.layers[(size_t)il];
+            const TargetLayer & L = w.layers[(size_t)il];
+
+            if (!build_cached_hot_graph(
+                    out.cached_routed_ffn[(size_t)il], backend,
+                    storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
+                    L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+                    make_moe_layer_desc(L), w.n_embd, w.n_ff_exp, w.n_expert_used)) {
+                // Non-fatal: fall back to split path for this layer
+            } else {
+                routed_count++;
+            }
+        }
+    }
 
+    std::fprintf(stderr, "[pipelined] cached %d prefn + %d routed FFN graphs%s\n",
+                 cached_prefn_count, routed_count,
+                 out.cold_compute ? "" : " (drop_cold=lossy)");
+
+    // Initialize fused cold FFN compute (bypasses ggml graph dispatch)
+    if (out.cold_compute) {
+        out.cold_ffn_compute = make_cpu_cold_ffn_compute(w.n_ff_exp);
+        out.cold_ffn_layers.resize((size_t)w.n_layer);
+        out.cold_output_buf.resize((size_t)w.n_embd);
+        for (int il = 0; il < w.n_layer && (size_t)il < hybrid.layers.size(); ++il) {
+            auto & storage = hybrid.layers[(size_t)il];
+            const TargetLayer & L = w.layers[(size_t)il];
+            auto & cl = out.cold_ffn_layers[(size_t)il];
+            cl.fused_gate_up = (storage.gate_up_cold != nullptr);
+            if (cl.fused_gate_up) {
+                cl.gate_up_data = storage.gate_up_cold ? storage.gate_up_cold->data : nullptr;
+                cl.gate_up_stride = storage.gate_up_cold ? storage.gate_up_cold->nb[2] : 0;
+                cl.gate_up_type = storage.gate_up_cold ? storage.gate_up_cold->type : GGML_TYPE_Q4_K;
+                cl.gate_up_scale = L.ffn_gate_up_exps_s;
+            } else {
+                cl.gate_data = storage.gate_cold ? storage.gate_cold->data : nullptr;
+                cl.up_data = storage.up_cold ? storage.up_cold->data : nullptr;
+                cl.gate_stride = storage.gate_cold ? storage.gate_cold->nb[2] : 0;
+                cl.up_stride = storage.up_cold ? storage.up_cold->nb[2] : 0;
+                cl.gate_type = storage.gate_cold ? storage.gate_cold->type : GGML_TYPE_Q4_K;
+                cl.up_type = storage.up_cold ? storage.up_cold->type : GGML_TYPE_Q4_K;
+                cl.gate_scale = L.ffn_gate_exps_s;
+                cl.up_scale = L.ffn_up_exps_s;
+            }
+            cl.down_data = storage.down_cold ? storage.down_cold->data : nullptr;
+            cl.down_stride = storage.down_cold ? storage.down_cold->nb[2] : 0;
+            cl.down_type = storage.down_cold ? storage.down_cold->type : GGML_TYPE_Q4_K;
+            cl.down_scale = L.ffn_down_exps_s;
+        }
+        std::fprintf(stderr, "[pipelined] cold FFN: fused kernel (bypasses ggml graph)\n");
+    }
+
+    out.cold_in_zeroed = true;
     return true;
 }
 
@@ -160,7 +229,7 @@ bool pipelined_decode_one_token(
     ggml_backend_t backend,
     const TargetWeights & w,
     TargetCache & cache,
-    Qwen35MoeHybridStorage & hybrid,
+    MoeHybridStorage & hybrid,
     int kv_pos,
     int kq_stride_pad,
     PipelinedDecodeTelemetry * tel) {
@@ -171,16 +240,7 @@ bool pipelined_decode_one_token(
     ggml_backend_t cpu_be = hybrid.cpu_backend;
 
     if (tel) {
-        tel->total_us = 0;
-        tel->prefn_graph_build_us = 0;
-        tel->prefn_compute_us = 0;
-        tel->routing_readback_us = 0;
-        tel->ffn_us = 0;
-        tel->ffn_allhot_us = 0;
-        tel->ffn_mixed_us = 0;
-        tel->allhot_layers = 0;
-        tel->mixed_layers = 0;
-        tel->total_layers = 0;
+        *tel = PipelinedDecodeTelemetry{};
     }
 
     const auto tok_t0 = PipelineClock::now();
@@ -190,6 +250,158 @@ bool pipelined_decode_one_token(
         const bool is_attn = (((il + 1) % state.full_attention_interval) == 0);
         const auto prefn_build_t0 = PipelineClock::now();
 
+        // ══════════════════════════════════════════════════════════════════════
+        // ROUTED FFN FAST PATH (StreamMoE-inspired async pipeline):
+        // prefn(async) → sync → routing readback → rffn(async) + cold(CPU parallel) → combine(async)
+        // Handles both all-hot and mixed layers. Cold compute runs on CPU
+        // in parallel with GPU rffn — zero overhead when all experts are hot.
+        // ══════════════════════════════════════════════════════════════════════
+        if (!is_attn
+            && state.cached_prefn[(size_t)il].valid()
+            && state.cached_routed_ffn[(size_t)il].valid()) {
+
+            auto & cpg = state.cached_prefn[(size_t)il];
+            auto & rffn = state.cached_routed_ffn[(size_t)il];
+
+            // 1. Copy act_cur → prefn input (GPU→GPU async)
+            ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
+
+            if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
+
+            // 2. Run prefn graph (DeltaNet + router)
+            const auto prefn_compute_t0 = PipelineClock::now();
+            ggml_backend_graph_compute_async(backend, cpg.gf);
+
+            // 3. Sync to read routing decisions from prefn output
+            const auto sync_t0 = PipelineClock::now();
+            ggml_backend_synchronize(backend);
+            const auto sync_t1 = PipelineClock::now();
+
+            // Read routing decisions from GPU
+            int32_t global_ids[8];
+            float   router_weights[8];
+            ggml_backend_tensor_get(cpg.moe_selected, global_ids, 0,
+                                    sizeof(int32_t) * (size_t)n_expert_used);
+            ggml_backend_tensor_get(cpg.moe_weights, router_weights, 0,
+                                    sizeof(float) * (size_t)n_expert_used);
+            const auto readback_t1 = PipelineClock::now();
+
+            // CPU-side local ID mapping + cold partition (trivial: 8 lookups)
+            auto & storage = hybrid.layers[(size_t)il];
+            int32_t local_ids[8];
+            float   masked_weights[8];
+            int32_t cold_ids[8];
+            float   cold_weights[8];
+            int n_cold = 0;
+            int layer_cold_hits = 0;
+            for (int i = 0; i < n_expert_used; ++i) {
+                int32_t gid = global_ids[i];
+                int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size())
+                              ? storage.hot_local_by_global[(size_t)gid] : -1;
+                if (lid >= 0) {
+                    local_ids[i] = lid;
+                    masked_weights[i] = router_weights[i];
+                } else {
+                    local_ids[i] = 0;       // safe: maps to expert 0 (result zeroed by weight)
+                    masked_weights[i] = 0.0f; // cold expert contributes nothing to hot path
+                    layer_cold_hits++;
+                    // Record for cold compute
+                    if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) {
+                        int32_t cold_local = storage.cold_local_by_global[(size_t)gid];
+                        if (cold_local >= 0) {
+                            cold_ids[n_cold] = cold_local;
+                            cold_weights[n_cold] = router_weights[i];
+                            n_cold++;
+                        }
+                    }
+                }
+            }
+            const bool has_cold_selected = (n_cold > 0);
+            const auto remap_t1 = PipelineClock::now();
+
+            // D2H ffn_post for cold compute (GPU already synced, data is ready)
+            if (has_cold_selected) {
+                ggml_backend_tensor_get(cpg.ffn_post, state.ffn_post_host_buf.data(), 0,
+                                        sizeof(float) * (size_t)n_embd);
+            }
+
+            // Upload pre-computed inputs to rffn graph (H→D async on compute stream)
+            ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0,
+                                          sizeof(int32_t) * (size_t)n_expert_used);
+            ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0,
+                                          sizeof(float) * (size_t)n_expert_used);
+            // Copy ffn_post from prefn output → rffn input (GPU→GPU, already synced)
+            ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_post, rffn.inp);
+
+            // 4. Copy residual to combine input (async)
+            ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_residual, state.gpu_state.combine.residual_in);
+
+            // 5. Run routed FFN graph (async — mul_mat_id + shared expert)
+            ggml_backend_graph_compute_async(backend, rffn.gf);
+
+            // 6. Cold compute on CPU (parallel with GPU rffn above)
+            const auto cold_t0 = PipelineClock::now();
+            if (has_cold_selected) {
+                state.cold_ffn_compute->compute(
+                    state.cold_ffn_layers[(size_t)il],
+                    state.ffn_post_host_buf.data(),
+                    cold_ids, cold_weights, n_cold,
+                    n_embd, w.n_ff_exp,
+                    state.cold_output_buf.data());
+            }
+            if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
+
+            // 7. Copy FFN output → combine.hot_in (async, ordered after FFN on GPU stream)
+            ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in);
+
+            // 8. Upload cold result or ensure cold_in is zero
+            if (has_cold_selected) {
+                ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in,
+                                              state.cold_output_buf.data(), 0,
+                                              sizeof(float) * (size_t)n_embd);
+                state.cold_in_zeroed = false;
+            } else if (!state.cold_in_zeroed) {
+                static float zeros[8192] = {};
+                ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+                                               sizeof(float) * (size_t)n_embd);
+                state.cold_in_zeroed = true;
+            }
+
+            // 9. Run combine graph (async — adds residual + hot + cold)
+            ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
+
+            // 10. Copy combine output → act_cur for next layer (async)
+            ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+
+            if (tel) {
+                tel->prefn_compute_us += pipe_elapsed_us(prefn_compute_t0, PipelineClock::now());
+                tel->routed_prefn_us += pipe_elapsed_us(prefn_compute_t0, sync_t0);
+                tel->routed_sync_us += pipe_elapsed_us(sync_t0, sync_t1);
+                tel->routed_readback_us += pipe_elapsed_us(sync_t1, readback_t1);
+                tel->routed_cpu_remap_us += pipe_elapsed_us(readback_t1, remap_t1);
+                tel->routed_ffn_dispatch_us += pipe_elapsed_us(remap_t1, PipelineClock::now());
+                tel->routed_cold_expert_hits += layer_cold_hits;
+                tel->routed_total_expert_slots += n_expert_used;
+                if (has_cold_selected) {
+                    tel->mixed_layers++;
+                } else {
+                    tel->allhot_layers++;
+                }
+                tel->total_layers++;
+                tel->routed_ffn_layers++;
+            }
+            continue;
+        }
+
+        // ══════════════════════════════════════════════════════════════════════
+        // SPLIT PATH: separate prefn + routing readback + FFN (original logic)
+        // Used for attention layers or layers without routed FFN graph.
+        // ══════════════════════════════════════════════════════════════════════
+
+        // Sync any pending async work before entering the split path
+        // (split path needs synchronous access to GPU data)
+        ggml_backend_synchronize(backend);
+
         ggml_tensor * ffn_post_gpu = nullptr;
         ggml_tensor * ffn_residual_gpu = nullptr;
         ggml_tensor * moe_selected_tensor = nullptr;
@@ -203,11 +415,11 @@ bool pipelined_decode_one_token(
                 step_graph_destroy(dyn_sg);
                 return false;
             }
-            // Copy act_cur to graph input (GPU→GPU)
-            ggml_backend_tensor_copy(state.gpu_state.act_cur, dyn_sg.inp_embed);
+            // Copy act_cur to graph input (GPU→GPU) — async on compute stream
+            ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, dyn_sg.inp_embed);
             if (dyn_sg.positions) {
                 int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0};
-                ggml_backend_tensor_set(dyn_sg.positions, pos4, 0, sizeof(pos4));
+                ggml_backend_tensor_set_async(backend, dyn_sg.positions, pos4, 0, sizeof(pos4));
             }
 
             if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
@@ -228,7 +440,8 @@ bool pipelined_decode_one_token(
         } else {
             // DeltaNet layer: reuse cached graph, just update input
             auto & cpg = state.cached_prefn[(size_t)il];
-            ggml_backend_tensor_copy(state.gpu_state.act_cur, cpg.inp_embed);
+            // Async copy on compute stream — ordered before next graph_compute
+            ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
 
             if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
 
@@ -244,19 +457,122 @@ bool pipelined_decode_one_token(
         }
 
         // ── Read routing decisions (tiny: 32 + 32 bytes) ──
+        // Use get_async + single sync instead of 2 separate sync tensor_gets.
+        // After graph_compute (SYNC) above, data is ready — just need D2H copy.
         const auto routing_t0 = PipelineClock::now();
         if (!moe_selected_tensor || !moe_weights_tensor) return false;
-        ggml_backend_tensor_get(moe_selected_tensor, state.routing_ids_buf.data(), 0,
+        ggml_backend_tensor_get_async(backend, moe_selected_tensor, state.routing_ids_buf.data(), 0,
                                 sizeof(int32_t) * (size_t)n_expert_used);
-        ggml_backend_tensor_get(moe_weights_tensor, state.routing_weights_buf.data(), 0,
+        ggml_backend_tensor_get_async(backend, moe_weights_tensor, state.routing_weights_buf.data(), 0,
                                 sizeof(float) * (size_t)n_expert_used);
+        ggml_backend_synchronize(backend);
         if (tel) tel->routing_readback_us += pipe_elapsed_us(routing_t0, PipelineClock::now());
 
-        // ── FFN: hot/cold partition + compute ──
+        // ── FFN: use routed FFN (cold-masking) if graph available, else split path ──
         const auto ffn_t0 = PipelineClock::now();
         auto & storage = hybrid.layers[(size_t)il];
         const auto & L = w.layers[(size_t)il];
 
+        // Try routed FFN path for this layer (works for attention layers too)
+        // Handles cold experts inline — cold compute runs parallel with GPU rffn.
+        auto & rffn = state.cached_routed_ffn[(size_t)il];
+        if (rffn.valid()) {
+            // Partition hot/cold: remap global→local, zero cold weights for hot path
+            int32_t local_ids[8];
+            float   masked_weights[8];
+            int32_t cold_ids[8];
+            float   cold_weights[8];
+            int n_cold = 0;
+            int layer_cold_hits = 0;
+            for (int i = 0; i < n_expert_used; ++i) {
+                int32_t gid = state.routing_ids_buf[(size_t)i];
+                int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size())
+                              ? storage.hot_local_by_global[(size_t)gid] : -1;
+                if (lid >= 0) {
+                    local_ids[i] = lid;
+                    masked_weights[i] = state.routing_weights_buf[(size_t)i];
+                } else {
+                    local_ids[i] = 0;
+                    masked_weights[i] = 0.0f;
+                    layer_cold_hits++;
+                    if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) {
+                        int32_t cold_local = storage.cold_local_by_global[(size_t)gid];
+                        if (cold_local >= 0) {
+                            cold_ids[n_cold] = cold_local;
+                            cold_weights[n_cold] = state.routing_weights_buf[(size_t)i];
+                            n_cold++;
+                        }
+                    }
+                }
+            }
+            const bool has_cold_selected = (n_cold > 0);
+
+            // D2H ffn_post for cold compute (GPU already synced after routing readback)
+            if (has_cold_selected) {
+                ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0,
+                                        sizeof(float) * (size_t)n_embd);
+            }
+
+            // Upload IDs + weights, copy inputs, dispatch rffn (all async)
+            ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0,
+                                          sizeof(int32_t) * (size_t)n_expert_used);
+            ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0,
+                                          sizeof(float) * (size_t)n_expert_used);
+            ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, rffn.inp);
+            ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in);
+            ggml_backend_graph_compute_async(backend, rffn.gf);
+
+            // Cold compute on CPU (parallel with GPU rffn above)
+            const auto cold_t0 = PipelineClock::now();
+            if (has_cold_selected) {
+                state.cold_ffn_compute->compute(
+                    state.cold_ffn_layers[(size_t)il],
+                    state.ffn_post_host_buf.data(),
+                    cold_ids, cold_weights, n_cold,
+                    n_embd, w.n_ff_exp,
+                    state.cold_output_buf.data());
+            }
+            if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
+
+            // Copy hot result → combine input (async, ordered after rffn on GPU stream)
+            ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in);
+
+            // Upload cold result or ensure cold_in is zero
+            if (has_cold_selected) {
+                ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in,
+                                              state.cold_output_buf.data(), 0,
+                                              sizeof(float) * (size_t)n_embd);
+                state.cold_in_zeroed = false;
+            } else if (!state.cold_in_zeroed) {
+                static float zeros[8192] = {};
+                ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+                                               sizeof(float) * (size_t)n_embd);
+                state.cold_in_zeroed = true;
+            }
+
+            ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
+            ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+
+            if (tel) {
+                uint64_t ffn_layer_us = pipe_elapsed_us(ffn_t0, PipelineClock::now());
+                tel->ffn_us += ffn_layer_us;
+                tel->total_layers++;
+                tel->routed_ffn_layers++;
+                if (has_cold_selected) {
+                    tel->mixed_layers++;
+                    tel->ffn_mixed_us += ffn_layer_us;
+                } else {
+                    tel->allhot_layers++;
+                    tel->ffn_allhot_us += ffn_layer_us;
+                }
+                tel->routed_cold_expert_hits += layer_cold_hits;
+                tel->routed_total_expert_slots += n_expert_used;
+            }
+            continue;
+        }
+
+        // ── Fallback: full split path (no routed FFN graph for this layer) ──
+
         // Partition into hot/cold (fast: just a lookup table scan, ~8 iterations)
         int n_hot = 0, n_cold = 0;
         int32_t hot_ids[8], cold_ids[8];
@@ -287,99 +603,125 @@ bool pipelined_decode_one_token(
         // ── Read ffn_post to CPU NOW (before hot launch) ──
         // The routing readback above already synced the GPU stream, so ffn_post
         // is guaranteed ready. Reading it here avoids a sync AFTER hot launch.
+        const auto tensor_io_t0 = PipelineClock::now();
         if (has_cold) {
             ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0,
                                     sizeof(float) * (size_t)n_embd);
         }
+        if (tel) tel->ffn_post_get_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now());
+
 
-        // ── GPU→GPU: copy residual to combine input ──
-        ggml_backend_tensor_copy(ffn_residual_gpu, state.gpu_state.combine.residual_in);
+        // ── GPU→GPU: copy residual to combine input (async on compute stream) ──
+        ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in);
 
         // ── Prepare + launch hot graph (async — returns immediately) ──
         bool hot_async_launched = false;
         if (has_hot || has_shared) {
             if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
+                const auto hbuild_t0 = PipelineClock::now();
                 build_cached_hot_graph(storage.hot_graph, backend,
                                        storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
                                        L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                       L, n_embd, w.n_ff_exp, n_hot);
+                                       make_moe_layer_desc(L), n_embd, w.n_ff_exp, n_hot);
+                if (tel) { tel->hot_graph_build_us += pipe_elapsed_us(hbuild_t0, PipelineClock::now()); tel->hot_graph_rebuilds++; }
             }
             if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
-                ggml_backend_tensor_copy(ffn_post_gpu, storage.hot_graph.inp);
+                // All setup on compute stream — no per-op cudaStreamSynchronize
+                ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, storage.hot_graph.inp);
                 if (storage.hot_graph.ids && has_hot) {
-                    ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids, 0,
-                                            sizeof(int32_t) * (size_t)n_hot);
+                    ggml_backend_tensor_set_async(backend, storage.hot_graph.ids, hot_ids, 0,
+                                                  sizeof(int32_t) * (size_t)n_hot);
                 }
                 if (storage.hot_graph.weights && has_hot) {
-                    ggml_backend_tensor_set(storage.hot_graph.weights, hot_weights, 0,
-                                            sizeof(float) * (size_t)n_hot);
+                    ggml_backend_tensor_set_async(backend, storage.hot_graph.weights, hot_weights, 0,
+                                                  sizeof(float) * (size_t)n_hot);
                 }
-                // Launch hot GPU async — no sync until combine
+                // Launch hot GPU async — queued after copies on same stream
                 ggml_backend_graph_compute_async(backend, storage.hot_graph.gf);
                 hot_async_launched = true;
             }
         }
+        if (tel) tel->tensor_io_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now());
 
         // ── Cold path: runs on CPU IN PARALLEL with hot GPU ──
+        const auto cold_t0 = PipelineClock::now();
         if (has_cold) {
             // ffn_post already read above (before hot launch) — no GPU sync here!
-            if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
-                build_cached_cold_graph(storage.cold_graph, cpu_be,
-                                        storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
-                                        L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
-                                        n_embd, w.n_ff_exp, n_cold);
-            }
-            if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
-                ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0,
-                                        sizeof(float) * (size_t)n_embd);
-                ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0,
-                                        sizeof(int32_t) * (size_t)n_cold);
-                ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0,
-                                        sizeof(float) * (size_t)n_cold);
-                // CPU cold compute — hot GPU runs concurrently on its stream
-                auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf);
-                if (cst != GGML_STATUS_SUCCESS) {
+            const auto cold_compute_t0 = PipelineClock::now();
+            if (state.cold_ffn_compute) {
+                // Fused kernel: bypass ggml graph dispatch entirely
+                state.cold_ffn_compute->compute(
+                    state.cold_ffn_layers[(size_t)il],
+                    state.ffn_post_host_buf.data(),
+                    cold_ids,
+                    cold_weights,
+                    n_cold, n_embd, w.n_ff_exp,
+                    state.cold_output_buf.data());
+            } else {
+                // Fallback: ggml cold graph (legacy path)
+                if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
+                    build_cached_cold_graph(storage.cold_graph, cpu_be,
+                                            storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
+                                            L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+                                            n_embd, w.n_ff_exp, n_cold);
+                }
+                if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
+                    ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0,
+                                            sizeof(float) * (size_t)n_embd);
+                    ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0,
+                                            sizeof(int32_t) * (size_t)n_cold);
+                    ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0,
+                                            sizeof(float) * (size_t)n_cold);
+                    auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf);
+                    if (cst != GGML_STATUS_SUCCESS) {
+                        if (hot_async_launched) ggml_backend_synchronize(backend);
+                        return false;
+                    }
+                } else {
                     if (hot_async_launched) ggml_backend_synchronize(backend);
                     return false;
                 }
-            } else {
-                if (hot_async_launched) ggml_backend_synchronize(backend);
-                return false;
             }
+            if (tel) tel->cold_compute_us += pipe_elapsed_us(cold_compute_t0, PipelineClock::now());
         }
+        if (tel) tel->cold_cpu_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
 
-        // ── Sync hot GPU (only now — after cold CPU finished) ──
+        // ── Combine: queue on compute stream (no explicit sync needed) ──
+        const auto combine_t0 = PipelineClock::now();
         if (hot_async_launched) {
-            ggml_backend_synchronize(backend);
-            ggml_backend_tensor_copy(storage.hot_graph.output, state.gpu_state.combine.hot_in);
+            ggml_backend_tensor_copy_async(backend, backend, storage.hot_graph.output, state.gpu_state.combine.hot_in);
         } else {
             float zeros[8192];
             std::memset(zeros, 0, sizeof(float) * (size_t)n_embd);
-            ggml_backend_tensor_set(state.gpu_state.combine.hot_in, zeros, 0,
-                                    sizeof(float) * (size_t)n_embd);
+            ggml_backend_tensor_set_async(backend, state.gpu_state.combine.hot_in, zeros, 0,
+                                           sizeof(float) * (size_t)n_embd);
         }
 
-        // ── Upload cold result (or keep zeros) ──
         if (has_cold) {
-            ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0,
-                                    sizeof(float) * (size_t)n_embd);
-            ggml_backend_tensor_set(state.gpu_state.combine.cold_in, state.ffn_post_host_buf.data(), 0,
-                                    sizeof(float) * (size_t)n_embd);
+            const float * cold_result = state.cold_ffn_compute
+                ? state.cold_output_buf.data()
+                : nullptr;
+            if (!cold_result) {
+                // Legacy path: read from ggml tensor
+                ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0,
+                                        sizeof(float) * (size_t)n_embd);
+                cold_result = state.ffn_post_host_buf.data();
+            }
+            ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, cold_result, 0,
+                                           sizeof(float) * (size_t)n_embd);
             state.cold_in_zeroed = false;
         } else if (!state.cold_in_zeroed) {
             float zeros[8192];
             std::memset(zeros, 0, sizeof(float) * (size_t)n_embd);
-            ggml_backend_tensor_set(state.gpu_state.combine.cold_in, zeros, 0,
-                                    sizeof(float) * (size_t)n_embd);
+            ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+                                           sizeof(float) * (size_t)n_embd);
             state.cold_in_zeroed = true;
         }
 
-        // ── Combine: output = residual + hot + cold ──
-        auto cst = ggml_backend_graph_compute(backend, state.gpu_state.combine.gf);
-        if (cst != GGML_STATUS_SUCCESS) return false;
+        ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
 
-        // ── Copy combine output to persistent act_cur ──
-        ggml_backend_tensor_copy(state.gpu_state.combine.output, state.gpu_state.act_cur);
+        ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+        if (tel) tel->combine_overhead_us += pipe_elapsed_us(combine_t0, PipelineClock::now());
 
         const auto ffn_t1 = PipelineClock::now();
         if (tel) {
@@ -398,8 +740,17 @@ bool pipelined_decode_one_token(
 
     step_graph_destroy(dyn_sg);
 
+    // Sync the compute stream before returning — caller needs act_cur on CPU.
+    // All async ops (combine + copy) from the last layer must complete.
+    const auto final_sync_t0 = PipelineClock::now();
+    ggml_backend_synchronize(backend);
+
     if (tel) {
+        tel->routed_final_sync_us = pipe_elapsed_us(final_sync_t0, PipelineClock::now());
         tel->total_us = pipe_elapsed_us(tok_t0, PipelineClock::now());
+        // GPU idle = time in tensor I/O + routing readback + combine overhead
+        // (these are all periods where GPU compute stream is idle)
+        tel->gpu_idle_us = tel->tensor_io_us + tel->routing_readback_us + tel->combine_overhead_us;
     }
     return true;
 }
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
index bc03264eb..330d72f7b 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
@@ -10,13 +10,15 @@
 #pragma once
 
 #include "internal.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/cold_ffn_compute.h"
 #include "graph_builders.h"
 
 #include "ggml-backend.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 namespace dflash::common {
@@ -65,9 +67,30 @@ struct PipelinedDecodeTelemetry {
     uint64_t ffn_us = 0;
     uint64_t ffn_allhot_us = 0;
     uint64_t ffn_mixed_us = 0;
+    // GPU utilization diagnosis: time the GPU is idle waiting for CPU
+    uint64_t gpu_idle_us = 0;       // total GPU idle (tensor_io + combine_overhead + sync_wait)
+    uint64_t tensor_io_us = 0;      // hot path setup: D2H readback + GPU copies + kernel launch
+    uint64_t combine_overhead_us = 0; // combine graph dispatch + copy
+    uint64_t cold_cpu_us = 0;       // cold path total (graph build + ggml CPU compute)
+    uint64_t cold_compute_us = 0;   // just ggml_backend_graph_compute(cpu_be) time
+    uint64_t hot_graph_build_us = 0; // hot graph rebuild (only when n_hot changes)
+    uint64_t ffn_post_get_us = 0;   // D2H readback of ffn_post for cold path
+    uint64_t sync_wait_us = 0;      // time in ggml_backend_synchronize (waiting for GPU)
     int allhot_layers = 0;
     int mixed_layers = 0;
     int total_layers = 0;
+    int hot_graph_rebuilds = 0;     // count of hot graph rebuilds
+    int routed_ffn_layers = 0;      // layers handled by routed FFN (async pipeline)
+
+    // ── Routed path breakdown (StreamMoE fast path) ──
+    uint64_t routed_prefn_us = 0;       // prefn graph compute (async dispatch + sync)
+    uint64_t routed_sync_us = 0;        // GPU sync stall waiting for prefn
+    uint64_t routed_readback_us = 0;    // D2H readback of routing IDs + weights
+    uint64_t routed_cpu_remap_us = 0;   // CPU-side local ID mapping + cold masking
+    uint64_t routed_ffn_dispatch_us = 0;// FFN graph dispatch + combine (async)
+    uint64_t routed_final_sync_us = 0;  // final sync at end of token (if measured)
+    int routed_cold_expert_hits = 0;    // experts masked (weight=0) in routed path
+    int routed_total_expert_slots = 0;  // total expert slots processed
 };
 
 // State for pipelined decode: holds cached DeltaNet pre-FFN graphs +
@@ -79,6 +102,10 @@ struct PipelinedDecodeState {
     // Attention layers (every full_attention_interval-th) are nullptr (rebuilt each token)
     std::vector<CachedPrefnGraph> cached_prefn;
 
+    // Cached routed FFN graphs for DeltaNet layers (layer index → graph)
+    // StreamMoE-inspired: reads routing from GPU, eliminates CPU sync.
+    std::vector<CachedFfnGraph> cached_routed_ffn;
+
     // Persistent host buffers (avoid per-layer allocation)
     std::vector<int32_t> routing_ids_buf;
     std::vector<float> routing_weights_buf;
@@ -87,6 +114,16 @@ struct PipelinedDecodeState {
     // Persistent zero buffer for cold_in (set once at init)
     bool cold_in_zeroed = false;
 
+    // When true (default), cold experts are computed on the cold backend
+    // (CPU/Halo) instead of being dropped via cold-masking. Exact but slower.
+    // Set DFLASH_DROP_COLD=1 to disable (fast but lossy).
+    bool cold_compute = true;
+
+    // Fused cold FFN compute (bypasses ggml graph dispatch overhead)
+    std::unique_ptr<ColdFfnCompute> cold_ffn_compute;
+    std::vector<ColdFfnLayer> cold_ffn_layers;   // per-layer cold weight metadata
+    std::vector<float> cold_output_buf;           // [n_embd] scratch for cold FFN output
+
     // Tracking
     int n_layer = 0;
     int n_embd = 0;
@@ -100,10 +137,15 @@ struct PipelinedDecodeState {
     PipelinedDecodeState(PipelinedDecodeState && o) noexcept
         : gpu_state(std::move(o.gpu_state)),
           cached_prefn(std::move(o.cached_prefn)),
+          cached_routed_ffn(std::move(o.cached_routed_ffn)),
           routing_ids_buf(std::move(o.routing_ids_buf)),
           routing_weights_buf(std::move(o.routing_weights_buf)),
           ffn_post_host_buf(std::move(o.ffn_post_host_buf)),
           cold_in_zeroed(o.cold_in_zeroed),
+          cold_compute(o.cold_compute),
+          cold_ffn_compute(std::move(o.cold_ffn_compute)),
+          cold_ffn_layers(std::move(o.cold_ffn_layers)),
+          cold_output_buf(std::move(o.cold_output_buf)),
           n_layer(o.n_layer), n_embd(o.n_embd),
           n_expert_used(o.n_expert_used),
           full_attention_interval(o.full_attention_interval) {
@@ -114,10 +156,15 @@ struct PipelinedDecodeState {
             destroy();
             gpu_state = std::move(o.gpu_state);
             cached_prefn = std::move(o.cached_prefn);
+            cached_routed_ffn = std::move(o.cached_routed_ffn);
             routing_ids_buf = std::move(o.routing_ids_buf);
             routing_weights_buf = std::move(o.routing_weights_buf);
             ffn_post_host_buf = std::move(o.ffn_post_host_buf);
             cold_in_zeroed = o.cold_in_zeroed;
+            cold_compute = o.cold_compute;
+            cold_ffn_compute = std::move(o.cold_ffn_compute);
+            cold_ffn_layers = std::move(o.cold_ffn_layers);
+            cold_output_buf = std::move(o.cold_output_buf);
             n_layer = o.n_layer; n_embd = o.n_embd;
             n_expert_used = o.n_expert_used;
             full_attention_interval = o.full_attention_interval;
@@ -136,6 +183,7 @@ bool init_pipelined_decode_state(
     ggml_backend_t backend,
     const TargetWeights & w,
     TargetCache & cache,
+    MoeHybridStorage & hybrid,
     int kv_start,           // initial KV position for graph caching
     int kq_stride_pad);
 
@@ -147,7 +195,7 @@ bool pipelined_decode_one_token(
     ggml_backend_t backend,
     const TargetWeights & w,
     TargetCache & cache,
-    Qwen35MoeHybridStorage & hybrid,
+    MoeHybridStorage & hybrid,
     int kv_pos,              // current KV position
     int kq_stride_pad,
     PipelinedDecodeTelemetry * telemetry = nullptr);
diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.h b/server/src/qwen35moe/qwen35moe_swap_manager.h
deleted file mode 100644
index 1acacedd9..000000000
--- a/server/src/qwen35moe/qwen35moe_swap_manager.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Request-boundary swap planning for qwen35moe expert placement.
-
-#pragma once
-
-#include "qwen35moe_expert_placement.h"
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace dflash::common {
-
-struct Qwen35MoeSwapAction {
-    int layer_idx = -1;
-    int evict_expert = -1;
-    int promote_expert = -1;
-    uint64_t evict_count = 0;
-    uint64_t promote_count = 0;
-};
-
-struct Qwen35MoeSwapPlan {
-    Qwen35MoeExpertPlacement next_placement;
-    std::vector<Qwen35MoeSwapAction> actions;
-};
-
-struct Qwen35MoeSwapPolicy {
-    int max_swaps_total = 0;          // 0 = no swaps
-    uint64_t min_promote_gain = 1;    // promoted expert count must exceed evicted by at least this amount
-};
-
-bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
-                               const Qwen35MoeRoutingStats & stats,
-                               const Qwen35MoeSwapPolicy & policy,
-                               Qwen35MoeSwapPlan & out,
-                               std::string * err = nullptr);
-
-}  // namespace dflash::common
diff --git a/server/test/test_dflash.cpp b/server/test/test_dflash.cpp
index f2544f5b0..76685ac60 100644
--- a/server/test/test_dflash.cpp
+++ b/server/test/test_dflash.cpp
@@ -28,9 +28,11 @@
                             // qwen35 + DFlash + DDTree pipeline below.
 #include "qwen35_daemon.h"   // arch dispatch - single-GPU qwen35 daemon mode
 #include "qwen35moe_daemon.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
-#include "qwen35moe_expert_placement.h"
+#include "../src/common/moe_hybrid_ffn_eval.h"
+#include "../src/common/moe_hybrid_storage.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
+#include "../src/common/moe_hybrid_types_impl.h"
 #include "qwen35moe_pipelined_decode.h"
 #include "qwen35_layer_split.h" // multi-GPU layer-split daemon args
 #include "layer_split_daemon_loop.h" // extracted layer-split daemon loop
@@ -1575,7 +1577,7 @@ int main(int argc, char ** argv) {
             // Build placement stats that mark the router's default picks as COLD
             // by giving them zero count (so they're placed cold), while giving
             // all other experts count=1 (so the hottest N are picked as hot).
-            Qwen35MoeRoutingStats biased_stats;
+            MoeHybridRoutingStats biased_stats;
             biased_stats.n_layer = w.n_layer;
             biased_stats.n_expert = w.n_expert;
             biased_stats.n_expert_used = w.n_expert_used;
@@ -1592,9 +1594,9 @@ int main(int argc, char ** argv) {
             }
             std::printf("  forced %d default-route experts to cold for worst-case bench\n", forced_cold_count);
 
-            Qwen35MoeExpertPlacement placement;
+            MoeHybridPlacement placement;
             std::string place_err;
-            if (!Qwen35MoeExpertPlacement::build_from_stats(
+            if (!MoeHybridPlacement::build_from_stats(
                     biased_stats, total_hot_budget,
                     /*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert),
                     placement, &place_err)) {
@@ -1651,10 +1653,15 @@ int main(int argc, char ** argv) {
                             layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps");
                         }
 
-                        auto hybrid = std::make_shared<Qwen35MoeHybridStorage>();
+                        auto hybrid = std::make_shared<MoeHybridStorage>();
                         std::string hybrid_err;
-                        if (!build_qwen35moe_hybrid_storage_from_file(
-                                w, backend, placement, layer_file_data, *hybrid, &hybrid_err)) {
+                        MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w);
+                        std::vector<MoeLayerDesc> hybrid_descs((size_t)w.n_layer);
+                        for (int il = 0; il < w.n_layer; ++il) {
+                            hybrid_descs[(size_t)il] = make_moe_layer_desc(w.layers[(size_t)il]);
+                        }
+                        if (!build_moe_hybrid_storage_from_file(
+                                hybrid_cfg, backend, placement, hybrid_descs, layer_file_data, *hybrid, &hybrid_err)) {
                             std::fprintf(stderr, "[time-breakdown] hybrid storage build failed: %s\n",
                                          hybrid_err.c_str());
                         } else {
@@ -1712,8 +1719,8 @@ int main(int argc, char ** argv) {
                                                                     sizeof(int32_t) * selected.size());
                                             ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
                                                                     sizeof(float) * weights_buf.size());
-                                            eval_qwen35moe_hybrid_ffn_gpu_resident(
-                                                backend, w, w.layers[(size_t)il],
+                                            eval_moe_hybrid_ffn_gpu_resident(
+                                                backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]),
                                                 hybrid->layers[(size_t)il], cpu_be,
                                                 layer_sg.ffn_post, layer_sg.ffn_residual,
                                                 gpu_state,
@@ -1750,8 +1757,8 @@ int main(int argc, char ** argv) {
                                         ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
                                                                 sizeof(float) * weights_buf.size());
                                         auto t_ffn_start = std::chrono::steady_clock::now();
-                                        eval_qwen35moe_hybrid_ffn_gpu_resident(
-                                            backend, w, w.layers[(size_t)il],
+                                        eval_moe_hybrid_ffn_gpu_resident(
+                                            backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]),
                                             hybrid->layers[(size_t)il], cpu_be,
                                             layer_sg.ffn_post, layer_sg.ffn_residual,
                                             gpu_state,
@@ -1811,7 +1818,7 @@ int main(int argc, char ** argv) {
 
                                 // Init pipelined state
                                 PipelinedDecodeState pipe_state;
-                                if (!init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) {
+                                if (!init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid, ctx, g_kq_stride_pad)) {
                                     std::fprintf(stderr, "[time-breakdown] pipelined state init failed\n");
                                     continue;
                                 }
@@ -1882,24 +1889,24 @@ int main(int argc, char ** argv) {
                             std::printf("\n[time-breakdown] === PIPELINED realistic placement (uniform hot/cold) ===\n");
                             {
                                 // Build uniform placement: hottest N experts per layer based on uniform counts
-                                Qwen35MoeRoutingStats uniform_stats;
+                                MoeHybridRoutingStats uniform_stats;
                                 uniform_stats.n_layer = w.n_layer;
                                 uniform_stats.n_expert = w.n_expert;
                                 uniform_stats.n_expert_used = w.n_expert_used;
                                 uniform_stats.counts.assign((size_t)w.n_layer * (size_t)w.n_expert, 1);
                                 uniform_stats.layer_totals.assign((size_t)w.n_layer, (uint64_t)w.n_expert);
 
-                                Qwen35MoeExpertPlacement uniform_placement;
+                                MoeHybridPlacement uniform_placement;
                                 std::string up_err;
-                                if (Qwen35MoeExpertPlacement::build_from_stats(
+                                if (MoeHybridPlacement::build_from_stats(
                                         uniform_stats, total_hot_budget,
                                         std::min(w.n_expert_used, w.n_expert),
                                         uniform_placement, &up_err)) {
 
                                     // Rebuild hybrid storage with uniform placement
-                                    auto hybrid_realistic = std::make_shared<Qwen35MoeHybridStorage>();
-                                    if (build_qwen35moe_hybrid_storage_from_file(
-                                            w, backend, uniform_placement, layer_file_data,
+                                    auto hybrid_realistic = std::make_shared<MoeHybridStorage>();
+                                    if (build_moe_hybrid_storage_from_file(
+                                            hybrid_cfg, backend, uniform_placement, hybrid_descs, layer_file_data,
                                             *hybrid_realistic, &up_err)) {
                                         std::printf("  uniform placement: hot=%d cold=%d — expect ~60%% all-hot layers\n",
                                                     uniform_placement.total_hot,
@@ -1908,7 +1915,7 @@ int main(int argc, char ** argv) {
                                         int ctx = 2000;
                                         if (ctx + 1 <= max_ctx) {
                                             PipelinedDecodeState pipe_state;
-                                            if (init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) {
+                                            if (init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid_realistic, ctx, g_kq_stride_pad)) {
                                                 std::vector<float> act_cur_pipe((size_t)hidden, 0.0f);
                                                 ggml_backend_tensor_set(pipe_state.gpu_state.act_cur, act_cur_pipe.data(), 0,
                                                                         sizeof(float) * (size_t)hidden);
diff --git a/server/test/test_qwen35moe_expert_placement.cpp b/server/test/test_qwen35moe_expert_placement.cpp
index c263d87df..05e1b92dd 100644
--- a/server/test/test_qwen35moe_expert_placement.cpp
+++ b/server/test/test_qwen35moe_expert_placement.cpp
@@ -1,4 +1,5 @@
-#include "qwen35moe_expert_placement.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -15,7 +16,7 @@ static void expect(bool cond, const char * msg) {
 }
 
 int main() {
-    Qwen35MoeRoutingStats stats;
+    MoeHybridRoutingStats stats;
     stats.n_layer = 2;
     stats.n_expert = 4;
     stats.n_expert_used = 2;
@@ -25,11 +26,11 @@ int main() {
     };
     stats.layer_totals = {280, 103};
 
-    Qwen35MoeExpertPlacement placement;
+    MoeHybridPlacement placement;
     std::string err;
-    expect(Qwen35MoeExpertPlacement::build_from_stats(stats, /*total_hot_budget=*/4,
-                                                      /*min_hot_per_layer=*/1,
-                                                      placement, &err), err.c_str());
+    expect(MoeHybridPlacement::build_from_stats(stats, /*total_hot_budget=*/4,
+                                                /*min_hot_per_layer=*/1,
+                                                placement, &err), err.c_str());
     expect(placement.n_layer == 2, "n_layer");
     expect(placement.hot_counts.size() == 2, "hot_counts size");
     expect(placement.hot_counts[0] == 3, "layer0 got extra hot slots");
@@ -41,17 +42,12 @@ int main() {
     expect(placement.is_hot(1, 0), "layer1 expert0 hot");
     expect(!placement.is_hot(1, 1), "layer1 expert1 cold");
 
-    TargetWeights w;
-    w.is_moe = true;
-    w.n_layer = 2;
-    w.n_expert = 4;
-    w.n_expert_used = 2;
-    expect(placement.matches(w), "placement matches weights");
-
-    const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-placement-test.json";
-    expect(placement.save_json(tmp.string(), &err), err.c_str());
-    Qwen35MoeExpertPlacement loaded;
-    expect(Qwen35MoeExpertPlacement::load_json(tmp.string(), loaded, &err), err.c_str());
+    expect(placement.matches(2, 4, 2), "placement matches dims");
+
+    const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-placement-test.json";
+    expect(placement.save_json(tmp.string(), "moe_hybrid", &err), err.c_str());
+    MoeHybridPlacement loaded;
+    expect(MoeHybridPlacement::load_json(tmp.string(), loaded, &err), err.c_str());
     expect(loaded.hot_counts == placement.hot_counts, "loaded hot counts");
     expect(loaded.hot_expert_ids == placement.hot_expert_ids, "loaded hot ids");
     std::filesystem::remove(tmp);
diff --git a/server/test/test_qwen35moe_routing_stats.cpp b/server/test/test_qwen35moe_routing_stats.cpp
index ea14ac630..cfdf87f07 100644
--- a/server/test/test_qwen35moe_routing_stats.cpp
+++ b/server/test/test_qwen35moe_routing_stats.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_routing_stats.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -16,15 +16,9 @@ static void expect(bool cond, const char * msg) {
 }
 
 int main() {
-    TargetWeights w;
-    w.is_moe = true;
-    w.n_layer = 2;
-    w.n_expert = 4;
-    w.n_expert_used = 2;
-
-    Qwen35MoeRoutingStats stats;
-    expect(stats.init_from_weights(w), "init_from_weights");
-    expect(stats.matches(w), "matches after init");
+    MoeHybridRoutingStats stats;
+    expect(stats.init(2, 4, 2), "init");
+    expect(stats.matches(2, 4, 2), "matches after init");
 
     const int32_t layer0_a[] = {2, 1};
     const int32_t layer0_b[] = {2, 3};
@@ -49,13 +43,13 @@ int main() {
     expect(hot0.size() == 2, "hot size");
     expect(hot0[0] == 2, "hot leader");
 
-    const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-routing-stats-test.csv";
+    const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-routing-stats-test.csv";
     std::string err;
     expect(stats.save_csv(tmp.string(), &err), err.c_str());
 
-    Qwen35MoeRoutingStats loaded;
-    expect(Qwen35MoeRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str());
-    expect(loaded.matches(w), "loaded matches weights");
+    MoeHybridRoutingStats loaded;
+    expect(MoeHybridRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str());
+    expect(loaded.matches(2, 4, 2), "loaded matches dims");
     expect(loaded.count(0, 2) == 2, "loaded count");
     expect(loaded.layer_totals[1] == 2, "loaded total");
 
diff --git a/server/test/test_qwen35moe_swap_manager.cpp b/server/test/test_qwen35moe_swap_manager.cpp
index 6962da496..cbbf5e6ee 100644
--- a/server/test/test_qwen35moe_swap_manager.cpp
+++ b/server/test/test_qwen35moe_swap_manager.cpp
@@ -1,4 +1,6 @@
-#include "qwen35moe_swap_manager.h"
+#include "../src/common/moe_hybrid_swap_manager.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -13,7 +15,7 @@ static void expect(bool cond, const char * msg) {
 }
 
 int main() {
-    Qwen35MoeRoutingStats stats;
+    MoeHybridRoutingStats stats;
     stats.n_layer = 2;
     stats.n_expert = 4;
     stats.n_expert_used = 2;
@@ -23,7 +25,7 @@ int main() {
     };
     stats.layer_totals = {205, 194};
 
-    Qwen35MoeExpertPlacement placement;
+    MoeHybridPlacement placement;
     placement.n_layer = 2;
     placement.n_expert = 4;
     placement.n_expert_used = 2;
@@ -31,13 +33,13 @@ int main() {
     placement.hot_counts = {1, 1};
     placement.hot_expert_ids = {{1}, {0}};
 
-    Qwen35MoeSwapPolicy policy;
+    MoeHybridSwapPolicy policy;
     policy.max_swaps_total = 1;
     policy.min_promote_gain = 5;
 
-    Qwen35MoeSwapPlan plan;
+    MoeHybridSwapPlan plan;
     std::string err;
-    expect(build_qwen35moe_swap_plan(placement, stats, policy, plan, &err), err.c_str());
+    expect(build_moe_hybrid_swap_plan(placement, stats, policy, plan, &err), err.c_str());
     expect(plan.actions.size() == 1, "one swap planned");
     expect(plan.actions[0].layer_idx == 0, "layer0 swap");
     expect(plan.actions[0].evict_expert == 1, "evict weakest hot");