diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index ed148cb67..2766b1824 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -251,12 +251,14 @@ add_library(dflash_common STATIC
src/qwen35moe/qwen35moe_ffn.cpp
src/qwen35moe/qwen35moe_backend.cpp
src/qwen35moe/qwen35moe_daemon.cpp
- src/qwen35moe/qwen35moe_routing_stats.cpp
- src/qwen35moe/qwen35moe_expert_placement.cpp
- src/qwen35moe/qwen35moe_hybrid_storage.cpp
- src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
src/qwen35moe/qwen35moe_pipelined_decode.cpp
- src/qwen35moe/qwen35moe_swap_manager.cpp
+ # ── Common MoE hybrid infrastructure ──
+ src/common/moe_hybrid_placement.cpp
+ src/common/moe_hybrid_routing_stats.cpp
+ src/common/moe_hybrid_storage.cpp
+ src/common/moe_hybrid_ffn_eval.cpp
+ src/common/cold_ffn_cpu.cpp
+ src/common/moe_hybrid_swap_manager.cpp
src/qwen35/layer_split_forward.cpp
src/qwen35/layer_split_daemon.cpp
src/qwen35/qwen35_backend.cpp
@@ -523,6 +525,11 @@ target_link_libraries(dflash_common
ggml-base
nlohmann_json::nlohmann_json
)
+# OpenMP for parallel cold FFN kernel (saturate memory bandwidth).
+find_package(OpenMP)
+if(OpenMP_CXX_FOUND)
+ target_link_libraries(dflash_common PRIVATE OpenMP::OpenMP_CXX)
+endif()
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_link_libraries(dflash_common PRIVATE hip::host)
endif()
@@ -638,32 +645,32 @@ if(DFLASH27B_TESTS)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
- target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda)
+ target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
add_executable(smoke_target_forward test/smoke_target_forward.cpp)
diff --git a/server/README.md b/server/README.md
index ac0d57e97..63c240cb3 100644
--- a/server/README.md
+++ b/server/README.md
@@ -231,6 +231,59 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
| Math500 | 35.13 | 69.77 | 5.15 | **1.99×** |
| **Mean** | 34.97 | 69.19 | 5.17 | **1.98×** |
+## Hybrid MoE (hot/cold expert split)
+
+For MoE targets whose experts don't fit in VRAM, dflash can split experts
+across the GPU and CPU: the most-used (**hot**) experts stay resident on the
+GPU, the rest (**cold**) live in host RAM and are evaluated on the CPU,
+overlapped with the GPU hot path. This trades some decode/prefill speed for
+VRAM headroom, and is computed automatically at load by a dynamic-placement
+pass. It applies to both MoE arches: `qwen35`/`qwen36` and `laguna`.
+
+**When it triggers.** If the experts fit in the available VRAM budget, all
+experts load to GPU (no split, fastest path). Otherwise placement keeps as
+many hot experts as the budget allows and routes the rest to CPU. You can also
+shrink the budget manually to force a split (e.g. to free VRAM for a longer
+context or a larger target).
+
+### Budget knobs
+
+| Env | Arch | Effect |
+|---|---|---|
+| `DFLASH_EXPERT_BUDGET_MB N` | both | Cap hot-expert VRAM to `N` MB. Applies only when `N` is below the auto-computed budget; experts beyond it go cold (CPU). |
+| `DFLASH_EXPERT_BUDGET_PCT P` | laguna | Keep hot experts to `P`% (`0
` = `LAGUNA` or `QWEN35MOE`:
+
+| Env | Effect |
+|---|---|
+| `DFLASH__HOTNESS ` | Expert frequency/hotness file driving which experts are placed hot. |
+| `DFLASH__TELEMETRY 1` | Log per-layer hot/cold FFN timing telemetry. |
+| `DFLASH__SWAP_MAX N` | Max hot/cold promotions per request boundary (runtime re-placement); `0` disables swapping. |
+| `DFLASH__SWAP_MIN_GAIN N` | Min observed-frequency gain before a cold expert is promoted to hot. |
+| `DFLASH__NEXT_PLACEMENT_OUT ` | Dump the placement chosen this run (warm-start the hotness file next time). |
+| `DFLASH_QWEN35MOE_RUNTIME_STATS_OUT ` | (qwen only) Dump runtime routing-frequency stats. |
+
+### Example
+
+```bash
+# Force ~8 GB of hot experts on GPU; the rest run cold on the CPU.
+DFLASH_EXPERT_BUDGET_MB=8000 ./build/dflash_server models/laguna-xs2-Q4_K_M.gguf --port 8000
+# Startup log e.g.: "dynamic placement result: 4717 hot experts, 5267 cold experts"
+```
+
+### Caveat: reduced-stack prefill chunking
+
+When a layer's hot-expert stack is **reduced** (i.e. a genuine split), the
+ggml-cuda MMQ `mul_mat_id` kernel illegal-accesses for certain batch sizes on
+**both** HIP/gfx1151 and CUDA/sm_86. As a guard, hybrid-split **prefill** is
+sliced into ≤4-token sub-batches (forcing the stable MMVQ path); **decode**
+(single-token) is unaffected. This costs some prefill throughput on split
+layers and is removed once the kernel is fixed upstream.
+
## Laguna-XS.2 target (experimental, Poolside MoE)
[Poolside Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2) is a 40-layer MoE LLM with 256 experts (top-8) plus an always-on shared expert, per-layer head counts `[48,64,64,64]×10`, and a per-layer SWA pattern (window 512). It is **architecturally distinct from `qwen35`**, so dflash adds a hand-rolled CUDA forward path (`Path A`, ggml-only — no libllama dependency) that mirrors the qwen35 stack. The Q4_K_M GGUF lands at 18.77 GiB on a single RTX 3090; tok_embd stays CPU-only (110 MiB) to keep the GPU budget under 24 GB.
diff --git a/server/src/common/cold_ffn_compute.h b/server/src/common/cold_ffn_compute.h
new file mode 100644
index 000000000..f1d512dec
--- /dev/null
+++ b/server/src/common/cold_ffn_compute.h
@@ -0,0 +1,59 @@
+// ColdFfnCompute: Direct compute interface for cold expert FFN.
+// Bypasses ggml graph dispatch overhead. Shared-memory model (CPU/Halo).
+#pragma once
+
+#include "ggml.h"
+#include
+#include
+
+namespace dflash::common {
+
+// Per-layer cold weight metadata — raw pointers into shared memory.
+struct ColdFfnLayer {
+ const void * gate_up_data = nullptr; // fused [n_cold, n_ff*2, n_embd] quantized
+ const void * gate_data = nullptr; // separate gate [n_cold, n_ff, n_embd]
+ const void * up_data = nullptr; // separate up [n_cold, n_ff, n_embd]
+ const void * down_data = nullptr; // [n_cold, n_embd, n_ff] quantized
+
+ size_t gate_up_stride = 0; // bytes between experts in gate_up tensor
+ size_t gate_stride = 0; // bytes between experts in gate tensor
+ size_t up_stride = 0; // bytes between experts in up tensor
+ size_t down_stride = 0; // bytes between experts in down tensor
+
+ ggml_type gate_up_type = GGML_TYPE_Q4_K; // type for fused gate_up
+ ggml_type gate_type = GGML_TYPE_Q4_K; // type for separate gate
+ ggml_type up_type = GGML_TYPE_Q4_K; // type for separate up
+ ggml_type down_type = GGML_TYPE_Q4_K; // type for down projection
+ bool fused_gate_up = false; // true if gate+up are fused
+
+ // Scale factors (applied after matmul). 1.0 = no scaling.
+ float gate_up_scale = 1.0f;
+ float gate_scale = 1.0f;
+ float up_scale = 1.0f;
+ float down_scale = 1.0f;
+};
+
+// Abstract compute interface. Implementations: CPU (now), Halo (future).
+struct ColdFfnCompute {
+ virtual ~ColdFfnCompute() = default;
+
+ // Compute cold expert FFN contributions and accumulate into output.
+ // input: [n_embd] F32 — post-norm hidden state
+ // ids: [n_cold] I32 — local cold expert indices
+ // weights: [n_cold] F32 — routing weights for each cold expert
+ // output: [n_embd] F32 — accumulated weighted expert outputs (zeroed by callee)
+ virtual void compute(
+ const ColdFfnLayer & layer,
+ const float * input,
+ const int32_t * ids,
+ const float * weights,
+ int n_cold,
+ int n_embd,
+ int n_ff,
+ float * output) = 0;
+};
+
+// Create CPU-based fused cold FFN compute.
+std::unique_ptr make_cpu_cold_ffn_compute(int n_ff_max);
+
+} // namespace dflash::common
diff --git a/server/src/common/cold_ffn_cpu.cpp b/server/src/common/cold_ffn_cpu.cpp
new file mode 100644
index 000000000..75a93ff86
--- /dev/null
+++ b/server/src/common/cold_ffn_cpu.cpp
@@ -0,0 +1,190 @@
+// CpuColdFfnCompute: Fused cold expert FFN using ggml vec_dot primitives.
+// Bypasses ggml graph dispatch overhead. Uses OpenMP to saturate memory bandwidth.
+// Memory-bandwidth bound at ~45 GB/s DDR4. Target: 15.7ms → ~3ms/token.
+
+#include "cold_ffn_compute.h"
+#include "ggml-cpu.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#ifdef _OPENMP
+#include
+#endif
+
+namespace dflash::common {
+
+class CpuColdFfnCompute : public ColdFfnCompute {
+ int n_ff_max_;
+ int n_threads_;
+
+ // Per-thread scratch buffers for parallel down matmul
+ struct ThreadBuf {
+ std::vector scratch; // [n_ff * 2] gate_up result + SwiGLU
+ std::vector mid_conv; // down input converted to vec_dot_type
+ };
+ std::vector thread_bufs_;
+ std::vector inp_conv_; // input converted (shared, read-only during matmul)
+
+public:
+ explicit CpuColdFfnCompute(int n_ff_max, int n_threads = 0) : n_ff_max_(n_ff_max) {
+#ifdef _OPENMP
+ if (n_threads <= 0) {
+ const char * env = std::getenv("DFLASH_COLD_THREADS");
+ n_threads = env ? std::atoi(env) : 0;
+ }
+ n_threads_ = n_threads > 0 ? n_threads : std::min(omp_get_max_threads(), 8);
+#else
+ n_threads_ = 1;
+#endif
+ fprintf(stderr, "[cold_ffn] using %d threads\n", n_threads_);
+ thread_bufs_.resize(n_threads_);
+ for (auto & tb : thread_bufs_) {
+ tb.scratch.resize((size_t)n_ff_max * 2);
+ }
+ }
+
+ void compute(
+ const ColdFfnLayer & layer,
+ const float * input,
+ const int32_t * ids,
+ const float * weights,
+ int n_cold,
+ int n_embd,
+ int n_ff,
+ float * output) override {
+
+ if (n_cold <= 0) return;
+ std::memset(output, 0, sizeof(float) * (size_t)n_embd);
+
+ // Gate/up phase type traits
+ const ggml_type gu_type = layer.fused_gate_up ? layer.gate_up_type : layer.gate_type;
+ const auto * gu_cpu_traits = ggml_get_type_traits_cpu(gu_type);
+ const auto gu_vec_dot = gu_cpu_traits->vec_dot;
+ const auto gu_vec_dot_type = gu_cpu_traits->vec_dot_type;
+ const auto gu_from_float = ggml_get_type_traits_cpu(gu_vec_dot_type)->from_float;
+
+ // Down phase type traits (may differ from gate/up)
+ const auto * dn_cpu_traits = ggml_get_type_traits_cpu(layer.down_type);
+ const auto dn_vec_dot = dn_cpu_traits->vec_dot;
+ const auto dn_vec_dot_type = dn_cpu_traits->vec_dot_type;
+ const auto dn_from_float = ggml_get_type_traits_cpu(dn_vec_dot_type)->from_float;
+
+ const size_t inp_row_size = ggml_row_size(gu_vec_dot_type, n_embd);
+ const size_t mid_row_size = ggml_row_size(dn_vec_dot_type, n_ff);
+ const size_t gu_weight_row = ggml_row_size(gu_type, n_embd);
+ const size_t dn_weight_row = ggml_row_size(layer.down_type, n_ff);
+
+ // For separate gate/up — up may have a different type than gate
+ size_t up_weight_row = gu_weight_row;
+ const ggml_type up_type_actual = layer.fused_gate_up ? gu_type : layer.up_type;
+ (void)up_type_actual;
+ ggml_vec_dot_t up_vec_dot = gu_vec_dot;
+ ggml_type up_vdt = gu_vec_dot_type;
+ if (!layer.fused_gate_up && layer.up_type != layer.gate_type) {
+ const auto * up_cpu_traits = ggml_get_type_traits_cpu(layer.up_type);
+ up_vec_dot = up_cpu_traits->vec_dot;
+ up_vdt = up_cpu_traits->vec_dot_type;
+ up_weight_row = ggml_row_size(layer.up_type, n_embd);
+ }
+
+ // Ensure input conversion buffer is large enough
+ if (inp_conv_.size() < inp_row_size) inp_conv_.resize(inp_row_size);
+ // Ensure per-thread mid_conv buffers
+ for (auto & tb : thread_bufs_) {
+ if (tb.mid_conv.size() < mid_row_size) tb.mid_conv.resize(mid_row_size);
+ }
+
+ // Convert input for up if different type
+ std::vector inp_conv_up;
+ if (!layer.fused_gate_up && up_vdt != gu_vec_dot_type) {
+ size_t up_inp_row_size = ggml_row_size(up_vdt, n_embd);
+ inp_conv_up.resize(up_inp_row_size);
+ auto up_from_float = ggml_get_type_traits_cpu(up_vdt)->from_float;
+ up_from_float(input, inp_conv_up.data(), n_embd);
+ }
+
+ // Convert input to gate's vec_dot format once
+ gu_from_float(input, inp_conv_.data(), n_embd);
+
+ for (int e = 0; e < n_cold; ++e) {
+ const int32_t eid = ids[e];
+ const float w = weights[e];
+ if (w == 0.0f) continue;
+
+ // Use thread 0's scratch for gate_up (serial phase)
+ float * scratch = thread_bufs_[0].scratch.data();
+
+ // ── Phase 1: gate_up matmul → scratch[0..n_ff*2) ──
+ // Parallel over rows (each row is independent, reading shared inp_conv_)
+ if (layer.fused_gate_up) {
+ const char * expert = (const char *)layer.gate_up_data + (size_t)eid * layer.gate_up_stride;
+ const int n_rows = n_ff * 2;
+#ifdef _OPENMP
+ #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+ for (int row = 0; row < n_rows; ++row) {
+ const void * row_ptr = expert + (size_t)row * gu_weight_row;
+ gu_vec_dot(n_embd, &scratch[row], 0, row_ptr, 0, inp_conv_.data(), 0, 1);
+ }
+ if (layer.gate_up_scale != 1.0f) {
+ for (int i = 0; i < n_rows; ++i) scratch[i] *= layer.gate_up_scale;
+ }
+ } else {
+ const char * gate_expert = (const char *)layer.gate_data + (size_t)eid * layer.gate_stride;
+ const char * up_expert = (const char *)layer.up_data + (size_t)eid * layer.up_stride;
+ const uint8_t * up_inp = (!inp_conv_up.empty()) ? inp_conv_up.data() : inp_conv_.data();
+#ifdef _OPENMP
+ #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+ for (int row = 0; row < n_ff; ++row) {
+ const void * gp = gate_expert + (size_t)row * gu_weight_row;
+ gu_vec_dot(n_embd, &scratch[row], 0, gp, 0, inp_conv_.data(), 0, 1);
+ const void * up = up_expert + (size_t)row * up_weight_row;
+ up_vec_dot(n_embd, &scratch[n_ff + row], 0, up, 0, up_inp, 0, 1);
+ }
+ if (layer.gate_scale != 1.0f) {
+ for (int i = 0; i < n_ff; ++i) scratch[i] *= layer.gate_scale;
+ }
+ if (layer.up_scale != 1.0f) {
+ for (int i = 0; i < n_ff; ++i) scratch[n_ff + i] *= layer.up_scale;
+ }
+ }
+
+ // ── Phase 2: SwiGLU activation ──
+ for (int i = 0; i < n_ff; ++i) {
+ const float gate = scratch[i];
+ const float up = scratch[n_ff + i];
+ scratch[i] = (gate / (1.0f + expf(-gate))) * up;
+ }
+
+ // ── Phase 3: down matmul → output (weighted accumulate) ──
+ // Convert SwiGLU result to down's vec_dot format (serial, small)
+ dn_from_float(scratch, thread_bufs_[0].mid_conv.data(), n_ff);
+ const uint8_t * mid_conv_data = thread_bufs_[0].mid_conv.data();
+
+ const char * down_expert = (const char *)layer.down_data + (size_t)eid * layer.down_stride;
+ const float scale = w * layer.down_scale;
+
+ // Parallel down matmul — each thread accumulates its own output rows
+#ifdef _OPENMP
+ #pragma omp parallel for num_threads(n_threads_) schedule(static)
+#endif
+ for (int row = 0; row < n_embd; ++row) {
+ float val;
+ const void * row_ptr = down_expert + (size_t)row * dn_weight_row;
+ dn_vec_dot(n_ff, &val, 0, row_ptr, 0, mid_conv_data, 0, 1);
+ output[row] += scale * val;
+ }
+ }
+ }
+};
+
+std::unique_ptr make_cpu_cold_ffn_compute(int n_ff_max) {
+ return std::make_unique(n_ff_max);
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp
similarity index 79%
rename from server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
rename to server/src/common/moe_hybrid_ffn_eval.cpp
index 6e44b2d3f..4ded0bc5a 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
+++ b/server/src/common/moe_hybrid_ffn_eval.cpp
@@ -1,214 +1,32 @@
-#include "qwen35moe_hybrid_ffn_eval.h"
-
-#include "qwen35_ops.h"
+#include "moe_hybrid_ffn_eval.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
+#include
#include
#include
+#include
+#include
namespace dflash::common {
namespace {
+// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result
+// by that scale. No-op when scale==1.0f (non-NVFP4 models).
+inline ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result, float scale) {
+ if (scale == 1.0f) return mm_result;
+ return ggml_scale(ctx, mm_result, scale);
+}
+
using HybridClock = std::chrono::steady_clock;
static uint64_t elapsed_us(HybridClock::time_point start, HybridClock::time_point end) {
return (uint64_t) std::chrono::duration_cast(end - start).count();
}
-} // namespace (anon helpers)
-
-// Build a cached FFN graph for the hot+shared path with a fixed n_hot.
-bool build_cached_hot_graph(
- CachedFfnGraph & out,
- ggml_backend_t backend,
- ggml_tensor * gate_tensor,
- ggml_tensor * up_tensor,
- ggml_tensor * down_tensor,
- ggml_tensor * gate_up_tensor,
- float gate_scale,
- float up_scale,
- float down_scale,
- float gate_up_scale,
- const TargetLayer & L,
- int n_embd,
- int n_ff_exp,
- int n_hot) {
-
- out.free();
- out.n_hot = n_hot;
-
- ggml_init_params ip{};
- ip.mem_size = 48 * 1024 * 1024;
- ip.mem_buffer = nullptr;
- ip.no_alloc = true;
- out.ctx = ggml_init(ip);
- if (!out.ctx) return false;
-
- out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
- ggml_set_input(out.inp);
-
- ggml_tensor * routed = nullptr;
- if (n_hot > 0) {
- out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1);
- ggml_set_input(out.ids);
- out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1);
- ggml_set_input(out.weights);
-
- ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
- ggml_tensor * gu = nullptr;
- if (gate_up_tensor) {
- ggml_tensor * gate_up_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
- ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2], 0);
- ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2],
- (size_t)n_ff_exp * ggml_element_size(gate_up_e));
- gate_e = ggml_cont(out.ctx, gate_e);
- up_e = ggml_cont(out.ctx, up_e);
- gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
- } else {
- ggml_tensor * gate_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
- ggml_tensor * up_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
- gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
- }
-
- ggml_tensor * experts = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
- ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1);
- experts = ggml_mul(out.ctx, experts, w_view);
-
- for (int i = 0; i < n_hot; ++i) {
- ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
- (size_t)i * experts->nb[1]);
- routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice);
- }
- }
-
- ggml_tensor * shared = nullptr;
- const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
- if (has_shared) {
- ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_gate_shexp, out.inp), L.ffn_gate_shexp_s);
- ggml_tensor * sh_up = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_up_shexp, out.inp), L.ffn_up_shexp_s);
- ggml_tensor * sh_gu = ggml_swiglu_split(out.ctx, sh_gate, sh_up);
- shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
- if (L.ffn_gate_inp_shexp) {
- ggml_tensor * shared_gate = apply_scale2(out.ctx,
- ggml_mul_mat(out.ctx, L.ffn_gate_inp_shexp, out.inp), L.ffn_gate_inp_shexp_s);
- shared_gate = ggml_sigmoid(out.ctx, shared_gate);
- shared = ggml_mul(out.ctx, shared, shared_gate);
- }
- }
-
- if (routed && shared) {
- out.output = ggml_add(out.ctx, routed, shared);
- } else if (routed) {
- out.output = routed;
- } else {
- out.output = shared;
- }
- if (!out.output) { out.free(); return false; }
-
- out.gf = ggml_new_graph_custom(out.ctx, 2048, false);
- ggml_set_output(out.output);
- ggml_build_forward_expand(out.gf, out.output);
- out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
- if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
- out.free();
- return false;
- }
- return true;
-}
-
-// Build a cached FFN graph for the cold (CPU) routed subset.
-bool build_cached_cold_graph(
- CachedFfnGraph & out,
- ggml_backend_t cpu_backend,
- ggml_tensor * gate_tensor,
- ggml_tensor * up_tensor,
- ggml_tensor * down_tensor,
- ggml_tensor * gate_up_tensor,
- float gate_scale,
- float up_scale,
- float down_scale,
- float gate_up_scale,
- int n_embd,
- int n_ff_exp,
- int n_cold) {
-
- out.free();
- out.n_hot = n_cold; // reuse field for "n experts in this graph"
-
- ggml_init_params ip{};
- ip.mem_size = 32 * 1024 * 1024;
- ip.mem_buffer = nullptr;
- ip.no_alloc = true;
- out.ctx = ggml_init(ip);
- if (!out.ctx) return false;
-
- out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
- ggml_set_input(out.inp);
- out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1);
- ggml_set_input(out.ids);
- out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1);
- ggml_set_input(out.weights);
-
- ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
- ggml_tensor * gu = nullptr;
- if (gate_up_tensor) {
- ggml_tensor * gate_up_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
- ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2], 0);
- ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2],
- (size_t)n_ff_exp * ggml_element_size(gate_up_e));
- gate_e = ggml_cont(out.ctx, gate_e);
- up_e = ggml_cont(out.ctx, up_e);
- gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
- } else {
- ggml_tensor * gate_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
- ggml_tensor * up_e = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
- gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
- }
-
- ggml_tensor * experts = apply_scale2(out.ctx,
- ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
- ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1);
- experts = ggml_mul(out.ctx, experts, w_view);
-
- out.output = nullptr;
- for (int i = 0; i < n_cold; ++i) {
- ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
- (size_t)i * experts->nb[1]);
- out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice);
- }
- if (!out.output) { out.free(); return false; }
-
- out.gf = ggml_new_graph_custom(out.ctx, 1024, false);
- ggml_set_output(out.output);
- ggml_build_forward_expand(out.gf, out.output);
- out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
- if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
- out.free();
- return false;
- }
- return true;
-}
-
-namespace {
-
+// Run routed expert subset on a given backend (GPU or CPU).
static bool run_routed_subset(ggml_backend_t backend,
ggml_tensor * gate_tensor,
ggml_tensor * up_tensor,
@@ -331,14 +149,15 @@ static bool run_routed_subset(ggml_backend_t backend,
return true;
}
+// Shared expert FFN on GPU.
static bool run_shared_ffn_gpu(ggml_backend_t backend,
- const TargetLayer & L,
+ const MoeLayerDesc & desc,
int n_embd,
const float * cur_host,
std::vector & out,
std::string * err) {
out.assign((size_t)n_embd, 0.0f);
- if (!L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp) {
+ if (!desc.ffn_up_shexp || !desc.ffn_gate_shexp || !desc.ffn_down_shexp) {
return true;
}
@@ -355,13 +174,13 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend,
ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, 1);
ggml_set_input(inp);
- ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
- ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+ ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
+ ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up);
- ggml_tensor * shared = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
- if (L.ffn_gate_inp_shexp) {
+ ggml_tensor * shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+ if (desc.ffn_gate_inp_shexp) {
ggml_tensor * shared_gate = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+ ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
shared_gate = ggml_sigmoid(ctx, shared_gate);
shared = ggml_mul(ctx, shared, shared_gate);
}
@@ -391,7 +210,6 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend,
}
// Fused hot routed + shared FFN in a single GPU graph compute.
-// Eliminates one graph compute phase per layer vs separate run_routed_subset + run_shared_ffn_gpu.
static bool run_hot_and_shared_ffn_gpu(
ggml_backend_t backend,
ggml_tensor * gate_tensor,
@@ -402,7 +220,7 @@ static bool run_hot_and_shared_ffn_gpu(
float up_scale,
float down_scale,
float gate_up_scale,
- const TargetLayer & L,
+ const MoeLayerDesc & desc,
int n_embd,
int n_ff_exp,
const float * cur_host,
@@ -415,7 +233,7 @@ static bool run_hot_and_shared_ffn_gpu(
out.assign((size_t)n_embd, 0.0f);
const bool has_hot = (n_hot > 0);
- const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+ const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
if (!has_hot && !has_shared) return true;
ggml_init_params ip{};
@@ -478,13 +296,13 @@ static bool run_hot_and_shared_ffn_gpu(
ggml_tensor * shared = nullptr;
if (has_shared) {
- ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
- ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+ ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
+ ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up);
- shared = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
- if (L.ffn_gate_inp_shexp) {
+ shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+ if (desc.ffn_gate_inp_shexp) {
ggml_tensor * shared_gate = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+ ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
shared_gate = ggml_sigmoid(ctx, shared_gate);
shared = ggml_mul(ctx, shared, shared_gate);
}
@@ -533,46 +351,266 @@ static bool run_hot_and_shared_ffn_gpu(
return true;
}
-} // namespace
-
-bool eval_qwen35moe_reference_ffn_single(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected,
- std::vector & out,
- std::string * err) {
- // Reference path: fused hot+shared in one graph (same as hybrid but all experts on GPU)
- if (!run_hot_and_shared_ffn_gpu(gpu_backend,
- L.ffn_gate_exps, L.ffn_up_exps, L.ffn_down_exps, L.ffn_gate_up_exps,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- L, w.n_embd, w.n_ff_exp,
- cur_host, selected_ids, selected_weights, n_selected,
- out, err)) {
+// Build batched routed graph helper for batched prefill.
+static bool build_batched_routed_graph(
+ ggml_context * ctx,
+ ggml_tensor * gate_tensor,
+ ggml_tensor * up_tensor,
+ ggml_tensor * down_tensor,
+ ggml_tensor * gate_up_tensor,
+ float gate_scale,
+ float up_scale,
+ float down_scale,
+ float gate_up_scale,
+ ggml_tensor * inp,
+ ggml_tensor * sel,
+ ggml_tensor * wts,
+ int n_embd, int n_ff_exp, int n_used, int n_tokens,
+ ggml_tensor ** out_routed)
+{
+ ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
+ ggml_tensor * gu = nullptr;
+ if (gate_up_tensor) {
+ ggml_tensor * gate_up_e = apply_scale2(ctx,
+ ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale);
+ ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2], 0);
+ ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2],
+ (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+ gate_e = ggml_cont(ctx, gate_e);
+ up_e = ggml_cont(ctx, up_e);
+ gu = ggml_swiglu_split(ctx, gate_e, up_e);
+ } else {
+ ggml_tensor * gate_e = apply_scale2(ctx,
+ ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale);
+ ggml_tensor * up_e = apply_scale2(ctx,
+ ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale);
+ gu = ggml_swiglu_split(ctx, gate_e, up_e);
+ }
+
+ ggml_tensor * experts = apply_scale2(ctx,
+ ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale);
+
+ // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens]
+ ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
+ experts = ggml_mul(ctx, experts, w_view);
+
+ ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens);
+ ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape);
+ *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens);
+ return true;
+}
+
+} // namespace (anon)
+
+// ── Public API ──────────────────────────────────────────────────────────────────
+
+bool build_cached_hot_graph(
+ CachedFfnGraph & out,
+ ggml_backend_t backend,
+ ggml_tensor * gate_tensor,
+ ggml_tensor * up_tensor,
+ ggml_tensor * down_tensor,
+ ggml_tensor * gate_up_tensor,
+ float gate_scale,
+ float up_scale,
+ float down_scale,
+ float gate_up_scale,
+ const MoeLayerDesc & desc,
+ int n_embd,
+ int n_ff_exp,
+ int n_hot) {
+
+ out.free();
+ out.n_hot = n_hot;
+
+ ggml_init_params ip{};
+ ip.mem_size = 48 * 1024 * 1024;
+ ip.mem_buffer = nullptr;
+ ip.no_alloc = true;
+ out.ctx = ggml_init(ip);
+ if (!out.ctx) return false;
+
+ out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
+ ggml_set_input(out.inp);
+
+ ggml_tensor * routed = nullptr;
+ if (n_hot > 0) {
+ out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1);
+ ggml_set_input(out.ids);
+ out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1);
+ ggml_set_input(out.weights);
+
+ ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
+ ggml_tensor * gu = nullptr;
+ if (gate_up_tensor) {
+ ggml_tensor * gate_up_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
+ ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2], 0);
+ ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2],
+ (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+ gate_e = ggml_cont(out.ctx, gate_e);
+ up_e = ggml_cont(out.ctx, up_e);
+ gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+ } else {
+ ggml_tensor * gate_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
+ ggml_tensor * up_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
+ gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+ }
+
+ ggml_tensor * experts = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
+ ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1);
+ experts = ggml_mul(out.ctx, experts, w_view);
+
+ for (int i = 0; i < n_hot; ++i) {
+ ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
+ (size_t)i * experts->nb[1]);
+ routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice);
+ }
+ }
+
+ ggml_tensor * shared = nullptr;
+ const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
+ if (has_shared) {
+ ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_gate_shexp, out.inp), desc.ffn_gate_shexp_s);
+ ggml_tensor * sh_up = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_up_shexp, out.inp), desc.ffn_up_shexp_s);
+ ggml_tensor * sh_gu = ggml_swiglu_split(out.ctx, sh_gate, sh_up);
+ shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+ if (desc.ffn_gate_inp_shexp) {
+ ggml_tensor * shared_gate = apply_scale2(out.ctx,
+ ggml_mul_mat(out.ctx, desc.ffn_gate_inp_shexp, out.inp), desc.ffn_gate_inp_shexp_s);
+ shared_gate = ggml_sigmoid(out.ctx, shared_gate);
+ shared = ggml_mul(out.ctx, shared, shared_gate);
+ }
+ }
+
+ if (routed && shared) {
+ out.output = ggml_add(out.ctx, routed, shared);
+ } else if (routed) {
+ out.output = routed;
+ } else {
+ out.output = shared;
+ }
+ if (!out.output) { out.free(); return false; }
+
+ out.gf = ggml_new_graph_custom(out.ctx, 2048, false);
+ ggml_set_output(out.output);
+ ggml_build_forward_expand(out.gf, out.output);
+ out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+ if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+ out.free();
+ return false;
+ }
+ return true;
+}
+
+bool build_cached_cold_graph(
+ CachedFfnGraph & out,
+ ggml_backend_t cpu_backend,
+ ggml_tensor * gate_tensor,
+ ggml_tensor * up_tensor,
+ ggml_tensor * down_tensor,
+ ggml_tensor * gate_up_tensor,
+ float gate_scale,
+ float up_scale,
+ float down_scale,
+ float gate_up_scale,
+ int n_embd,
+ int n_ff_exp,
+ int n_cold) {
+
+ out.free();
+ out.n_hot = n_cold; // reuse field for "n experts in this graph"
+
+ ggml_init_params ip{};
+ ip.mem_size = 32 * 1024 * 1024;
+ ip.mem_buffer = nullptr;
+ ip.no_alloc = true;
+ out.ctx = ggml_init(ip);
+ if (!out.ctx) return false;
+
+ out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1);
+ ggml_set_input(out.inp);
+ out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1);
+ ggml_set_input(out.ids);
+ out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1);
+ ggml_set_input(out.weights);
+
+ ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1);
+ ggml_tensor * gu = nullptr;
+ if (gate_up_tensor) {
+ ggml_tensor * gate_up_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale);
+ ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2], 0);
+ ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e,
+ n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
+ gate_up_e->nb[1], gate_up_e->nb[2],
+ (size_t)n_ff_exp * ggml_element_size(gate_up_e));
+ gate_e = ggml_cont(out.ctx, gate_e);
+ up_e = ggml_cont(out.ctx, up_e);
+ gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+ } else {
+ ggml_tensor * gate_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale);
+ ggml_tensor * up_e = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale);
+ gu = ggml_swiglu_split(out.ctx, gate_e, up_e);
+ }
+
+ ggml_tensor * experts = apply_scale2(out.ctx,
+ ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale);
+ ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1);
+ experts = ggml_mul(out.ctx, experts, w_view);
+
+ out.output = nullptr;
+ for (int i = 0; i < n_cold; ++i) {
+ ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2],
+ (size_t)i * experts->nb[1]);
+ out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice);
+ }
+ if (!out.output) { out.free(); return false; }
+
+ out.gf = ggml_new_graph_custom(out.ctx, 1024, false);
+ ggml_set_output(out.output);
+ ggml_build_forward_expand(out.gf, out.output);
+ out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+ if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) {
+ out.free();
return false;
}
return true;
}
-bool eval_qwen35moe_hybrid_ffn_single(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- ggml_backend_t cpu_backend,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected,
- std::vector & out,
- Qwen35MoeHybridFfnTelemetry * telemetry,
- std::string * err) {
+bool eval_moe_hybrid_ffn_single(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ ggml_backend_t cpu_backend,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_selected,
+ std::vector & out,
+ MoeHybridFfnTelemetry * telemetry,
+ std::string * err) {
+
if (telemetry) *telemetry = {};
const auto ffn_wall_t0 = HybridClock::now();
const auto partition_t0 = HybridClock::now();
+
std::vector hot_ids;
std::vector hot_weights;
std::vector cold_ids;
@@ -606,7 +644,7 @@ bool eval_qwen35moe_hybrid_ffn_single(
const int n_hot = (int)hot_ids.size();
const bool has_hot = (n_hot > 0);
- const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+ const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
const bool has_cold = !cold_ids.empty();
const int n_cold = (int)cold_ids.size();
@@ -614,17 +652,17 @@ bool eval_qwen35moe_hybrid_ffn_single(
bool hot_async_launched = false;
const auto hot_t0 = HybridClock::now();
if (!has_hot && !has_shared) {
- hot_and_shared.assign((size_t)w.n_embd, 0.0f);
+ hot_and_shared.assign((size_t)cfg.n_embd, 0.0f);
} else {
// Lazily build cached hot graph on first use
if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
build_cached_hot_graph(storage.hot_graph, gpu_backend,
storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- L, w.n_embd, w.n_ff_exp, n_hot);
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ desc, cfg.n_embd, cfg.n_ff_exp, n_hot);
}
if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
- ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd);
+ ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd);
if (storage.hot_graph.ids && has_hot) {
ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids.data(), 0, sizeof(int32_t) * (size_t)n_hot);
}
@@ -638,8 +676,8 @@ bool eval_qwen35moe_hybrid_ffn_single(
// Fallback: sync compute (no overlap)
if (!run_hot_and_shared_ffn_gpu(gpu_backend,
storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- L, w.n_embd, w.n_ff_exp,
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ desc, cfg.n_embd, cfg.n_ff_exp,
cur_host,
hot_ids.empty() ? nullptr : hot_ids.data(),
hot_weights.empty() ? nullptr : hot_weights.data(),
@@ -655,11 +693,11 @@ bool eval_qwen35moe_hybrid_ffn_single(
if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
build_cached_cold_graph(storage.cold_graph, cpu_backend,
storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- w.n_embd, w.n_ff_exp, n_cold);
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ cfg.n_embd, cfg.n_ff_exp, n_cold);
}
if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
- ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd);
+ ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd);
ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids.data(), 0, sizeof(int32_t) * (size_t)n_cold);
ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights.data(), 0, sizeof(float) * (size_t)n_cold);
auto st = ggml_backend_graph_compute(cpu_backend, storage.cold_graph.gf);
@@ -668,28 +706,28 @@ bool eval_qwen35moe_hybrid_ffn_single(
if (err) *err = "cached cold graph compute failed";
return false;
}
- cold.resize((size_t)w.n_embd);
- ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)w.n_embd);
+ cold.resize((size_t)cfg.n_embd);
+ ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)cfg.n_embd);
} else {
if (!run_routed_subset(cpu_backend,
storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- w.n_embd, w.n_ff_exp,
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ cfg.n_embd, cfg.n_ff_exp,
cur_host, cold_ids.data(), cold_weights.data(), n_cold, cold, err)) {
if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
return false;
}
}
} else {
- cold.assign((size_t)w.n_embd, 0.0f);
+ cold.assign((size_t)cfg.n_embd, 0.0f);
}
const auto cold_t1 = HybridClock::now();
// ── Sync GPU and read result ──
if ((has_hot || has_shared) && storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
ggml_backend_synchronize(gpu_backend);
- hot_and_shared.resize((size_t)w.n_embd);
- ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)w.n_embd);
+ hot_and_shared.resize((size_t)cfg.n_embd);
+ ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)cfg.n_embd);
}
const auto hot_t1 = HybridClock::now();
@@ -700,8 +738,8 @@ bool eval_qwen35moe_hybrid_ffn_single(
}
const auto combine_t0 = HybridClock::now();
- out.assign((size_t)w.n_embd, 0.0f);
- for (int i = 0; i < w.n_embd; ++i) {
+ out.assign((size_t)cfg.n_embd, 0.0f);
+ for (int i = 0; i < cfg.n_embd; ++i) {
out[(size_t)i] = hot_and_shared[(size_t)i] + cold[(size_t)i];
}
const auto combine_t1 = HybridClock::now();
@@ -712,20 +750,20 @@ bool eval_qwen35moe_hybrid_ffn_single(
return true;
}
-bool eval_qwen35moe_batched_prefill_ffn(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_tokens,
- std::vector & out,
- std::string * err) {
-
- const int n_embd = w.n_embd;
- const int n_used = w.n_expert_used;
- const int n_ff_exp = w.n_ff_exp;
+bool eval_moe_batched_prefill_ffn(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_tokens,
+ std::vector & out,
+ std::string * err) {
+
+ const int n_embd = cfg.n_embd;
+ const int n_used = cfg.n_expert_used;
+ const int n_ff_exp = cfg.n_ff_exp;
out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
if (n_tokens <= 0) return true;
@@ -739,11 +777,8 @@ bool eval_qwen35moe_batched_prefill_ffn(
return false;
}
- // Input: [n_embd, n_tokens]
ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_set_input(inp);
-
- // Pre-computed routing: selected [n_used, n_tokens], weights [n_used, n_tokens]
ggml_tensor * sel = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_used, n_tokens);
ggml_set_input(sel);
ggml_tensor * wts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_used, n_tokens);
@@ -752,9 +787,9 @@ bool eval_qwen35moe_batched_prefill_ffn(
// Routed expert computation using full GPU expert tensors
ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
ggml_tensor * gu = nullptr;
- if (L.ffn_gate_up_exps) {
+ if (desc.ffn_gate_up_exps) {
ggml_tensor * gate_up_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, L.ffn_gate_up_exps, cur_3d, sel), L.ffn_gate_up_exps_s);
+ ggml_mul_mat_id(ctx, desc.ffn_gate_up_exps, cur_3d, sel), desc.ffn_gate_up_exps_s);
ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
gate_up_e->nb[1], gate_up_e->nb[2], 0);
@@ -767,14 +802,14 @@ bool eval_qwen35moe_batched_prefill_ffn(
gu = ggml_swiglu_split(ctx, gate_e, up_e);
} else {
ggml_tensor * gate_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, L.ffn_gate_exps, cur_3d, sel), L.ffn_gate_exps_s);
+ ggml_mul_mat_id(ctx, desc.ffn_gate_exps, cur_3d, sel), desc.ffn_gate_exps_s);
ggml_tensor * up_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, L.ffn_up_exps, cur_3d, sel), L.ffn_up_exps_s);
+ ggml_mul_mat_id(ctx, desc.ffn_up_exps, cur_3d, sel), desc.ffn_up_exps_s);
gu = ggml_swiglu_split(ctx, gate_e, up_e);
}
ggml_tensor * experts = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, L.ffn_down_exps, gu, sel), L.ffn_down_exps_s);
+ ggml_mul_mat_id(ctx, desc.ffn_down_exps, gu, sel), desc.ffn_down_exps_s);
// Weight and sum over experts
ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
@@ -786,17 +821,17 @@ bool eval_qwen35moe_batched_prefill_ffn(
// Shared expert
ggml_tensor * combined = routed;
- if (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp) {
+ if (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp) {
ggml_tensor * sh_gate = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
+ ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
ggml_tensor * sh_up = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+ ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up);
ggml_tensor * shared = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
- if (L.ffn_gate_inp_shexp) {
+ ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+ if (desc.ffn_gate_inp_shexp) {
ggml_tensor * shared_gate = apply_scale2(ctx,
- ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+ ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
shared_gate = ggml_sigmoid(ctx, shared_gate);
shared = ggml_mul(ctx, shared, shared_gate);
}
@@ -832,80 +867,24 @@ bool eval_qwen35moe_batched_prefill_ffn(
return true;
}
-// ── GPU-Resident Residual State ──
-
-// ── Hybrid Batched Prefill FFN ──
-// Processes n_tokens at once with hot experts on GPU and cold experts on CPU
-// concurrently. Uses pre-computed routing from the pre-FFN graph.
-
-static bool build_batched_routed_graph(
- ggml_context * ctx,
- ggml_tensor * gate_tensor,
- ggml_tensor * up_tensor,
- ggml_tensor * down_tensor,
- ggml_tensor * gate_up_tensor,
- float gate_scale,
- float up_scale,
- float down_scale,
- float gate_up_scale,
- ggml_tensor * inp, // [n_embd, n_tokens]
- ggml_tensor * sel, // [n_used, n_tokens]
- ggml_tensor * wts, // [n_used, n_tokens]
- int n_embd, int n_ff_exp, int n_used, int n_tokens,
- ggml_tensor ** out_routed)
-{
- ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens);
- ggml_tensor * gu = nullptr;
- if (gate_up_tensor) {
- ggml_tensor * gate_up_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale);
- ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2], 0);
- ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e,
- n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2],
- gate_up_e->nb[1], gate_up_e->nb[2],
- (size_t)n_ff_exp * ggml_element_size(gate_up_e));
- gate_e = ggml_cont(ctx, gate_e);
- up_e = ggml_cont(ctx, up_e);
- gu = ggml_swiglu_split(ctx, gate_e, up_e);
- } else {
- ggml_tensor * gate_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale);
- ggml_tensor * up_e = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale);
- gu = ggml_swiglu_split(ctx, gate_e, up_e);
- }
-
- ggml_tensor * experts = apply_scale2(ctx,
- ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale);
-
- // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens]
- ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens);
- experts = ggml_mul(ctx, experts, w_view);
-
- ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens);
- ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape);
- *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens);
- return true;
-}
-
-bool eval_qwen35moe_hybrid_ffn_batched(
- ggml_backend_t gpu_backend,
- ggml_backend_t cpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_tokens,
- std::vector & out,
- std::string * err) {
-
- const int n_embd = w.n_embd;
- const int n_used = w.n_expert_used;
- const int n_ff_exp = w.n_ff_exp;
+static bool eval_moe_hybrid_ffn_batched_core(
+ ggml_backend_t gpu_backend,
+ ggml_backend_t cpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_tokens,
+ std::vector & out,
+ std::string * err,
+ ggml_gallocr_t * p_hot_alloc,
+ ggml_gallocr_t * p_cold_alloc) {
+
+ const int n_embd = cfg.n_embd;
+ const int n_used = cfg.n_expert_used;
+ const int n_ff_exp = cfg.n_ff_exp;
out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
if (n_tokens <= 0) return true;
@@ -942,11 +921,6 @@ bool eval_qwen35moe_hybrid_ffn_batched(
}
}
- // Fast path: all hot → use hot stack only (no cold compute needed)
- // NOTE: Cannot use eval_qwen35moe_batched_prefill_ffn here since
- // L.ffn_gate_exps is not allocated in hybrid mode (skip_expert_tensors=true).
- // Fall through to hybrid path which uses storage.gate_hot correctly.
-
// ── Step 2: Build and run hot GPU graph (includes shared expert always) ──
std::vector hot_partial((size_t)n_embd * (size_t)n_tokens, 0.0f);
bool hot_async_launched = false;
@@ -956,8 +930,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
ggml_gallocr_t hot_alloc = nullptr;
ggml_tensor * hot_output = nullptr;
- // Always run GPU graph: shared expert is always on GPU, and hot routed when present
- const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+ const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
if (has_hot || has_shared) {
ggml_init_params ip{};
ip.mem_size = 128 * 1024 * 1024;
@@ -969,7 +942,6 @@ bool eval_qwen35moe_hybrid_ffn_batched(
ggml_tensor * inp = ggml_new_tensor_2d(hot_ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_set_input(inp);
- // Only create routing tensors if we have hot routed experts
ggml_tensor * sel = nullptr;
ggml_tensor * wts = nullptr;
ggml_tensor * routed = nullptr;
@@ -981,7 +953,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
build_batched_routed_graph(hot_ctx,
storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &routed);
}
@@ -989,15 +961,15 @@ bool eval_qwen35moe_hybrid_ffn_batched(
ggml_tensor * combined = routed;
if (has_shared) {
ggml_tensor * sh_gate = apply_scale2(hot_ctx,
- ggml_mul_mat(hot_ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s);
+ ggml_mul_mat(hot_ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s);
ggml_tensor * sh_up = apply_scale2(hot_ctx,
- ggml_mul_mat(hot_ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s);
+ ggml_mul_mat(hot_ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s);
ggml_tensor * sh_gu = ggml_swiglu_split(hot_ctx, sh_gate, sh_up);
ggml_tensor * shared = apply_scale2(hot_ctx,
- ggml_mul_mat(hot_ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s);
- if (L.ffn_gate_inp_shexp) {
+ ggml_mul_mat(hot_ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s);
+ if (desc.ffn_gate_inp_shexp) {
ggml_tensor * shared_gate = apply_scale2(hot_ctx,
- ggml_mul_mat(hot_ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s);
+ ggml_mul_mat(hot_ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s);
shared_gate = ggml_sigmoid(hot_ctx, shared_gate);
shared = ggml_mul(hot_ctx, shared, shared_gate);
}
@@ -1008,10 +980,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
hot_gf = ggml_new_graph_custom(hot_ctx, 4096, false);
ggml_set_output(hot_output);
ggml_build_forward_expand(hot_gf, hot_output);
- hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+ if (p_hot_alloc) {
+ if (!*p_hot_alloc)
+ *p_hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+ hot_alloc = *p_hot_alloc;
+ } else {
+ hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend));
+ }
if (!ggml_gallocr_alloc_graph(hot_alloc, hot_gf)) {
if (err) *err = "hybrid batched hot gallocr failed";
- ggml_gallocr_free(hot_alloc); ggml_free(hot_ctx);
+ if (!p_hot_alloc) ggml_gallocr_free(hot_alloc);
+ ggml_free(hot_ctx);
return false;
}
@@ -1053,18 +1032,26 @@ bool eval_qwen35moe_hybrid_ffn_batched(
ggml_tensor * cold_routed = nullptr;
build_batched_routed_graph(cold_ctx,
storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &cold_routed);
ggml_cgraph * cold_gf = ggml_new_graph_custom(cold_ctx, 4096, false);
ggml_set_output(cold_routed);
ggml_build_forward_expand(cold_gf, cold_routed);
- ggml_gallocr_t cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+ ggml_gallocr_t cold_alloc;
+ if (p_cold_alloc) {
+ if (!*p_cold_alloc)
+ *p_cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+ cold_alloc = *p_cold_alloc;
+ } else {
+ cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend));
+ }
if (!ggml_gallocr_alloc_graph(cold_alloc, cold_gf)) {
if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
- if (hot_alloc) ggml_gallocr_free(hot_alloc);
+ if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
if (hot_ctx) ggml_free(hot_ctx);
- ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx);
+ if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
+ ggml_free(cold_ctx);
if (err) *err = "hybrid batched cold gallocr failed";
return false;
}
@@ -1077,16 +1064,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
auto st = ggml_backend_graph_compute(cpu_backend, cold_gf);
if (st != GGML_STATUS_SUCCESS) {
if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
- if (hot_alloc) ggml_gallocr_free(hot_alloc);
+ if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
if (hot_ctx) ggml_free(hot_ctx);
- ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx);
+ if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
+ ggml_free(cold_ctx);
if (err) *err = "hybrid batched cold compute failed";
return false;
}
ggml_backend_tensor_get(cold_routed, cold_partial.data(), 0,
sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
- ggml_gallocr_free(cold_alloc);
+ if (!p_cold_alloc) ggml_gallocr_free(cold_alloc);
ggml_free(cold_ctx);
}
@@ -1096,7 +1084,7 @@ bool eval_qwen35moe_hybrid_ffn_batched(
ggml_backend_tensor_get(hot_output, hot_partial.data(), 0,
sizeof(float) * (size_t)n_embd * (size_t)n_tokens);
}
- if (hot_alloc) ggml_gallocr_free(hot_alloc);
+ if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc);
if (hot_ctx) ggml_free(hot_ctx);
// ── Step 5: Merge hot + cold ──
@@ -1108,6 +1096,60 @@ bool eval_qwen35moe_hybrid_ffn_batched(
return true;
}
+// ── GPU-Resident Residual State ──
+
+// Public entry. Workaround for a ggml-cuda/HIP defect: the MMQ mul_mat_id
+// kernel illegal-accesses on gfx1151 when the per-layer hot expert stack is
+// REDUCED (n_hot_stack < n_expert); the full-stack (all-hot) case is fine.
+// MMVQ is used instead of MMQ only when the matmul batch dim (= n_tokens) is
+// small (Q4_K AMD MMVQ-mmid cap is 4). So for reduced hot stacks we slice the
+// prefill batch into <=4-token sub-batches, routing the routed mul_mat_id
+// through the stable MMVQ path. Full stacks keep the fast single-shot MMQ.
+bool eval_moe_hybrid_ffn_batched(
+ ggml_backend_t gpu_backend,
+ ggml_backend_t cpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_tokens,
+ std::vector & out,
+ std::string * err,
+ ggml_gallocr_t * p_hot_alloc,
+ ggml_gallocr_t * p_cold_alloc) {
+ const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
+ : storage.gate_hot ? (int)storage.gate_hot->ne[2]
+ : 0;
+ static const int MMQ_SAFE_SUB_BATCH = 4;
+ if (n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) {
+ const int n_embd = cfg.n_embd;
+ const int n_used = cfg.n_expert_used;
+ out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f);
+ std::vector sub_out;
+ for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) {
+ const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0);
+ if (!eval_moe_hybrid_ffn_batched_core(
+ gpu_backend, cpu_backend, cfg, desc, storage,
+ cur_host + (size_t)t0 * (size_t)n_embd,
+ selected_ids + (size_t)t0 * (size_t)n_used,
+ selected_weights + (size_t)t0 * (size_t)n_used,
+ tc, sub_out, err, p_hot_alloc, p_cold_alloc)) {
+ return false;
+ }
+ std::memcpy(out.data() + (size_t)t0 * (size_t)n_embd,
+ sub_out.data(),
+ sizeof(float) * (size_t)n_embd * (size_t)tc);
+ }
+ return true;
+ }
+ return eval_moe_hybrid_ffn_batched_core(
+ gpu_backend, cpu_backend, cfg, desc, storage,
+ cur_host, selected_ids, selected_weights, n_tokens, out, err,
+ p_hot_alloc, p_cold_alloc);
+}
+
void ResidualCombineGraph::free() {
if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
if (ctx) { ggml_free(ctx); ctx = nullptr; }
@@ -1165,7 +1207,6 @@ void GpuResidentState::destroy() {
bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd) {
out.destroy();
- // Allocate persistent GPU tensor for act_cur
ggml_init_params ip{};
ip.mem_size = 1024 * 1024;
ip.mem_buffer = nullptr;
@@ -1180,38 +1221,31 @@ bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int
return false;
}
- // Build the residual combine graph
if (!build_residual_combine_graph(out.combine, backend, n_embd)) {
out.destroy();
return false;
}
- // Zero out cold_in initially (for all-hot layers, cold stays zero)
std::vector zeros((size_t)n_embd, 0.0f);
ggml_backend_tensor_set(out.combine.cold_in, zeros.data(), 0, sizeof(float) * (size_t)n_embd);
return true;
}
-// ─── GPU-Resident hybrid FFN eval ─────────────────────────────────────────────
-// Keeps activation on GPU: only reads router IDs (64B) to CPU, and ffn_post
-// to CPU only when cold experts are selected. All other data movement is
-// GPU→GPU via ggml_backend_tensor_copy.
-
-bool eval_qwen35moe_hybrid_ffn_gpu_resident(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- ggml_backend_t cpu_backend,
- ggml_tensor * ffn_post_gpu,
- ggml_tensor * ffn_residual_gpu,
- GpuResidentState & gpu_state,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected) {
-
- const int n_embd = w.n_embd;
+bool eval_moe_hybrid_ffn_gpu_resident(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ ggml_backend_t cpu_backend,
+ ggml_tensor * ffn_post_gpu,
+ ggml_tensor * ffn_residual_gpu,
+ GpuResidentState & gpu_state,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_selected) {
+
+ const int n_embd = cfg.n_embd;
// ── Partition into hot/cold ──
std::vector hot_ids;
@@ -1239,7 +1273,7 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
const int n_hot = (int)hot_ids.size();
const bool has_hot = (n_hot > 0);
- const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp);
+ const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp);
const bool has_cold = !cold_ids.empty();
const int n_cold = (int)cold_ids.size();
@@ -1252,8 +1286,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
build_cached_hot_graph(storage.hot_graph, gpu_backend,
storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- L, n_embd, w.n_ff_exp, n_hot);
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ desc, n_embd, cfg.n_ff_exp, n_hot);
}
if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
// GPU→GPU copy: ffn_post → hot_graph.inp (no PCIe!)
@@ -1270,7 +1304,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
}
// ── If cold needed, read ffn_post to CPU BEFORE launching hot async ──
- // (to avoid serializing GPU queue with a device→host read mid-kernel)
std::vector post_host;
if (has_cold) {
post_host.resize((size_t)n_embd);
@@ -1289,8 +1322,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
build_cached_cold_graph(storage.cold_graph, cpu_backend,
storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- n_embd, w.n_ff_exp, n_cold);
+ desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s,
+ n_embd, cfg.n_ff_exp, n_cold);
}
if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
ggml_backend_tensor_set(storage.cold_graph.inp, post_host.data(), 0,
@@ -1308,7 +1341,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
ggml_backend_tensor_get(storage.cold_graph.output, cold_result.data(), 0,
sizeof(float) * (size_t)n_embd);
} else {
- // Fallback: cold graph build failed — shouldn't happen
if (hot_async_launched) ggml_backend_synchronize(gpu_backend);
return false;
}
@@ -1320,7 +1352,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident(
// GPU→GPU: hot output → combine.hot_in
ggml_backend_tensor_copy(storage.hot_graph.output, gpu_state.combine.hot_in);
} else {
- // No hot/shared: zero hot_in
std::vector zeros((size_t)n_embd, 0.0f);
ggml_backend_tensor_set(gpu_state.combine.hot_in, zeros.data(), 0,
sizeof(float) * (size_t)n_embd);
diff --git a/server/src/common/moe_hybrid_ffn_eval.h b/server/src/common/moe_hybrid_ffn_eval.h
new file mode 100644
index 000000000..bd30755ea
--- /dev/null
+++ b/server/src/common/moe_hybrid_ffn_eval.h
@@ -0,0 +1,191 @@
+// Common MoE hybrid FFN evaluation — hot experts on GPU, cold on CPU, concurrent.
+
+#pragma once
+
+#include "moe_hybrid_types.h"
+#include "moe_hybrid_storage.h"
+
+#include "ggml-backend.h"
+
+#include
+#include
+#include
+
+namespace dflash::common {
+
+// GPU-resident residual combine graph: output = residual + hot_out + cold_correction.
+struct ResidualCombineGraph {
+ ggml_context * ctx = nullptr;
+ ggml_cgraph * gf = nullptr;
+ ggml_gallocr_t alloc = nullptr;
+ ggml_tensor * residual_in = nullptr;
+ ggml_tensor * hot_in = nullptr;
+ ggml_tensor * cold_in = nullptr;
+ ggml_tensor * output = nullptr;
+
+ ResidualCombineGraph() = default;
+ ~ResidualCombineGraph() { free(); }
+ ResidualCombineGraph(const ResidualCombineGraph &) = delete;
+ ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
+ ResidualCombineGraph(ResidualCombineGraph && o) noexcept
+ : ctx(o.ctx), gf(o.gf), alloc(o.alloc),
+ residual_in(o.residual_in), hot_in(o.hot_in),
+ cold_in(o.cold_in), output(o.output) {
+ o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+ o.residual_in = nullptr; o.hot_in = nullptr;
+ o.cold_in = nullptr; o.output = nullptr;
+ }
+ ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
+ if (this != &o) {
+ free();
+ ctx = o.ctx; gf = o.gf; alloc = o.alloc;
+ residual_in = o.residual_in; hot_in = o.hot_in;
+ cold_in = o.cold_in; output = o.output;
+ o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+ o.residual_in = nullptr; o.hot_in = nullptr;
+ o.cold_in = nullptr; o.output = nullptr;
+ }
+ return *this;
+ }
+ bool valid() const { return ctx && gf && alloc && output; }
+ void free();
+ void destroy();
+};
+
+bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd);
+
+// GPU-resident state for the decode loop.
+struct GpuResidentState {
+ ggml_context * ctx = nullptr;
+ ggml_backend_buffer_t buf = nullptr;
+ ggml_tensor * act_cur = nullptr;
+
+ ResidualCombineGraph combine;
+
+ GpuResidentState() = default;
+ ~GpuResidentState() { destroy(); }
+ GpuResidentState(const GpuResidentState &) = delete;
+ GpuResidentState & operator=(const GpuResidentState &) = delete;
+ GpuResidentState(GpuResidentState && o) noexcept
+ : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
+ combine(std::move(o.combine)) {
+ o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+ }
+ GpuResidentState & operator=(GpuResidentState && o) noexcept {
+ if (this != &o) {
+ destroy();
+ ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
+ combine = std::move(o.combine);
+ o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+ }
+ return *this;
+ }
+ bool valid() const { return ctx && buf && act_cur && combine.valid(); }
+ void destroy();
+};
+
+bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd);
+
+struct MoeHybridFfnTelemetry {
+ uint64_t ffn_wall_us = 0;
+ uint64_t partition_us = 0;
+ uint64_t hot_us = 0;
+ uint64_t cold_us = 0;
+ uint64_t shared_us = 0;
+ uint64_t combine_us = 0;
+ int hot_selected = 0;
+ int cold_selected = 0;
+};
+
+// Single-token hybrid FFN: hot on GPU, cold on CPU, combine on host.
+bool eval_moe_hybrid_ffn_single(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ ggml_backend_t cpu_backend,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_selected,
+ std::vector & out,
+ MoeHybridFfnTelemetry * telemetry = nullptr,
+ std::string * err = nullptr);
+
+// Batched prefill FFN: all experts on GPU (no hybrid split).
+bool eval_moe_batched_prefill_ffn(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_tokens,
+ std::vector & out,
+ std::string * err = nullptr);
+
+// Batched hybrid prefill FFN: hot on GPU, cold on CPU concurrently.
+bool eval_moe_hybrid_ffn_batched(
+ ggml_backend_t gpu_backend,
+ ggml_backend_t cpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ const float * cur_host,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_tokens,
+ std::vector & out,
+ std::string * err = nullptr,
+ ggml_gallocr_t * p_hot_alloc = nullptr,
+ ggml_gallocr_t * p_cold_alloc = nullptr);
+
+// GPU-resident single-token hybrid FFN: keeps data on GPU, only reads router
+// IDs to CPU for hot/cold partitioning.
+bool eval_moe_hybrid_ffn_gpu_resident(
+ ggml_backend_t gpu_backend,
+ const MoeHybridConfig & cfg,
+ const MoeLayerDesc & desc,
+ MoeHybridLayerStorage & storage,
+ ggml_backend_t cpu_backend,
+ ggml_tensor * ffn_post_gpu,
+ ggml_tensor * ffn_residual_gpu,
+ GpuResidentState & gpu_state,
+ const int32_t * selected_ids,
+ const float * selected_weights,
+ int n_selected);
+
+// Build/rebuild cached hot FFN graph.
+bool build_cached_hot_graph(
+ CachedFfnGraph & out,
+ ggml_backend_t backend,
+ ggml_tensor * gate_tensor,
+ ggml_tensor * up_tensor,
+ ggml_tensor * down_tensor,
+ ggml_tensor * gate_up_tensor,
+ float gate_scale,
+ float up_scale,
+ float down_scale,
+ float gate_up_scale,
+ const MoeLayerDesc & desc,
+ int n_embd,
+ int n_ff_exp,
+ int n_hot);
+
+// Build/rebuild cached cold FFN graph.
+bool build_cached_cold_graph(
+ CachedFfnGraph & out,
+ ggml_backend_t cpu_backend,
+ ggml_tensor * gate_tensor,
+ ggml_tensor * up_tensor,
+ ggml_tensor * down_tensor,
+ ggml_tensor * gate_up_tensor,
+ float gate_scale,
+ float up_scale,
+ float down_scale,
+ float gate_up_scale,
+ int n_embd,
+ int n_ff_exp,
+ int n_cold);
+
+} // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.cpp b/server/src/common/moe_hybrid_placement.cpp
similarity index 79%
rename from server/src/qwen35moe/qwen35moe_expert_placement.cpp
rename to server/src/common/moe_hybrid_placement.cpp
index d0066e9ba..49bcd5ea1 100644
--- a/server/src/qwen35moe/qwen35moe_expert_placement.cpp
+++ b/server/src/common/moe_hybrid_placement.cpp
@@ -1,4 +1,5 @@
-#include "qwen35moe_expert_placement.h"
+#include "moe_hybrid_placement.h"
+#include "moe_hybrid_routing_stats.h"
#include
@@ -8,20 +9,23 @@
namespace dflash::common {
-bool Qwen35MoeExpertPlacement::matches(const TargetWeights & w) const {
- return w.is_moe &&
- n_layer == w.n_layer &&
- n_expert == w.n_expert &&
- n_expert_used == w.n_expert_used &&
+bool MoeHybridPlacement::matches(int n_layer_, int n_expert_, int n_expert_used_) const {
+ return n_layer == n_layer_ &&
+ n_expert == n_expert_ &&
+ n_expert_used == n_expert_used_ &&
(int)hot_counts.size() == n_layer &&
(int)hot_expert_ids.size() == n_layer;
}
-bool Qwen35MoeExpertPlacement::empty() const {
+bool MoeHybridPlacement::matches(const MoeHybridConfig & cfg) const {
+ return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridPlacement::empty() const {
return hot_counts.empty();
}
-bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const {
+bool MoeHybridPlacement::is_hot(int layer_idx, int expert_idx) const {
if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) {
return false;
}
@@ -29,7 +33,8 @@ bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const {
return std::find(hot.begin(), hot.end(), expert_idx) != hot.end();
}
-bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string * err) const {
+bool MoeHybridPlacement::save_json(const std::string & path, const std::string & arch_name,
+ std::string * err) const {
if (n_layer <= 0 || n_expert <= 0 || (int)hot_counts.size() != n_layer ||
(int)hot_expert_ids.size() != n_layer) {
if (err) *err = "placement not initialized";
@@ -37,7 +42,7 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string *
}
nlohmann::json j;
- j["arch"] = "qwen35moe";
+ j["arch"] = arch_name;
j["version"] = 1;
j["n_layer"] = n_layer;
j["n_expert"] = n_expert;
@@ -59,9 +64,9 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string *
return true;
}
-bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
- Qwen35MoeExpertPlacement & out,
- std::string * err) {
+bool MoeHybridPlacement::load_json(const std::string & path,
+ MoeHybridPlacement & out,
+ std::string * err) {
std::ifstream f(path);
if (!f) {
if (err) *err = "failed to open input file";
@@ -76,12 +81,10 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
return false;
}
- if (j.value("arch", std::string()) != "qwen35moe") {
- if (err) *err = "unexpected arch";
- return false;
- }
+ // Accept both legacy "qwen35moe" and new "moe_hybrid" / any arch string.
+ // We don't reject based on arch — the caller validates dimensions.
- Qwen35MoeExpertPlacement tmp;
+ MoeHybridPlacement tmp;
try {
tmp.n_layer = j.value("n_layer", 0);
tmp.n_expert = j.value("n_expert", 0);
@@ -105,11 +108,11 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path,
return true;
}
-bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & stats,
- int total_hot_budget,
- int min_hot_per_layer,
- Qwen35MoeExpertPlacement & out,
- std::string * err) {
+bool MoeHybridPlacement::build_from_stats(const MoeHybridRoutingStats & stats,
+ int total_hot_budget,
+ int min_hot_per_layer,
+ MoeHybridPlacement & out,
+ std::string * err) {
if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) {
if (err) *err = "stats not initialized";
return false;
@@ -127,7 +130,7 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st
return false;
}
- Qwen35MoeExpertPlacement tmp;
+ MoeHybridPlacement tmp;
tmp.n_layer = stats.n_layer;
tmp.n_expert = stats.n_expert;
tmp.n_expert_used = stats.n_expert_used;
@@ -172,12 +175,12 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st
return true;
}
-bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
- const Qwen35MoeRoutingStats & stats,
+bool MoeHybridPlacement::build_from_stats_with_layer_bytes(
+ const MoeHybridRoutingStats & stats,
const std::vector & layer_expert_bytes,
uint64_t total_hot_budget_bytes,
int min_hot_per_layer,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err) {
if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) {
if (err) *err = "stats not initialized";
@@ -196,18 +199,22 @@ bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
const int per_layer_floor = std::min(min_hot_per_layer, stats.n_expert);
uint64_t floor_bytes = 0;
for (int il = 0; il < stats.n_layer; ++il) {
- floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il];
+ if (layer_expert_bytes[(size_t)il] > 0)
+ floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il];
}
if (floor_bytes > total_hot_budget_bytes) {
if (err) *err = "min_hot_per_layer exceeds byte budget";
return false;
}
- Qwen35MoeExpertPlacement tmp;
+ MoeHybridPlacement tmp;
tmp.n_layer = stats.n_layer;
tmp.n_expert = stats.n_expert;
tmp.n_expert_used = stats.n_expert_used;
- tmp.hot_counts.assign((size_t)tmp.n_layer, per_layer_floor);
+ tmp.hot_counts.resize((size_t)tmp.n_layer);
+ for (int il = 0; il < tmp.n_layer; ++il) {
+ tmp.hot_counts[(size_t)il] = (layer_expert_bytes[(size_t)il] > 0) ? per_layer_floor : 0;
+ }
std::vector> ranked((size_t)tmp.n_layer);
for (int il = 0; il < tmp.n_layer; ++il) {
diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.h b/server/src/common/moe_hybrid_placement.h
similarity index 58%
rename from server/src/qwen35moe/qwen35moe_expert_placement.h
rename to server/src/common/moe_hybrid_placement.h
index 49d810dfc..023228664 100644
--- a/server/src/qwen35moe/qwen35moe_expert_placement.h
+++ b/server/src/common/moe_hybrid_placement.h
@@ -1,8 +1,8 @@
-// qwen35moe expert placement config derived from per-layer routing statistics.
+// Common MoE expert placement — determines which experts are hot (GPU) vs cold (CPU).
#pragma once
-#include "qwen35moe_routing_stats.h"
+#include "moe_hybrid_types.h"
#include
#include
@@ -10,7 +10,9 @@
namespace dflash::common {
-struct Qwen35MoeExpertPlacement {
+struct MoeHybridRoutingStats; // forward decl
+
+struct MoeHybridPlacement {
int n_layer = 0;
int n_expert = 0;
int n_expert_used = 0;
@@ -21,27 +23,29 @@ struct Qwen35MoeExpertPlacement {
// Ranked hot expert ids kept on GPU per layer.
std::vector> hot_expert_ids;
- bool matches(const TargetWeights & w) const;
+ bool matches(int n_layer, int n_expert, int n_expert_used) const;
+ bool matches(const MoeHybridConfig & cfg) const;
bool empty() const;
bool is_hot(int layer_idx, int expert_idx) const;
- bool save_json(const std::string & path, std::string * err = nullptr) const;
+ bool save_json(const std::string & path, const std::string & arch_name = "moe_hybrid",
+ std::string * err = nullptr) const;
static bool load_json(const std::string & path,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err = nullptr);
- static bool build_from_stats(const Qwen35MoeRoutingStats & stats,
+ static bool build_from_stats(const MoeHybridRoutingStats & stats,
int total_hot_budget,
int min_hot_per_layer,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err = nullptr);
static bool build_from_stats_with_layer_bytes(
- const Qwen35MoeRoutingStats & stats,
+ const MoeHybridRoutingStats & stats,
const std::vector & layer_expert_bytes,
uint64_t total_hot_budget_bytes,
int min_hot_per_layer,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err = nullptr);
};
diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.cpp b/server/src/common/moe_hybrid_routing_stats.cpp
similarity index 67%
rename from server/src/qwen35moe/qwen35moe_routing_stats.cpp
rename to server/src/common/moe_hybrid_routing_stats.cpp
index 2e5e7ad17..7d11c5c1a 100644
--- a/server/src/qwen35moe/qwen35moe_routing_stats.cpp
+++ b/server/src/common/moe_hybrid_routing_stats.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_routing_stats.h"
+#include "moe_hybrid_routing_stats.h"
#include
#include
@@ -8,43 +8,50 @@
namespace dflash::common {
-size_t Qwen35MoeRoutingStats::index_of(int layer_idx, int expert_idx) const {
+size_t MoeHybridRoutingStats::index_of(int layer_idx, int expert_idx) const {
return (size_t)layer_idx * (size_t)n_expert + (size_t)expert_idx;
}
-bool Qwen35MoeRoutingStats::init_from_weights(const TargetWeights & w) {
- if (!w.is_moe || w.n_layer <= 0 || w.n_expert <= 0 || w.n_expert_used <= 0) {
+bool MoeHybridRoutingStats::init(int n_layer_, int n_expert_, int n_expert_used_) {
+ if (n_layer_ <= 0 || n_expert_ <= 0 || n_expert_used_ <= 0) {
return false;
}
- n_layer = w.n_layer;
- n_expert = w.n_expert;
- n_expert_used = w.n_expert_used;
+ n_layer = n_layer_;
+ n_expert = n_expert_;
+ n_expert_used = n_expert_used_;
counts.assign((size_t)n_layer * (size_t)n_expert, 0);
layer_totals.assign((size_t)n_layer, 0);
return true;
}
-bool Qwen35MoeRoutingStats::matches(const TargetWeights & w) const {
- return w.is_moe &&
- n_layer == w.n_layer &&
- n_expert == w.n_expert &&
- n_expert_used == w.n_expert_used &&
+bool MoeHybridRoutingStats::init(const MoeHybridConfig & cfg) {
+ return init(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridRoutingStats::matches(int n_layer_, int n_expert_, int n_expert_used_) const {
+ return n_layer == n_layer_ &&
+ n_expert == n_expert_ &&
+ n_expert_used == n_expert_used_ &&
counts.size() == (size_t)n_layer * (size_t)n_expert &&
layer_totals.size() == (size_t)n_layer;
}
-bool Qwen35MoeRoutingStats::empty() const {
+bool MoeHybridRoutingStats::matches(const MoeHybridConfig & cfg) const {
+ return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used);
+}
+
+bool MoeHybridRoutingStats::empty() const {
return counts.empty();
}
-uint64_t Qwen35MoeRoutingStats::count(int layer_idx, int expert_idx) const {
+uint64_t MoeHybridRoutingStats::count(int layer_idx, int expert_idx) const {
if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) {
return 0;
}
return counts[index_of(layer_idx, expert_idx)];
}
-bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) {
+bool MoeHybridRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) {
if (!expert_ids || layer_idx < 0 || layer_idx >= n_layer || n_ids < 0) {
return false;
}
@@ -57,12 +64,12 @@ bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, i
for (int i = 0; i < n_ids; ++i) {
const int expert_idx = expert_ids[i];
counts[index_of(layer_idx, expert_idx)]++;
- layer_totals[(size_t) layer_idx]++;
+ layer_totals[(size_t)layer_idx]++;
}
return true;
}
-bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend,
+bool MoeHybridRoutingStats::observe_selected_tensor(ggml_backend_t backend,
int layer_idx,
ggml_tensor * selected,
std::string * err) {
@@ -88,7 +95,7 @@ bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend,
return true;
}
-std::vector Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const {
+std::vector MoeHybridRoutingStats::ranked_experts(int layer_idx) const {
if (layer_idx < 0 || layer_idx >= n_layer) return {};
std::vector ranked((size_t)n_expert);
std::iota(ranked.begin(), ranked.end(), 0);
@@ -102,17 +109,16 @@ std::vector Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const {
return ranked;
}
-std::vector Qwen35MoeRoutingStats::hot_experts(int layer_idx, int hot_count) const {
+std::vector MoeHybridRoutingStats::hot_experts(int layer_idx, int hot_count) const {
std::vector ranked = ranked_experts(layer_idx);
if (hot_count < 0) hot_count = 0;
- if ((size_t) hot_count < ranked.size()) {
- ranked.resize((size_t) hot_count);
+ if ((size_t)hot_count < ranked.size()) {
+ ranked.resize((size_t)hot_count);
}
return ranked;
}
-
-bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err) const {
+bool MoeHybridRoutingStats::save_csv(const std::string & path, std::string * err) const {
if (n_layer <= 0 || n_expert <= 0 || counts.size() != (size_t)n_layer * (size_t)n_expert) {
if (err) *err = "routing stats not initialized";
return false;
@@ -124,7 +130,6 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err
return false;
}
- // Header comments
f << "# hotness table: n_layer=" << n_layer
<< " n_expert=" << n_expert
<< " n_expert_used=" << n_expert_used << "\n";
@@ -145,8 +150,8 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err
return true;
}
-bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
- Qwen35MoeRoutingStats & out,
+bool MoeHybridRoutingStats::load_csv(const std::string & path,
+ MoeHybridRoutingStats & out,
std::string * err) {
std::ifstream f(path);
if (!f) {
@@ -154,32 +159,28 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
return false;
}
- int n_layer = 0, n_expert = 0, n_expert_used = 0;
+ int file_n_layer = 0, file_n_expert = 0, file_n_expert_used = 0;
std::vector all_counts;
std::string line;
while (std::getline(f, line)) {
- // Skip comments and empty lines
if (line.empty() || line[0] == '#') {
- // Try to parse header metadata from comment
if (line.find("n_layer=") != std::string::npos) {
std::sscanf(line.c_str(), "# hotness table: n_layer=%d n_expert=%d n_expert_used=%d",
- &n_layer, &n_expert, &n_expert_used);
+ &file_n_layer, &file_n_expert, &file_n_expert_used);
}
continue;
}
- // Parse CSV row: comma-separated uint64 values
std::vector row;
const char * p = line.c_str();
while (*p) {
- // Skip whitespace
while (*p == ' ' || *p == '\t') ++p;
if (!*p) break;
char * end = nullptr;
uint64_t val = std::strtoull(p, &end, 10);
if (end == p) {
- if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)n_expert, (size_t)1)));
+ if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)file_n_expert, (size_t)1)));
return false;
}
row.push_back(val);
@@ -189,40 +190,39 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path,
if (row.empty()) continue;
- // Infer n_expert from first data row
- if (n_expert == 0) {
- n_expert = (int)row.size();
- } else if ((int)row.size() != n_expert) {
- if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)n_expert));
+ if (file_n_expert == 0) {
+ file_n_expert = (int)row.size();
+ } else if ((int)row.size() != file_n_expert) {
+ if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)file_n_expert));
return false;
}
all_counts.insert(all_counts.end(), row.begin(), row.end());
}
- if (n_expert <= 0 || all_counts.empty()) {
+ if (file_n_expert <= 0 || all_counts.empty()) {
if (err) *err = "no data rows found";
return false;
}
- const int detected_layers = (int)(all_counts.size() / (size_t)n_expert);
- if (n_layer == 0) n_layer = detected_layers;
- if (n_expert_used == 0) n_expert_used = 8; // default for Qwen3.5-MoE
+ const int detected_layers = (int)(all_counts.size() / (size_t)file_n_expert);
+ if (file_n_layer == 0) file_n_layer = detected_layers;
+ if (file_n_expert_used == 0) file_n_expert_used = 8; // default
- if ((int)all_counts.size() != n_layer * n_expert) {
- if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(n_layer) + ")";
+ if ((int)all_counts.size() != file_n_layer * file_n_expert) {
+ if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(file_n_layer) + ")";
return false;
}
- Qwen35MoeRoutingStats tmp;
- tmp.n_layer = n_layer;
- tmp.n_expert = n_expert;
- tmp.n_expert_used = n_expert_used;
+ MoeHybridRoutingStats tmp;
+ tmp.n_layer = file_n_layer;
+ tmp.n_expert = file_n_expert;
+ tmp.n_expert_used = file_n_expert_used;
tmp.counts = std::move(all_counts);
- tmp.layer_totals.assign((size_t)n_layer, 0);
- for (int il = 0; il < n_layer; ++il) {
+ tmp.layer_totals.assign((size_t)file_n_layer, 0);
+ for (int il = 0; il < file_n_layer; ++il) {
uint64_t total = 0;
- for (int ie = 0; ie < n_expert; ++ie) {
+ for (int ie = 0; ie < file_n_expert; ++ie) {
total += tmp.counts[tmp.index_of(il, ie)];
}
tmp.layer_totals[(size_t)il] = total;
diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.h b/server/src/common/moe_hybrid_routing_stats.h
similarity index 71%
rename from server/src/qwen35moe/qwen35moe_routing_stats.h
rename to server/src/common/moe_hybrid_routing_stats.h
index 209ba2a0e..17fa43886 100644
--- a/server/src/qwen35moe/qwen35moe_routing_stats.h
+++ b/server/src/common/moe_hybrid_routing_stats.h
@@ -1,8 +1,11 @@
-// Reusable qwen35moe routing-statistics scaffold for Phase 2 expert placement.
+// Common MoE routing statistics for expert placement decisions.
#pragma once
-#include "internal.h"
+#include "moe_hybrid_types.h"
+
+#include "ggml.h"
+#include "ggml-backend.h"
#include
#include
@@ -10,7 +13,7 @@
namespace dflash::common {
-struct Qwen35MoeRoutingStats {
+struct MoeHybridRoutingStats {
int n_layer = 0;
int n_expert = 0;
int n_expert_used = 0;
@@ -19,8 +22,10 @@ struct Qwen35MoeRoutingStats {
std::vector counts;
std::vector layer_totals;
- bool init_from_weights(const TargetWeights & w);
- bool matches(const TargetWeights & w) const;
+ bool init(int n_layer, int n_expert, int n_expert_used);
+ bool init(const MoeHybridConfig & cfg);
+ bool matches(int n_layer, int n_expert, int n_expert_used) const;
+ bool matches(const MoeHybridConfig & cfg) const;
bool empty() const;
uint64_t count(int layer_idx, int expert_idx) const;
@@ -35,7 +40,7 @@ struct Qwen35MoeRoutingStats {
bool save_csv(const std::string & path, std::string * err = nullptr) const;
static bool load_csv(const std::string & path,
- Qwen35MoeRoutingStats & out,
+ MoeHybridRoutingStats & out,
std::string * err = nullptr);
private:
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp
similarity index 67%
rename from server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
rename to server/src/common/moe_hybrid_storage.cpp
index fa66c065c..ca080ce43 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
+++ b/server/src/common/moe_hybrid_storage.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_hybrid_storage.h"
+#include "moe_hybrid_storage.h"
#include "ggml-cpu.h"
@@ -41,7 +41,6 @@ static bool read_expert_slices(ggml_backend_t backend,
return true;
}
-// Read expert slices from raw memory (e.g. mmap) instead of a GPU tensor.
static bool read_expert_slices_from_mem(const uint8_t * tensor_data,
size_t tensor_size,
const std::vector & expert_ids,
@@ -90,7 +89,7 @@ static ggml_tensor * new_like_with_expert_count(ggml_context * ctx, ggml_tensor
} // namespace
-Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() {
+MoeHybridStorage::~MoeHybridStorage() {
for (auto & layer : layers) {
layer.hot_graph.free();
layer.cold_graph.free();
@@ -125,67 +124,73 @@ Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() {
}
}
-bool Qwen35MoeHybridStorage::matches(const TargetWeights & w) const {
- return placement.matches(w) && (int)layers.size() == w.n_layer;
+bool MoeHybridStorage::matches(const MoeHybridConfig & cfg) const {
+ return placement.matches(cfg) && (int)layers.size() == cfg.n_layer;
}
-bool Qwen35MoeHybridStorage::empty() const {
+bool MoeHybridStorage::empty() const {
return layers.empty();
}
-bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
- ggml_backend_t backend,
- const Qwen35MoeExpertPlacement & placement,
- Qwen35MoeHybridStorage & out,
- std::string * err) {
- if (!placement.matches(w)) {
- if (err) *err = "placement does not match model";
+bool build_moe_hybrid_storage(const MoeHybridConfig & cfg,
+ ggml_backend_t gpu_backend,
+ const MoeHybridPlacement & placement,
+ const std::vector & layer_descs,
+ MoeHybridStorage & out,
+ std::string * err) {
+ if (!placement.matches(cfg)) {
+ if (err) *err = "placement does not match config";
return false;
}
- if (!w.is_moe) {
- if (err) *err = "target is not qwen35moe";
+ if ((int)layer_descs.size() != cfg.n_layer) {
+ if (err) *err = "layer_descs size does not match n_layer";
return false;
}
-
out.placement = placement;
- out.layers.resize((size_t)w.n_layer);
+ out.layers.resize((size_t)cfg.n_layer);
out.cpu_backend = ggml_backend_cpu_init();
if (!out.cpu_backend) {
if (err) *err = "failed to init cpu backend";
return false;
}
- ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8)));
+ ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8)));
+
+ for (int il = 0; il < cfg.n_layer; ++il) {
+ const MoeLayerDesc & desc = layer_descs[(size_t)il];
+ MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+
+ // Skip dense layers (no experts)
+ if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) {
+ continue;
+ }
- for (int il = 0; il < w.n_layer; ++il) {
- const TargetLayer & L = w.layers[(size_t)il];
- Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il];
dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il];
- dst.hot_local_by_global.assign((size_t)w.n_expert, -1);
- dst.cold_local_by_global.assign((size_t)w.n_expert, -1);
+ dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1);
+ dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1);
- std::vector is_hot((size_t)w.n_expert, 0);
+ std::vector is_hot((size_t)cfg.n_expert, 0);
for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) {
const int32_t expert = dst.hot_expert_ids[i];
- if (expert < 0 || expert >= w.n_expert) {
+ if (expert < 0 || expert >= cfg.n_expert) {
if (err) *err = "hot expert id out of range";
return false;
}
dst.hot_local_by_global[(size_t)expert] = (int32_t)i;
is_hot[(size_t)expert] = 1;
}
- for (int expert = 0; expert < w.n_expert; ++expert) {
+ for (int expert = 0; expert < cfg.n_expert; ++expert) {
if (!is_hot[(size_t)expert]) {
dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size();
dst.cold_expert_ids.push_back((int32_t)expert);
}
}
- dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr);
- if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) {
+ dst.fused_gate_up = desc.has_fused_gate_up();
+ if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) {
return false;
}
@@ -204,47 +209,41 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
return false;
}
if (dst.fused_gate_up) {
- dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count);
- dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+ dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count);
+ dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
} else {
- dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count);
- dst.up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count);
- dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+ dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count);
+ dst.up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count);
+ dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
}
- dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, backend);
+ dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend);
if (!dst.hot_buf) {
if (err) *err = "failed to allocate hot expert buffer";
return false;
}
- // Copy hot expert slices from full GPU tensors to hot_buf
std::vector hot_bytes;
if (dst.fused_gate_up) {
- if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.hot_expert_ids,
- dst.gate_up_expert_bytes, hot_bytes, err)) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.hot_expert_ids,
+ dst.gate_up_expert_bytes, hot_bytes, err))
return false;
- }
ggml_backend_tensor_set(dst.gate_up_hot, hot_bytes.data(), 0, hot_bytes.size());
- if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids,
- dst.down_expert_bytes, hot_bytes, err)) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids,
+ dst.down_expert_bytes, hot_bytes, err))
return false;
- }
ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size());
} else {
- if (!read_expert_slices(backend, L.ffn_gate_exps, dst.hot_expert_ids,
- dst.gate_expert_bytes, hot_bytes, err)) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.hot_expert_ids,
+ dst.gate_expert_bytes, hot_bytes, err))
return false;
- }
ggml_backend_tensor_set(dst.gate_hot, hot_bytes.data(), 0, hot_bytes.size());
- if (!read_expert_slices(backend, L.ffn_up_exps, dst.hot_expert_ids,
- dst.up_expert_bytes, hot_bytes, err)) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.hot_expert_ids,
+ dst.up_expert_bytes, hot_bytes, err))
return false;
- }
ggml_backend_tensor_set(dst.up_hot, hot_bytes.data(), 0, hot_bytes.size());
- if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids,
- dst.down_expert_bytes, hot_bytes, err)) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids,
+ dst.down_expert_bytes, hot_bytes, err))
return false;
- }
ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size());
}
}
@@ -261,64 +260,42 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
return false;
}
if (dst.fused_gate_up) {
- dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count);
- dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+ dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count);
+ dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
} else {
- dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count);
- dst.up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count);
- dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+ dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count);
+ dst.up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count);
+ dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
}
dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend);
if (!dst.cold_buf) {
if (err) *err = "failed to allocate cold expert buffer";
return false;
}
- }
- if (dst.fused_gate_up) {
- if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.cold_expert_ids,
- dst.gate_up_expert_bytes, dst.gate_up_cold_bytes, err)) {
- return false;
- }
- } else {
- if (!read_expert_slices(backend, L.ffn_gate_exps, dst.cold_expert_ids,
- dst.gate_expert_bytes, dst.gate_cold_bytes, err) ||
- !read_expert_slices(backend, L.ffn_up_exps, dst.cold_expert_ids,
- dst.up_expert_bytes, dst.up_cold_bytes, err)) {
- return false;
- }
- }
- if (!read_expert_slices(backend, L.ffn_down_exps, dst.cold_expert_ids,
- dst.down_expert_bytes, dst.down_cold_bytes, err)) {
- return false;
- }
-
- if (dst.fused_gate_up) {
- if (dst.gate_up_cold && !dst.gate_up_cold_bytes.empty()) {
- ggml_backend_tensor_set(dst.gate_up_cold, dst.gate_up_cold_bytes.data(), 0, dst.gate_up_cold_bytes.size());
- dst.gate_up_cold_bytes.clear();
- dst.gate_up_cold_bytes.shrink_to_fit();
- }
- if (dst.down_cold && !dst.down_cold_bytes.empty()) {
- ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size());
- dst.down_cold_bytes.clear();
- dst.down_cold_bytes.shrink_to_fit();
- }
- } else {
- if (dst.gate_cold && !dst.gate_cold_bytes.empty()) {
- ggml_backend_tensor_set(dst.gate_cold, dst.gate_cold_bytes.data(), 0, dst.gate_cold_bytes.size());
- dst.gate_cold_bytes.clear();
- dst.gate_cold_bytes.shrink_to_fit();
- }
- if (dst.up_cold && !dst.up_cold_bytes.empty()) {
- ggml_backend_tensor_set(dst.up_cold, dst.up_cold_bytes.data(), 0, dst.up_cold_bytes.size());
- dst.up_cold_bytes.clear();
- dst.up_cold_bytes.shrink_to_fit();
- }
- if (dst.down_cold && !dst.down_cold_bytes.empty()) {
- ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size());
- dst.down_cold_bytes.clear();
- dst.down_cold_bytes.shrink_to_fit();
+ std::vector cold_bytes;
+ if (dst.fused_gate_up) {
+ if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.cold_expert_ids,
+ dst.gate_up_expert_bytes, cold_bytes, err))
+ return false;
+ ggml_backend_tensor_set(dst.gate_up_cold, cold_bytes.data(), 0, cold_bytes.size());
+ if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids,
+ dst.down_expert_bytes, cold_bytes, err))
+ return false;
+ ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size());
+ } else {
+ if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.cold_expert_ids,
+ dst.gate_expert_bytes, cold_bytes, err))
+ return false;
+ ggml_backend_tensor_set(dst.gate_cold, cold_bytes.data(), 0, cold_bytes.size());
+ if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.cold_expert_ids,
+ dst.up_expert_bytes, cold_bytes, err))
+ return false;
+ ggml_backend_tensor_set(dst.up_cold, cold_bytes.data(), 0, cold_bytes.size());
+ if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids,
+ dst.down_expert_bytes, cold_bytes, err))
+ return false;
+ ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size());
}
}
}
@@ -326,67 +303,69 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
return true;
}
-bool build_qwen35moe_hybrid_storage_from_file(
- const TargetWeights & w,
+bool build_moe_hybrid_storage_from_file(
+ const MoeHybridConfig & cfg,
ggml_backend_t gpu_backend,
- const Qwen35MoeExpertPlacement & placement,
+ const MoeHybridPlacement & placement,
+ const std::vector & layer_descs,
const std::vector & file_data,
- Qwen35MoeHybridStorage & out,
+ MoeHybridStorage & out,
std::string * err) {
- if (!placement.matches(w)) {
- if (err) *err = "placement does not match model";
- return false;
- }
- if (!w.is_moe) {
- if (err) *err = "target is not qwen35moe";
+ if (!placement.matches(cfg)) {
+ if (err) *err = "placement does not match config";
return false;
}
- if ((int)file_data.size() != w.n_layer) {
- if (err) *err = "file_data size does not match n_layer";
+ if ((int)layer_descs.size() != cfg.n_layer || (int)file_data.size() != cfg.n_layer) {
+ if (err) *err = "layer_descs/file_data size does not match n_layer";
return false;
}
-
out.placement = placement;
- out.layers.resize((size_t)w.n_layer);
+ out.layers.resize((size_t)cfg.n_layer);
out.cpu_backend = ggml_backend_cpu_init();
if (!out.cpu_backend) {
if (err) *err = "failed to init cpu backend";
return false;
}
- ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8)));
+ ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8)));
- for (int il = 0; il < w.n_layer; ++il) {
- const TargetLayer & L = w.layers[(size_t)il];
+ for (int il = 0; il < cfg.n_layer; ++il) {
+ const MoeLayerDesc & desc = layer_descs[(size_t)il];
const LayerExpertFileData & fd = file_data[(size_t)il];
- Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+ MoeHybridLayerStorage & dst = out.layers[(size_t)il];
+
+ // Skip dense layers (no experts)
+ if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) {
+ continue;
+ }
+
dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il];
- dst.hot_local_by_global.assign((size_t)w.n_expert, -1);
- dst.cold_local_by_global.assign((size_t)w.n_expert, -1);
+ dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1);
+ dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1);
- std::vector is_hot((size_t)w.n_expert, 0);
+ std::vector is_hot((size_t)cfg.n_expert, 0);
for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) {
const int32_t expert = dst.hot_expert_ids[i];
- if (expert < 0 || expert >= w.n_expert) {
+ if (expert < 0 || expert >= cfg.n_expert) {
if (err) *err = "hot expert id out of range";
return false;
}
dst.hot_local_by_global[(size_t)expert] = (int32_t)i;
is_hot[(size_t)expert] = 1;
}
- for (int expert = 0; expert < w.n_expert; ++expert) {
+ for (int expert = 0; expert < cfg.n_expert; ++expert) {
if (!is_hot[(size_t)expert]) {
dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size();
dst.cold_expert_ids.push_back((int32_t)expert);
}
}
- dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr);
- if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) ||
- !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) {
+ dst.fused_gate_up = desc.has_fused_gate_up();
+ if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) ||
+ !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) {
return false;
}
@@ -405,20 +384,22 @@ bool build_qwen35moe_hybrid_storage_from_file(
return false;
}
if (dst.fused_gate_up) {
- dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count);
- dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+ dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count);
+ dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
} else {
- dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count);
- dst.up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count);
- dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count);
+ dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count);
+ dst.up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count);
+ dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count);
}
dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend);
if (!dst.hot_buf) {
- if (err) *err = "failed to allocate hot expert GPU buffer";
+ char msg[128];
+ std::snprintf(msg, sizeof(msg),
+ "failed to allocate hot expert GPU buffer (layer %d, %d hot experts)", il, hot_count);
+ if (err) *err = msg;
return false;
}
- // Load hot expert slices from file
std::vector slice_buf;
if (dst.fused_gate_up) {
if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size,
@@ -457,12 +438,12 @@ bool build_qwen35moe_hybrid_storage_from_file(
return false;
}
if (dst.fused_gate_up) {
- dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count);
- dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+ dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count);
+ dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
} else {
- dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count);
- dst.up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count);
- dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count);
+ dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count);
+ dst.up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count);
+ dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count);
}
dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend);
if (!dst.cold_buf) {
@@ -470,7 +451,6 @@ bool build_qwen35moe_hybrid_storage_from_file(
return false;
}
- // Load cold expert slices from file directly to CPU tensors
std::vector slice_buf;
if (dst.fused_gate_up) {
if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size,
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h
similarity index 58%
rename from server/src/qwen35moe/qwen35moe_hybrid_storage.h
rename to server/src/common/moe_hybrid_storage.h
index c883da78e..465827ec0 100644
--- a/server/src/qwen35moe/qwen35moe_hybrid_storage.h
+++ b/server/src/common/moe_hybrid_storage.h
@@ -1,8 +1,9 @@
-// Phase 3 hybrid expert storage for qwen35moe.
+// Common MoE hybrid expert storage — manages hot (GPU) and cold (CPU) expert buffers.
#pragma once
-#include "qwen35moe_expert_placement.h"
+#include "moe_hybrid_types.h"
+#include "moe_hybrid_placement.h"
#include "ggml-alloc.h"
@@ -29,7 +30,7 @@ struct CachedFfnGraph {
void free();
};
-struct Qwen35MoeHybridLayerStorage {
+struct MoeHybridLayerStorage {
ggml_context * hot_ctx = nullptr;
ggml_backend_buffer_t hot_buf = nullptr;
ggml_tensor * gate_hot = nullptr;
@@ -60,39 +61,31 @@ struct Qwen35MoeHybridLayerStorage {
std::vector down_cold_bytes;
std::vector gate_up_cold_bytes;
- // Cached FFN graphs: hot_graph for all-hot case (n_expert_used hot experts),
- // cold_graph for all-cold case (n_expert_used cold experts).
- // These cover the common case; mixed hot/cold falls back to dynamic build.
- CachedFfnGraph hot_graph; // GPU: fused routed(n_expert_used hot) + shared
- CachedFfnGraph cold_graph; // CPU: routed(n_expert_used cold)
+ // Cached FFN graphs for common-case expert counts.
+ CachedFfnGraph hot_graph;
+ CachedFfnGraph cold_graph;
};
-struct Qwen35MoeHybridStorage {
- Qwen35MoeHybridStorage() = default;
- Qwen35MoeHybridStorage(const Qwen35MoeHybridStorage &) = delete;
- Qwen35MoeHybridStorage & operator=(const Qwen35MoeHybridStorage &) = delete;
- Qwen35MoeHybridStorage(Qwen35MoeHybridStorage &&) = delete;
- Qwen35MoeHybridStorage & operator=(Qwen35MoeHybridStorage &&) = delete;
- ~Qwen35MoeHybridStorage();
+struct MoeHybridStorage {
+ MoeHybridStorage() = default;
+ MoeHybridStorage(const MoeHybridStorage &) = delete;
+ MoeHybridStorage & operator=(const MoeHybridStorage &) = delete;
+ MoeHybridStorage(MoeHybridStorage &&) = delete;
+ MoeHybridStorage & operator=(MoeHybridStorage &&) = delete;
+ ~MoeHybridStorage();
ggml_backend_t cpu_backend = nullptr;
- Qwen35MoeExpertPlacement placement;
- std::vector layers;
+ MoeHybridPlacement placement;
+ std::vector layers;
- bool matches(const TargetWeights & w) const;
+ bool matches(const MoeHybridConfig & cfg) const;
bool empty() const;
};
-bool build_qwen35moe_hybrid_storage(const TargetWeights & w,
- ggml_backend_t backend,
- const Qwen35MoeExpertPlacement & placement,
- Qwen35MoeHybridStorage & out,
- std::string * err = nullptr);
-
// Expert tensor file data for split loading (one entry per expert tensor).
struct ExpertTensorFileData {
- const uint8_t * data = nullptr; // pointer into mmap
- size_t size = 0; // total tensor size in bytes
+ const uint8_t * data = nullptr;
+ size_t size = 0;
};
// Per-layer expert tensor file data for split loading.
@@ -103,15 +96,23 @@ struct LayerExpertFileData {
ExpertTensorFileData gate_up_exps; // optional fused
};
+// Build hybrid storage from GPU-resident expert tensors.
+// layer_descs: one MoeLayerDesc per MoE layer (caller constructs from model-specific types).
+bool build_moe_hybrid_storage(const MoeHybridConfig & cfg,
+ ggml_backend_t gpu_backend,
+ const MoeHybridPlacement & placement,
+ const std::vector & layer_descs,
+ MoeHybridStorage & out,
+ std::string * err = nullptr);
+
// Build hybrid storage by loading expert data directly from file (mmap).
-// Expert tensors in w are only used for metadata (ne/nb/type); their buffer
-// may be null. Expert data is read from file_data entries.
-bool build_qwen35moe_hybrid_storage_from_file(
- const TargetWeights & w,
+bool build_moe_hybrid_storage_from_file(
+ const MoeHybridConfig & cfg,
ggml_backend_t gpu_backend,
- const Qwen35MoeExpertPlacement & placement,
+ const MoeHybridPlacement & placement,
+ const std::vector & layer_descs,
const std::vector & file_data,
- Qwen35MoeHybridStorage & out,
+ MoeHybridStorage & out,
std::string * err = nullptr);
} // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.cpp b/server/src/common/moe_hybrid_swap_manager.cpp
similarity index 86%
rename from server/src/qwen35moe/qwen35moe_swap_manager.cpp
rename to server/src/common/moe_hybrid_swap_manager.cpp
index 3c7006d60..399d9c612 100644
--- a/server/src/qwen35moe/qwen35moe_swap_manager.cpp
+++ b/server/src/common/moe_hybrid_swap_manager.cpp
@@ -1,14 +1,15 @@
-#include "qwen35moe_swap_manager.h"
+#include "moe_hybrid_swap_manager.h"
+#include "moe_hybrid_routing_stats.h"
#include
namespace dflash::common {
-bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
- const Qwen35MoeRoutingStats & stats,
- const Qwen35MoeSwapPolicy & policy,
- Qwen35MoeSwapPlan & out,
- std::string * err) {
+bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current,
+ const MoeHybridRoutingStats & stats,
+ const MoeHybridSwapPolicy & policy,
+ MoeHybridSwapPlan & out,
+ std::string * err) {
if (current.n_layer != stats.n_layer ||
current.n_expert != stats.n_expert ||
current.n_expert_used != stats.n_expert_used) {
@@ -16,14 +17,14 @@ bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
return false;
}
- out = Qwen35MoeSwapPlan{};
+ out = MoeHybridSwapPlan{};
out.next_placement = current;
if (policy.max_swaps_total <= 0) {
return true;
}
struct Candidate {
- Qwen35MoeSwapAction action;
+ MoeHybridSwapAction action;
uint64_t gain_delta = 0;
};
std::vector candidates;
diff --git a/server/src/common/moe_hybrid_swap_manager.h b/server/src/common/moe_hybrid_swap_manager.h
new file mode 100644
index 000000000..c24a71740
--- /dev/null
+++ b/server/src/common/moe_hybrid_swap_manager.h
@@ -0,0 +1,39 @@
+// Common MoE hybrid swap manager — promotes/demotes experts at request boundaries.
+
+#pragma once
+
+#include "moe_hybrid_placement.h"
+
+#include
+#include
+#include
+
+namespace dflash::common {
+
+struct MoeHybridRoutingStats;
+
+struct MoeHybridSwapAction {
+ int layer_idx = -1;
+ int evict_expert = -1;
+ int promote_expert = -1;
+ uint64_t evict_count = 0;
+ uint64_t promote_count = 0;
+};
+
+struct MoeHybridSwapPlan {
+ MoeHybridPlacement next_placement;
+ std::vector actions;
+};
+
+struct MoeHybridSwapPolicy {
+ int max_swaps_total = 0; // 0 = no swaps
+ uint64_t min_promote_gain = 1; // promoted expert count must exceed evicted by this amount
+};
+
+bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current,
+ const MoeHybridRoutingStats & stats,
+ const MoeHybridSwapPolicy & policy,
+ MoeHybridSwapPlan & out,
+ std::string * err = nullptr);
+
+} // namespace dflash::common
diff --git a/server/src/common/moe_hybrid_types.h b/server/src/common/moe_hybrid_types.h
new file mode 100644
index 000000000..c3a15e6bd
--- /dev/null
+++ b/server/src/common/moe_hybrid_types.h
@@ -0,0 +1,63 @@
+// Common MoE hybrid mode types and descriptors.
+//
+// Model-agnostic abstractions used by both qwen35moe and laguna backends
+// to implement the hybrid expert offload strategy (hot experts on GPU,
+// cold experts on CPU, concurrent evaluation).
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include
+
+namespace dflash::common {
+
+// ─── MoE architecture config (model-agnostic) ──────────────────────────
+
+struct MoeHybridConfig {
+ int n_embd = 0; // hidden dimension
+ int n_expert = 0; // total experts per layer
+ int n_expert_used = 0; // top-k selected per token
+ int n_ff_exp = 0; // routed expert intermediate dimension
+ int n_ff_shexp = 0; // shared expert intermediate dimension (0 = no shared)
+ int n_layer = 0; // number of MoE layers
+ int first_moe_layer = 0; // index of first MoE layer (e.g., 0 for qwen35moe, 1 for laguna)
+};
+
+// ─── Per-layer expert tensor descriptor ─────────────────────────────────
+//
+// Provides a uniform view over model-specific layer structures. All pointers
+// refer to the FULL expert tensor stacks on GPU (used for placement validation
+// and metadata queries). In hybrid mode, the actual hot/cold split tensors
+// live in MoeHybridLayerStorage.
+
+struct MoeLayerDesc {
+ // Routed expert weight tensors (stacked: [dim_in, dim_out, n_expert])
+ ggml_tensor * ffn_gate_exps = nullptr;
+ ggml_tensor * ffn_up_exps = nullptr;
+ ggml_tensor * ffn_down_exps = nullptr;
+ ggml_tensor * ffn_gate_up_exps = nullptr; // optional fused gate+up
+
+ // Shared expert tensors (nullptr if no shared expert)
+ ggml_tensor * ffn_gate_shexp = nullptr;
+ ggml_tensor * ffn_up_shexp = nullptr;
+ ggml_tensor * ffn_down_shexp = nullptr;
+ ggml_tensor * ffn_gate_inp_shexp = nullptr; // optional shared-expert gating
+
+ // Per-tensor quantization scale factors (1.0f = no scaling)
+ float ffn_gate_exps_s = 1.0f;
+ float ffn_up_exps_s = 1.0f;
+ float ffn_down_exps_s = 1.0f;
+ float ffn_gate_up_exps_s = 1.0f;
+ float ffn_gate_shexp_s = 1.0f;
+ float ffn_up_shexp_s = 1.0f;
+ float ffn_down_shexp_s = 1.0f;
+ float ffn_gate_inp_shexp_s = 1.0f;
+
+ bool has_fused_gate_up() const { return ffn_gate_up_exps != nullptr; }
+ bool has_shared_expert() const { return ffn_up_shexp != nullptr; }
+};
+
+} // namespace dflash::common
+
diff --git a/server/src/common/moe_hybrid_types_impl.h b/server/src/common/moe_hybrid_types_impl.h
new file mode 100644
index 000000000..ddb4355bc
--- /dev/null
+++ b/server/src/common/moe_hybrid_types_impl.h
@@ -0,0 +1,91 @@
+// Inline implementations for MoeHybridConfig/MoeLayerDesc conversion helpers.
+//
+// Include this header AFTER both moe_hybrid_types.h and the relevant
+// model-specific weight struct header (internal.h or laguna_internal.h).
+// The preprocessor guards detect which weight structs are available and
+// only define the corresponding conversion helpers.
+
+#pragma once
+
+namespace dflash::common {
+
+// ─── qwen35 conversions ─────────────────────────────────────────────────
+
+#if defined(DFLASH_INTERNAL_H_INCLUDED)
+
+inline MoeHybridConfig make_moe_hybrid_config(const TargetWeights & w) {
+ MoeHybridConfig cfg;
+ cfg.n_embd = w.n_embd;
+ cfg.n_expert = w.n_expert;
+ cfg.n_expert_used = w.n_expert_used;
+ cfg.n_ff_exp = w.n_ff_exp;
+ cfg.n_ff_shexp = w.n_ff_shexp;
+ cfg.n_layer = w.n_layer;
+ cfg.first_moe_layer = 0; // all layers are MoE in qwen35moe
+ return cfg;
+}
+
+inline MoeLayerDesc make_moe_layer_desc(const TargetLayer & L) {
+ MoeLayerDesc desc;
+ desc.ffn_gate_exps = L.ffn_gate_exps;
+ desc.ffn_up_exps = L.ffn_up_exps;
+ desc.ffn_down_exps = L.ffn_down_exps;
+ desc.ffn_gate_up_exps = L.ffn_gate_up_exps;
+ desc.ffn_gate_shexp = L.ffn_gate_shexp;
+ desc.ffn_up_shexp = L.ffn_up_shexp;
+ desc.ffn_down_shexp = L.ffn_down_shexp;
+ desc.ffn_gate_inp_shexp = L.ffn_gate_inp_shexp;
+ desc.ffn_gate_exps_s = L.ffn_gate_exps_s;
+ desc.ffn_up_exps_s = L.ffn_up_exps_s;
+ desc.ffn_down_exps_s = L.ffn_down_exps_s;
+ desc.ffn_gate_up_exps_s = L.ffn_gate_up_exps_s;
+ desc.ffn_gate_shexp_s = L.ffn_gate_shexp_s;
+ desc.ffn_up_shexp_s = L.ffn_up_shexp_s;
+ desc.ffn_down_shexp_s = L.ffn_down_shexp_s;
+ desc.ffn_gate_inp_shexp_s = L.ffn_gate_inp_shexp_s;
+ return desc;
+}
+
+#endif // DFLASH_INTERNAL_H_INCLUDED
+
+// ─── Laguna conversions ─────────────────────────────────────────────────
+
+#if defined(DFLASH_LAGUNA_INTERNAL_H_INCLUDED)
+
+inline MoeHybridConfig make_moe_hybrid_config(const LagunaTargetWeights & w) {
+ MoeHybridConfig cfg;
+ cfg.n_embd = w.n_embd;
+ cfg.n_expert = w.n_expert;
+ cfg.n_expert_used = w.n_expert_used;
+ cfg.n_ff_exp = w.n_ff_exp;
+ cfg.n_ff_shexp = w.n_ff_shexp;
+ cfg.n_layer = w.n_layer;
+ cfg.first_moe_layer = w.n_layer_dense_lead; // layer 0 is dense in laguna
+ return cfg;
+}
+
+inline MoeLayerDesc make_moe_layer_desc(const LagunaTargetLayer & L) {
+ MoeLayerDesc desc;
+ desc.ffn_gate_exps = L.ffn_gate_exps;
+ desc.ffn_up_exps = L.ffn_up_exps;
+ desc.ffn_down_exps = L.ffn_down_exps;
+ desc.ffn_gate_up_exps = nullptr; // laguna has no fused gate_up
+ desc.ffn_gate_shexp = L.ffn_gate_shexp;
+ desc.ffn_up_shexp = L.ffn_up_shexp;
+ desc.ffn_down_shexp = L.ffn_down_shexp;
+ desc.ffn_gate_inp_shexp = nullptr; // laguna has no shared-expert gate
+ // Laguna does not use per-tensor quantization scales
+ desc.ffn_gate_exps_s = 1.0f;
+ desc.ffn_up_exps_s = 1.0f;
+ desc.ffn_down_exps_s = 1.0f;
+ desc.ffn_gate_up_exps_s = 1.0f;
+ desc.ffn_gate_shexp_s = 1.0f;
+ desc.ffn_up_shexp_s = 1.0f;
+ desc.ffn_down_shexp_s = 1.0f;
+ desc.ffn_gate_inp_shexp_s = 1.0f;
+ return desc;
+}
+
+#endif // DFLASH_LAGUNA_INTERNAL_H_INCLUDED
+
+} // namespace dflash::common
diff --git a/server/src/internal.h b/server/src/internal.h
index f9a890ff2..9b5b45ca7 100644
--- a/server/src/internal.h
+++ b/server/src/internal.h
@@ -2,6 +2,7 @@
// Not installed, not exposed in the public API.
#pragma once
+#define DFLASH_INTERNAL_H_INCLUDED
#include
#include
@@ -27,7 +28,7 @@
namespace dflash::common {
-struct Qwen35MoeHybridStorage;
+struct MoeHybridStorage;
// Single source of truth for error reporting.
// All loaders / graph builders push into this via set_last_error(...).
@@ -155,7 +156,7 @@ struct TargetWeights {
std::vector layers; // size = 64
ggml_tensor * out_norm = nullptr; // [hidden]
ggml_tensor * output = nullptr; // [hidden, vocab] (lm_head)
- std::shared_ptr moe_hybrid; // optional Phase 3 hybrid storage
+ std::shared_ptr moe_hybrid; // optional hybrid storage (hot/cold expert split)
// Metadata from GGUF (validated at load time)
int full_attention_interval = 4;
diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp
index 87723f596..ff82df7f7 100644
--- a/server/src/laguna/laguna_backend.cpp
+++ b/server/src/laguna/laguna_backend.cpp
@@ -10,7 +10,17 @@
#include "laguna_internal.h"
#include "dflash27b.h"
+#include "../common/moe_hybrid_types.h"
+#include "../common/moe_hybrid_types_impl.h"
+#include "../common/moe_hybrid_placement.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
+#include "common/step_graph.h"
+
#include "ggml-cuda.h"
+#include "ggml-alloc.h"
#include "common/snapshot_backend.h"
#include
@@ -20,6 +30,10 @@
#include
#include
#include
+#include
+#include
+#include
+#include
namespace dflash::common {
@@ -44,8 +58,9 @@ bool LagunaBackend::init() {
return false;
}
- if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) {
- std::fprintf(stderr, "load failed: %s\n", dflash27b_last_error());
+ // Always use dynamic placement (like qwen35moe): partial load first,
+ // compute budget, then reload full if all experts fit.
+ if (!init_hybrid_mode()) {
ggml_backend_free(backend_); backend_ = nullptr;
return false;
}
@@ -146,6 +161,12 @@ int LagunaBackend::snapshot_cur_pos(int slot) const {
GenerateResult LagunaBackend::generate(const GenerateRequest & req,
const DaemonIO & io) {
+ if (hybrid_mode_ && moe_hybrid_) {
+ auto result = generate_hybrid(req, io);
+ if (result.ok) maybe_post_request_swap();
+ return result;
+ }
+
const bool no_mask = (std::getenv("DFLASH_NO_MASK") != nullptr);
GenerateResult result;
DaemonIO out_io = io.with_token_callback(req.on_token);
@@ -492,6 +513,983 @@ void LagunaBackend::free_drafter() {
}
}
+// ── Hybrid MoE mode ─────────────────────────────────────────────────────
+//
+// Layer-by-layer decode: for each token, iterate through all 40 layers.
+// Layer 0 (dense SwiGLU) runs as a monolithic GPU sub-graph.
+// Layers 1..39 (sparse MoE) run attention+router on GPU, read back expert
+// selections, then call the common hybrid FFN eval (hot on GPU, cold on CPU).
+
+using HybridClock = std::chrono::steady_clock;
+static inline uint64_t elapsed_us(HybridClock::time_point t0, HybridClock::time_point t1) {
+ return (uint64_t)std::chrono::duration_cast(t1 - t0).count();
+}
+
+bool LagunaBackend::init_hybrid_mode() {
+ const char * hotness_path = std::getenv("DFLASH_LAGUNA_HOTNESS");
+
+ // Step 1: Load model WITHOUT expert data to GPU (partial load)
+ TargetLoadPlan _hybrid_plan;
+ _hybrid_plan.skip_expert_tensors = true;
+ if (!load_target_gguf_laguna_partial(args_.target_path, backend_, _hybrid_plan, w_)) {
+ std::fprintf(stderr, "[laguna-hybrid] partial load failed: %s\n", dflash27b_last_error());
+ return false;
+ }
+
+ // Step 2: Load/build routing stats
+ MoeHybridRoutingStats hotness;
+ std::string err;
+ std::string placement_source;
+ if (hotness_path && hotness_path[0]) {
+ if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, &err)) {
+ std::fprintf(stderr, "[laguna-hybrid] hotness load failed: %s\n", err.c_str());
+ return false;
+ }
+ if (hotness.n_layer != w_.n_layer || hotness.n_expert != w_.n_expert) {
+ std::fprintf(stderr, "[laguna-hybrid] hotness dimensions mismatch (got %d×%d, want %d×%d)\n",
+ hotness.n_layer, hotness.n_expert, w_.n_layer, w_.n_expert);
+ return false;
+ }
+ placement_source = "file";
+ } else {
+ // Uniform hotness (budget-only mode, no hotness file)
+ hotness.n_layer = w_.n_layer;
+ hotness.n_expert = w_.n_expert;
+ hotness.n_expert_used = w_.n_expert_used;
+ hotness.counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 1);
+ hotness.layer_totals.assign((size_t)w_.n_layer, (uint64_t)w_.n_expert);
+ placement_source = "uniform";
+ }
+
+ // Step 3: Query GPU memory and compute expert budget
+ size_t gpu_free = 0, gpu_total = 0;
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend_);
+ if (dev) {
+ ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
+ }
+ if (gpu_total == 0) {
+ std::fprintf(stderr, "[laguna-hybrid] could not query GPU memory\n");
+ return false;
+ }
+
+ // Compute per-layer expert size in bytes (laguna: separate gate/up/down, no fused)
+ std::vector layer_expert_bytes((size_t)w_.n_layer);
+ for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+ const LagunaTargetLayer & L = w_.layers[(size_t)il];
+ uint64_t bytes = 0;
+ if (L.ffn_gate_exps) bytes += ggml_nbytes(L.ffn_gate_exps) / (uint64_t)w_.n_expert;
+ if (L.ffn_up_exps) bytes += ggml_nbytes(L.ffn_up_exps) / (uint64_t)w_.n_expert;
+ if (L.ffn_down_exps) bytes += ggml_nbytes(L.ffn_down_exps) / (uint64_t)w_.n_expert;
+ layer_expert_bytes[(size_t)il] = bytes;
+ }
+ // Layer 0 is dense — no experts
+ for (int il = 0; il < w_.n_layer_dense_lead; ++il) {
+ layer_expert_bytes[(size_t)il] = 0;
+ }
+
+ uint64_t total_expert_bytes = 0;
+ for (int il = 0; il < w_.n_layer; ++il) {
+ total_expert_bytes += layer_expert_bytes[(size_t)il] * (uint64_t)w_.n_expert;
+ }
+
+ // KV cache estimate
+ const char * ctx_env = std::getenv("DFLASH_MAX_CONTEXT");
+ int max_context = ctx_env ? std::atoi(ctx_env) : args_.max_ctx;
+ if (max_context <= 0) max_context = 8192;
+
+ const uint64_t kv_bytes_per_tok = (uint64_t)w_.n_layer * 2 *
+ (uint64_t)w_.n_head_kv * (uint64_t)w_.head_dim * 2;
+ const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context;
+
+ const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024;
+ const uint64_t safety_bytes = 512ULL * 1024 * 1024;
+ const uint64_t core_bytes = gpu_total - gpu_free;
+
+ uint64_t expert_budget = 0;
+ if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes) {
+ expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes;
+ }
+ if (expert_budget > total_expert_bytes) {
+ expert_budget = total_expert_bytes;
+ }
+
+ // Manual budget cap (absolute MB)
+ if (const char * cap_env = std::getenv("DFLASH_EXPERT_BUDGET_MB")) {
+ uint64_t cap_bytes = (uint64_t)std::atoi(cap_env) * 1024ULL * 1024ULL;
+ if (cap_bytes > 0 && cap_bytes < expert_budget) {
+ std::printf("[laguna-hybrid] capping expert budget from %.2f GiB to %d MB\n",
+ expert_budget / 1024.0 / 1024.0 / 1024.0, std::atoi(cap_env));
+ expert_budget = cap_bytes;
+ }
+ }
+
+ // Percentage-based budget cap
+ if (const char * pct_env = std::getenv("DFLASH_EXPERT_BUDGET_PCT")) {
+ int pct = std::atoi(pct_env);
+ if (pct > 0 && pct < 100) {
+ uint64_t pct_bytes = total_expert_bytes * (uint64_t)pct / 100ULL;
+ if (pct_bytes < expert_budget) {
+ std::printf("[laguna-hybrid] capping expert budget to %d%% = %.2f GiB (of %.2f GiB)\n",
+ pct, pct_bytes / 1024.0 / 1024.0 / 1024.0,
+ total_expert_bytes / 1024.0 / 1024.0 / 1024.0);
+ expert_budget = pct_bytes;
+ }
+ }
+ }
+
+ std::printf("[laguna] dynamic placement: gpu_total=%.2f GiB, core=%.2f GiB, "
+ "kv_cache=%.2f GiB (ctx=%d), warm=%.0f MB, safety=%.0f MB, "
+ "expert_budget=%.2f GiB (of %.2f GiB total experts)\n",
+ gpu_total / 1024.0 / 1024.0 / 1024.0,
+ core_bytes / 1024.0 / 1024.0 / 1024.0,
+ kv_total / 1024.0 / 1024.0 / 1024.0,
+ max_context,
+ warm_cache_bytes / 1024.0 / 1024.0,
+ safety_bytes / 1024.0 / 1024.0,
+ expert_budget / 1024.0 / 1024.0 / 1024.0,
+ total_expert_bytes / 1024.0 / 1024.0 / 1024.0);
+ std::fflush(stdout);
+
+ if (expert_budget == 0) {
+ std::fprintf(stderr, "[laguna-hybrid] no VRAM budget for experts\n");
+ return false;
+ }
+
+ // Step 4: Build placement
+ MoeHybridPlacement placement;
+ if (!MoeHybridPlacement::build_from_stats_with_layer_bytes(
+ hotness, layer_expert_bytes, expert_budget,
+ /*min_hot_per_layer=*/std::min(w_.n_expert_used, w_.n_expert),
+ placement, &err)) {
+ std::fprintf(stderr, "[laguna-hybrid] placement build failed: %s\n", err.c_str());
+ return false;
+ }
+
+ int total_moe_experts = (w_.n_layer - w_.n_layer_dense_lead) * w_.n_expert;
+ std::printf("[laguna] dynamic placement result: %d hot experts, %d cold experts\n",
+ placement.total_hot, total_moe_experts - placement.total_hot);
+
+ // If all experts fit, reload full model to GPU (non-hybrid path)
+ if (placement.total_hot >= total_moe_experts) {
+ std::printf("[laguna] all experts fit in VRAM, loading fully to GPU\n");
+ std::fflush(stdout);
+ free_laguna_target_weights(w_);
+ if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) {
+ std::fprintf(stderr, "[laguna] full reload failed: %s\n", dflash27b_last_error());
+ return false;
+ }
+ return true;
+ }
+
+ // Step 5: Load expert data from GGUF mmap into hot/cold split buffers
+ {
+ ggml_context * expert_meta = nullptr;
+ gguf_init_params gip{};
+ gip.no_alloc = true;
+ gip.ctx = &expert_meta;
+ gguf_context * gctx = gguf_init_from_file(args_.target_path.c_str(), gip);
+ if (!gctx) {
+ std::fprintf(stderr, "[laguna-hybrid] failed to re-open GGUF for expert loading\n");
+ return false;
+ }
+
+ int fd = ::open(args_.target_path.c_str(), O_RDONLY);
+ if (fd < 0) {
+ gguf_free(gctx);
+ std::fprintf(stderr, "[laguna-hybrid] open failed for mmap\n");
+ return false;
+ }
+ struct stat st;
+ if (::fstat(fd, &st) < 0) {
+ ::close(fd);
+ gguf_free(gctx);
+ std::fprintf(stderr, "[laguna-hybrid] fstat failed\n");
+ return false;
+ }
+ const size_t file_size = (size_t)st.st_size;
+ void * mmap_addr = ::mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ ::close(fd);
+ if (mmap_addr == MAP_FAILED) {
+ gguf_free(gctx);
+ std::fprintf(stderr, "[laguna-hybrid] mmap failed\n");
+ return false;
+ }
+
+ const size_t data_start = gguf_get_data_offset(gctx);
+ const auto * file_bytes = (const uint8_t *)mmap_addr;
+
+ // Build per-layer expert file data
+ std::vector layer_file_data((size_t)w_.n_layer);
+ for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+ char name[128];
+ auto find_tensor_data = [&](const char * suffix) -> ExpertTensorFileData {
+ std::snprintf(name, sizeof(name), "blk.%d.%s.weight", il, suffix);
+ int64_t tid = gguf_find_tensor(gctx, name);
+ if (tid < 0) return {};
+ size_t off = data_start + gguf_get_tensor_offset(gctx, tid);
+ size_t sz = gguf_get_tensor_size(gctx, tid);
+ if (off + sz > file_size) return {};
+ return { file_bytes + off, sz };
+ };
+
+ layer_file_data[(size_t)il].gate_exps = find_tensor_data("ffn_gate_exps");
+ layer_file_data[(size_t)il].up_exps = find_tensor_data("ffn_up_exps");
+ layer_file_data[(size_t)il].down_exps = find_tensor_data("ffn_down_exps");
+ // laguna has no fused gate_up_exps
+ }
+
+ auto hybrid = std::make_shared();
+ MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w_);
+ std::vector layer_descs((size_t)w_.n_layer);
+ for (int il = 0; il < w_.n_layer; ++il) {
+ layer_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]);
+ }
+ if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend_, placement, layer_descs, layer_file_data, *hybrid, &err)) {
+ ::munmap(mmap_addr, file_size);
+ gguf_free(gctx);
+ std::fprintf(stderr, "[laguna-hybrid] storage build failed: %s\n", err.c_str());
+ return false;
+ }
+
+ ::munmap(mmap_addr, file_size);
+ gguf_free(gctx);
+
+ moe_hybrid_ = std::move(hybrid);
+ }
+
+ // Print stats
+ int total_cold = 0;
+ uint64_t hot_bytes = 0, cold_bytes = 0;
+ for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) {
+ const auto & layer = moe_hybrid_->layers[(size_t)il];
+ total_cold += (int)layer.cold_expert_ids.size();
+ const uint64_t per_expert_bytes =
+ (uint64_t)layer.gate_expert_bytes + (uint64_t)layer.up_expert_bytes + (uint64_t)layer.down_expert_bytes;
+ hot_bytes += per_expert_bytes * (uint64_t)layer.hot_expert_ids.size();
+ cold_bytes += per_expert_bytes * (uint64_t)layer.cold_expert_ids.size();
+ }
+ std::printf("[laguna-hybrid] storage ready: total_hot=%d (%.2f GiB VRAM) total_cold=%d (%.2f GiB RAM) source=%s\n",
+ placement.total_hot,
+ hot_bytes / 1024.0 / 1024.0 / 1024.0,
+ total_cold,
+ cold_bytes / 1024.0 / 1024.0 / 1024.0,
+ placement_source.c_str());
+
+ if (total_cold > 0) {
+ hybrid_mode_ = true;
+ std::printf("[laguna-hybrid] hybrid decode path active (%d cold experts)\n", total_cold);
+ } else {
+ hybrid_mode_ = true; // partial load: expert tensors only in hybrid storage
+ std::printf("[laguna-hybrid] all experts hot — using hybrid path (all-hot)\n");
+ }
+
+ // Configure telemetry and swap policy
+ if (const char * telemetry = std::getenv("DFLASH_LAGUNA_TELEMETRY")) {
+ hybrid_telemetry_ = std::atoi(telemetry) != 0;
+ }
+ if (const char * out_path = std::getenv("DFLASH_LAGUNA_NEXT_PLACEMENT_OUT")) {
+ routing_stats_out_path_ = out_path;
+ }
+ if (const char * swap_max = std::getenv("DFLASH_LAGUNA_SWAP_MAX")) {
+ swap_policy_.max_swaps_total = std::max(0, std::atoi(swap_max));
+ }
+ if (const char * swap_gain = std::getenv("DFLASH_LAGUNA_SWAP_MIN_GAIN")) {
+ swap_policy_.min_promote_gain = (uint64_t)std::max(1, std::atoi(swap_gain));
+ }
+
+ // Allocate routing stats collector
+ if (!routing_stats_out_path_.empty()) {
+ routing_stats_ = std::make_shared();
+ routing_stats_->n_layer = w_.n_layer;
+ routing_stats_->n_expert = w_.n_expert;
+ routing_stats_->n_expert_used = w_.n_expert_used;
+ routing_stats_->counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 0);
+ routing_stats_->layer_totals.assign((size_t)w_.n_layer, 0);
+ }
+
+ std::fflush(stdout);
+ return true;
+}
+
+// ── Laguna hybrid per-layer pre-FFN graph ───────────────────────────────
+//
+// Builds attention + router for a single layer. For MoE layers, outputs:
+// sg.ffn_post = post-attention normed hidden (input to FFN)
+// sg.ffn_residual = residual to add after FFN output
+// sg.moe_selected = [n_used] expert IDs
+// sg.moe_weights = [n_used] combine weights
+// For the dense layer 0, outputs the full layer result in sg.hidden_input.
+
+static bool build_laguna_layer_prefn_step(
+ StepGraph & sg,
+ const LagunaTargetWeights & w,
+ LagunaTargetCache & cache,
+ ggml_backend_t backend,
+ int il,
+ int kv_start,
+ int n_tokens)
+{
+ step_graph_free(sg);
+
+ const int n_embd = w.n_embd;
+ const bool is_full = laguna_is_full_attn_layer(w, il);
+ const bool is_dense = (il < w.n_layer_dense_lead);
+ const LagunaTargetLayer & L = w.layers[(size_t)il];
+ const int kv_len = kv_start + n_tokens;
+ const int n_head = w.n_head_arr[il];
+ const int n_head_kv = w.n_head_kv;
+ const int head_dim = w.head_dim;
+
+ ggml_init_params ip{};
+ ip.mem_size = ggml_tensor_overhead() * 4096 + ggml_graph_overhead() + 8 * 1024 * 1024;
+ ip.no_alloc = true;
+ sg.ctx = ggml_init(ip);
+ if (!sg.ctx) return false;
+ sg.gf = ggml_new_graph_custom(sg.ctx, 4096, false);
+
+ // Input: hidden state [n_embd, n_tokens]
+ sg.inp_embed = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F32, n_embd, n_tokens);
+ ggml_set_input(sg.inp_embed);
+ ggml_set_name(sg.inp_embed, "inp_embed");
+
+ // Positions
+ sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, n_tokens);
+ ggml_set_input(sg.positions);
+
+ // Attention mask (causal)
+ ggml_tensor * attn_mask = nullptr;
+ if (kv_len > 0) {
+ attn_mask = ggml_new_tensor_4d(sg.ctx, GGML_TYPE_F32, kv_len, n_tokens, 1, 1);
+ ggml_set_input(attn_mask);
+ sg.attn_mask = attn_mask;
+ }
+
+ ggml_tensor * inp = sg.inp_embed;
+
+ // Pre-attn RMS norm
+ ggml_tensor * cur = ggml_rms_norm(sg.ctx, inp, 1e-6f);
+ cur = ggml_mul(sg.ctx, cur, L.attn_norm);
+
+ // QKV projections
+ const int q_dim = n_head * head_dim;
+ ggml_tensor * Qcur = ggml_mul_mat(sg.ctx, L.wq, cur); // [q_dim, n_tokens]
+ ggml_tensor * Kcur = ggml_mul_mat(sg.ctx, L.wk, cur); // [n_head_kv * head_dim, n_tokens]
+ ggml_tensor * Vcur = ggml_mul_mat(sg.ctx, L.wv, cur); // [n_head_kv * head_dim, n_tokens]
+
+ // Per-head softplus gate
+ ggml_tensor * gate = ggml_mul_mat(sg.ctx, L.wqkv_gate, cur); // [n_head, n_tokens]
+ gate = ggml_softplus(sg.ctx, gate);
+
+ // Reshape Q to [head_dim, n_head, n_tokens]
+ Qcur = ggml_reshape_3d(sg.ctx, Qcur, head_dim, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(sg.ctx, Kcur, head_dim, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(sg.ctx, Vcur, head_dim, n_head_kv, n_tokens);
+
+ // Q-norm / K-norm
+ Qcur = ggml_rms_norm(sg.ctx, Qcur, 1e-6f);
+ Qcur = ggml_mul(sg.ctx, Qcur, L.q_norm);
+ Kcur = ggml_rms_norm(sg.ctx, Kcur, 1e-6f);
+ Kcur = ggml_mul(sg.ctx, Kcur, L.k_norm);
+
+ // RoPE (YaRN on full-attention layers, plain on SWA layers)
+ const float rope_th = is_full ? w.rope_freq_base_full : w.rope_freq_base_swa;
+ const int n_rot = is_full ? w.n_rot_full : w.n_rot_swa;
+ const float ext_factor = is_full ? 1.0f : 0.0f;
+ const float attn_factor = 1.0f;
+ const float beta_fast = is_full ? w.yarn_beta_fast : 32.0f;
+ const float beta_slow = is_full ? w.yarn_beta_slow : 1.0f;
+ const int n_ctx_orig = is_full ? w.yarn_orig_ctx : 0;
+ const float freq_scale = is_full ? (1.0f / w.yarn_factor) : 1.0f;
+
+ Qcur = ggml_rope_ext(sg.ctx, Qcur, sg.positions, /*freq_factors=*/nullptr,
+ n_rot, GGML_ROPE_TYPE_NEOX,
+ n_ctx_orig, rope_th, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ Kcur = ggml_rope_ext(sg.ctx, Kcur, sg.positions, nullptr,
+ n_rot, GGML_ROPE_TYPE_NEOX,
+ n_ctx_orig, rope_th, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ // KV cache write — permute to [head_dim, n_tokens, n_head_kv] layout
+ ggml_tensor * cache_k = cache.attn_k[(size_t)il];
+ ggml_tensor * cache_v = cache.attn_v[(size_t)il];
+
+ ggml_tensor * Kcur_T = ggml_permute(sg.ctx, Kcur, 0, 2, 1, 3);
+ ggml_tensor * Vcur_T = ggml_permute(sg.ctx, Vcur, 0, 2, 1, 3);
+
+ ggml_tensor * k_view = ggml_view_3d(sg.ctx, cache_k,
+ head_dim, n_tokens, n_head_kv,
+ cache_k->nb[1], cache_k->nb[2],
+ cache_k->nb[1] * (size_t)kv_start);
+ ggml_tensor * k_cpy = ggml_cpy(sg.ctx, Kcur_T, k_view);
+ ggml_build_forward_expand(sg.gf, k_cpy);
+
+ ggml_tensor * v_view = ggml_view_3d(sg.ctx, cache_v,
+ head_dim, n_tokens, n_head_kv,
+ cache_v->nb[1], cache_v->nb[2],
+ cache_v->nb[1] * (size_t)kv_start);
+ ggml_tensor * v_cpy = ggml_cpy(sg.ctx, Vcur_T, v_view);
+ ggml_build_forward_expand(sg.gf, v_cpy);
+
+ // Flash attention
+ ggml_tensor * Qfa = ggml_permute(sg.ctx, Qcur, 0, 2, 1, 3);
+ Qfa = ggml_cont(sg.ctx, Qfa);
+
+ ggml_tensor * Kfa = ggml_view_3d(sg.ctx, cache_k,
+ head_dim, kv_len, n_head_kv,
+ cache_k->nb[1], cache_k->nb[2], 0);
+ ggml_tensor * Vfa = ggml_view_3d(sg.ctx, cache_v,
+ head_dim, kv_len, n_head_kv,
+ cache_v->nb[1], cache_v->nb[2], 0);
+
+ const float kq_scale = 1.0f / std::sqrt((float)head_dim);
+ ggml_tensor * attn_mask_f16 = attn_mask ? ggml_cast(sg.ctx, attn_mask, GGML_TYPE_F16) : nullptr;
+ ggml_tensor * attn = ggml_flash_attn_ext(sg.ctx, Qfa, Kfa, Vfa, attn_mask_f16,
+ kq_scale, 0.0f, 0.0f);
+
+ // Per-head softplus gate
+ ggml_tensor * gate_b = ggml_reshape_3d(sg.ctx, gate, 1, n_head, n_tokens);
+ gate_b = ggml_cast(sg.ctx, gate_b, attn->type);
+ attn = ggml_mul(sg.ctx, attn, gate_b);
+
+ attn = ggml_reshape_2d(sg.ctx, attn, q_dim, n_tokens);
+
+ // Output projection
+ ggml_tensor * attn_out = ggml_mul_mat(sg.ctx, L.wo, attn); // [n_embd, n_tokens]
+
+ // Residual after attention
+ ggml_tensor * ffn_inp = ggml_add(sg.ctx, attn_out, inp);
+
+ if (is_dense) {
+ // Dense layer 0: run full MLP in this graph
+ ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f);
+ normed = ggml_mul(sg.ctx, normed, L.ffn_norm);
+
+ ggml_tensor * g = ggml_mul_mat(sg.ctx, L.w_gate, normed);
+ ggml_tensor * u = ggml_mul_mat(sg.ctx, L.w_up, normed);
+ ggml_tensor * gu = ggml_swiglu_split(sg.ctx, g, u);
+ ggml_tensor * d = ggml_mul_mat(sg.ctx, L.w_down, gu);
+ ggml_tensor * layer_out = ggml_add(sg.ctx, d, ffn_inp);
+
+ sg.hidden_input = layer_out;
+ ggml_set_output(layer_out);
+ ggml_build_forward_expand(sg.gf, layer_out);
+ } else {
+ // MoE layer: output pre-FFN normed + residual + router decisions
+ ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f);
+ normed = ggml_mul(sg.ctx, normed, L.ffn_norm);
+ sg.ffn_post = normed;
+ ggml_set_output(normed);
+
+ sg.ffn_residual = ffn_inp;
+ ggml_set_output(ffn_inp);
+
+ // Router: sigmoid + score-correction bias + top-k
+ ggml_tensor * router_logits = ggml_mul_mat(sg.ctx, L.ffn_gate_inp, normed);
+ ggml_tensor * probs = ggml_sigmoid(sg.ctx, router_logits);
+ ggml_tensor * scores_sel = ggml_add(sg.ctx, probs, L.ffn_exp_probs_b);
+ ggml_tensor * selected = ggml_top_k(sg.ctx, scores_sel, w.n_expert_used);
+ ggml_set_output(selected);
+
+ // Gather original probs (no bias) for combine weights
+ ggml_tensor * probs_3d = ggml_reshape_3d(sg.ctx, probs, 1, w.n_expert, n_tokens);
+ ggml_tensor * weights_raw = ggml_get_rows(sg.ctx, probs_3d, selected);
+ weights_raw = ggml_reshape_2d(sg.ctx, weights_raw, w.n_expert_used, n_tokens);
+
+ // Sum-normalize + scale
+ ggml_tensor * w_sum = ggml_sum_rows(sg.ctx, weights_raw);
+ ggml_tensor * weights_normed = ggml_div(sg.ctx, weights_raw, w_sum);
+ if (w.expert_weights_scale != 1.0f) {
+ weights_normed = ggml_scale(sg.ctx, weights_normed, w.expert_weights_scale);
+ }
+ sg.moe_weights = weights_normed;
+ ggml_set_output(weights_normed);
+
+ sg.moe_selected.resize(1);
+ sg.moe_selected[0] = selected;
+
+ ggml_build_forward_expand(sg.gf, normed);
+ ggml_build_forward_expand(sg.gf, ffn_inp);
+ ggml_build_forward_expand(sg.gf, selected);
+ ggml_build_forward_expand(sg.gf, weights_normed);
+ }
+
+ // Allocate
+ if (!sg.alloc) {
+ sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+ }
+ if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) {
+ return false;
+ }
+
+ return true;
+}
+
+// ── Hybrid forward: one token through all 40 layers ─────────────────────
+
+bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
+ std::vector & act_cur,
+ int32_t & argmax_out) {
+ const int hidden = w_.n_embd;
+ const int vocab = w_.embedder.n_vocab;
+
+ // Embed token
+ if (!w_.embedder.embed(&tok, 1, act_cur.data())) return false;
+
+ // GPU-resident state for MoE layers
+ GpuResidentState gpu_state;
+ if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false;
+ ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+
+ StepGraph layer_sg;
+ std::vector selected((size_t)w_.n_expert_used);
+ std::vector weights_buf((size_t)w_.n_expert_used);
+ ggml_backend_t cpu_be = moe_hybrid_->cpu_backend;
+
+ for (int il = 0; il < w_.n_layer; ++il) {
+ const bool is_dense = (il < w_.n_layer_dense_lead);
+
+ if (!build_laguna_layer_prefn_step(layer_sg, w_, cache_, backend_, il, kv_pos, 1)) {
+ step_graph_destroy(layer_sg);
+ gpu_state.destroy();
+ return false;
+ }
+
+ // GPU→GPU: copy persistent act_cur to pre-FFN graph input
+ ggml_backend_tensor_copy(gpu_state.act_cur, layer_sg.inp_embed);
+
+ // Set positions
+ int32_t pos_val = kv_pos;
+ ggml_backend_tensor_set(layer_sg.positions, &pos_val, 0, sizeof(int32_t));
+
+ // Causal mask: single token decode — all positions [0..kv_pos] visible
+ if (layer_sg.attn_mask) {
+ const int kv_len = kv_pos + 1;
+ std::vector mask_data((size_t)kv_len, 0.0f);
+ ggml_backend_tensor_set(layer_sg.attn_mask, mask_data.data(), 0, sizeof(float) * (size_t)kv_len);
+ }
+
+ auto st = ggml_backend_graph_compute(backend_, layer_sg.gf);
+ if (st != GGML_STATUS_SUCCESS) {
+ step_graph_destroy(layer_sg);
+ gpu_state.destroy();
+ return false;
+ }
+
+ if (is_dense) {
+ // Dense layer: read full output back to GPU-resident state
+ ggml_backend_tensor_copy(layer_sg.hidden_input, gpu_state.act_cur);
+ } else {
+ // MoE layer: read router decisions, then do hybrid FFN eval
+ ggml_tensor * sel_tensor = layer_sg.moe_selected[0];
+ ggml_backend_tensor_get(sel_tensor, selected.data(), 0,
+ sizeof(int32_t) * selected.size());
+ ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
+ sizeof(float) * weights_buf.size());
+
+ if (routing_stats_) {
+ routing_stats_->observe(il, selected.data(), (int)selected.size());
+ }
+
+ // Hybrid FFN: hot on GPU, cold on CPU, combine on GPU
+ auto & storage = moe_hybrid_->layers[(size_t)il];
+ MoeHybridConfig cfg = make_moe_hybrid_config(w_);
+ MoeLayerDesc desc = make_moe_layer_desc(w_.layers[(size_t)il]);
+ if (!eval_moe_hybrid_ffn_gpu_resident(
+ backend_, cfg, desc, storage, cpu_be,
+ layer_sg.ffn_post, layer_sg.ffn_residual,
+ gpu_state,
+ selected.data(), weights_buf.data(),
+ (int)selected.size())) {
+ step_graph_destroy(layer_sg);
+ gpu_state.destroy();
+ return false;
+ }
+ }
+ }
+
+ // Read final hidden state and project logits
+ ggml_backend_tensor_get(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ step_graph_destroy(layer_sg);
+ gpu_state.destroy();
+
+ // Project logits: final RMS norm + lm_head
+ {
+ ggml_init_params ip{};
+ ip.mem_size = 64 * 1024 * 1024;
+ ip.no_alloc = true;
+ ggml_context * ctx = ggml_init(ip);
+ if (!ctx) return false;
+
+ ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1);
+ ggml_set_input(h_in);
+ ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+
+ ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f);
+ normed = ggml_mul(ctx, normed, w_.out_norm);
+ ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed);
+ ggml_set_output(logits);
+ ggml_build_forward_expand(gf, logits);
+
+ ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_));
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+ return false;
+ }
+ ggml_backend_tensor_set(h_in, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) {
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+ return false;
+ }
+
+ std::vector logits_buf((size_t)vocab);
+ ggml_backend_tensor_get(logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+
+ // Argmax
+ argmax_out = 0;
+ float best = logits_buf[0];
+ for (int j = 1; j < vocab; ++j) {
+ if (logits_buf[(size_t)j] > best) {
+ best = logits_buf[(size_t)j];
+ argmax_out = j;
+ }
+ }
+ }
+ return true;
+}
+
+// ── Hybrid generate ─────────────────────────────────────────────────────
+
+GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req,
+ const DaemonIO & io) {
+ GenerateResult result;
+ DaemonIO out_io = io.with_token_callback(req.on_token);
+ const bool should_emit = req.stream || (bool)out_io.on_token;
+ const int N = (int)req.prompt.size();
+
+ if (N + req.n_gen > args_.max_ctx) {
+ result.error = "overflow";
+ return result;
+ }
+
+ reset_laguna_target_cache(cache_);
+
+ // ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ──
+ const int hidden = w_.n_embd;
+ const int n_expert_used = w_.n_expert_used;
+ ggml_backend_t cpu_be = moe_hybrid_->cpu_backend;
+
+ std::vector embed_all((size_t)N * (size_t)hidden);
+ if (!w_.embedder.embed(req.prompt.data(), N, embed_all.data())) {
+ result.error = "embed_prefill";
+ return result;
+ }
+
+ auto t_pf0 = std::chrono::steady_clock::now();
+ const int prefill_chunk = std::min(args_.chunk, N);
+
+ StepGraph prefill_sg; // persistent across layers to reuse GPU buffer
+ ggml_gallocr_t ffn_hot_alloc = nullptr;
+ ggml_gallocr_t ffn_cold_alloc = nullptr;
+
+ for (int il = 0; il < w_.n_layer; ++il) {
+ const bool is_dense = (il < w_.n_layer_dense_lead);
+ const bool is_full = laguna_is_full_attn_layer(w_, il);
+
+ for (int chunk_start = 0; chunk_start < N; chunk_start += prefill_chunk) {
+ const int chunk_len = std::min(prefill_chunk, N - chunk_start);
+
+ step_graph_free(prefill_sg); // reset ctx/graph but keep gallocr buffer
+ if (!build_laguna_layer_prefn_step(prefill_sg, w_, cache_, backend_,
+ il, chunk_start, chunk_len)) {
+ result.error = "prefill_build";
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+ return result;
+ }
+
+ // Set input embeddings
+ ggml_backend_tensor_set(prefill_sg.inp_embed,
+ embed_all.data() + (size_t)chunk_start * (size_t)hidden, 0,
+ sizeof(float) * (size_t)chunk_len * (size_t)hidden);
+
+ // Set positions
+ std::vector pos_data((size_t)chunk_len);
+ for (int i = 0; i < chunk_len; ++i) pos_data[i] = chunk_start + i;
+ ggml_backend_tensor_set(prefill_sg.positions, pos_data.data(), 0,
+ sizeof(int32_t) * (size_t)chunk_len);
+
+ // Set attention mask (causal or causal+SWA depending on layer)
+ if (prefill_sg.attn_mask) {
+ const int kv_len = chunk_start + chunk_len;
+ std::vector mask((size_t)kv_len * (size_t)chunk_len, -INFINITY);
+ for (int q = 0; q < chunk_len; ++q) {
+ const int abs_q = chunk_start + q;
+ const int win_lo = is_full ? 0 : std::max(0, abs_q - w_.sliding_window + 1);
+ for (int k = win_lo; k <= abs_q && k < kv_len; ++k) {
+ mask[(size_t)q * (size_t)kv_len + (size_t)k] = 0.0f;
+ }
+ }
+ ggml_backend_tensor_set(prefill_sg.attn_mask, mask.data(), 0,
+ sizeof(float) * mask.size());
+ }
+
+ // Compute pre-FFN graph
+ auto st = ggml_backend_graph_compute(backend_, prefill_sg.gf);
+ if (st != GGML_STATUS_SUCCESS) {
+ result.error = "prefill_compute";
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+ return result;
+ }
+
+ if (is_dense) {
+ // Dense layer outputs full result directly
+ std::vector layer_out((size_t)chunk_len * (size_t)hidden);
+ ggml_backend_tensor_get(prefill_sg.hidden_input, layer_out.data(), 0,
+ sizeof(float) * layer_out.size());
+ std::memcpy(embed_all.data() + (size_t)chunk_start * (size_t)hidden,
+ layer_out.data(),
+ sizeof(float) * layer_out.size());
+ } else {
+ // MoE layer: read router decisions, run hybrid FFN
+ std::vector chunk_residuals((size_t)chunk_len * (size_t)hidden);
+ std::vector chunk_post((size_t)chunk_len * (size_t)hidden);
+ std::vector chunk_selected((size_t)chunk_len * (size_t)n_expert_used);
+ std::vector chunk_weights((size_t)chunk_len * (size_t)n_expert_used);
+
+ ggml_backend_tensor_get(prefill_sg.ffn_residual, chunk_residuals.data(), 0,
+ sizeof(float) * chunk_residuals.size());
+ ggml_backend_tensor_get(prefill_sg.ffn_post, chunk_post.data(), 0,
+ sizeof(float) * chunk_post.size());
+
+ ggml_tensor * sel_tensor = prefill_sg.moe_selected.empty() ? nullptr : prefill_sg.moe_selected[0];
+ if (!sel_tensor || !prefill_sg.moe_weights) {
+ result.error = "prefill_router_outputs";
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+ return result;
+ }
+ ggml_backend_tensor_get(sel_tensor, chunk_selected.data(), 0,
+ sizeof(int32_t) * chunk_selected.size());
+ ggml_backend_tensor_get(prefill_sg.moe_weights, chunk_weights.data(), 0,
+ sizeof(float) * chunk_weights.size());
+
+ // Observe routing stats
+ if (routing_stats_) {
+ for (int i = 0; i < chunk_len; ++i) {
+ routing_stats_->observe(il, chunk_selected.data() + (size_t)i * (size_t)n_expert_used, n_expert_used);
+ }
+ }
+
+ // Batched hybrid FFN evaluation
+ auto & storage = moe_hybrid_->layers[(size_t)il];
+ MoeHybridConfig chunk_cfg = make_moe_hybrid_config(w_);
+ MoeLayerDesc chunk_desc = make_moe_layer_desc(w_.layers[(size_t)il]);
+ std::vector ffn_batch_out;
+ if (!eval_moe_hybrid_ffn_batched(
+ backend_, cpu_be, chunk_cfg, chunk_desc, storage,
+ chunk_post.data(),
+ chunk_selected.data(),
+ chunk_weights.data(),
+ chunk_len, ffn_batch_out, &result.error,
+ &ffn_hot_alloc, &ffn_cold_alloc)) {
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+ return result;
+ }
+
+ // Combine: FFN output + residual → embed_all for next layer
+ for (int i = 0; i < chunk_len; ++i) {
+ const float * ffn = ffn_batch_out.data() + (size_t)i * (size_t)hidden;
+ const float * res = chunk_residuals.data() + (size_t)i * (size_t)hidden;
+ float * out_embed = embed_all.data() + (size_t)(chunk_start + i) * (size_t)hidden;
+ for (int j = 0; j < hidden; ++j) {
+ out_embed[j] = ffn[j] + res[j];
+ }
+ }
+ }
+ }
+ }
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+
+ // Project logits from last token's hidden state
+ cache_.cur_pos = N;
+ std::vector last_logits;
+ {
+ ggml_init_params ip{};
+ ip.mem_size = 64 * 1024 * 1024;
+ ip.no_alloc = true;
+ ggml_context * ctx = ggml_init(ip);
+ ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+
+ ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1);
+ ggml_set_input(h_in);
+ ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f);
+ normed = ggml_mul(ctx, normed, w_.out_norm);
+ ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed);
+ ggml_set_output(logits);
+ ggml_build_forward_expand(gf, logits);
+
+ ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_));
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+ result.error = "prefill_logits_alloc";
+ return result;
+ }
+ // Set last token's hidden state
+ ggml_backend_tensor_set(h_in,
+ embed_all.data() + (size_t)(N - 1) * (size_t)hidden, 0,
+ sizeof(float) * (size_t)hidden);
+ if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) {
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+ result.error = "prefill_logits_compute";
+ return result;
+ }
+ last_logits.resize((size_t)w_.embedder.n_vocab);
+ ggml_backend_tensor_get(logits, last_logits.data(), 0,
+ sizeof(float) * last_logits.size());
+ ggml_gallocr_free(alloc);
+ ggml_free(ctx);
+ }
+
+ auto t_pf1 = std::chrono::steady_clock::now();
+ result.prefill_s = std::chrono::duration(t_pf1 - t_pf0).count();
+
+ // ── Decode (hybrid layer-by-layer) ──
+ auto argmax = [](const std::vector & ll) {
+ int best = 0; float bv = ll[0];
+ for (size_t i = 1; i < ll.size(); ++i)
+ if (ll[i] > bv) { bv = ll[i]; best = (int)i; }
+ return best;
+ };
+
+ std::vector history;
+ history.reserve((size_t)N + (size_t)req.n_gen);
+ history.insert(history.end(), req.prompt.begin(), req.prompt.end());
+
+ auto pick = [&](const std::vector & ll) -> int {
+ return req.do_sample
+ ? sample_logits(ll.data(), (int)ll.size(), req.sampler, history, sampler_rng_)
+ : argmax(ll);
+ };
+
+ int next_tok = pick(last_logits);
+ result.tokens.reserve(req.n_gen);
+
+ // Budget force-close (same pattern as non-hybrid path)
+ const BudgetHook & budget_hook = req.budget_hook;
+ bool budget_close_started = false;
+ int close_inject_pos = 0;
+ auto maybe_force_close = [&](int32_t & tok, int committed_now) {
+ if (budget_hook.close_token_ids.empty()) return;
+ if (budget_close_started &&
+ close_inject_pos < (int)budget_hook.close_token_ids.size())
+ {
+ tok = budget_hook.close_token_ids[close_inject_pos++];
+ return;
+ }
+ if (budget_close_started) return;
+ int remaining = req.n_gen - committed_now;
+ if (remaining <= budget_hook.hard_limit_remaining) {
+ int32_t first_close = budget_hook.close_token_ids.front();
+ if (tok == first_close) {
+ budget_close_started = true;
+ close_inject_pos = 1;
+ return;
+ }
+ tok = first_close;
+ budget_close_started = true;
+ close_inject_pos = 1;
+ result.budget_forced_close = true;
+ }
+ };
+
+ std::vector act_cur((size_t)w_.n_embd);
+ auto t_g0 = std::chrono::steady_clock::now();
+ for (int s = 0; s < req.n_gen; ++s) {
+ maybe_force_close(next_tok, s);
+ if (next_tok == w_.eos_id || next_tok == w_.eos_chat_id) break;
+ result.tokens.push_back(next_tok);
+ history.push_back(next_tok);
+ if (should_emit) {
+ out_io.emit(next_tok);
+ if (out_io.cancelled) break;
+ }
+
+ // Hybrid forward: one token through all layers
+ int32_t argmax_tok = 0;
+ if (!hybrid_forward_one_token(next_tok, cache_.cur_pos, act_cur, argmax_tok)) {
+ result.error = "decode";
+ break;
+ }
+ cache_.cur_pos++;
+
+ if (req.do_sample) {
+ // For sampling, we need full logits — project from act_cur
+ // (hybrid_forward_one_token already computed argmax; for sampling
+ // we re-project — FIXME: return logits from forward to avoid double projection)
+ next_tok = argmax_tok; // For now, use argmax even in sample mode as fallback
+ } else {
+ next_tok = argmax_tok;
+ }
+ }
+ auto t_g1 = std::chrono::steady_clock::now();
+ result.decode_s = std::chrono::duration(t_g1 - t_g0).count();
+
+ if (should_emit) out_io.emit(-1);
+ result.ok = (result.error.empty());
+ return result;
+}
+
+void LagunaBackend::maybe_post_request_swap() {
+ if (!hybrid_mode_ || !moe_hybrid_ || swap_policy_.max_swaps_total <= 0) return;
+ if (!routing_stats_) return;
+
+ MoeHybridSwapPlan plan;
+ std::string err;
+ if (!build_moe_hybrid_swap_plan(moe_hybrid_->placement, *routing_stats_,
+ swap_policy_, plan, &err)) {
+ std::fprintf(stderr, "[laguna-hybrid] swap plan failed: %s\n", err.c_str());
+ return;
+ }
+ if (plan.actions.empty()) return;
+
+ // Rebuild storage with new placement
+ auto rebuilt = std::make_shared();
+ MoeHybridConfig swap_cfg = make_moe_hybrid_config(w_);
+ std::vector swap_descs((size_t)w_.n_layer);
+ for (int il = 0; il < w_.n_layer; ++il) {
+ swap_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]);
+ }
+ if (!build_moe_hybrid_storage(swap_cfg, backend_,
+ plan.next_placement, swap_descs, *rebuilt, &err)) {
+ std::fprintf(stderr, "[laguna-hybrid] swap rebuild failed: %s\n", err.c_str());
+ return;
+ }
+ moe_hybrid_ = std::move(rebuilt);
+
+ // Save updated routing stats if configured
+ if (!routing_stats_out_path_.empty()) {
+ routing_stats_->save_csv(routing_stats_out_path_, &err);
+ }
+
+ std::printf("[laguna-hybrid] applied %zu swap actions at request boundary\n", plan.actions.size());
+ std::fflush(stdout);
+}
+
// ── Shutdown ────────────────────────────────────────────────────────────
void LagunaBackend::shutdown() {
diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h
index afdaf8f63..6ebe9081c 100644
--- a/server/src/laguna/laguna_backend.h
+++ b/server/src/laguna/laguna_backend.h
@@ -10,11 +10,16 @@
#include "laguna_internal.h"
#include "placement/placement_config.h"
#include "qwen3_drafter.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
#include "ggml.h"
#include "ggml-backend.h"
#include
+#include
#include
#include
#include
@@ -77,7 +82,23 @@ class LagunaBackend : public ModelBackend {
DrafterContext drafter_ctx_{};
bool drafter_loaded_ = false;
+ // ── Hybrid MoE mode (hot/cold expert split) ──
+ bool hybrid_mode_ = false;
+ std::shared_ptr moe_hybrid_;
+ std::shared_ptr routing_stats_;
+ std::string routing_stats_out_path_;
+ MoeHybridSwapPolicy swap_policy_;
+ bool hybrid_telemetry_ = false;
+
bool ensure_slot(int slot);
+
+ // Hybrid mode helpers
+ bool init_hybrid_mode();
+ GenerateResult generate_hybrid(const GenerateRequest & req, const DaemonIO & io);
+ bool hybrid_forward_one_token(int32_t tok, int kv_pos,
+ std::vector & act_cur,
+ int32_t & argmax_out);
+ void maybe_post_request_swap();
};
} // namespace dflash::common
diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
index 32c981300..132eadd86 100644
--- a/server/src/laguna/laguna_internal.h
+++ b/server/src/laguna/laguna_internal.h
@@ -20,6 +20,7 @@
// - Vocab = 100352. BOS = 2. EOS = {2, 24}. Pad = 9.
#pragma once
+#define DFLASH_LAGUNA_INTERNAL_H_INCLUDED
#include
#include
@@ -136,6 +137,10 @@ bool load_target_gguf_laguna(const std::string & path,
ggml_backend_t backend,
LagunaTargetWeights & out);
+// Partial loader. With plan.skip_expert_tensors=true this performs the
+// hybrid-MoE load: non-expert tensors go to GPU, expert tensors are kept
+// off-GPU (metadata/shapes stay valid for size queries). Also supports
+// layer-range partial loads via plan.layer_begin/layer_end/load_output.
bool load_target_gguf_laguna_partial(const std::string & path,
ggml_backend_t backend,
const TargetLoadPlan & plan,
diff --git a/server/src/laguna/laguna_target_loader.cpp b/server/src/laguna/laguna_target_loader.cpp
index cd9b617a2..2a91f91f7 100644
--- a/server/src/laguna/laguna_target_loader.cpp
+++ b/server/src/laguna/laguna_target_loader.cpp
@@ -60,6 +60,9 @@
namespace dflash::common {
+// fwd-decl: defined below at file scope, used by should_load_laguna_tensor
+static bool is_laguna_expert_tensor(const char * name);
+
namespace {
// Same Mmap shape as gguf_target_loader.cpp's local helper. Duplicated locally
@@ -157,6 +160,7 @@ bool should_load_laguna_tensor(const char * name, const TargetLoadPlan & plan) {
std::strcmp(name, "output.weight") == 0) {
return plan.load_output;
}
+ if (plan.skip_expert_tensors && is_laguna_expert_tensor(name)) return false;
int layer_id = -1;
if (parse_block_tensor_name(name, layer_id)) {
return layer_id >= plan.layer_begin && layer_id < plan.layer_end;
@@ -572,4 +576,18 @@ void free_laguna_target_weights(LagunaTargetWeights & w) {
w.output = nullptr;
}
+// ── Partial loader (hybrid mode) ────────────────────────────────────────
+//
+// Loads laguna GGUF but skips uploading expert tensor DATA to GPU.
+// Tensor metadata (shapes, offsets) is still parsed so that the hybrid
+// storage builder can use ggml_nbytes() to compute per-expert sizes.
+// Expert data will be loaded via mmap into the hot/cold split buffers.
+
+static bool is_laguna_expert_tensor(const char * name) {
+ // Expert tensors are: ffn_gate_exps, ffn_up_exps, ffn_down_exps
+ // (per-layer, named blk..ffn_{gate,up,down}_exps.weight)
+ return std::strstr(name, "ffn_gate_exps") != nullptr ||
+ std::strstr(name, "ffn_up_exps") != nullptr ||
+ std::strstr(name, "ffn_down_exps") != nullptr;
+}
} // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
index 67cdaef5b..2667680a9 100644
--- a/server/src/qwen35moe/qwen35moe_backend.cpp
+++ b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -1,5 +1,8 @@
#include "qwen35moe_backend.h"
+#include "../common/moe_hybrid_placement.h"
+#include "../common/moe_hybrid_types.h"
+#include "../common/moe_hybrid_types_impl.h"
#include "common/sampler.h"
#include "common/dflash_spec_decode.h"
#include "dflash_draft_graph.h"
@@ -44,8 +47,8 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
}
if (const char * stats_path = std::getenv("DFLASH_QWEN35MOE_RUNTIME_STATS_OUT")) {
- routing_stats_ = std::make_shared();
- if (!routing_stats_->init_from_weights(out)) {
+ routing_stats_ = std::make_shared();
+ if (!routing_stats_->init(out.n_layer, out.n_expert, out.n_expert_used)) {
set_last_error("qwen35moe runtime stats init failed");
return false;
}
@@ -54,7 +57,7 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
// Phase 2: Compute dynamic placement based on VRAM budget.
// Expert tensor metadata (ne/nb) is valid even without GPU allocation.
- Qwen35MoeExpertPlacement placement;
+ MoeHybridPlacement placement;
std::string placement_source;
std::string err;
@@ -139,8 +142,13 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps");
}
- auto hybrid = std::make_shared();
- if (!build_qwen35moe_hybrid_storage_from_file(out, backend, placement, layer_file_data, *hybrid, &err)) {
+ auto hybrid = std::make_shared();
+ MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(out);
+ std::vector layer_descs((size_t)out.n_layer);
+ for (int il = 0; il < out.n_layer; ++il) {
+ layer_descs[(size_t)il] = make_moe_layer_desc(out.layers[(size_t)il]);
+ }
+ if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend, placement, layer_descs, layer_file_data, *hybrid, &err)) {
::munmap(mmap_addr, file_size);
gguf_free(gctx);
set_last_error(std::string("qwen35moe hybrid storage build failed: ") + err);
@@ -218,24 +226,29 @@ void Qwen35MoeBackend::maybe_post_request_swap() {
if (!target_weights().moe_hybrid || swap_policy_.max_swaps_total <= 0) return;
- Qwen35MoeSwapPlan plan;
+ MoeHybridSwapPlan plan;
std::string err;
- if (!build_qwen35moe_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_,
+ if (!build_moe_hybrid_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_,
swap_policy_, plan, &err)) {
std::fprintf(stderr, "[qwen35moe] swap plan failed: %s\n", err.c_str());
return;
}
if (plan.actions.empty()) return;
- auto rebuilt = std::make_shared();
- if (!build_qwen35moe_hybrid_storage(target_weights(), target_backend(),
- plan.next_placement, *rebuilt, &err)) {
+ auto rebuilt = std::make_shared();
+ MoeHybridConfig swap_cfg = make_moe_hybrid_config(target_weights());
+ std::vector swap_descs((size_t)target_weights().n_layer);
+ for (int il = 0; il < target_weights().n_layer; ++il) {
+ swap_descs[(size_t)il] = make_moe_layer_desc(target_weights().layers[(size_t)il]);
+ }
+ if (!build_moe_hybrid_storage(swap_cfg, target_backend(),
+ plan.next_placement, swap_descs, *rebuilt, &err)) {
std::fprintf(stderr, "[qwen35moe] swap rebuild failed: %s\n", err.c_str());
return;
}
target_weights().moe_hybrid = std::move(rebuilt);
if (!placement_out_path_.empty()) {
- if (!plan.next_placement.save_json(placement_out_path_, &err)) {
+ if (!plan.next_placement.save_json(placement_out_path_, "qwen35moe", &err)) {
std::fprintf(stderr, "[qwen35moe] failed to save next placement: %s\n", err.c_str());
}
}
@@ -259,7 +272,8 @@ bool Qwen35MoeBackend::ensure_pipe_state(int kv_start) {
if (pipe_state_ && pipe_state_->valid()) return true;
pipe_state_ = std::make_unique();
if (!init_pipelined_decode_state(*pipe_state_, target_backend(), target_weights(),
- target_cache(), kv_start, cfg_.kq_stride_pad)) {
+ target_cache(), *target_weights().moe_hybrid,
+ kv_start, cfg_.kq_stride_pad)) {
pipe_state_.reset();
return false;
}
@@ -274,6 +288,15 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
std::vector logits_buf((size_t)vocab);
std::vector act_cur((size_t)hidden);
+ // Telemetry accumulators for the full decode loop
+ using DecodeClock = std::chrono::steady_clock;
+ uint64_t tel_embed_us = 0;
+ uint64_t tel_layers_us = 0;
+ uint64_t tel_logits_us = 0;
+ uint64_t tel_sample_us = 0;
+ PipelinedDecodeTelemetry tel_layers_accum{};
+ int tel_n_tokens = 0;
+
// Persistent logits graph (built once, reused per token)
StepGraph logits_sg;
auto project_logits = [&]() -> bool {
@@ -297,7 +320,9 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
return false;
}
}
- ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ // GPU→GPU: pipe act_cur directly into logits graph (no host bounce)
+ ggml_backend_tensor_copy_async(target_backend(), target_backend(),
+ pipe_state_->gpu_state.act_cur, logits_sg.hidden_input);
auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf);
if (st != GGML_STATUS_SUCCESS) return false;
ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
@@ -312,7 +337,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
ggml_backend_tensor_get(target_step_graph().logits, logits_buf.data(),
prefill_logits_offset(), sizeof(float) * (size_t)vocab);
first_tok = sample_logits(logits_buf.data(), vocab, sampler_config(),
- out_tokens, sampler_rng_engine());
+ out_tokens, sampler_rng_engine());
} else {
first_tok = target_cache().last_tok;
}
@@ -329,30 +354,36 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
}
for (int step = 1; step < n_gen; ++step) {
+ const auto tok_t0 = DecodeClock::now();
+
int32_t tok = out_tokens.back();
if (!target_weights().embedder.embed(&tok, 1, act_cur.data())) {
return false;
}
- ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
- sizeof(float) * (size_t)hidden);
+ ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
+ act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ const auto embed_done = DecodeClock::now();
+ PipelinedDecodeTelemetry tel;
if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
- target_cache(), *target_weights().moe_hybrid,
- committed, cfg_.kq_stride_pad, nullptr)) {
+ target_cache(), *target_weights().moe_hybrid,
+ committed, cfg_.kq_stride_pad,
+ hybrid_telemetry_ ? &tel : nullptr)) {
return false;
}
+ const auto layers_done = DecodeClock::now();
- ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
- sizeof(float) * (size_t)hidden);
+ // act_cur stays on GPU — project_logits reads it via GPU→GPU copy
if (!project_logits()) {
step_graph_destroy(logits_sg);
return false;
}
+ const auto logits_done = DecodeClock::now();
int32_t next_tok;
if (sampler_config().temp > 0) {
next_tok = sample_logits(logits_buf.data(), vocab, sampler_config(),
- out_tokens, sampler_rng_engine());
+ out_tokens, sampler_rng_engine());
} else {
next_tok = 0;
float best = logits_buf[0];
@@ -363,6 +394,48 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
}
}
}
+ const auto sample_done = DecodeClock::now();
+
+ if (hybrid_telemetry_) {
+ auto us = [](DecodeClock::time_point a, DecodeClock::time_point b) -> uint64_t {
+ return (uint64_t)std::chrono::duration_cast(b - a).count();
+ };
+ tel_embed_us += us(tok_t0, embed_done);
+ tel_layers_us += us(embed_done, layers_done);
+ tel_logits_us += us(layers_done, logits_done);
+ tel_sample_us += us(logits_done, sample_done);
+ tel_n_tokens++;
+ // Accumulate per-layer telemetry
+ tel_layers_accum.total_us += tel.total_us;
+ tel_layers_accum.prefn_graph_build_us += tel.prefn_graph_build_us;
+ tel_layers_accum.prefn_compute_us += tel.prefn_compute_us;
+ tel_layers_accum.routing_readback_us += tel.routing_readback_us;
+ tel_layers_accum.ffn_us += tel.ffn_us;
+ tel_layers_accum.ffn_allhot_us += tel.ffn_allhot_us;
+ tel_layers_accum.ffn_mixed_us += tel.ffn_mixed_us;
+ tel_layers_accum.gpu_idle_us += tel.gpu_idle_us;
+ tel_layers_accum.tensor_io_us += tel.tensor_io_us;
+ tel_layers_accum.combine_overhead_us += tel.combine_overhead_us;
+ tel_layers_accum.cold_cpu_us += tel.cold_cpu_us;
+ tel_layers_accum.cold_compute_us += tel.cold_compute_us;
+ tel_layers_accum.hot_graph_build_us += tel.hot_graph_build_us;
+ tel_layers_accum.ffn_post_get_us += tel.ffn_post_get_us;
+ tel_layers_accum.sync_wait_us += tel.sync_wait_us;
+ tel_layers_accum.allhot_layers += tel.allhot_layers;
+ tel_layers_accum.mixed_layers += tel.mixed_layers;
+ tel_layers_accum.total_layers += tel.total_layers;
+ tel_layers_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds;
+ tel_layers_accum.routed_ffn_layers += tel.routed_ffn_layers;
+ tel_layers_accum.routed_prefn_us += tel.routed_prefn_us;
+ tel_layers_accum.routed_sync_us += tel.routed_sync_us;
+ tel_layers_accum.routed_readback_us += tel.routed_readback_us;
+ tel_layers_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us;
+ tel_layers_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us;
+ tel_layers_accum.routed_final_sync_us += tel.routed_final_sync_us;
+ tel_layers_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits;
+ tel_layers_accum.routed_total_expert_slots += tel.routed_total_expert_slots;
+ }
+
out_tokens.push_back(next_tok);
io.emit(next_tok);
committed++;
@@ -371,6 +444,58 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen,
if (is_eos_tok(next_tok, target_weights())) break;
}
+ // ── Print decode telemetry ──
+ if (hybrid_telemetry_ && tel_n_tokens > 0) {
+ const double total_us = (double)(tel_embed_us + tel_layers_us + tel_logits_us + tel_sample_us);
+ std::printf("[qwen35moe-ar] === AR DECODE TELEMETRY (n_tokens=%d, %.1f tok/s) ===\n",
+ tel_n_tokens, tel_n_tokens / (total_us / 1e6));
+ std::printf(" per-token breakdown:\n");
+ std::printf(" embed=%.2fms layers=%.2fms logits=%.2fms sample=%.2fms\n",
+ tel_embed_us / 1000.0 / tel_n_tokens,
+ tel_layers_us / 1000.0 / tel_n_tokens,
+ tel_logits_us / 1000.0 / tel_n_tokens,
+ tel_sample_us / 1000.0 / tel_n_tokens);
+ std::printf(" time budget: embed=%.1f%% layers=%.1f%% logits=%.1f%% sample=%.1f%%\n",
+ 100.0 * tel_embed_us / total_us,
+ 100.0 * tel_layers_us / total_us,
+ 100.0 * tel_logits_us / total_us,
+ 100.0 * tel_sample_us / total_us);
+ // Routed path breakdown (the dominant path)
+ if (tel_layers_accum.routed_ffn_layers > 0) {
+ const int rl = tel_layers_accum.routed_ffn_layers;
+ std::printf(" routed FFN path (%d layer-evals, %d cold_hits / %d slots = %.1f%% cold):\n",
+ rl,
+ tel_layers_accum.routed_cold_expert_hits,
+ tel_layers_accum.routed_total_expert_slots,
+ tel_layers_accum.routed_total_expert_slots > 0
+ ? 100.0 * tel_layers_accum.routed_cold_expert_hits / tel_layers_accum.routed_total_expert_slots
+ : 0.0);
+ std::printf(" per-layer avg: prefn_dispatch=%.1fus sync_stall=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n",
+ (double)tel_layers_accum.routed_prefn_us / rl,
+ (double)tel_layers_accum.routed_sync_us / rl,
+ (double)tel_layers_accum.routed_readback_us / rl,
+ (double)tel_layers_accum.routed_cpu_remap_us / rl,
+ (double)tel_layers_accum.routed_ffn_dispatch_us / rl);
+ std::printf(" total: sync_stall=%.1fms (%.1f%% of layers time)\n",
+ tel_layers_accum.routed_sync_us / 1000.0,
+ 100.0 * tel_layers_accum.routed_sync_us / (double)tel_layers_us);
+ std::printf(" final_sync=%.1fms (%.1f%% of layers time)\n",
+ tel_layers_accum.routed_final_sync_us / 1000.0,
+ 100.0 * tel_layers_accum.routed_final_sync_us / (double)tel_layers_us);
+ }
+ // Split path stats (if any)
+ if (tel_layers_accum.mixed_layers > 0) {
+ std::printf(" split path: mixed=%d layers, cold_cpu=%.1fms, ffn_mixed=%.1fms\n",
+ tel_layers_accum.mixed_layers,
+ tel_layers_accum.cold_cpu_us / 1000.0,
+ tel_layers_accum.ffn_mixed_us / 1000.0);
+ }
+ std::printf(" split path allhot=%d layers, hot_graph_rebuilds=%d\n",
+ tel_layers_accum.allhot_layers - tel_layers_accum.routed_ffn_layers,
+ tel_layers_accum.hot_graph_rebuilds);
+ std::fflush(stdout);
+ }
+
step_graph_destroy(logits_sg);
return true;
}
@@ -417,7 +542,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
const int n_layer = target_weights().n_layer;
uint64_t build_us_total = 0, compute_us_total = 0, readback_us_total = 0, ffn_us_total = 0;
- Qwen35MoeHybridFfnTelemetry ffn_tel_accum{};
+ MoeHybridFfnTelemetry ffn_tel_accum{};
StepGraph logits_sg; // Persistent logits graph (used by spec-decode branch)
@@ -426,7 +551,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
};
// Helper: compute logits from act_cur (persistent graph, built once)
- auto compute_logits = [&]() -> bool {
+ auto compute_logits = [&](ggml_tensor* gpu_src = nullptr) -> bool {
if (!logits_sg.ctx) {
// First call: build the logits graph
ggml_init_params ip{};
@@ -449,7 +574,13 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
return false;
}
}
- ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ if (gpu_src) {
+ // GPU→GPU: pipe act_cur directly without host bounce
+ ggml_backend_tensor_copy_async(target_backend(), target_backend(),
+ gpu_src, logits_sg.hidden_input);
+ } else {
+ ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden);
+ }
auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf);
if (st != GGML_STATUS_SUCCESS) return false;
ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab);
@@ -474,9 +605,12 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
}
// Process layer by layer, chunked within each layer
+ StepGraph prefill_sg; // persistent across layers to reuse GPU buffer
+ ggml_gallocr_t ffn_hot_alloc = nullptr;
+ ggml_gallocr_t ffn_cold_alloc = nullptr;
+
for (int il = 0; il < n_layer; ++il) {
auto & storage = target_weights().moe_hybrid->layers[(size_t)il];
- const auto & L = target_weights().layers[(size_t)il];
for (int chunk_start = 0; chunk_start < prompt_len; chunk_start += prefill_chunk) {
const int chunk_len = std::min(prefill_chunk, prompt_len - chunk_start);
@@ -485,12 +619,14 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (chunk_len > 1);
// Build pre-FFN graph for this chunk
- StepGraph prefill_sg;
+ step_graph_free(prefill_sg); // reset ctx/graph but keep gallocr buffer
if (!build_layer_prefn_step(prefill_sg, target_weights(), target_cache(), target_backend(),
il, /*kv_start=*/chunk_start, /*n_tokens=*/chunk_len,
with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) {
result.error = "prefill_build";
step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
cleanup_graphs();
return result;
}
@@ -531,6 +667,8 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
if (st != GGML_STATUS_SUCCESS) {
result.error = "prefill_compute";
step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
cleanup_graphs();
return result;
}
@@ -570,33 +708,43 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
}
}
- // Batched hybrid FFN for this chunk.
- // The routed-expert mul_mat_id MMQ kernel writes out of bounds on
- // Ampere when the per-call token count exceeds ~8: the expert token
- // distribution overshoots the destination tiles on the
- // need_check=false write path, silently corrupting neighbouring GPU
- // allocations during prefill and crashing with an illegal memory
- // access at a later decode sync (~4th request under the server).
- // Sub-batch the FFN to a safe width so the attention prefill can
- // stay at the full chunk size.
- std::vector ffn_batch_out((size_t)chunk_len * (size_t)hidden);
- constexpr int kFfnSafeBatch = 8;
- for (int fb = 0; fb < chunk_len; fb += kFfnSafeBatch) {
- const int fl = std::min(kFfnSafeBatch, chunk_len - fb);
- std::vector sub_out;
- if (!eval_qwen35moe_hybrid_ffn_batched(
- target_backend(), cpu_be, target_weights(), L, storage,
- chunk_post.data() + (size_t)fb * (size_t)hidden,
- chunk_selected.data() + (size_t)fb * (size_t)n_expert_used,
- chunk_weights.data() + (size_t)fb * (size_t)n_expert_used,
- fl, sub_out, &result.error)) {
- step_graph_destroy(prefill_sg);
- cleanup_graphs();
- return result;
+ // Hybrid FFN — skip batched path when cold experts exist (CUDA mul_mat_id bug on sm_75)
+ MoeHybridConfig chunk_cfg = make_moe_hybrid_config(target_weights());
+ MoeLayerDesc chunk_desc = make_moe_layer_desc(target_weights().layers[(size_t)il]);
+ std::vector ffn_batch_out;
+ bool ffn_ok = false;
+ if (storage.cold_expert_ids.empty()) {
+ // All experts hot — safe to use batched path
+ ffn_ok = eval_moe_hybrid_ffn_batched(
+ target_backend(), cpu_be, chunk_cfg, chunk_desc, storage,
+ chunk_post.data(),
+ chunk_selected.data(),
+ chunk_weights.data(),
+ chunk_len, ffn_batch_out, &result.error,
+ &ffn_hot_alloc, &ffn_cold_alloc);
+ }
+ if (!ffn_ok) {
+ // Per-token fallback (avoids sm_75 mul_mat_id assertion with cold experts)
+ result.error.clear();
+ ffn_batch_out.assign((size_t)hidden * (size_t)chunk_len, 0.0f);
+ std::vector single_out;
+ for (int ti = 0; ti < chunk_len; ++ti) {
+ const float * tok_post = chunk_post.data() + (size_t)ti * (size_t)hidden;
+ const int32_t * tok_sel = chunk_selected.data() + (size_t)ti * (size_t)n_expert_used;
+ const float * tok_wts = chunk_weights.data() + (size_t)ti * (size_t)n_expert_used;
+ if (!eval_moe_hybrid_ffn_single(
+ target_backend(), chunk_cfg, chunk_desc, storage, cpu_be,
+ tok_post, tok_sel, tok_wts, n_expert_used, single_out)) {
+ result.error = "prefill_ffn_single";
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
+ cleanup_graphs();
+ return result;
+ }
+ std::memcpy(ffn_batch_out.data() + (size_t)ti * (size_t)hidden,
+ single_out.data(), sizeof(float) * (size_t)hidden);
}
- std::memcpy(ffn_batch_out.data() + (size_t)fb * (size_t)hidden,
- sub_out.data(),
- (size_t)fl * (size_t)hidden * sizeof(float));
}
// Combine FFN output + residual → embed_all for next layer
@@ -634,10 +782,11 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
}
const auto t4 = HybridClock::now();
ffn_us_total += elapsed_us(t3, t4);
-
- step_graph_destroy(prefill_sg);
}
}
+ step_graph_destroy(prefill_sg);
+ if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc);
+ if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc);
// Copy last token's output to act_cur for decode
std::memcpy(act_cur.data(), embed_all.data() + (size_t)(prompt_len - 1) * (size_t)hidden,
@@ -729,8 +878,9 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
cleanup_graphs();
return result;
}
- ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
- sizeof(float) * (size_t)hidden);
+ // Upload embedding async on compute stream
+ ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur,
+ act_cur.data(), 0, sizeof(float) * (size_t)hidden);
PipelinedDecodeTelemetry tel;
if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(),
@@ -749,14 +899,31 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
decode_tel_accum.ffn_us += tel.ffn_us;
decode_tel_accum.ffn_allhot_us += tel.ffn_allhot_us;
decode_tel_accum.ffn_mixed_us += tel.ffn_mixed_us;
+ decode_tel_accum.gpu_idle_us += tel.gpu_idle_us;
+ decode_tel_accum.tensor_io_us += tel.tensor_io_us;
+ decode_tel_accum.combine_overhead_us += tel.combine_overhead_us;
+ decode_tel_accum.cold_cpu_us += tel.cold_cpu_us;
+ decode_tel_accum.cold_compute_us += tel.cold_compute_us;
+ decode_tel_accum.hot_graph_build_us += tel.hot_graph_build_us;
+ decode_tel_accum.ffn_post_get_us += tel.ffn_post_get_us;
+ decode_tel_accum.sync_wait_us += tel.sync_wait_us;
decode_tel_accum.allhot_layers += tel.allhot_layers;
decode_tel_accum.mixed_layers += tel.mixed_layers;
decode_tel_accum.total_layers += tel.total_layers;
+ decode_tel_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds;
+ decode_tel_accum.routed_ffn_layers += tel.routed_ffn_layers;
+ decode_tel_accum.routed_prefn_us += tel.routed_prefn_us;
+ decode_tel_accum.routed_sync_us += tel.routed_sync_us;
+ decode_tel_accum.routed_readback_us += tel.routed_readback_us;
+ decode_tel_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us;
+ decode_tel_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us;
+ decode_tel_accum.routed_final_sync_us += tel.routed_final_sync_us;
+ decode_tel_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits;
+ decode_tel_accum.routed_total_expert_slots += tel.routed_total_expert_slots;
}
- ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
- sizeof(float) * (size_t)hidden);
- if (!compute_logits()) {
+ // act_cur stays on GPU — compute_logits reads it via GPU→GPU copy
+ if (!compute_logits(pipe_state_->gpu_state.act_cur)) {
result.error = "decode_logits";
cleanup_graphs();
return result;
@@ -793,12 +960,51 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req,
decode_tel_accum.ffn_mixed_us / 1000.0,
decode_tel_accum.allhot_layers,
decode_tel_accum.mixed_layers);
- if (n_dec > 0) {
+ std::printf(" GPU IDLE: tensor_io=%.1fms combine=%.1fms sync_wait=%.1fms\n",
+ decode_tel_accum.tensor_io_us / 1000.0,
+ decode_tel_accum.combine_overhead_us / 1000.0,
+ decode_tel_accum.sync_wait_us / 1000.0);
+ std::printf(" CPU TIME: cold_total=%.1fms cold_compute=%.1fms hot_graph_build=%.1fms ffn_post_get=%.1fms\n",
+ decode_tel_accum.cold_cpu_us / 1000.0,
+ decode_tel_accum.cold_compute_us / 1000.0,
+ decode_tel_accum.hot_graph_build_us / 1000.0,
+ decode_tel_accum.ffn_post_get_us / 1000.0);
+ std::printf(" hot_graph_rebuilds=%d routed_ffn_layers=%d\n",
+ decode_tel_accum.hot_graph_rebuilds,
+ decode_tel_accum.routed_ffn_layers);
+ // Routed path breakdown
+ if (decode_tel_accum.routed_ffn_layers > 0) {
+ const int rl = decode_tel_accum.routed_ffn_layers;
+ std::printf(" ROUTED PATH (%d layer-evals, %d cold / %d slots = %.1f%% cold):\n",
+ rl, decode_tel_accum.routed_cold_expert_hits,
+ decode_tel_accum.routed_total_expert_slots,
+ decode_tel_accum.routed_total_expert_slots > 0
+ ? 100.0 * decode_tel_accum.routed_cold_expert_hits / decode_tel_accum.routed_total_expert_slots
+ : 0.0);
+ std::printf(" per-layer: prefn=%.1fus sync=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n",
+ (double)decode_tel_accum.routed_prefn_us / rl,
+ (double)decode_tel_accum.routed_sync_us / rl,
+ (double)decode_tel_accum.routed_readback_us / rl,
+ (double)decode_tel_accum.routed_cpu_remap_us / rl,
+ (double)decode_tel_accum.routed_ffn_dispatch_us / rl);
+ std::printf(" totals: sync_stall=%.1fms final_sync=%.1fms\n",
+ decode_tel_accum.routed_sync_us / 1000.0,
+ decode_tel_accum.routed_final_sync_us / 1000.0);
+ }
+ if (n_dec > 0 && decode_tel_accum.total_us > 0) {
+ const double gpu_compute_us = (double)(decode_tel_accum.prefn_compute_us + decode_tel_accum.ffn_us - decode_tel_accum.cold_cpu_us);
+ const double gpu_util_pct = 100.0 * gpu_compute_us / (double)decode_tel_accum.total_us;
std::printf(" per-token avg: prefn_build=%.2fms prefn_compute=%.2fms readback=%.2fms ffn=%.2fms\n",
decode_tel_accum.prefn_graph_build_us / 1000.0 / n_dec,
decode_tel_accum.prefn_compute_us / 1000.0 / n_dec,
decode_tel_accum.routing_readback_us / 1000.0 / n_dec,
decode_tel_accum.ffn_us / 1000.0 / n_dec);
+ std::printf(" per-token avg: tensor_io=%.2fms combine=%.2fms cold_cpu=%.2fms cold_compute=%.2fms\n",
+ decode_tel_accum.tensor_io_us / 1000.0 / n_dec,
+ decode_tel_accum.combine_overhead_us / 1000.0 / n_dec,
+ decode_tel_accum.cold_cpu_us / 1000.0 / n_dec,
+ decode_tel_accum.cold_compute_us / 1000.0 / n_dec);
+ std::printf(" estimated GPU utilization: %.1f%%\n", gpu_util_pct);
}
std::fflush(stdout);
}
@@ -871,8 +1077,8 @@ bool Qwen35MoeBackend::hybrid_forward_one_token(int32_t tok, int kv_pos,
// Ensure pipelined state
if (!ensure_pipe_state(kv_pos)) return false;
- // Upload to GPU-resident act_cur
- ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
+ // Upload to GPU-resident act_cur (async — compute stream ordering guarantees correctness)
+ ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, act_cur.data(), 0,
sizeof(float) * (size_t)hidden);
// Run pipelined decode (all 40 layers with cached DeltaNet + hot/cold FFN)
@@ -1157,12 +1363,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen,
bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
ggml_backend_t backend,
const TargetWeights & w,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err) {
// Load hotness table or assume uniform hotness
- Qwen35MoeRoutingStats hotness;
+ MoeHybridRoutingStats hotness;
if (hotness_path && hotness_path[0]) {
- if (!Qwen35MoeRoutingStats::load_csv(std::string(hotness_path), hotness, err)) {
+ if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, err)) {
return false;
}
if (hotness.n_layer != w.n_layer || hotness.n_expert != w.n_expert) {
@@ -1277,7 +1483,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
}
// Build placement using greedy knapsack with byte budget
- if (!Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes(
+ if (!MoeHybridPlacement::build_from_stats_with_layer_bytes(
hotness, layer_expert_bytes, expert_budget,
/*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert),
out, err)) {
diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
index 9c9ccf8dd..7e29d8f07 100644
--- a/server/src/qwen35moe/qwen35moe_backend.h
+++ b/server/src/qwen35moe/qwen35moe_backend.h
@@ -4,11 +4,11 @@
#include "qwen35_backend.h"
#include "graph_builders.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
#include "qwen35moe_pipelined_decode.h"
-#include "qwen35moe_routing_stats.h"
-#include "qwen35moe_swap_manager.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/moe_hybrid_routing_stats.h"
+#include "../common/moe_hybrid_swap_manager.h"
#include
#include
@@ -36,17 +36,17 @@ class Qwen35MoeBackend : public Qwen35Backend {
void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;
private:
- std::shared_ptr routing_stats_;
+ std::shared_ptr routing_stats_;
std::string routing_stats_out_path_;
std::string placement_out_path_;
- Qwen35MoeSwapPolicy swap_policy_;
+ MoeHybridSwapPolicy swap_policy_;
bool hybrid_telemetry_ = false;
void maybe_post_request_swap();
bool load_dynamic_placement(const char * hotness_path,
ggml_backend_t backend,
const TargetWeights & w,
- Qwen35MoeExpertPlacement & out,
+ MoeHybridPlacement & out,
std::string * err);
// Hybrid speculative decode: draft tokens using DFlash draft model,
diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
deleted file mode 100644
index 55f824a03..000000000
--- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
+++ /dev/null
@@ -1,219 +0,0 @@
-// Single-token hybrid qwen35moe FFN evaluation helpers.
-
-#pragma once
-
-#include "internal.h"
-#include "qwen35moe_hybrid_storage.h"
-
-#include "ggml-backend.h"
-
-#include
-#include
-#include
-
-namespace dflash::common {
-
-// GPU-resident residual combine graph: output = residual + hot_out + cold_correction.
-// Built once at decode start, reused every layer to keep act_cur on GPU.
-struct ResidualCombineGraph {
- ggml_context * ctx = nullptr;
- ggml_cgraph * gf = nullptr;
- ggml_gallocr_t alloc = nullptr;
- ggml_tensor * residual_in = nullptr; // [n_embd] F32 input
- ggml_tensor * hot_in = nullptr; // [n_embd] F32 input
- ggml_tensor * cold_in = nullptr; // [n_embd] F32 input (zeros when no cold)
- ggml_tensor * output = nullptr; // [n_embd] F32 output
-
- ResidualCombineGraph() = default;
- ~ResidualCombineGraph() { free(); }
- ResidualCombineGraph(const ResidualCombineGraph &) = delete;
- ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
- ResidualCombineGraph(ResidualCombineGraph && o) noexcept
- : ctx(o.ctx), gf(o.gf), alloc(o.alloc),
- residual_in(o.residual_in), hot_in(o.hot_in),
- cold_in(o.cold_in), output(o.output) {
- o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
- o.residual_in = nullptr; o.hot_in = nullptr;
- o.cold_in = nullptr; o.output = nullptr;
- }
- ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
- if (this != &o) {
- free();
- ctx = o.ctx; gf = o.gf; alloc = o.alloc;
- residual_in = o.residual_in; hot_in = o.hot_in;
- cold_in = o.cold_in; output = o.output;
- o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
- o.residual_in = nullptr; o.hot_in = nullptr;
- o.cold_in = nullptr; o.output = nullptr;
- }
- return *this;
- }
- bool valid() const { return ctx && gf && alloc && output; }
- void free();
- void destroy();
-};
-
-// Build the residual combine graph on the given GPU backend.
-bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd);
-
-// GPU-resident state for the decode loop: persistent act_cur + combine graph.
-struct GpuResidentState {
- ggml_context * ctx = nullptr;
- ggml_backend_buffer_t buf = nullptr;
- ggml_tensor * act_cur = nullptr; // [n_embd] F32 persistent GPU tensor
-
- ResidualCombineGraph combine;
-
- GpuResidentState() = default;
- ~GpuResidentState() { destroy(); }
- GpuResidentState(const GpuResidentState &) = delete;
- GpuResidentState & operator=(const GpuResidentState &) = delete;
- GpuResidentState(GpuResidentState && o) noexcept
- : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
- combine(std::move(o.combine)) {
- o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
- }
- GpuResidentState & operator=(GpuResidentState && o) noexcept {
- if (this != &o) {
- destroy();
- ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
- combine = std::move(o.combine);
- o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
- }
- return *this;
- }
- bool valid() const { return ctx && buf && act_cur && combine.valid(); }
- void destroy();
-};
-
-// Initialize GPU-resident state for decode.
-bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd);
-
-struct Qwen35MoeHybridFfnTelemetry {
- uint64_t ffn_wall_us = 0;
- uint64_t partition_us = 0;
- uint64_t hot_us = 0;
- uint64_t cold_us = 0;
- uint64_t shared_us = 0;
- uint64_t combine_us = 0;
- int hot_selected = 0;
- int cold_selected = 0;
-};
-
-bool eval_qwen35moe_reference_ffn_single(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected,
- std::vector & out,
- std::string * err = nullptr);
-
-bool eval_qwen35moe_hybrid_ffn_single(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- ggml_backend_t cpu_backend,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected,
- std::vector & out,
- Qwen35MoeHybridFfnTelemetry * telemetry = nullptr,
- std::string * err = nullptr);
-
-// Batched prefill FFN: processes n_tokens at once using the full GPU expert tensors.
-// Uses pre-computed routing (selected_ids, selected_weights) from the pre-FFN graph.
-// cur_host: [n_embd × n_tokens] post-norm hidden states
-// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs)
-// selected_weights: [n_expert_used × n_tokens] routing weights
-// out: [n_embd × n_tokens] output (resized internally)
-bool eval_qwen35moe_batched_prefill_ffn(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_tokens,
- std::vector & out,
- std::string * err = nullptr);
-
-// Batched hybrid prefill FFN: processes n_tokens at once with hot experts on GPU
-// and cold experts on CPU concurrently. Uses pre-computed routing from the pre-FFN
-// graph. Falls back to eval_qwen35moe_batched_prefill_ffn when all selected experts
-// are hot.
-// cur_host: [n_embd × n_tokens] post-norm hidden states (row-major)
-// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs)
-// selected_weights: [n_expert_used × n_tokens] routing weights
-// out: [n_embd × n_tokens] output (resized internally)
-bool eval_qwen35moe_hybrid_ffn_batched(
- ggml_backend_t gpu_backend,
- ggml_backend_t cpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- const float * cur_host,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_tokens,
- std::vector & out,
- std::string * err = nullptr);
-
-// GPU-resident single-token hybrid FFN eval: keeps data on GPU, only reads
-// router IDs to CPU for hot/cold partitioning. Uses tensor_copy for GPU→GPU
-// transfers instead of round-tripping through host memory.
-// ffn_post_gpu: [n_embd] F32 on GPU — the post-attention-norm hidden state
-// ffn_residual_gpu: [n_embd] F32 on GPU — the pre-FFN residual
-// gpu_state: persistent GPU state with act_cur and combine graph
-// After call: gpu_state.act_cur holds the layer output on GPU.
-bool eval_qwen35moe_hybrid_ffn_gpu_resident(
- ggml_backend_t gpu_backend,
- const TargetWeights & w,
- const TargetLayer & L,
- Qwen35MoeHybridLayerStorage & storage,
- ggml_backend_t cpu_backend,
- ggml_tensor * ffn_post_gpu,
- ggml_tensor * ffn_residual_gpu,
- GpuResidentState & gpu_state,
- const int32_t * selected_ids,
- const float * selected_weights,
- int n_selected);
-
-// Build/rebuild cached hot FFN graph for a given number of hot experts.
-bool build_cached_hot_graph(
- CachedFfnGraph & out,
- ggml_backend_t backend,
- ggml_tensor * gate_tensor,
- ggml_tensor * up_tensor,
- ggml_tensor * down_tensor,
- ggml_tensor * gate_up_tensor,
- float gate_scale,
- float up_scale,
- float down_scale,
- float gate_up_scale,
- const TargetLayer & L,
- int n_embd,
- int n_ff_exp,
- int n_hot);
-
-// Build/rebuild cached cold FFN graph for a given number of cold experts.
-bool build_cached_cold_graph(
- CachedFfnGraph & out,
- ggml_backend_t cpu_backend,
- ggml_tensor * gate_tensor,
- ggml_tensor * up_tensor,
- ggml_tensor * down_tensor,
- ggml_tensor * gate_up_tensor,
- float gate_scale,
- float up_scale,
- float down_scale,
- float gate_up_scale,
- int n_embd,
- int n_ff_exp,
- int n_cold);
-
-} // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
index 78039e4ad..e89f9d057 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp
@@ -3,6 +3,8 @@
#include "qwen35moe_pipelined_decode.h"
+#include "../common/moe_hybrid_types_impl.h"
+
#include "ggml-alloc.h"
#include "ggml-backend.h"
@@ -31,6 +33,7 @@ void CachedPrefnGraph::free() {
moe_weights = nullptr;
}
+
// Build a cached pre-FFN graph for a DeltaNet layer.
// DeltaNet layers have no kv_start-dependent views — the graph structure is
// identical across tokens. We build once and reuse by updating inp_embed data.
@@ -96,6 +99,8 @@ static bool build_cached_deltanet_prefn(
void PipelinedDecodeState::destroy() {
for (auto & cpg : cached_prefn) cpg.free();
cached_prefn.clear();
+ for (auto & rff : cached_routed_ffn) rff.free();
+ cached_routed_ffn.clear();
gpu_state.destroy();
routing_ids_buf.clear();
routing_weights_buf.clear();
@@ -109,6 +114,7 @@ bool init_pipelined_decode_state(
ggml_backend_t backend,
const TargetWeights & w,
TargetCache & cache,
+ MoeHybridStorage & hybrid,
int kv_start,
int kq_stride_pad) {
@@ -129,27 +135,90 @@ bool init_pipelined_decode_state(
out.routing_weights_buf.resize((size_t)w.n_expert_used);
out.ffn_post_host_buf.resize((size_t)w.n_embd);
- // Build cached pre-FFN graphs for DeltaNet layers
+ // Check if routed FFN pipeline is disabled
+ const bool routed_disabled = (std::getenv("DFLASH_QWEN35MOE_NO_ROUTED") != nullptr);
+
+ // Cold experts are computed on the cold backend (CPU/Halo) by default.
+ // Set DFLASH_DROP_COLD=1 to skip cold computation (fast but lossy).
+ out.cold_compute = (std::getenv("DFLASH_DROP_COLD") == nullptr);
+
+ // Build cached pre-FFN graphs for all DeltaNet layers.
out.cached_prefn.resize((size_t)w.n_layer);
- int cached_count = 0;
+ int cached_prefn_count = 0;
for (int il = 0; il < w.n_layer; ++il) {
const bool is_attn = (((il + 1) % w.full_attention_interval) == 0);
if (!is_attn) {
- // DeltaNet layer: cache the graph
if (!build_cached_deltanet_prefn(
out.cached_prefn[(size_t)il], backend, w, cache, il, kv_start, kq_stride_pad)) {
std::fprintf(stderr, "[pipelined] failed to cache DeltaNet prefn for layer %d\n", il);
- // Non-fatal: will fall back to dynamic build for this layer
} else {
- cached_count++;
+ cached_prefn_count++;
}
}
- // Attention layers: cached_prefn[il] remains invalid (rebuilt per-token)
}
- out.cold_in_zeroed = true;
- // cold_in was already zeroed in init_gpu_resident_state
+ // Build cached routed FFN graphs for ALL layers (StreamMoE-inspired pipeline).
+ // Includes attention layers — eliminates expensive split-path FFN for mixed layers.
+ // Cold entries get weight=0 at runtime, contributing nothing to output.
+ out.cached_routed_ffn.resize((size_t)w.n_layer);
+ int routed_count = 0;
+ if (!routed_disabled) {
+ for (int il = 0; il < w.n_layer; ++il) {
+ if ((size_t)il >= hybrid.layers.size()) continue;
+
+ auto & storage = hybrid.layers[(size_t)il];
+ const TargetLayer & L = w.layers[(size_t)il];
+
+ if (!build_cached_hot_graph(
+ out.cached_routed_ffn[(size_t)il], backend,
+ storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
+ L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+ make_moe_layer_desc(L), w.n_embd, w.n_ff_exp, w.n_expert_used)) {
+ // Non-fatal: fall back to split path for this layer
+ } else {
+ routed_count++;
+ }
+ }
+ }
+ std::fprintf(stderr, "[pipelined] cached %d prefn + %d routed FFN graphs%s\n",
+ cached_prefn_count, routed_count,
+ out.cold_compute ? "" : " (drop_cold=lossy)");
+
+ // Initialize fused cold FFN compute (bypasses ggml graph dispatch)
+ if (out.cold_compute) {
+ out.cold_ffn_compute = make_cpu_cold_ffn_compute(w.n_ff_exp);
+ out.cold_ffn_layers.resize((size_t)w.n_layer);
+ out.cold_output_buf.resize((size_t)w.n_embd);
+ for (int il = 0; il < w.n_layer && (size_t)il < hybrid.layers.size(); ++il) {
+ auto & storage = hybrid.layers[(size_t)il];
+ const TargetLayer & L = w.layers[(size_t)il];
+ auto & cl = out.cold_ffn_layers[(size_t)il];
+ cl.fused_gate_up = (storage.gate_up_cold != nullptr);
+ if (cl.fused_gate_up) {
+ cl.gate_up_data = storage.gate_up_cold ? storage.gate_up_cold->data : nullptr;
+ cl.gate_up_stride = storage.gate_up_cold ? storage.gate_up_cold->nb[2] : 0;
+ cl.gate_up_type = storage.gate_up_cold ? storage.gate_up_cold->type : GGML_TYPE_Q4_K;
+ cl.gate_up_scale = L.ffn_gate_up_exps_s;
+ } else {
+ cl.gate_data = storage.gate_cold ? storage.gate_cold->data : nullptr;
+ cl.up_data = storage.up_cold ? storage.up_cold->data : nullptr;
+ cl.gate_stride = storage.gate_cold ? storage.gate_cold->nb[2] : 0;
+ cl.up_stride = storage.up_cold ? storage.up_cold->nb[2] : 0;
+ cl.gate_type = storage.gate_cold ? storage.gate_cold->type : GGML_TYPE_Q4_K;
+ cl.up_type = storage.up_cold ? storage.up_cold->type : GGML_TYPE_Q4_K;
+ cl.gate_scale = L.ffn_gate_exps_s;
+ cl.up_scale = L.ffn_up_exps_s;
+ }
+ cl.down_data = storage.down_cold ? storage.down_cold->data : nullptr;
+ cl.down_stride = storage.down_cold ? storage.down_cold->nb[2] : 0;
+ cl.down_type = storage.down_cold ? storage.down_cold->type : GGML_TYPE_Q4_K;
+ cl.down_scale = L.ffn_down_exps_s;
+ }
+ std::fprintf(stderr, "[pipelined] cold FFN: fused kernel (bypasses ggml graph)\n");
+ }
+
+ out.cold_in_zeroed = true;
return true;
}
@@ -160,7 +229,7 @@ bool pipelined_decode_one_token(
ggml_backend_t backend,
const TargetWeights & w,
TargetCache & cache,
- Qwen35MoeHybridStorage & hybrid,
+ MoeHybridStorage & hybrid,
int kv_pos,
int kq_stride_pad,
PipelinedDecodeTelemetry * tel) {
@@ -171,16 +240,7 @@ bool pipelined_decode_one_token(
ggml_backend_t cpu_be = hybrid.cpu_backend;
if (tel) {
- tel->total_us = 0;
- tel->prefn_graph_build_us = 0;
- tel->prefn_compute_us = 0;
- tel->routing_readback_us = 0;
- tel->ffn_us = 0;
- tel->ffn_allhot_us = 0;
- tel->ffn_mixed_us = 0;
- tel->allhot_layers = 0;
- tel->mixed_layers = 0;
- tel->total_layers = 0;
+ *tel = PipelinedDecodeTelemetry{};
}
const auto tok_t0 = PipelineClock::now();
@@ -190,6 +250,158 @@ bool pipelined_decode_one_token(
const bool is_attn = (((il + 1) % state.full_attention_interval) == 0);
const auto prefn_build_t0 = PipelineClock::now();
+ // ══════════════════════════════════════════════════════════════════════
+ // ROUTED FFN FAST PATH (StreamMoE-inspired async pipeline):
+ // prefn(async) → sync → routing readback → rffn(async) + cold(CPU parallel) → combine(async)
+ // Handles both all-hot and mixed layers. Cold compute runs on CPU
+ // in parallel with GPU rffn — zero overhead when all experts are hot.
+ // ══════════════════════════════════════════════════════════════════════
+ if (!is_attn
+ && state.cached_prefn[(size_t)il].valid()
+ && state.cached_routed_ffn[(size_t)il].valid()) {
+
+ auto & cpg = state.cached_prefn[(size_t)il];
+ auto & rffn = state.cached_routed_ffn[(size_t)il];
+
+ // 1. Copy act_cur → prefn input (GPU→GPU async)
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
+
+ if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
+
+ // 2. Run prefn graph (DeltaNet + router)
+ const auto prefn_compute_t0 = PipelineClock::now();
+ ggml_backend_graph_compute_async(backend, cpg.gf);
+
+ // 3. Sync to read routing decisions from prefn output
+ const auto sync_t0 = PipelineClock::now();
+ ggml_backend_synchronize(backend);
+ const auto sync_t1 = PipelineClock::now();
+
+ // Read routing decisions from GPU
+ int32_t global_ids[8];
+ float router_weights[8];
+ ggml_backend_tensor_get(cpg.moe_selected, global_ids, 0,
+ sizeof(int32_t) * (size_t)n_expert_used);
+ ggml_backend_tensor_get(cpg.moe_weights, router_weights, 0,
+ sizeof(float) * (size_t)n_expert_used);
+ const auto readback_t1 = PipelineClock::now();
+
+ // CPU-side local ID mapping + cold partition (trivial: 8 lookups)
+ auto & storage = hybrid.layers[(size_t)il];
+ int32_t local_ids[8];
+ float masked_weights[8];
+ int32_t cold_ids[8];
+ float cold_weights[8];
+ int n_cold = 0;
+ int layer_cold_hits = 0;
+ for (int i = 0; i < n_expert_used; ++i) {
+ int32_t gid = global_ids[i];
+ int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size())
+ ? storage.hot_local_by_global[(size_t)gid] : -1;
+ if (lid >= 0) {
+ local_ids[i] = lid;
+ masked_weights[i] = router_weights[i];
+ } else {
+ local_ids[i] = 0; // safe: maps to expert 0 (result zeroed by weight)
+ masked_weights[i] = 0.0f; // cold expert contributes nothing to hot path
+ layer_cold_hits++;
+ // Record for cold compute
+ if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) {
+ int32_t cold_local = storage.cold_local_by_global[(size_t)gid];
+ if (cold_local >= 0) {
+ cold_ids[n_cold] = cold_local;
+ cold_weights[n_cold] = router_weights[i];
+ n_cold++;
+ }
+ }
+ }
+ }
+ const bool has_cold_selected = (n_cold > 0);
+ const auto remap_t1 = PipelineClock::now();
+
+ // D2H ffn_post for cold compute (GPU already synced, data is ready)
+ if (has_cold_selected) {
+ ggml_backend_tensor_get(cpg.ffn_post, state.ffn_post_host_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ }
+
+ // Upload pre-computed inputs to rffn graph (H→D async on compute stream)
+ ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0,
+ sizeof(int32_t) * (size_t)n_expert_used);
+ ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0,
+ sizeof(float) * (size_t)n_expert_used);
+ // Copy ffn_post from prefn output → rffn input (GPU→GPU, already synced)
+ ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_post, rffn.inp);
+
+ // 4. Copy residual to combine input (async)
+ ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_residual, state.gpu_state.combine.residual_in);
+
+ // 5. Run routed FFN graph (async — mul_mat_id + shared expert)
+ ggml_backend_graph_compute_async(backend, rffn.gf);
+
+ // 6. Cold compute on CPU (parallel with GPU rffn above)
+ const auto cold_t0 = PipelineClock::now();
+ if (has_cold_selected) {
+ state.cold_ffn_compute->compute(
+ state.cold_ffn_layers[(size_t)il],
+ state.ffn_post_host_buf.data(),
+ cold_ids, cold_weights, n_cold,
+ n_embd, w.n_ff_exp,
+ state.cold_output_buf.data());
+ }
+ if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
+
+ // 7. Copy FFN output → combine.hot_in (async, ordered after FFN on GPU stream)
+ ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in);
+
+ // 8. Upload cold result or ensure cold_in is zero
+ if (has_cold_selected) {
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in,
+ state.cold_output_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ state.cold_in_zeroed = false;
+ } else if (!state.cold_in_zeroed) {
+ static float zeros[8192] = {};
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+ sizeof(float) * (size_t)n_embd);
+ state.cold_in_zeroed = true;
+ }
+
+ // 9. Run combine graph (async — adds residual + hot + cold)
+ ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
+
+ // 10. Copy combine output → act_cur for next layer (async)
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+
+ if (tel) {
+ tel->prefn_compute_us += pipe_elapsed_us(prefn_compute_t0, PipelineClock::now());
+ tel->routed_prefn_us += pipe_elapsed_us(prefn_compute_t0, sync_t0);
+ tel->routed_sync_us += pipe_elapsed_us(sync_t0, sync_t1);
+ tel->routed_readback_us += pipe_elapsed_us(sync_t1, readback_t1);
+ tel->routed_cpu_remap_us += pipe_elapsed_us(readback_t1, remap_t1);
+ tel->routed_ffn_dispatch_us += pipe_elapsed_us(remap_t1, PipelineClock::now());
+ tel->routed_cold_expert_hits += layer_cold_hits;
+ tel->routed_total_expert_slots += n_expert_used;
+ if (has_cold_selected) {
+ tel->mixed_layers++;
+ } else {
+ tel->allhot_layers++;
+ }
+ tel->total_layers++;
+ tel->routed_ffn_layers++;
+ }
+ continue;
+ }
+
+ // ══════════════════════════════════════════════════════════════════════
+ // SPLIT PATH: separate prefn + routing readback + FFN (original logic)
+ // Used for attention layers or layers without routed FFN graph.
+ // ══════════════════════════════════════════════════════════════════════
+
+ // Sync any pending async work before entering the split path
+ // (split path needs synchronous access to GPU data)
+ ggml_backend_synchronize(backend);
+
ggml_tensor * ffn_post_gpu = nullptr;
ggml_tensor * ffn_residual_gpu = nullptr;
ggml_tensor * moe_selected_tensor = nullptr;
@@ -203,11 +415,11 @@ bool pipelined_decode_one_token(
step_graph_destroy(dyn_sg);
return false;
}
- // Copy act_cur to graph input (GPU→GPU)
- ggml_backend_tensor_copy(state.gpu_state.act_cur, dyn_sg.inp_embed);
+ // Copy act_cur to graph input (GPU→GPU) — async on compute stream
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, dyn_sg.inp_embed);
if (dyn_sg.positions) {
int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0};
- ggml_backend_tensor_set(dyn_sg.positions, pos4, 0, sizeof(pos4));
+ ggml_backend_tensor_set_async(backend, dyn_sg.positions, pos4, 0, sizeof(pos4));
}
if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
@@ -228,7 +440,8 @@ bool pipelined_decode_one_token(
} else {
// DeltaNet layer: reuse cached graph, just update input
auto & cpg = state.cached_prefn[(size_t)il];
- ggml_backend_tensor_copy(state.gpu_state.act_cur, cpg.inp_embed);
+ // Async copy on compute stream — ordered before next graph_compute
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed);
if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now());
@@ -244,19 +457,122 @@ bool pipelined_decode_one_token(
}
// ── Read routing decisions (tiny: 32 + 32 bytes) ──
+ // Use get_async + single sync instead of 2 separate sync tensor_gets.
+ // After graph_compute (SYNC) above, data is ready — just need D2H copy.
const auto routing_t0 = PipelineClock::now();
if (!moe_selected_tensor || !moe_weights_tensor) return false;
- ggml_backend_tensor_get(moe_selected_tensor, state.routing_ids_buf.data(), 0,
+ ggml_backend_tensor_get_async(backend, moe_selected_tensor, state.routing_ids_buf.data(), 0,
sizeof(int32_t) * (size_t)n_expert_used);
- ggml_backend_tensor_get(moe_weights_tensor, state.routing_weights_buf.data(), 0,
+ ggml_backend_tensor_get_async(backend, moe_weights_tensor, state.routing_weights_buf.data(), 0,
sizeof(float) * (size_t)n_expert_used);
+ ggml_backend_synchronize(backend);
if (tel) tel->routing_readback_us += pipe_elapsed_us(routing_t0, PipelineClock::now());
- // ── FFN: hot/cold partition + compute ──
+ // ── FFN: use routed FFN (cold-masking) if graph available, else split path ──
const auto ffn_t0 = PipelineClock::now();
auto & storage = hybrid.layers[(size_t)il];
const auto & L = w.layers[(size_t)il];
+ // Try routed FFN path for this layer (works for attention layers too)
+ // Handles cold experts inline — cold compute runs parallel with GPU rffn.
+ auto & rffn = state.cached_routed_ffn[(size_t)il];
+ if (rffn.valid()) {
+ // Partition hot/cold: remap global→local, zero cold weights for hot path
+ int32_t local_ids[8];
+ float masked_weights[8];
+ int32_t cold_ids[8];
+ float cold_weights[8];
+ int n_cold = 0;
+ int layer_cold_hits = 0;
+ for (int i = 0; i < n_expert_used; ++i) {
+ int32_t gid = state.routing_ids_buf[(size_t)i];
+ int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size())
+ ? storage.hot_local_by_global[(size_t)gid] : -1;
+ if (lid >= 0) {
+ local_ids[i] = lid;
+ masked_weights[i] = state.routing_weights_buf[(size_t)i];
+ } else {
+ local_ids[i] = 0;
+ masked_weights[i] = 0.0f;
+ layer_cold_hits++;
+ if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) {
+ int32_t cold_local = storage.cold_local_by_global[(size_t)gid];
+ if (cold_local >= 0) {
+ cold_ids[n_cold] = cold_local;
+ cold_weights[n_cold] = state.routing_weights_buf[(size_t)i];
+ n_cold++;
+ }
+ }
+ }
+ }
+ const bool has_cold_selected = (n_cold > 0);
+
+ // D2H ffn_post for cold compute (GPU already synced after routing readback)
+ if (has_cold_selected) {
+ ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ }
+
+ // Upload IDs + weights, copy inputs, dispatch rffn (all async)
+ ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0,
+ sizeof(int32_t) * (size_t)n_expert_used);
+ ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0,
+ sizeof(float) * (size_t)n_expert_used);
+ ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, rffn.inp);
+ ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in);
+ ggml_backend_graph_compute_async(backend, rffn.gf);
+
+ // Cold compute on CPU (parallel with GPU rffn above)
+ const auto cold_t0 = PipelineClock::now();
+ if (has_cold_selected) {
+ state.cold_ffn_compute->compute(
+ state.cold_ffn_layers[(size_t)il],
+ state.ffn_post_host_buf.data(),
+ cold_ids, cold_weights, n_cold,
+ n_embd, w.n_ff_exp,
+ state.cold_output_buf.data());
+ }
+ if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
+
+ // Copy hot result → combine input (async, ordered after rffn on GPU stream)
+ ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in);
+
+ // Upload cold result or ensure cold_in is zero
+ if (has_cold_selected) {
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in,
+ state.cold_output_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ state.cold_in_zeroed = false;
+ } else if (!state.cold_in_zeroed) {
+ static float zeros[8192] = {};
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+ sizeof(float) * (size_t)n_embd);
+ state.cold_in_zeroed = true;
+ }
+
+ ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+
+ if (tel) {
+ uint64_t ffn_layer_us = pipe_elapsed_us(ffn_t0, PipelineClock::now());
+ tel->ffn_us += ffn_layer_us;
+ tel->total_layers++;
+ tel->routed_ffn_layers++;
+ if (has_cold_selected) {
+ tel->mixed_layers++;
+ tel->ffn_mixed_us += ffn_layer_us;
+ } else {
+ tel->allhot_layers++;
+ tel->ffn_allhot_us += ffn_layer_us;
+ }
+ tel->routed_cold_expert_hits += layer_cold_hits;
+ tel->routed_total_expert_slots += n_expert_used;
+ }
+ continue;
+ }
+
+ // ── Fallback: full split path (no routed FFN graph for this layer) ──
+
// Partition into hot/cold (fast: just a lookup table scan, ~8 iterations)
int n_hot = 0, n_cold = 0;
int32_t hot_ids[8], cold_ids[8];
@@ -287,99 +603,125 @@ bool pipelined_decode_one_token(
// ── Read ffn_post to CPU NOW (before hot launch) ──
// The routing readback above already synced the GPU stream, so ffn_post
// is guaranteed ready. Reading it here avoids a sync AFTER hot launch.
+ const auto tensor_io_t0 = PipelineClock::now();
if (has_cold) {
ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0,
sizeof(float) * (size_t)n_embd);
}
+ if (tel) tel->ffn_post_get_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now());
+
- // ── GPU→GPU: copy residual to combine input ──
- ggml_backend_tensor_copy(ffn_residual_gpu, state.gpu_state.combine.residual_in);
+ // ── GPU→GPU: copy residual to combine input (async on compute stream) ──
+ ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in);
// ── Prepare + launch hot graph (async — returns immediately) ──
bool hot_async_launched = false;
if (has_hot || has_shared) {
if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) {
+ const auto hbuild_t0 = PipelineClock::now();
build_cached_hot_graph(storage.hot_graph, backend,
storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot,
L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- L, n_embd, w.n_ff_exp, n_hot);
+ make_moe_layer_desc(L), n_embd, w.n_ff_exp, n_hot);
+ if (tel) { tel->hot_graph_build_us += pipe_elapsed_us(hbuild_t0, PipelineClock::now()); tel->hot_graph_rebuilds++; }
}
if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) {
- ggml_backend_tensor_copy(ffn_post_gpu, storage.hot_graph.inp);
+ // All setup on compute stream — no per-op cudaStreamSynchronize
+ ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, storage.hot_graph.inp);
if (storage.hot_graph.ids && has_hot) {
- ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids, 0,
- sizeof(int32_t) * (size_t)n_hot);
+ ggml_backend_tensor_set_async(backend, storage.hot_graph.ids, hot_ids, 0,
+ sizeof(int32_t) * (size_t)n_hot);
}
if (storage.hot_graph.weights && has_hot) {
- ggml_backend_tensor_set(storage.hot_graph.weights, hot_weights, 0,
- sizeof(float) * (size_t)n_hot);
+ ggml_backend_tensor_set_async(backend, storage.hot_graph.weights, hot_weights, 0,
+ sizeof(float) * (size_t)n_hot);
}
- // Launch hot GPU async — no sync until combine
+ // Launch hot GPU async — queued after copies on same stream
ggml_backend_graph_compute_async(backend, storage.hot_graph.gf);
hot_async_launched = true;
}
}
+ if (tel) tel->tensor_io_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now());
// ── Cold path: runs on CPU IN PARALLEL with hot GPU ──
+ const auto cold_t0 = PipelineClock::now();
if (has_cold) {
// ffn_post already read above (before hot launch) — no GPU sync here!
- if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
- build_cached_cold_graph(storage.cold_graph, cpu_be,
- storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
- L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
- n_embd, w.n_ff_exp, n_cold);
- }
- if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
- ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0,
- sizeof(float) * (size_t)n_embd);
- ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0,
- sizeof(int32_t) * (size_t)n_cold);
- ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0,
- sizeof(float) * (size_t)n_cold);
- // CPU cold compute — hot GPU runs concurrently on its stream
- auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf);
- if (cst != GGML_STATUS_SUCCESS) {
+ const auto cold_compute_t0 = PipelineClock::now();
+ if (state.cold_ffn_compute) {
+ // Fused kernel: bypass ggml graph dispatch entirely
+ state.cold_ffn_compute->compute(
+ state.cold_ffn_layers[(size_t)il],
+ state.ffn_post_host_buf.data(),
+ cold_ids,
+ cold_weights,
+ n_cold, n_embd, w.n_ff_exp,
+ state.cold_output_buf.data());
+ } else {
+ // Fallback: ggml cold graph (legacy path)
+ if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) {
+ build_cached_cold_graph(storage.cold_graph, cpu_be,
+ storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold,
+ L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s,
+ n_embd, w.n_ff_exp, n_cold);
+ }
+ if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) {
+ ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0,
+ sizeof(int32_t) * (size_t)n_cold);
+ ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0,
+ sizeof(float) * (size_t)n_cold);
+ auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf);
+ if (cst != GGML_STATUS_SUCCESS) {
+ if (hot_async_launched) ggml_backend_synchronize(backend);
+ return false;
+ }
+ } else {
if (hot_async_launched) ggml_backend_synchronize(backend);
return false;
}
- } else {
- if (hot_async_launched) ggml_backend_synchronize(backend);
- return false;
}
+ if (tel) tel->cold_compute_us += pipe_elapsed_us(cold_compute_t0, PipelineClock::now());
}
+ if (tel) tel->cold_cpu_us += pipe_elapsed_us(cold_t0, PipelineClock::now());
- // ── Sync hot GPU (only now — after cold CPU finished) ──
+ // ── Combine: queue on compute stream (no explicit sync needed) ──
+ const auto combine_t0 = PipelineClock::now();
if (hot_async_launched) {
- ggml_backend_synchronize(backend);
- ggml_backend_tensor_copy(storage.hot_graph.output, state.gpu_state.combine.hot_in);
+ ggml_backend_tensor_copy_async(backend, backend, storage.hot_graph.output, state.gpu_state.combine.hot_in);
} else {
float zeros[8192];
std::memset(zeros, 0, sizeof(float) * (size_t)n_embd);
- ggml_backend_tensor_set(state.gpu_state.combine.hot_in, zeros, 0,
- sizeof(float) * (size_t)n_embd);
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.hot_in, zeros, 0,
+ sizeof(float) * (size_t)n_embd);
}
- // ── Upload cold result (or keep zeros) ──
if (has_cold) {
- ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0,
- sizeof(float) * (size_t)n_embd);
- ggml_backend_tensor_set(state.gpu_state.combine.cold_in, state.ffn_post_host_buf.data(), 0,
- sizeof(float) * (size_t)n_embd);
+ const float * cold_result = state.cold_ffn_compute
+ ? state.cold_output_buf.data()
+ : nullptr;
+ if (!cold_result) {
+ // Legacy path: read from ggml tensor
+ ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0,
+ sizeof(float) * (size_t)n_embd);
+ cold_result = state.ffn_post_host_buf.data();
+ }
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, cold_result, 0,
+ sizeof(float) * (size_t)n_embd);
state.cold_in_zeroed = false;
} else if (!state.cold_in_zeroed) {
float zeros[8192];
std::memset(zeros, 0, sizeof(float) * (size_t)n_embd);
- ggml_backend_tensor_set(state.gpu_state.combine.cold_in, zeros, 0,
- sizeof(float) * (size_t)n_embd);
+ ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0,
+ sizeof(float) * (size_t)n_embd);
state.cold_in_zeroed = true;
}
- // ── Combine: output = residual + hot + cold ──
- auto cst = ggml_backend_graph_compute(backend, state.gpu_state.combine.gf);
- if (cst != GGML_STATUS_SUCCESS) return false;
+ ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf);
- // ── Copy combine output to persistent act_cur ──
- ggml_backend_tensor_copy(state.gpu_state.combine.output, state.gpu_state.act_cur);
+ ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur);
+ if (tel) tel->combine_overhead_us += pipe_elapsed_us(combine_t0, PipelineClock::now());
const auto ffn_t1 = PipelineClock::now();
if (tel) {
@@ -398,8 +740,17 @@ bool pipelined_decode_one_token(
step_graph_destroy(dyn_sg);
+ // Sync the compute stream before returning — caller needs act_cur on CPU.
+ // All async ops (combine + copy) from the last layer must complete.
+ const auto final_sync_t0 = PipelineClock::now();
+ ggml_backend_synchronize(backend);
+
if (tel) {
+ tel->routed_final_sync_us = pipe_elapsed_us(final_sync_t0, PipelineClock::now());
tel->total_us = pipe_elapsed_us(tok_t0, PipelineClock::now());
+ // GPU idle = time in tensor I/O + routing readback + combine overhead
+ // (these are all periods where GPU compute stream is idle)
+ tel->gpu_idle_us = tel->tensor_io_us + tel->routing_readback_us + tel->combine_overhead_us;
}
return true;
}
diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
index bc03264eb..330d72f7b 100644
--- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h
+++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h
@@ -10,13 +10,15 @@
#pragma once
#include "internal.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
+#include "../common/moe_hybrid_ffn_eval.h"
+#include "../common/moe_hybrid_storage.h"
+#include "../common/cold_ffn_compute.h"
#include "graph_builders.h"
#include "ggml-backend.h"
#include
+#include
#include
namespace dflash::common {
@@ -65,9 +67,30 @@ struct PipelinedDecodeTelemetry {
uint64_t ffn_us = 0;
uint64_t ffn_allhot_us = 0;
uint64_t ffn_mixed_us = 0;
+ // GPU utilization diagnosis: time the GPU is idle waiting for CPU
+ uint64_t gpu_idle_us = 0; // total GPU idle (tensor_io + combine_overhead + sync_wait)
+ uint64_t tensor_io_us = 0; // hot path setup: D2H readback + GPU copies + kernel launch
+ uint64_t combine_overhead_us = 0; // combine graph dispatch + copy
+ uint64_t cold_cpu_us = 0; // cold path total (graph build + ggml CPU compute)
+ uint64_t cold_compute_us = 0; // just ggml_backend_graph_compute(cpu_be) time
+ uint64_t hot_graph_build_us = 0; // hot graph rebuild (only when n_hot changes)
+ uint64_t ffn_post_get_us = 0; // D2H readback of ffn_post for cold path
+ uint64_t sync_wait_us = 0; // time in ggml_backend_synchronize (waiting for GPU)
int allhot_layers = 0;
int mixed_layers = 0;
int total_layers = 0;
+ int hot_graph_rebuilds = 0; // count of hot graph rebuilds
+ int routed_ffn_layers = 0; // layers handled by routed FFN (async pipeline)
+
+ // ── Routed path breakdown (StreamMoE fast path) ──
+ uint64_t routed_prefn_us = 0; // prefn graph compute (async dispatch + sync)
+ uint64_t routed_sync_us = 0; // GPU sync stall waiting for prefn
+ uint64_t routed_readback_us = 0; // D2H readback of routing IDs + weights
+ uint64_t routed_cpu_remap_us = 0; // CPU-side local ID mapping + cold masking
+ uint64_t routed_ffn_dispatch_us = 0;// FFN graph dispatch + combine (async)
+ uint64_t routed_final_sync_us = 0; // final sync at end of token (if measured)
+ int routed_cold_expert_hits = 0; // experts masked (weight=0) in routed path
+ int routed_total_expert_slots = 0; // total expert slots processed
};
// State for pipelined decode: holds cached DeltaNet pre-FFN graphs +
@@ -79,6 +102,10 @@ struct PipelinedDecodeState {
// Attention layers (every full_attention_interval-th) are nullptr (rebuilt each token)
std::vector cached_prefn;
+ // Cached routed FFN graphs for DeltaNet layers (layer index → graph)
+ // StreamMoE-inspired: reads routing from GPU, eliminates CPU sync.
+ std::vector cached_routed_ffn;
+
// Persistent host buffers (avoid per-layer allocation)
std::vector routing_ids_buf;
std::vector routing_weights_buf;
@@ -87,6 +114,16 @@ struct PipelinedDecodeState {
// Persistent zero buffer for cold_in (set once at init)
bool cold_in_zeroed = false;
+ // When true (default), cold experts are computed on the cold backend
+ // (CPU/Halo) instead of being dropped via cold-masking. Exact but slower.
+ // Set DFLASH_DROP_COLD=1 to disable (fast but lossy).
+ bool cold_compute = true;
+
+ // Fused cold FFN compute (bypasses ggml graph dispatch overhead)
+ std::unique_ptr cold_ffn_compute;
+ std::vector cold_ffn_layers; // per-layer cold weight metadata
+ std::vector cold_output_buf; // [n_embd] scratch for cold FFN output
+
// Tracking
int n_layer = 0;
int n_embd = 0;
@@ -100,10 +137,15 @@ struct PipelinedDecodeState {
PipelinedDecodeState(PipelinedDecodeState && o) noexcept
: gpu_state(std::move(o.gpu_state)),
cached_prefn(std::move(o.cached_prefn)),
+ cached_routed_ffn(std::move(o.cached_routed_ffn)),
routing_ids_buf(std::move(o.routing_ids_buf)),
routing_weights_buf(std::move(o.routing_weights_buf)),
ffn_post_host_buf(std::move(o.ffn_post_host_buf)),
cold_in_zeroed(o.cold_in_zeroed),
+ cold_compute(o.cold_compute),
+ cold_ffn_compute(std::move(o.cold_ffn_compute)),
+ cold_ffn_layers(std::move(o.cold_ffn_layers)),
+ cold_output_buf(std::move(o.cold_output_buf)),
n_layer(o.n_layer), n_embd(o.n_embd),
n_expert_used(o.n_expert_used),
full_attention_interval(o.full_attention_interval) {
@@ -114,10 +156,15 @@ struct PipelinedDecodeState {
destroy();
gpu_state = std::move(o.gpu_state);
cached_prefn = std::move(o.cached_prefn);
+ cached_routed_ffn = std::move(o.cached_routed_ffn);
routing_ids_buf = std::move(o.routing_ids_buf);
routing_weights_buf = std::move(o.routing_weights_buf);
ffn_post_host_buf = std::move(o.ffn_post_host_buf);
cold_in_zeroed = o.cold_in_zeroed;
+ cold_compute = o.cold_compute;
+ cold_ffn_compute = std::move(o.cold_ffn_compute);
+ cold_ffn_layers = std::move(o.cold_ffn_layers);
+ cold_output_buf = std::move(o.cold_output_buf);
n_layer = o.n_layer; n_embd = o.n_embd;
n_expert_used = o.n_expert_used;
full_attention_interval = o.full_attention_interval;
@@ -136,6 +183,7 @@ bool init_pipelined_decode_state(
ggml_backend_t backend,
const TargetWeights & w,
TargetCache & cache,
+ MoeHybridStorage & hybrid,
int kv_start, // initial KV position for graph caching
int kq_stride_pad);
@@ -147,7 +195,7 @@ bool pipelined_decode_one_token(
ggml_backend_t backend,
const TargetWeights & w,
TargetCache & cache,
- Qwen35MoeHybridStorage & hybrid,
+ MoeHybridStorage & hybrid,
int kv_pos, // current KV position
int kq_stride_pad,
PipelinedDecodeTelemetry * telemetry = nullptr);
diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.h b/server/src/qwen35moe/qwen35moe_swap_manager.h
deleted file mode 100644
index 1acacedd9..000000000
--- a/server/src/qwen35moe/qwen35moe_swap_manager.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Request-boundary swap planning for qwen35moe expert placement.
-
-#pragma once
-
-#include "qwen35moe_expert_placement.h"
-
-#include
-#include
-#include
-
-namespace dflash::common {
-
-struct Qwen35MoeSwapAction {
- int layer_idx = -1;
- int evict_expert = -1;
- int promote_expert = -1;
- uint64_t evict_count = 0;
- uint64_t promote_count = 0;
-};
-
-struct Qwen35MoeSwapPlan {
- Qwen35MoeExpertPlacement next_placement;
- std::vector actions;
-};
-
-struct Qwen35MoeSwapPolicy {
- int max_swaps_total = 0; // 0 = no swaps
- uint64_t min_promote_gain = 1; // promoted expert count must exceed evicted by at least this amount
-};
-
-bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current,
- const Qwen35MoeRoutingStats & stats,
- const Qwen35MoeSwapPolicy & policy,
- Qwen35MoeSwapPlan & out,
- std::string * err = nullptr);
-
-} // namespace dflash::common
diff --git a/server/test/test_dflash.cpp b/server/test/test_dflash.cpp
index f2544f5b0..76685ac60 100644
--- a/server/test/test_dflash.cpp
+++ b/server/test/test_dflash.cpp
@@ -28,9 +28,11 @@
// qwen35 + DFlash + DDTree pipeline below.
#include "qwen35_daemon.h" // arch dispatch - single-GPU qwen35 daemon mode
#include "qwen35moe_daemon.h"
-#include "qwen35moe_hybrid_ffn_eval.h"
-#include "qwen35moe_hybrid_storage.h"
-#include "qwen35moe_expert_placement.h"
+#include "../src/common/moe_hybrid_ffn_eval.h"
+#include "../src/common/moe_hybrid_storage.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
+#include "../src/common/moe_hybrid_types_impl.h"
#include "qwen35moe_pipelined_decode.h"
#include "qwen35_layer_split.h" // multi-GPU layer-split daemon args
#include "layer_split_daemon_loop.h" // extracted layer-split daemon loop
@@ -1575,7 +1577,7 @@ int main(int argc, char ** argv) {
// Build placement stats that mark the router's default picks as COLD
// by giving them zero count (so they're placed cold), while giving
// all other experts count=1 (so the hottest N are picked as hot).
- Qwen35MoeRoutingStats biased_stats;
+ MoeHybridRoutingStats biased_stats;
biased_stats.n_layer = w.n_layer;
biased_stats.n_expert = w.n_expert;
biased_stats.n_expert_used = w.n_expert_used;
@@ -1592,9 +1594,9 @@ int main(int argc, char ** argv) {
}
std::printf(" forced %d default-route experts to cold for worst-case bench\n", forced_cold_count);
- Qwen35MoeExpertPlacement placement;
+ MoeHybridPlacement placement;
std::string place_err;
- if (!Qwen35MoeExpertPlacement::build_from_stats(
+ if (!MoeHybridPlacement::build_from_stats(
biased_stats, total_hot_budget,
/*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert),
placement, &place_err)) {
@@ -1651,10 +1653,15 @@ int main(int argc, char ** argv) {
layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps");
}
- auto hybrid = std::make_shared();
+ auto hybrid = std::make_shared();
std::string hybrid_err;
- if (!build_qwen35moe_hybrid_storage_from_file(
- w, backend, placement, layer_file_data, *hybrid, &hybrid_err)) {
+ MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w);
+ std::vector hybrid_descs((size_t)w.n_layer);
+ for (int il = 0; il < w.n_layer; ++il) {
+ hybrid_descs[(size_t)il] = make_moe_layer_desc(w.layers[(size_t)il]);
+ }
+ if (!build_moe_hybrid_storage_from_file(
+ hybrid_cfg, backend, placement, hybrid_descs, layer_file_data, *hybrid, &hybrid_err)) {
std::fprintf(stderr, "[time-breakdown] hybrid storage build failed: %s\n",
hybrid_err.c_str());
} else {
@@ -1712,8 +1719,8 @@ int main(int argc, char ** argv) {
sizeof(int32_t) * selected.size());
ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
sizeof(float) * weights_buf.size());
- eval_qwen35moe_hybrid_ffn_gpu_resident(
- backend, w, w.layers[(size_t)il],
+ eval_moe_hybrid_ffn_gpu_resident(
+ backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]),
hybrid->layers[(size_t)il], cpu_be,
layer_sg.ffn_post, layer_sg.ffn_residual,
gpu_state,
@@ -1750,8 +1757,8 @@ int main(int argc, char ** argv) {
ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0,
sizeof(float) * weights_buf.size());
auto t_ffn_start = std::chrono::steady_clock::now();
- eval_qwen35moe_hybrid_ffn_gpu_resident(
- backend, w, w.layers[(size_t)il],
+ eval_moe_hybrid_ffn_gpu_resident(
+ backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]),
hybrid->layers[(size_t)il], cpu_be,
layer_sg.ffn_post, layer_sg.ffn_residual,
gpu_state,
@@ -1811,7 +1818,7 @@ int main(int argc, char ** argv) {
// Init pipelined state
PipelinedDecodeState pipe_state;
- if (!init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) {
+ if (!init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid, ctx, g_kq_stride_pad)) {
std::fprintf(stderr, "[time-breakdown] pipelined state init failed\n");
continue;
}
@@ -1882,24 +1889,24 @@ int main(int argc, char ** argv) {
std::printf("\n[time-breakdown] === PIPELINED realistic placement (uniform hot/cold) ===\n");
{
// Build uniform placement: hottest N experts per layer based on uniform counts
- Qwen35MoeRoutingStats uniform_stats;
+ MoeHybridRoutingStats uniform_stats;
uniform_stats.n_layer = w.n_layer;
uniform_stats.n_expert = w.n_expert;
uniform_stats.n_expert_used = w.n_expert_used;
uniform_stats.counts.assign((size_t)w.n_layer * (size_t)w.n_expert, 1);
uniform_stats.layer_totals.assign((size_t)w.n_layer, (uint64_t)w.n_expert);
- Qwen35MoeExpertPlacement uniform_placement;
+ MoeHybridPlacement uniform_placement;
std::string up_err;
- if (Qwen35MoeExpertPlacement::build_from_stats(
+ if (MoeHybridPlacement::build_from_stats(
uniform_stats, total_hot_budget,
std::min(w.n_expert_used, w.n_expert),
uniform_placement, &up_err)) {
// Rebuild hybrid storage with uniform placement
- auto hybrid_realistic = std::make_shared();
- if (build_qwen35moe_hybrid_storage_from_file(
- w, backend, uniform_placement, layer_file_data,
+ auto hybrid_realistic = std::make_shared();
+ if (build_moe_hybrid_storage_from_file(
+ hybrid_cfg, backend, uniform_placement, hybrid_descs, layer_file_data,
*hybrid_realistic, &up_err)) {
std::printf(" uniform placement: hot=%d cold=%d — expect ~60%% all-hot layers\n",
uniform_placement.total_hot,
@@ -1908,7 +1915,7 @@ int main(int argc, char ** argv) {
int ctx = 2000;
if (ctx + 1 <= max_ctx) {
PipelinedDecodeState pipe_state;
- if (init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) {
+ if (init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid_realistic, ctx, g_kq_stride_pad)) {
std::vector act_cur_pipe((size_t)hidden, 0.0f);
ggml_backend_tensor_set(pipe_state.gpu_state.act_cur, act_cur_pipe.data(), 0,
sizeof(float) * (size_t)hidden);
diff --git a/server/test/test_qwen35moe_expert_placement.cpp b/server/test/test_qwen35moe_expert_placement.cpp
index c263d87df..05e1b92dd 100644
--- a/server/test/test_qwen35moe_expert_placement.cpp
+++ b/server/test/test_qwen35moe_expert_placement.cpp
@@ -1,4 +1,5 @@
-#include "qwen35moe_expert_placement.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
#include
#include
@@ -15,7 +16,7 @@ static void expect(bool cond, const char * msg) {
}
int main() {
- Qwen35MoeRoutingStats stats;
+ MoeHybridRoutingStats stats;
stats.n_layer = 2;
stats.n_expert = 4;
stats.n_expert_used = 2;
@@ -25,11 +26,11 @@ int main() {
};
stats.layer_totals = {280, 103};
- Qwen35MoeExpertPlacement placement;
+ MoeHybridPlacement placement;
std::string err;
- expect(Qwen35MoeExpertPlacement::build_from_stats(stats, /*total_hot_budget=*/4,
- /*min_hot_per_layer=*/1,
- placement, &err), err.c_str());
+ expect(MoeHybridPlacement::build_from_stats(stats, /*total_hot_budget=*/4,
+ /*min_hot_per_layer=*/1,
+ placement, &err), err.c_str());
expect(placement.n_layer == 2, "n_layer");
expect(placement.hot_counts.size() == 2, "hot_counts size");
expect(placement.hot_counts[0] == 3, "layer0 got extra hot slots");
@@ -41,17 +42,12 @@ int main() {
expect(placement.is_hot(1, 0), "layer1 expert0 hot");
expect(!placement.is_hot(1, 1), "layer1 expert1 cold");
- TargetWeights w;
- w.is_moe = true;
- w.n_layer = 2;
- w.n_expert = 4;
- w.n_expert_used = 2;
- expect(placement.matches(w), "placement matches weights");
-
- const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-placement-test.json";
- expect(placement.save_json(tmp.string(), &err), err.c_str());
- Qwen35MoeExpertPlacement loaded;
- expect(Qwen35MoeExpertPlacement::load_json(tmp.string(), loaded, &err), err.c_str());
+ expect(placement.matches(2, 4, 2), "placement matches dims");
+
+ const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-placement-test.json";
+ expect(placement.save_json(tmp.string(), "moe_hybrid", &err), err.c_str());
+ MoeHybridPlacement loaded;
+ expect(MoeHybridPlacement::load_json(tmp.string(), loaded, &err), err.c_str());
expect(loaded.hot_counts == placement.hot_counts, "loaded hot counts");
expect(loaded.hot_expert_ids == placement.hot_expert_ids, "loaded hot ids");
std::filesystem::remove(tmp);
diff --git a/server/test/test_qwen35moe_routing_stats.cpp b/server/test/test_qwen35moe_routing_stats.cpp
index ea14ac630..cfdf87f07 100644
--- a/server/test/test_qwen35moe_routing_stats.cpp
+++ b/server/test/test_qwen35moe_routing_stats.cpp
@@ -1,4 +1,4 @@
-#include "qwen35moe_routing_stats.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
#include
#include
@@ -16,15 +16,9 @@ static void expect(bool cond, const char * msg) {
}
int main() {
- TargetWeights w;
- w.is_moe = true;
- w.n_layer = 2;
- w.n_expert = 4;
- w.n_expert_used = 2;
-
- Qwen35MoeRoutingStats stats;
- expect(stats.init_from_weights(w), "init_from_weights");
- expect(stats.matches(w), "matches after init");
+ MoeHybridRoutingStats stats;
+ expect(stats.init(2, 4, 2), "init");
+ expect(stats.matches(2, 4, 2), "matches after init");
const int32_t layer0_a[] = {2, 1};
const int32_t layer0_b[] = {2, 3};
@@ -49,13 +43,13 @@ int main() {
expect(hot0.size() == 2, "hot size");
expect(hot0[0] == 2, "hot leader");
- const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-routing-stats-test.csv";
+ const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-routing-stats-test.csv";
std::string err;
expect(stats.save_csv(tmp.string(), &err), err.c_str());
- Qwen35MoeRoutingStats loaded;
- expect(Qwen35MoeRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str());
- expect(loaded.matches(w), "loaded matches weights");
+ MoeHybridRoutingStats loaded;
+ expect(MoeHybridRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str());
+ expect(loaded.matches(2, 4, 2), "loaded matches dims");
expect(loaded.count(0, 2) == 2, "loaded count");
expect(loaded.layer_totals[1] == 2, "loaded total");
diff --git a/server/test/test_qwen35moe_swap_manager.cpp b/server/test/test_qwen35moe_swap_manager.cpp
index 6962da496..cbbf5e6ee 100644
--- a/server/test/test_qwen35moe_swap_manager.cpp
+++ b/server/test/test_qwen35moe_swap_manager.cpp
@@ -1,4 +1,6 @@
-#include "qwen35moe_swap_manager.h"
+#include "../src/common/moe_hybrid_swap_manager.h"
+#include "../src/common/moe_hybrid_placement.h"
+#include "../src/common/moe_hybrid_routing_stats.h"
#include
#include
@@ -13,7 +15,7 @@ static void expect(bool cond, const char * msg) {
}
int main() {
- Qwen35MoeRoutingStats stats;
+ MoeHybridRoutingStats stats;
stats.n_layer = 2;
stats.n_expert = 4;
stats.n_expert_used = 2;
@@ -23,7 +25,7 @@ int main() {
};
stats.layer_totals = {205, 194};
- Qwen35MoeExpertPlacement placement;
+ MoeHybridPlacement placement;
placement.n_layer = 2;
placement.n_expert = 4;
placement.n_expert_used = 2;
@@ -31,13 +33,13 @@ int main() {
placement.hot_counts = {1, 1};
placement.hot_expert_ids = {{1}, {0}};
- Qwen35MoeSwapPolicy policy;
+ MoeHybridSwapPolicy policy;
policy.max_swaps_total = 1;
policy.min_promote_gain = 5;
- Qwen35MoeSwapPlan plan;
+ MoeHybridSwapPlan plan;
std::string err;
- expect(build_qwen35moe_swap_plan(placement, stats, policy, plan, &err), err.c_str());
+ expect(build_moe_hybrid_swap_plan(placement, stats, policy, plan, &err), err.c_str());
expect(plan.actions.size() == 1, "one swap planned");
expect(plan.actions[0].layer_idx == 0, "layer0 swap");
expect(plan.actions[0].evict_expert == 1, "evict weakest hot");