diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index ed148cb67..2766b1824 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -251,12 +251,14 @@ add_library(dflash_common STATIC src/qwen35moe/qwen35moe_ffn.cpp src/qwen35moe/qwen35moe_backend.cpp src/qwen35moe/qwen35moe_daemon.cpp - src/qwen35moe/qwen35moe_routing_stats.cpp - src/qwen35moe/qwen35moe_expert_placement.cpp - src/qwen35moe/qwen35moe_hybrid_storage.cpp - src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp src/qwen35moe/qwen35moe_pipelined_decode.cpp - src/qwen35moe/qwen35moe_swap_manager.cpp + # ── Common MoE hybrid infrastructure ── + src/common/moe_hybrid_placement.cpp + src/common/moe_hybrid_routing_stats.cpp + src/common/moe_hybrid_storage.cpp + src/common/moe_hybrid_ffn_eval.cpp + src/common/cold_ffn_cpu.cpp + src/common/moe_hybrid_swap_manager.cpp src/qwen35/layer_split_forward.cpp src/qwen35/layer_split_daemon.cpp src/qwen35/qwen35_backend.cpp @@ -523,6 +525,11 @@ target_link_libraries(dflash_common ggml-base nlohmann_json::nlohmann_json ) +# OpenMP for parallel cold FFN kernel (saturate memory bandwidth). +find_package(OpenMP) +if(OpenMP_CXX_FOUND) + target_link_libraries(dflash_common PRIVATE OpenMP::OpenMP_CXX) +endif() if(DFLASH27B_GPU_BACKEND STREQUAL "hip") target_link_libraries(dflash_common PRIVATE hip::host) endif() @@ -638,32 +645,32 @@ if(DFLASH27B_TESTS) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp") add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp) target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp") add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp) target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp") add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp) target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp") add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp) target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp") add_executable(bench_laguna_generate test/bench_laguna_generate.cpp) target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp") add_executable(test_laguna_daemon test/test_laguna_daemon.cpp) target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda) + target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp") add_executable(smoke_target_forward test/smoke_target_forward.cpp) diff --git a/server/README.md b/server/README.md index ac0d57e97..63c240cb3 100644 --- a/server/README.md +++ b/server/README.md @@ -231,6 +231,59 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX | Math500 | 35.13 | 69.77 | 5.15 | **1.99×** | | **Mean** | 34.97 | 69.19 | 5.17 | **1.98×** | +## Hybrid MoE (hot/cold expert split) + +For MoE targets whose experts don't fit in VRAM, dflash can split experts +across the GPU and CPU: the most-used (**hot**) experts stay resident on the +GPU, the rest (**cold**) live in host RAM and are evaluated on the CPU, +overlapped with the GPU hot path. This trades some decode/prefill speed for +VRAM headroom, and is computed automatically at load by a dynamic-placement +pass. It applies to both MoE arches: `qwen35`/`qwen36` and `laguna`. + +**When it triggers.** If the experts fit in the available VRAM budget, all +experts load to GPU (no split, fastest path). Otherwise placement keeps as +many hot experts as the budget allows and routes the rest to CPU. You can also +shrink the budget manually to force a split (e.g. to free VRAM for a longer +context or a larger target). + +### Budget knobs + +| Env | Arch | Effect | +|---|---|---| +| `DFLASH_EXPERT_BUDGET_MB N` | both | Cap hot-expert VRAM to `N` MB. Applies only when `N` is below the auto-computed budget; experts beyond it go cold (CPU). | +| `DFLASH_EXPERT_BUDGET_PCT P` | laguna | Keep hot experts to `P`% (`0` = `LAGUNA` or `QWEN35MOE`: + +| Env | Effect | +|---|---| +| `DFLASH__HOTNESS ` | Expert frequency/hotness file driving which experts are placed hot. | +| `DFLASH__TELEMETRY 1` | Log per-layer hot/cold FFN timing telemetry. | +| `DFLASH__SWAP_MAX N` | Max hot/cold promotions per request boundary (runtime re-placement); `0` disables swapping. | +| `DFLASH__SWAP_MIN_GAIN N` | Min observed-frequency gain before a cold expert is promoted to hot. | +| `DFLASH__NEXT_PLACEMENT_OUT ` | Dump the placement chosen this run (warm-start the hotness file next time). | +| `DFLASH_QWEN35MOE_RUNTIME_STATS_OUT ` | (qwen only) Dump runtime routing-frequency stats. | + +### Example + +```bash +# Force ~8 GB of hot experts on GPU; the rest run cold on the CPU. +DFLASH_EXPERT_BUDGET_MB=8000 ./build/dflash_server models/laguna-xs2-Q4_K_M.gguf --port 8000 +# Startup log e.g.: "dynamic placement result: 4717 hot experts, 5267 cold experts" +``` + +### Caveat: reduced-stack prefill chunking + +When a layer's hot-expert stack is **reduced** (i.e. a genuine split), the +ggml-cuda MMQ `mul_mat_id` kernel illegal-accesses for certain batch sizes on +**both** HIP/gfx1151 and CUDA/sm_86. As a guard, hybrid-split **prefill** is +sliced into ≤4-token sub-batches (forcing the stable MMVQ path); **decode** +(single-token) is unaffected. This costs some prefill throughput on split +layers and is removed once the kernel is fixed upstream. + ## Laguna-XS.2 target (experimental, Poolside MoE) [Poolside Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2) is a 40-layer MoE LLM with 256 experts (top-8) plus an always-on shared expert, per-layer head counts `[48,64,64,64]×10`, and a per-layer SWA pattern (window 512). It is **architecturally distinct from `qwen35`**, so dflash adds a hand-rolled CUDA forward path (`Path A`, ggml-only — no libllama dependency) that mirrors the qwen35 stack. The Q4_K_M GGUF lands at 18.77 GiB on a single RTX 3090; tok_embd stays CPU-only (110 MiB) to keep the GPU budget under 24 GB. diff --git a/server/src/common/cold_ffn_compute.h b/server/src/common/cold_ffn_compute.h new file mode 100644 index 000000000..f1d512dec --- /dev/null +++ b/server/src/common/cold_ffn_compute.h @@ -0,0 +1,59 @@ +// ColdFfnCompute: Direct compute interface for cold expert FFN. +// Bypasses ggml graph dispatch overhead. Shared-memory model (CPU/Halo). +#pragma once + +#include "ggml.h" +#include +#include + +namespace dflash::common { + +// Per-layer cold weight metadata — raw pointers into shared memory. +struct ColdFfnLayer { + const void * gate_up_data = nullptr; // fused [n_cold, n_ff*2, n_embd] quantized + const void * gate_data = nullptr; // separate gate [n_cold, n_ff, n_embd] + const void * up_data = nullptr; // separate up [n_cold, n_ff, n_embd] + const void * down_data = nullptr; // [n_cold, n_embd, n_ff] quantized + + size_t gate_up_stride = 0; // bytes between experts in gate_up tensor + size_t gate_stride = 0; // bytes between experts in gate tensor + size_t up_stride = 0; // bytes between experts in up tensor + size_t down_stride = 0; // bytes between experts in down tensor + + ggml_type gate_up_type = GGML_TYPE_Q4_K; // type for fused gate_up + ggml_type gate_type = GGML_TYPE_Q4_K; // type for separate gate + ggml_type up_type = GGML_TYPE_Q4_K; // type for separate up + ggml_type down_type = GGML_TYPE_Q4_K; // type for down projection + bool fused_gate_up = false; // true if gate+up are fused + + // Scale factors (applied after matmul). 1.0 = no scaling. + float gate_up_scale = 1.0f; + float gate_scale = 1.0f; + float up_scale = 1.0f; + float down_scale = 1.0f; +}; + +// Abstract compute interface. Implementations: CPU (now), Halo (future). +struct ColdFfnCompute { + virtual ~ColdFfnCompute() = default; + + // Compute cold expert FFN contributions and accumulate into output. + // input: [n_embd] F32 — post-norm hidden state + // ids: [n_cold] I32 — local cold expert indices + // weights: [n_cold] F32 — routing weights for each cold expert + // output: [n_embd] F32 — accumulated weighted expert outputs (zeroed by callee) + virtual void compute( + const ColdFfnLayer & layer, + const float * input, + const int32_t * ids, + const float * weights, + int n_cold, + int n_embd, + int n_ff, + float * output) = 0; +}; + +// Create CPU-based fused cold FFN compute. +std::unique_ptr make_cpu_cold_ffn_compute(int n_ff_max); + +} // namespace dflash::common diff --git a/server/src/common/cold_ffn_cpu.cpp b/server/src/common/cold_ffn_cpu.cpp new file mode 100644 index 000000000..75a93ff86 --- /dev/null +++ b/server/src/common/cold_ffn_cpu.cpp @@ -0,0 +1,190 @@ +// CpuColdFfnCompute: Fused cold expert FFN using ggml vec_dot primitives. +// Bypasses ggml graph dispatch overhead. Uses OpenMP to saturate memory bandwidth. +// Memory-bandwidth bound at ~45 GB/s DDR4. Target: 15.7ms → ~3ms/token. + +#include "cold_ffn_compute.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace dflash::common { + +class CpuColdFfnCompute : public ColdFfnCompute { + int n_ff_max_; + int n_threads_; + + // Per-thread scratch buffers for parallel down matmul + struct ThreadBuf { + std::vector scratch; // [n_ff * 2] gate_up result + SwiGLU + std::vector mid_conv; // down input converted to vec_dot_type + }; + std::vector thread_bufs_; + std::vector inp_conv_; // input converted (shared, read-only during matmul) + +public: + explicit CpuColdFfnCompute(int n_ff_max, int n_threads = 0) : n_ff_max_(n_ff_max) { +#ifdef _OPENMP + if (n_threads <= 0) { + const char * env = std::getenv("DFLASH_COLD_THREADS"); + n_threads = env ? std::atoi(env) : 0; + } + n_threads_ = n_threads > 0 ? n_threads : std::min(omp_get_max_threads(), 8); +#else + n_threads_ = 1; +#endif + fprintf(stderr, "[cold_ffn] using %d threads\n", n_threads_); + thread_bufs_.resize(n_threads_); + for (auto & tb : thread_bufs_) { + tb.scratch.resize((size_t)n_ff_max * 2); + } + } + + void compute( + const ColdFfnLayer & layer, + const float * input, + const int32_t * ids, + const float * weights, + int n_cold, + int n_embd, + int n_ff, + float * output) override { + + if (n_cold <= 0) return; + std::memset(output, 0, sizeof(float) * (size_t)n_embd); + + // Gate/up phase type traits + const ggml_type gu_type = layer.fused_gate_up ? layer.gate_up_type : layer.gate_type; + const auto * gu_cpu_traits = ggml_get_type_traits_cpu(gu_type); + const auto gu_vec_dot = gu_cpu_traits->vec_dot; + const auto gu_vec_dot_type = gu_cpu_traits->vec_dot_type; + const auto gu_from_float = ggml_get_type_traits_cpu(gu_vec_dot_type)->from_float; + + // Down phase type traits (may differ from gate/up) + const auto * dn_cpu_traits = ggml_get_type_traits_cpu(layer.down_type); + const auto dn_vec_dot = dn_cpu_traits->vec_dot; + const auto dn_vec_dot_type = dn_cpu_traits->vec_dot_type; + const auto dn_from_float = ggml_get_type_traits_cpu(dn_vec_dot_type)->from_float; + + const size_t inp_row_size = ggml_row_size(gu_vec_dot_type, n_embd); + const size_t mid_row_size = ggml_row_size(dn_vec_dot_type, n_ff); + const size_t gu_weight_row = ggml_row_size(gu_type, n_embd); + const size_t dn_weight_row = ggml_row_size(layer.down_type, n_ff); + + // For separate gate/up — up may have a different type than gate + size_t up_weight_row = gu_weight_row; + const ggml_type up_type_actual = layer.fused_gate_up ? gu_type : layer.up_type; + (void)up_type_actual; + ggml_vec_dot_t up_vec_dot = gu_vec_dot; + ggml_type up_vdt = gu_vec_dot_type; + if (!layer.fused_gate_up && layer.up_type != layer.gate_type) { + const auto * up_cpu_traits = ggml_get_type_traits_cpu(layer.up_type); + up_vec_dot = up_cpu_traits->vec_dot; + up_vdt = up_cpu_traits->vec_dot_type; + up_weight_row = ggml_row_size(layer.up_type, n_embd); + } + + // Ensure input conversion buffer is large enough + if (inp_conv_.size() < inp_row_size) inp_conv_.resize(inp_row_size); + // Ensure per-thread mid_conv buffers + for (auto & tb : thread_bufs_) { + if (tb.mid_conv.size() < mid_row_size) tb.mid_conv.resize(mid_row_size); + } + + // Convert input for up if different type + std::vector inp_conv_up; + if (!layer.fused_gate_up && up_vdt != gu_vec_dot_type) { + size_t up_inp_row_size = ggml_row_size(up_vdt, n_embd); + inp_conv_up.resize(up_inp_row_size); + auto up_from_float = ggml_get_type_traits_cpu(up_vdt)->from_float; + up_from_float(input, inp_conv_up.data(), n_embd); + } + + // Convert input to gate's vec_dot format once + gu_from_float(input, inp_conv_.data(), n_embd); + + for (int e = 0; e < n_cold; ++e) { + const int32_t eid = ids[e]; + const float w = weights[e]; + if (w == 0.0f) continue; + + // Use thread 0's scratch for gate_up (serial phase) + float * scratch = thread_bufs_[0].scratch.data(); + + // ── Phase 1: gate_up matmul → scratch[0..n_ff*2) ── + // Parallel over rows (each row is independent, reading shared inp_conv_) + if (layer.fused_gate_up) { + const char * expert = (const char *)layer.gate_up_data + (size_t)eid * layer.gate_up_stride; + const int n_rows = n_ff * 2; +#ifdef _OPENMP + #pragma omp parallel for num_threads(n_threads_) schedule(static) +#endif + for (int row = 0; row < n_rows; ++row) { + const void * row_ptr = expert + (size_t)row * gu_weight_row; + gu_vec_dot(n_embd, &scratch[row], 0, row_ptr, 0, inp_conv_.data(), 0, 1); + } + if (layer.gate_up_scale != 1.0f) { + for (int i = 0; i < n_rows; ++i) scratch[i] *= layer.gate_up_scale; + } + } else { + const char * gate_expert = (const char *)layer.gate_data + (size_t)eid * layer.gate_stride; + const char * up_expert = (const char *)layer.up_data + (size_t)eid * layer.up_stride; + const uint8_t * up_inp = (!inp_conv_up.empty()) ? inp_conv_up.data() : inp_conv_.data(); +#ifdef _OPENMP + #pragma omp parallel for num_threads(n_threads_) schedule(static) +#endif + for (int row = 0; row < n_ff; ++row) { + const void * gp = gate_expert + (size_t)row * gu_weight_row; + gu_vec_dot(n_embd, &scratch[row], 0, gp, 0, inp_conv_.data(), 0, 1); + const void * up = up_expert + (size_t)row * up_weight_row; + up_vec_dot(n_embd, &scratch[n_ff + row], 0, up, 0, up_inp, 0, 1); + } + if (layer.gate_scale != 1.0f) { + for (int i = 0; i < n_ff; ++i) scratch[i] *= layer.gate_scale; + } + if (layer.up_scale != 1.0f) { + for (int i = 0; i < n_ff; ++i) scratch[n_ff + i] *= layer.up_scale; + } + } + + // ── Phase 2: SwiGLU activation ── + for (int i = 0; i < n_ff; ++i) { + const float gate = scratch[i]; + const float up = scratch[n_ff + i]; + scratch[i] = (gate / (1.0f + expf(-gate))) * up; + } + + // ── Phase 3: down matmul → output (weighted accumulate) ── + // Convert SwiGLU result to down's vec_dot format (serial, small) + dn_from_float(scratch, thread_bufs_[0].mid_conv.data(), n_ff); + const uint8_t * mid_conv_data = thread_bufs_[0].mid_conv.data(); + + const char * down_expert = (const char *)layer.down_data + (size_t)eid * layer.down_stride; + const float scale = w * layer.down_scale; + + // Parallel down matmul — each thread accumulates its own output rows +#ifdef _OPENMP + #pragma omp parallel for num_threads(n_threads_) schedule(static) +#endif + for (int row = 0; row < n_embd; ++row) { + float val; + const void * row_ptr = down_expert + (size_t)row * dn_weight_row; + dn_vec_dot(n_ff, &val, 0, row_ptr, 0, mid_conv_data, 0, 1); + output[row] += scale * val; + } + } + } +}; + +std::unique_ptr make_cpu_cold_ffn_compute(int n_ff_max) { + return std::make_unique(n_ff_max); +} + +} // namespace dflash::common diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/common/moe_hybrid_ffn_eval.cpp similarity index 79% rename from server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp rename to server/src/common/moe_hybrid_ffn_eval.cpp index 6e44b2d3f..4ded0bc5a 100644 --- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp +++ b/server/src/common/moe_hybrid_ffn_eval.cpp @@ -1,214 +1,32 @@ -#include "qwen35moe_hybrid_ffn_eval.h" - -#include "qwen35_ops.h" +#include "moe_hybrid_ffn_eval.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include #include #include +#include +#include namespace dflash::common { namespace { +// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result +// by that scale. No-op when scale==1.0f (non-NVFP4 models). +inline ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result, float scale) { + if (scale == 1.0f) return mm_result; + return ggml_scale(ctx, mm_result, scale); +} + using HybridClock = std::chrono::steady_clock; static uint64_t elapsed_us(HybridClock::time_point start, HybridClock::time_point end) { return (uint64_t) std::chrono::duration_cast(end - start).count(); } -} // namespace (anon helpers) - -// Build a cached FFN graph for the hot+shared path with a fixed n_hot. -bool build_cached_hot_graph( - CachedFfnGraph & out, - ggml_backend_t backend, - ggml_tensor * gate_tensor, - ggml_tensor * up_tensor, - ggml_tensor * down_tensor, - ggml_tensor * gate_up_tensor, - float gate_scale, - float up_scale, - float down_scale, - float gate_up_scale, - const TargetLayer & L, - int n_embd, - int n_ff_exp, - int n_hot) { - - out.free(); - out.n_hot = n_hot; - - ggml_init_params ip{}; - ip.mem_size = 48 * 1024 * 1024; - ip.mem_buffer = nullptr; - ip.no_alloc = true; - out.ctx = ggml_init(ip); - if (!out.ctx) return false; - - out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1); - ggml_set_input(out.inp); - - ggml_tensor * routed = nullptr; - if (n_hot > 0) { - out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1); - ggml_set_input(out.ids); - out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1); - ggml_set_input(out.weights); - - ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1); - ggml_tensor * gu = nullptr; - if (gate_up_tensor) { - ggml_tensor * gate_up_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale); - ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], 0); - ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], - (size_t)n_ff_exp * ggml_element_size(gate_up_e)); - gate_e = ggml_cont(out.ctx, gate_e); - up_e = ggml_cont(out.ctx, up_e); - gu = ggml_swiglu_split(out.ctx, gate_e, up_e); - } else { - ggml_tensor * gate_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale); - ggml_tensor * up_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale); - gu = ggml_swiglu_split(out.ctx, gate_e, up_e); - } - - ggml_tensor * experts = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale); - ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1); - experts = ggml_mul(out.ctx, experts, w_view); - - for (int i = 0; i < n_hot; ++i) { - ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2], - (size_t)i * experts->nb[1]); - routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice); - } - } - - ggml_tensor * shared = nullptr; - const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp); - if (has_shared) { - ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_gate_shexp, out.inp), L.ffn_gate_shexp_s); - ggml_tensor * sh_up = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_up_shexp, out.inp), L.ffn_up_shexp_s); - ggml_tensor * sh_gu = ggml_swiglu_split(out.ctx, sh_gate, sh_up); - shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s); - if (L.ffn_gate_inp_shexp) { - ggml_tensor * shared_gate = apply_scale2(out.ctx, - ggml_mul_mat(out.ctx, L.ffn_gate_inp_shexp, out.inp), L.ffn_gate_inp_shexp_s); - shared_gate = ggml_sigmoid(out.ctx, shared_gate); - shared = ggml_mul(out.ctx, shared, shared_gate); - } - } - - if (routed && shared) { - out.output = ggml_add(out.ctx, routed, shared); - } else if (routed) { - out.output = routed; - } else { - out.output = shared; - } - if (!out.output) { out.free(); return false; } - - out.gf = ggml_new_graph_custom(out.ctx, 2048, false); - ggml_set_output(out.output); - ggml_build_forward_expand(out.gf, out.output); - out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); - if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) { - out.free(); - return false; - } - return true; -} - -// Build a cached FFN graph for the cold (CPU) routed subset. -bool build_cached_cold_graph( - CachedFfnGraph & out, - ggml_backend_t cpu_backend, - ggml_tensor * gate_tensor, - ggml_tensor * up_tensor, - ggml_tensor * down_tensor, - ggml_tensor * gate_up_tensor, - float gate_scale, - float up_scale, - float down_scale, - float gate_up_scale, - int n_embd, - int n_ff_exp, - int n_cold) { - - out.free(); - out.n_hot = n_cold; // reuse field for "n experts in this graph" - - ggml_init_params ip{}; - ip.mem_size = 32 * 1024 * 1024; - ip.mem_buffer = nullptr; - ip.no_alloc = true; - out.ctx = ggml_init(ip); - if (!out.ctx) return false; - - out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1); - ggml_set_input(out.inp); - out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1); - ggml_set_input(out.ids); - out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1); - ggml_set_input(out.weights); - - ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1); - ggml_tensor * gu = nullptr; - if (gate_up_tensor) { - ggml_tensor * gate_up_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale); - ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], 0); - ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], - (size_t)n_ff_exp * ggml_element_size(gate_up_e)); - gate_e = ggml_cont(out.ctx, gate_e); - up_e = ggml_cont(out.ctx, up_e); - gu = ggml_swiglu_split(out.ctx, gate_e, up_e); - } else { - ggml_tensor * gate_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale); - ggml_tensor * up_e = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale); - gu = ggml_swiglu_split(out.ctx, gate_e, up_e); - } - - ggml_tensor * experts = apply_scale2(out.ctx, - ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale); - ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1); - experts = ggml_mul(out.ctx, experts, w_view); - - out.output = nullptr; - for (int i = 0; i < n_cold; ++i) { - ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2], - (size_t)i * experts->nb[1]); - out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice); - } - if (!out.output) { out.free(); return false; } - - out.gf = ggml_new_graph_custom(out.ctx, 1024, false); - ggml_set_output(out.output); - ggml_build_forward_expand(out.gf, out.output); - out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); - if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) { - out.free(); - return false; - } - return true; -} - -namespace { - +// Run routed expert subset on a given backend (GPU or CPU). static bool run_routed_subset(ggml_backend_t backend, ggml_tensor * gate_tensor, ggml_tensor * up_tensor, @@ -331,14 +149,15 @@ static bool run_routed_subset(ggml_backend_t backend, return true; } +// Shared expert FFN on GPU. static bool run_shared_ffn_gpu(ggml_backend_t backend, - const TargetLayer & L, + const MoeLayerDesc & desc, int n_embd, const float * cur_host, std::vector & out, std::string * err) { out.assign((size_t)n_embd, 0.0f); - if (!L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp) { + if (!desc.ffn_up_shexp || !desc.ffn_gate_shexp || !desc.ffn_down_shexp) { return true; } @@ -355,13 +174,13 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend, ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, 1); ggml_set_input(inp); - ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s); - ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s); + ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s); + ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s); ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up); - ggml_tensor * shared = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s); - if (L.ffn_gate_inp_shexp) { + ggml_tensor * shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); + if (desc.ffn_gate_inp_shexp) { ggml_tensor * shared_gate = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s); + ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s); shared_gate = ggml_sigmoid(ctx, shared_gate); shared = ggml_mul(ctx, shared, shared_gate); } @@ -391,7 +210,6 @@ static bool run_shared_ffn_gpu(ggml_backend_t backend, } // Fused hot routed + shared FFN in a single GPU graph compute. -// Eliminates one graph compute phase per layer vs separate run_routed_subset + run_shared_ffn_gpu. static bool run_hot_and_shared_ffn_gpu( ggml_backend_t backend, ggml_tensor * gate_tensor, @@ -402,7 +220,7 @@ static bool run_hot_and_shared_ffn_gpu( float up_scale, float down_scale, float gate_up_scale, - const TargetLayer & L, + const MoeLayerDesc & desc, int n_embd, int n_ff_exp, const float * cur_host, @@ -415,7 +233,7 @@ static bool run_hot_and_shared_ffn_gpu( out.assign((size_t)n_embd, 0.0f); const bool has_hot = (n_hot > 0); - const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp); + const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp); if (!has_hot && !has_shared) return true; ggml_init_params ip{}; @@ -478,13 +296,13 @@ static bool run_hot_and_shared_ffn_gpu( ggml_tensor * shared = nullptr; if (has_shared) { - ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s); - ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s); + ggml_tensor * sh_gate = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s); + ggml_tensor * sh_up = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s); ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up); - shared = apply_scale2(ctx, ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s); - if (L.ffn_gate_inp_shexp) { + shared = apply_scale2(ctx, ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); + if (desc.ffn_gate_inp_shexp) { ggml_tensor * shared_gate = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s); + ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s); shared_gate = ggml_sigmoid(ctx, shared_gate); shared = ggml_mul(ctx, shared, shared_gate); } @@ -533,46 +351,266 @@ static bool run_hot_and_shared_ffn_gpu( return true; } -} // namespace - -bool eval_qwen35moe_reference_ffn_single( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected, - std::vector & out, - std::string * err) { - // Reference path: fused hot+shared in one graph (same as hybrid but all experts on GPU) - if (!run_hot_and_shared_ffn_gpu(gpu_backend, - L.ffn_gate_exps, L.ffn_up_exps, L.ffn_down_exps, L.ffn_gate_up_exps, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - L, w.n_embd, w.n_ff_exp, - cur_host, selected_ids, selected_weights, n_selected, - out, err)) { +// Build batched routed graph helper for batched prefill. +static bool build_batched_routed_graph( + ggml_context * ctx, + ggml_tensor * gate_tensor, + ggml_tensor * up_tensor, + ggml_tensor * down_tensor, + ggml_tensor * gate_up_tensor, + float gate_scale, + float up_scale, + float down_scale, + float gate_up_scale, + ggml_tensor * inp, + ggml_tensor * sel, + ggml_tensor * wts, + int n_embd, int n_ff_exp, int n_used, int n_tokens, + ggml_tensor ** out_routed) +{ + ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens); + ggml_tensor * gu = nullptr; + if (gate_up_tensor) { + ggml_tensor * gate_up_e = apply_scale2(ctx, + ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale); + ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], 0); + ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], + (size_t)n_ff_exp * ggml_element_size(gate_up_e)); + gate_e = ggml_cont(ctx, gate_e); + up_e = ggml_cont(ctx, up_e); + gu = ggml_swiglu_split(ctx, gate_e, up_e); + } else { + ggml_tensor * gate_e = apply_scale2(ctx, + ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale); + ggml_tensor * up_e = apply_scale2(ctx, + ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale); + gu = ggml_swiglu_split(ctx, gate_e, up_e); + } + + ggml_tensor * experts = apply_scale2(ctx, + ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale); + + // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens] + ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens); + experts = ggml_mul(ctx, experts, w_view); + + ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens); + ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape); + *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens); + return true; +} + +} // namespace (anon) + +// ── Public API ────────────────────────────────────────────────────────────────── + +bool build_cached_hot_graph( + CachedFfnGraph & out, + ggml_backend_t backend, + ggml_tensor * gate_tensor, + ggml_tensor * up_tensor, + ggml_tensor * down_tensor, + ggml_tensor * gate_up_tensor, + float gate_scale, + float up_scale, + float down_scale, + float gate_up_scale, + const MoeLayerDesc & desc, + int n_embd, + int n_ff_exp, + int n_hot) { + + out.free(); + out.n_hot = n_hot; + + ggml_init_params ip{}; + ip.mem_size = 48 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + out.ctx = ggml_init(ip); + if (!out.ctx) return false; + + out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1); + ggml_set_input(out.inp); + + ggml_tensor * routed = nullptr; + if (n_hot > 0) { + out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_hot, 1); + ggml_set_input(out.ids); + out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_hot, 1); + ggml_set_input(out.weights); + + ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1); + ggml_tensor * gu = nullptr; + if (gate_up_tensor) { + ggml_tensor * gate_up_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale); + ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], 0); + ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], + (size_t)n_ff_exp * ggml_element_size(gate_up_e)); + gate_e = ggml_cont(out.ctx, gate_e); + up_e = ggml_cont(out.ctx, up_e); + gu = ggml_swiglu_split(out.ctx, gate_e, up_e); + } else { + ggml_tensor * gate_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale); + ggml_tensor * up_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale); + gu = ggml_swiglu_split(out.ctx, gate_e, up_e); + } + + ggml_tensor * experts = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale); + ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_hot, 1); + experts = ggml_mul(out.ctx, experts, w_view); + + for (int i = 0; i < n_hot; ++i) { + ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2], + (size_t)i * experts->nb[1]); + routed = (i == 0) ? slice : ggml_add(out.ctx, routed, slice); + } + } + + ggml_tensor * shared = nullptr; + const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp); + if (has_shared) { + ggml_tensor * sh_gate = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_gate_shexp, out.inp), desc.ffn_gate_shexp_s); + ggml_tensor * sh_up = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_up_shexp, out.inp), desc.ffn_up_shexp_s); + ggml_tensor * sh_gu = ggml_swiglu_split(out.ctx, sh_gate, sh_up); + shared = apply_scale2(out.ctx, ggml_mul_mat(out.ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); + if (desc.ffn_gate_inp_shexp) { + ggml_tensor * shared_gate = apply_scale2(out.ctx, + ggml_mul_mat(out.ctx, desc.ffn_gate_inp_shexp, out.inp), desc.ffn_gate_inp_shexp_s); + shared_gate = ggml_sigmoid(out.ctx, shared_gate); + shared = ggml_mul(out.ctx, shared, shared_gate); + } + } + + if (routed && shared) { + out.output = ggml_add(out.ctx, routed, shared); + } else if (routed) { + out.output = routed; + } else { + out.output = shared; + } + if (!out.output) { out.free(); return false; } + + out.gf = ggml_new_graph_custom(out.ctx, 2048, false); + ggml_set_output(out.output); + ggml_build_forward_expand(out.gf, out.output); + out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) { + out.free(); + return false; + } + return true; +} + +bool build_cached_cold_graph( + CachedFfnGraph & out, + ggml_backend_t cpu_backend, + ggml_tensor * gate_tensor, + ggml_tensor * up_tensor, + ggml_tensor * down_tensor, + ggml_tensor * gate_up_tensor, + float gate_scale, + float up_scale, + float down_scale, + float gate_up_scale, + int n_embd, + int n_ff_exp, + int n_cold) { + + out.free(); + out.n_hot = n_cold; // reuse field for "n experts in this graph" + + ggml_init_params ip{}; + ip.mem_size = 32 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + out.ctx = ggml_init(ip); + if (!out.ctx) return false; + + out.inp = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_embd, 1); + ggml_set_input(out.inp); + out.ids = ggml_new_tensor_2d(out.ctx, GGML_TYPE_I32, n_cold, 1); + ggml_set_input(out.ids); + out.weights = ggml_new_tensor_2d(out.ctx, GGML_TYPE_F32, n_cold, 1); + ggml_set_input(out.weights); + + ggml_tensor * cur_3d = ggml_reshape_3d(out.ctx, out.inp, n_embd, 1, 1); + ggml_tensor * gu = nullptr; + if (gate_up_tensor) { + ggml_tensor * gate_up_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, gate_up_tensor, cur_3d, out.ids), gate_up_scale); + ggml_tensor * gate_e = ggml_view_3d(out.ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], 0); + ggml_tensor * up_e = ggml_view_3d(out.ctx, gate_up_e, + n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], + gate_up_e->nb[1], gate_up_e->nb[2], + (size_t)n_ff_exp * ggml_element_size(gate_up_e)); + gate_e = ggml_cont(out.ctx, gate_e); + up_e = ggml_cont(out.ctx, up_e); + gu = ggml_swiglu_split(out.ctx, gate_e, up_e); + } else { + ggml_tensor * gate_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, gate_tensor, cur_3d, out.ids), gate_scale); + ggml_tensor * up_e = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, up_tensor, cur_3d, out.ids), up_scale); + gu = ggml_swiglu_split(out.ctx, gate_e, up_e); + } + + ggml_tensor * experts = apply_scale2(out.ctx, + ggml_mul_mat_id(out.ctx, down_tensor, gu, out.ids), down_scale); + ggml_tensor * w_view = ggml_reshape_3d(out.ctx, out.weights, 1, n_cold, 1); + experts = ggml_mul(out.ctx, experts, w_view); + + out.output = nullptr; + for (int i = 0; i < n_cold; ++i) { + ggml_tensor * slice = ggml_view_2d(out.ctx, experts, n_embd, 1, experts->nb[2], + (size_t)i * experts->nb[1]); + out.output = (i == 0) ? slice : ggml_add(out.ctx, out.output, slice); + } + if (!out.output) { out.free(); return false; } + + out.gf = ggml_new_graph_custom(out.ctx, 1024, false); + ggml_set_output(out.output); + ggml_build_forward_expand(out.gf, out.output); + out.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); + if (!ggml_gallocr_alloc_graph(out.alloc, out.gf)) { + out.free(); return false; } return true; } -bool eval_qwen35moe_hybrid_ffn_single( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - ggml_backend_t cpu_backend, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected, - std::vector & out, - Qwen35MoeHybridFfnTelemetry * telemetry, - std::string * err) { +bool eval_moe_hybrid_ffn_single( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + ggml_backend_t cpu_backend, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_selected, + std::vector & out, + MoeHybridFfnTelemetry * telemetry, + std::string * err) { + if (telemetry) *telemetry = {}; const auto ffn_wall_t0 = HybridClock::now(); const auto partition_t0 = HybridClock::now(); + std::vector hot_ids; std::vector hot_weights; std::vector cold_ids; @@ -606,7 +644,7 @@ bool eval_qwen35moe_hybrid_ffn_single( const int n_hot = (int)hot_ids.size(); const bool has_hot = (n_hot > 0); - const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp); + const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp); const bool has_cold = !cold_ids.empty(); const int n_cold = (int)cold_ids.size(); @@ -614,17 +652,17 @@ bool eval_qwen35moe_hybrid_ffn_single( bool hot_async_launched = false; const auto hot_t0 = HybridClock::now(); if (!has_hot && !has_shared) { - hot_and_shared.assign((size_t)w.n_embd, 0.0f); + hot_and_shared.assign((size_t)cfg.n_embd, 0.0f); } else { // Lazily build cached hot graph on first use if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) { build_cached_hot_graph(storage.hot_graph, gpu_backend, storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - L, w.n_embd, w.n_ff_exp, n_hot); + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + desc, cfg.n_embd, cfg.n_ff_exp, n_hot); } if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) { - ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd); + ggml_backend_tensor_set(storage.hot_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd); if (storage.hot_graph.ids && has_hot) { ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids.data(), 0, sizeof(int32_t) * (size_t)n_hot); } @@ -638,8 +676,8 @@ bool eval_qwen35moe_hybrid_ffn_single( // Fallback: sync compute (no overlap) if (!run_hot_and_shared_ffn_gpu(gpu_backend, storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - L, w.n_embd, w.n_ff_exp, + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + desc, cfg.n_embd, cfg.n_ff_exp, cur_host, hot_ids.empty() ? nullptr : hot_ids.data(), hot_weights.empty() ? nullptr : hot_weights.data(), @@ -655,11 +693,11 @@ bool eval_qwen35moe_hybrid_ffn_single( if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) { build_cached_cold_graph(storage.cold_graph, cpu_backend, storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - w.n_embd, w.n_ff_exp, n_cold); + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + cfg.n_embd, cfg.n_ff_exp, n_cold); } if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) { - ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)w.n_embd); + ggml_backend_tensor_set(storage.cold_graph.inp, cur_host, 0, sizeof(float) * (size_t)cfg.n_embd); ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids.data(), 0, sizeof(int32_t) * (size_t)n_cold); ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights.data(), 0, sizeof(float) * (size_t)n_cold); auto st = ggml_backend_graph_compute(cpu_backend, storage.cold_graph.gf); @@ -668,28 +706,28 @@ bool eval_qwen35moe_hybrid_ffn_single( if (err) *err = "cached cold graph compute failed"; return false; } - cold.resize((size_t)w.n_embd); - ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)w.n_embd); + cold.resize((size_t)cfg.n_embd); + ggml_backend_tensor_get(storage.cold_graph.output, cold.data(), 0, sizeof(float) * (size_t)cfg.n_embd); } else { if (!run_routed_subset(cpu_backend, storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - w.n_embd, w.n_ff_exp, + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + cfg.n_embd, cfg.n_ff_exp, cur_host, cold_ids.data(), cold_weights.data(), n_cold, cold, err)) { if (hot_async_launched) ggml_backend_synchronize(gpu_backend); return false; } } } else { - cold.assign((size_t)w.n_embd, 0.0f); + cold.assign((size_t)cfg.n_embd, 0.0f); } const auto cold_t1 = HybridClock::now(); // ── Sync GPU and read result ── if ((has_hot || has_shared) && storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) { ggml_backend_synchronize(gpu_backend); - hot_and_shared.resize((size_t)w.n_embd); - ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)w.n_embd); + hot_and_shared.resize((size_t)cfg.n_embd); + ggml_backend_tensor_get(storage.hot_graph.output, hot_and_shared.data(), 0, sizeof(float) * (size_t)cfg.n_embd); } const auto hot_t1 = HybridClock::now(); @@ -700,8 +738,8 @@ bool eval_qwen35moe_hybrid_ffn_single( } const auto combine_t0 = HybridClock::now(); - out.assign((size_t)w.n_embd, 0.0f); - for (int i = 0; i < w.n_embd; ++i) { + out.assign((size_t)cfg.n_embd, 0.0f); + for (int i = 0; i < cfg.n_embd; ++i) { out[(size_t)i] = hot_and_shared[(size_t)i] + cold[(size_t)i]; } const auto combine_t1 = HybridClock::now(); @@ -712,20 +750,20 @@ bool eval_qwen35moe_hybrid_ffn_single( return true; } -bool eval_qwen35moe_batched_prefill_ffn( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_tokens, - std::vector & out, - std::string * err) { - - const int n_embd = w.n_embd; - const int n_used = w.n_expert_used; - const int n_ff_exp = w.n_ff_exp; +bool eval_moe_batched_prefill_ffn( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_tokens, + std::vector & out, + std::string * err) { + + const int n_embd = cfg.n_embd; + const int n_used = cfg.n_expert_used; + const int n_ff_exp = cfg.n_ff_exp; out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); if (n_tokens <= 0) return true; @@ -739,11 +777,8 @@ bool eval_qwen35moe_batched_prefill_ffn( return false; } - // Input: [n_embd, n_tokens] ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_set_input(inp); - - // Pre-computed routing: selected [n_used, n_tokens], weights [n_used, n_tokens] ggml_tensor * sel = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_used, n_tokens); ggml_set_input(sel); ggml_tensor * wts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_used, n_tokens); @@ -752,9 +787,9 @@ bool eval_qwen35moe_batched_prefill_ffn( // Routed expert computation using full GPU expert tensors ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens); ggml_tensor * gu = nullptr; - if (L.ffn_gate_up_exps) { + if (desc.ffn_gate_up_exps) { ggml_tensor * gate_up_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, L.ffn_gate_up_exps, cur_3d, sel), L.ffn_gate_up_exps_s); + ggml_mul_mat_id(ctx, desc.ffn_gate_up_exps, cur_3d, sel), desc.ffn_gate_up_exps_s); ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e, n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], gate_up_e->nb[1], gate_up_e->nb[2], 0); @@ -767,14 +802,14 @@ bool eval_qwen35moe_batched_prefill_ffn( gu = ggml_swiglu_split(ctx, gate_e, up_e); } else { ggml_tensor * gate_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, L.ffn_gate_exps, cur_3d, sel), L.ffn_gate_exps_s); + ggml_mul_mat_id(ctx, desc.ffn_gate_exps, cur_3d, sel), desc.ffn_gate_exps_s); ggml_tensor * up_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, L.ffn_up_exps, cur_3d, sel), L.ffn_up_exps_s); + ggml_mul_mat_id(ctx, desc.ffn_up_exps, cur_3d, sel), desc.ffn_up_exps_s); gu = ggml_swiglu_split(ctx, gate_e, up_e); } ggml_tensor * experts = apply_scale2(ctx, - ggml_mul_mat_id(ctx, L.ffn_down_exps, gu, sel), L.ffn_down_exps_s); + ggml_mul_mat_id(ctx, desc.ffn_down_exps, gu, sel), desc.ffn_down_exps_s); // Weight and sum over experts ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens); @@ -786,17 +821,17 @@ bool eval_qwen35moe_batched_prefill_ffn( // Shared expert ggml_tensor * combined = routed; - if (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp) { + if (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp) { ggml_tensor * sh_gate = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s); + ggml_mul_mat(ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s); ggml_tensor * sh_up = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s); + ggml_mul_mat(ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s); ggml_tensor * sh_gu = ggml_swiglu_split(ctx, sh_gate, sh_up); ggml_tensor * shared = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s); - if (L.ffn_gate_inp_shexp) { + ggml_mul_mat(ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); + if (desc.ffn_gate_inp_shexp) { ggml_tensor * shared_gate = apply_scale2(ctx, - ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s); + ggml_mul_mat(ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s); shared_gate = ggml_sigmoid(ctx, shared_gate); shared = ggml_mul(ctx, shared, shared_gate); } @@ -832,80 +867,24 @@ bool eval_qwen35moe_batched_prefill_ffn( return true; } -// ── GPU-Resident Residual State ── - -// ── Hybrid Batched Prefill FFN ── -// Processes n_tokens at once with hot experts on GPU and cold experts on CPU -// concurrently. Uses pre-computed routing from the pre-FFN graph. - -static bool build_batched_routed_graph( - ggml_context * ctx, - ggml_tensor * gate_tensor, - ggml_tensor * up_tensor, - ggml_tensor * down_tensor, - ggml_tensor * gate_up_tensor, - float gate_scale, - float up_scale, - float down_scale, - float gate_up_scale, - ggml_tensor * inp, // [n_embd, n_tokens] - ggml_tensor * sel, // [n_used, n_tokens] - ggml_tensor * wts, // [n_used, n_tokens] - int n_embd, int n_ff_exp, int n_used, int n_tokens, - ggml_tensor ** out_routed) -{ - ggml_tensor * cur_3d = ggml_reshape_3d(ctx, inp, n_embd, 1, n_tokens); - ggml_tensor * gu = nullptr; - if (gate_up_tensor) { - ggml_tensor * gate_up_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, gate_up_tensor, cur_3d, sel), gate_up_scale); - ggml_tensor * gate_e = ggml_view_3d(ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], 0); - ggml_tensor * up_e = ggml_view_3d(ctx, gate_up_e, - n_ff_exp, gate_up_e->ne[1], gate_up_e->ne[2], - gate_up_e->nb[1], gate_up_e->nb[2], - (size_t)n_ff_exp * ggml_element_size(gate_up_e)); - gate_e = ggml_cont(ctx, gate_e); - up_e = ggml_cont(ctx, up_e); - gu = ggml_swiglu_split(ctx, gate_e, up_e); - } else { - ggml_tensor * gate_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, gate_tensor, cur_3d, sel), gate_scale); - ggml_tensor * up_e = apply_scale2(ctx, - ggml_mul_mat_id(ctx, up_tensor, cur_3d, sel), up_scale); - gu = ggml_swiglu_split(ctx, gate_e, up_e); - } - - ggml_tensor * experts = apply_scale2(ctx, - ggml_mul_mat_id(ctx, down_tensor, gu, sel), down_scale); - - // Weight and sum over experts: [n_embd, n_used, n_tokens] * [1, n_used, n_tokens] - ggml_tensor * w_view = ggml_reshape_3d(ctx, wts, 1, n_used, n_tokens); - experts = ggml_mul(ctx, experts, w_view); - - ggml_tensor * sum_shape = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1, n_tokens); - ggml_tensor * moe_sum = ggml_repeat_back(ctx, experts, sum_shape); - *out_routed = ggml_reshape_2d(ctx, moe_sum, n_embd, n_tokens); - return true; -} - -bool eval_qwen35moe_hybrid_ffn_batched( - ggml_backend_t gpu_backend, - ggml_backend_t cpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_tokens, - std::vector & out, - std::string * err) { - - const int n_embd = w.n_embd; - const int n_used = w.n_expert_used; - const int n_ff_exp = w.n_ff_exp; +static bool eval_moe_hybrid_ffn_batched_core( + ggml_backend_t gpu_backend, + ggml_backend_t cpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_tokens, + std::vector & out, + std::string * err, + ggml_gallocr_t * p_hot_alloc, + ggml_gallocr_t * p_cold_alloc) { + + const int n_embd = cfg.n_embd; + const int n_used = cfg.n_expert_used; + const int n_ff_exp = cfg.n_ff_exp; out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); if (n_tokens <= 0) return true; @@ -942,11 +921,6 @@ bool eval_qwen35moe_hybrid_ffn_batched( } } - // Fast path: all hot → use hot stack only (no cold compute needed) - // NOTE: Cannot use eval_qwen35moe_batched_prefill_ffn here since - // L.ffn_gate_exps is not allocated in hybrid mode (skip_expert_tensors=true). - // Fall through to hybrid path which uses storage.gate_hot correctly. - // ── Step 2: Build and run hot GPU graph (includes shared expert always) ── std::vector hot_partial((size_t)n_embd * (size_t)n_tokens, 0.0f); bool hot_async_launched = false; @@ -956,8 +930,7 @@ bool eval_qwen35moe_hybrid_ffn_batched( ggml_gallocr_t hot_alloc = nullptr; ggml_tensor * hot_output = nullptr; - // Always run GPU graph: shared expert is always on GPU, and hot routed when present - const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp); + const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp); if (has_hot || has_shared) { ggml_init_params ip{}; ip.mem_size = 128 * 1024 * 1024; @@ -969,7 +942,6 @@ bool eval_qwen35moe_hybrid_ffn_batched( ggml_tensor * inp = ggml_new_tensor_2d(hot_ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_set_input(inp); - // Only create routing tensors if we have hot routed experts ggml_tensor * sel = nullptr; ggml_tensor * wts = nullptr; ggml_tensor * routed = nullptr; @@ -981,7 +953,7 @@ bool eval_qwen35moe_hybrid_ffn_batched( build_batched_routed_graph(hot_ctx, storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &routed); } @@ -989,15 +961,15 @@ bool eval_qwen35moe_hybrid_ffn_batched( ggml_tensor * combined = routed; if (has_shared) { ggml_tensor * sh_gate = apply_scale2(hot_ctx, - ggml_mul_mat(hot_ctx, L.ffn_gate_shexp, inp), L.ffn_gate_shexp_s); + ggml_mul_mat(hot_ctx, desc.ffn_gate_shexp, inp), desc.ffn_gate_shexp_s); ggml_tensor * sh_up = apply_scale2(hot_ctx, - ggml_mul_mat(hot_ctx, L.ffn_up_shexp, inp), L.ffn_up_shexp_s); + ggml_mul_mat(hot_ctx, desc.ffn_up_shexp, inp), desc.ffn_up_shexp_s); ggml_tensor * sh_gu = ggml_swiglu_split(hot_ctx, sh_gate, sh_up); ggml_tensor * shared = apply_scale2(hot_ctx, - ggml_mul_mat(hot_ctx, L.ffn_down_shexp, sh_gu), L.ffn_down_shexp_s); - if (L.ffn_gate_inp_shexp) { + ggml_mul_mat(hot_ctx, desc.ffn_down_shexp, sh_gu), desc.ffn_down_shexp_s); + if (desc.ffn_gate_inp_shexp) { ggml_tensor * shared_gate = apply_scale2(hot_ctx, - ggml_mul_mat(hot_ctx, L.ffn_gate_inp_shexp, inp), L.ffn_gate_inp_shexp_s); + ggml_mul_mat(hot_ctx, desc.ffn_gate_inp_shexp, inp), desc.ffn_gate_inp_shexp_s); shared_gate = ggml_sigmoid(hot_ctx, shared_gate); shared = ggml_mul(hot_ctx, shared, shared_gate); } @@ -1008,10 +980,17 @@ bool eval_qwen35moe_hybrid_ffn_batched( hot_gf = ggml_new_graph_custom(hot_ctx, 4096, false); ggml_set_output(hot_output); ggml_build_forward_expand(hot_gf, hot_output); - hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend)); + if (p_hot_alloc) { + if (!*p_hot_alloc) + *p_hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend)); + hot_alloc = *p_hot_alloc; + } else { + hot_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(gpu_backend)); + } if (!ggml_gallocr_alloc_graph(hot_alloc, hot_gf)) { if (err) *err = "hybrid batched hot gallocr failed"; - ggml_gallocr_free(hot_alloc); ggml_free(hot_ctx); + if (!p_hot_alloc) ggml_gallocr_free(hot_alloc); + ggml_free(hot_ctx); return false; } @@ -1053,18 +1032,26 @@ bool eval_qwen35moe_hybrid_ffn_batched( ggml_tensor * cold_routed = nullptr; build_batched_routed_graph(cold_ctx, storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, inp, sel, wts, n_embd, n_ff_exp, n_used, n_tokens, &cold_routed); ggml_cgraph * cold_gf = ggml_new_graph_custom(cold_ctx, 4096, false); ggml_set_output(cold_routed); ggml_build_forward_expand(cold_gf, cold_routed); - ggml_gallocr_t cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); + ggml_gallocr_t cold_alloc; + if (p_cold_alloc) { + if (!*p_cold_alloc) + *p_cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); + cold_alloc = *p_cold_alloc; + } else { + cold_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(cpu_backend)); + } if (!ggml_gallocr_alloc_graph(cold_alloc, cold_gf)) { if (hot_async_launched) ggml_backend_synchronize(gpu_backend); - if (hot_alloc) ggml_gallocr_free(hot_alloc); + if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc); if (hot_ctx) ggml_free(hot_ctx); - ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx); + if (!p_cold_alloc) ggml_gallocr_free(cold_alloc); + ggml_free(cold_ctx); if (err) *err = "hybrid batched cold gallocr failed"; return false; } @@ -1077,16 +1064,17 @@ bool eval_qwen35moe_hybrid_ffn_batched( auto st = ggml_backend_graph_compute(cpu_backend, cold_gf); if (st != GGML_STATUS_SUCCESS) { if (hot_async_launched) ggml_backend_synchronize(gpu_backend); - if (hot_alloc) ggml_gallocr_free(hot_alloc); + if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc); if (hot_ctx) ggml_free(hot_ctx); - ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx); + if (!p_cold_alloc) ggml_gallocr_free(cold_alloc); + ggml_free(cold_ctx); if (err) *err = "hybrid batched cold compute failed"; return false; } ggml_backend_tensor_get(cold_routed, cold_partial.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); - ggml_gallocr_free(cold_alloc); + if (!p_cold_alloc) ggml_gallocr_free(cold_alloc); ggml_free(cold_ctx); } @@ -1096,7 +1084,7 @@ bool eval_qwen35moe_hybrid_ffn_batched( ggml_backend_tensor_get(hot_output, hot_partial.data(), 0, sizeof(float) * (size_t)n_embd * (size_t)n_tokens); } - if (hot_alloc) ggml_gallocr_free(hot_alloc); + if (!p_hot_alloc && hot_alloc) ggml_gallocr_free(hot_alloc); if (hot_ctx) ggml_free(hot_ctx); // ── Step 5: Merge hot + cold ── @@ -1108,6 +1096,60 @@ bool eval_qwen35moe_hybrid_ffn_batched( return true; } +// ── GPU-Resident Residual State ── + +// Public entry. Workaround for a ggml-cuda/HIP defect: the MMQ mul_mat_id +// kernel illegal-accesses on gfx1151 when the per-layer hot expert stack is +// REDUCED (n_hot_stack < n_expert); the full-stack (all-hot) case is fine. +// MMVQ is used instead of MMQ only when the matmul batch dim (= n_tokens) is +// small (Q4_K AMD MMVQ-mmid cap is 4). So for reduced hot stacks we slice the +// prefill batch into <=4-token sub-batches, routing the routed mul_mat_id +// through the stable MMVQ path. Full stacks keep the fast single-shot MMQ. +bool eval_moe_hybrid_ffn_batched( + ggml_backend_t gpu_backend, + ggml_backend_t cpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_tokens, + std::vector & out, + std::string * err, + ggml_gallocr_t * p_hot_alloc, + ggml_gallocr_t * p_cold_alloc) { + const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2] + : storage.gate_hot ? (int)storage.gate_hot->ne[2] + : 0; + static const int MMQ_SAFE_SUB_BATCH = 4; + if (n_hot_stack > 0 && n_hot_stack < cfg.n_expert && n_tokens > MMQ_SAFE_SUB_BATCH) { + const int n_embd = cfg.n_embd; + const int n_used = cfg.n_expert_used; + out.assign((size_t)n_embd * (size_t)n_tokens, 0.0f); + std::vector sub_out; + for (int t0 = 0; t0 < n_tokens; t0 += MMQ_SAFE_SUB_BATCH) { + const int tc = std::min(MMQ_SAFE_SUB_BATCH, n_tokens - t0); + if (!eval_moe_hybrid_ffn_batched_core( + gpu_backend, cpu_backend, cfg, desc, storage, + cur_host + (size_t)t0 * (size_t)n_embd, + selected_ids + (size_t)t0 * (size_t)n_used, + selected_weights + (size_t)t0 * (size_t)n_used, + tc, sub_out, err, p_hot_alloc, p_cold_alloc)) { + return false; + } + std::memcpy(out.data() + (size_t)t0 * (size_t)n_embd, + sub_out.data(), + sizeof(float) * (size_t)n_embd * (size_t)tc); + } + return true; + } + return eval_moe_hybrid_ffn_batched_core( + gpu_backend, cpu_backend, cfg, desc, storage, + cur_host, selected_ids, selected_weights, n_tokens, out, err, + p_hot_alloc, p_cold_alloc); +} + void ResidualCombineGraph::free() { if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; } if (ctx) { ggml_free(ctx); ctx = nullptr; } @@ -1165,7 +1207,6 @@ void GpuResidentState::destroy() { bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd) { out.destroy(); - // Allocate persistent GPU tensor for act_cur ggml_init_params ip{}; ip.mem_size = 1024 * 1024; ip.mem_buffer = nullptr; @@ -1180,38 +1221,31 @@ bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int return false; } - // Build the residual combine graph if (!build_residual_combine_graph(out.combine, backend, n_embd)) { out.destroy(); return false; } - // Zero out cold_in initially (for all-hot layers, cold stays zero) std::vector zeros((size_t)n_embd, 0.0f); ggml_backend_tensor_set(out.combine.cold_in, zeros.data(), 0, sizeof(float) * (size_t)n_embd); return true; } -// ─── GPU-Resident hybrid FFN eval ───────────────────────────────────────────── -// Keeps activation on GPU: only reads router IDs (64B) to CPU, and ffn_post -// to CPU only when cold experts are selected. All other data movement is -// GPU→GPU via ggml_backend_tensor_copy. - -bool eval_qwen35moe_hybrid_ffn_gpu_resident( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - ggml_backend_t cpu_backend, - ggml_tensor * ffn_post_gpu, - ggml_tensor * ffn_residual_gpu, - GpuResidentState & gpu_state, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected) { - - const int n_embd = w.n_embd; +bool eval_moe_hybrid_ffn_gpu_resident( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + ggml_backend_t cpu_backend, + ggml_tensor * ffn_post_gpu, + ggml_tensor * ffn_residual_gpu, + GpuResidentState & gpu_state, + const int32_t * selected_ids, + const float * selected_weights, + int n_selected) { + + const int n_embd = cfg.n_embd; // ── Partition into hot/cold ── std::vector hot_ids; @@ -1239,7 +1273,7 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( const int n_hot = (int)hot_ids.size(); const bool has_hot = (n_hot > 0); - const bool has_shared = (L.ffn_up_shexp && L.ffn_gate_shexp && L.ffn_down_shexp); + const bool has_shared = (desc.ffn_up_shexp && desc.ffn_gate_shexp && desc.ffn_down_shexp); const bool has_cold = !cold_ids.empty(); const int n_cold = (int)cold_ids.size(); @@ -1252,8 +1286,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) { build_cached_hot_graph(storage.hot_graph, gpu_backend, storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - L, n_embd, w.n_ff_exp, n_hot); + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + desc, n_embd, cfg.n_ff_exp, n_hot); } if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) { // GPU→GPU copy: ffn_post → hot_graph.inp (no PCIe!) @@ -1270,7 +1304,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( } // ── If cold needed, read ffn_post to CPU BEFORE launching hot async ── - // (to avoid serializing GPU queue with a device→host read mid-kernel) std::vector post_host; if (has_cold) { post_host.resize((size_t)n_embd); @@ -1289,8 +1322,8 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) { build_cached_cold_graph(storage.cold_graph, cpu_backend, storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - n_embd, w.n_ff_exp, n_cold); + desc.ffn_gate_exps_s, desc.ffn_up_exps_s, desc.ffn_down_exps_s, desc.ffn_gate_up_exps_s, + n_embd, cfg.n_ff_exp, n_cold); } if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) { ggml_backend_tensor_set(storage.cold_graph.inp, post_host.data(), 0, @@ -1308,7 +1341,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( ggml_backend_tensor_get(storage.cold_graph.output, cold_result.data(), 0, sizeof(float) * (size_t)n_embd); } else { - // Fallback: cold graph build failed — shouldn't happen if (hot_async_launched) ggml_backend_synchronize(gpu_backend); return false; } @@ -1320,7 +1352,6 @@ bool eval_qwen35moe_hybrid_ffn_gpu_resident( // GPU→GPU: hot output → combine.hot_in ggml_backend_tensor_copy(storage.hot_graph.output, gpu_state.combine.hot_in); } else { - // No hot/shared: zero hot_in std::vector zeros((size_t)n_embd, 0.0f); ggml_backend_tensor_set(gpu_state.combine.hot_in, zeros.data(), 0, sizeof(float) * (size_t)n_embd); diff --git a/server/src/common/moe_hybrid_ffn_eval.h b/server/src/common/moe_hybrid_ffn_eval.h new file mode 100644 index 000000000..bd30755ea --- /dev/null +++ b/server/src/common/moe_hybrid_ffn_eval.h @@ -0,0 +1,191 @@ +// Common MoE hybrid FFN evaluation — hot experts on GPU, cold on CPU, concurrent. + +#pragma once + +#include "moe_hybrid_types.h" +#include "moe_hybrid_storage.h" + +#include "ggml-backend.h" + +#include +#include +#include + +namespace dflash::common { + +// GPU-resident residual combine graph: output = residual + hot_out + cold_correction. +struct ResidualCombineGraph { + ggml_context * ctx = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t alloc = nullptr; + ggml_tensor * residual_in = nullptr; + ggml_tensor * hot_in = nullptr; + ggml_tensor * cold_in = nullptr; + ggml_tensor * output = nullptr; + + ResidualCombineGraph() = default; + ~ResidualCombineGraph() { free(); } + ResidualCombineGraph(const ResidualCombineGraph &) = delete; + ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete; + ResidualCombineGraph(ResidualCombineGraph && o) noexcept + : ctx(o.ctx), gf(o.gf), alloc(o.alloc), + residual_in(o.residual_in), hot_in(o.hot_in), + cold_in(o.cold_in), output(o.output) { + o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr; + o.residual_in = nullptr; o.hot_in = nullptr; + o.cold_in = nullptr; o.output = nullptr; + } + ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept { + if (this != &o) { + free(); + ctx = o.ctx; gf = o.gf; alloc = o.alloc; + residual_in = o.residual_in; hot_in = o.hot_in; + cold_in = o.cold_in; output = o.output; + o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr; + o.residual_in = nullptr; o.hot_in = nullptr; + o.cold_in = nullptr; o.output = nullptr; + } + return *this; + } + bool valid() const { return ctx && gf && alloc && output; } + void free(); + void destroy(); +}; + +bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd); + +// GPU-resident state for the decode loop. +struct GpuResidentState { + ggml_context * ctx = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_tensor * act_cur = nullptr; + + ResidualCombineGraph combine; + + GpuResidentState() = default; + ~GpuResidentState() { destroy(); } + GpuResidentState(const GpuResidentState &) = delete; + GpuResidentState & operator=(const GpuResidentState &) = delete; + GpuResidentState(GpuResidentState && o) noexcept + : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur), + combine(std::move(o.combine)) { + o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr; + } + GpuResidentState & operator=(GpuResidentState && o) noexcept { + if (this != &o) { + destroy(); + ctx = o.ctx; buf = o.buf; act_cur = o.act_cur; + combine = std::move(o.combine); + o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr; + } + return *this; + } + bool valid() const { return ctx && buf && act_cur && combine.valid(); } + void destroy(); +}; + +bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd); + +struct MoeHybridFfnTelemetry { + uint64_t ffn_wall_us = 0; + uint64_t partition_us = 0; + uint64_t hot_us = 0; + uint64_t cold_us = 0; + uint64_t shared_us = 0; + uint64_t combine_us = 0; + int hot_selected = 0; + int cold_selected = 0; +}; + +// Single-token hybrid FFN: hot on GPU, cold on CPU, combine on host. +bool eval_moe_hybrid_ffn_single( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + ggml_backend_t cpu_backend, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_selected, + std::vector & out, + MoeHybridFfnTelemetry * telemetry = nullptr, + std::string * err = nullptr); + +// Batched prefill FFN: all experts on GPU (no hybrid split). +bool eval_moe_batched_prefill_ffn( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_tokens, + std::vector & out, + std::string * err = nullptr); + +// Batched hybrid prefill FFN: hot on GPU, cold on CPU concurrently. +bool eval_moe_hybrid_ffn_batched( + ggml_backend_t gpu_backend, + ggml_backend_t cpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + const float * cur_host, + const int32_t * selected_ids, + const float * selected_weights, + int n_tokens, + std::vector & out, + std::string * err = nullptr, + ggml_gallocr_t * p_hot_alloc = nullptr, + ggml_gallocr_t * p_cold_alloc = nullptr); + +// GPU-resident single-token hybrid FFN: keeps data on GPU, only reads router +// IDs to CPU for hot/cold partitioning. +bool eval_moe_hybrid_ffn_gpu_resident( + ggml_backend_t gpu_backend, + const MoeHybridConfig & cfg, + const MoeLayerDesc & desc, + MoeHybridLayerStorage & storage, + ggml_backend_t cpu_backend, + ggml_tensor * ffn_post_gpu, + ggml_tensor * ffn_residual_gpu, + GpuResidentState & gpu_state, + const int32_t * selected_ids, + const float * selected_weights, + int n_selected); + +// Build/rebuild cached hot FFN graph. +bool build_cached_hot_graph( + CachedFfnGraph & out, + ggml_backend_t backend, + ggml_tensor * gate_tensor, + ggml_tensor * up_tensor, + ggml_tensor * down_tensor, + ggml_tensor * gate_up_tensor, + float gate_scale, + float up_scale, + float down_scale, + float gate_up_scale, + const MoeLayerDesc & desc, + int n_embd, + int n_ff_exp, + int n_hot); + +// Build/rebuild cached cold FFN graph. +bool build_cached_cold_graph( + CachedFfnGraph & out, + ggml_backend_t cpu_backend, + ggml_tensor * gate_tensor, + ggml_tensor * up_tensor, + ggml_tensor * down_tensor, + ggml_tensor * gate_up_tensor, + float gate_scale, + float up_scale, + float down_scale, + float gate_up_scale, + int n_embd, + int n_ff_exp, + int n_cold); + +} // namespace dflash::common diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.cpp b/server/src/common/moe_hybrid_placement.cpp similarity index 79% rename from server/src/qwen35moe/qwen35moe_expert_placement.cpp rename to server/src/common/moe_hybrid_placement.cpp index d0066e9ba..49bcd5ea1 100644 --- a/server/src/qwen35moe/qwen35moe_expert_placement.cpp +++ b/server/src/common/moe_hybrid_placement.cpp @@ -1,4 +1,5 @@ -#include "qwen35moe_expert_placement.h" +#include "moe_hybrid_placement.h" +#include "moe_hybrid_routing_stats.h" #include @@ -8,20 +9,23 @@ namespace dflash::common { -bool Qwen35MoeExpertPlacement::matches(const TargetWeights & w) const { - return w.is_moe && - n_layer == w.n_layer && - n_expert == w.n_expert && - n_expert_used == w.n_expert_used && +bool MoeHybridPlacement::matches(int n_layer_, int n_expert_, int n_expert_used_) const { + return n_layer == n_layer_ && + n_expert == n_expert_ && + n_expert_used == n_expert_used_ && (int)hot_counts.size() == n_layer && (int)hot_expert_ids.size() == n_layer; } -bool Qwen35MoeExpertPlacement::empty() const { +bool MoeHybridPlacement::matches(const MoeHybridConfig & cfg) const { + return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used); +} + +bool MoeHybridPlacement::empty() const { return hot_counts.empty(); } -bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const { +bool MoeHybridPlacement::is_hot(int layer_idx, int expert_idx) const { if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) { return false; } @@ -29,7 +33,8 @@ bool Qwen35MoeExpertPlacement::is_hot(int layer_idx, int expert_idx) const { return std::find(hot.begin(), hot.end(), expert_idx) != hot.end(); } -bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string * err) const { +bool MoeHybridPlacement::save_json(const std::string & path, const std::string & arch_name, + std::string * err) const { if (n_layer <= 0 || n_expert <= 0 || (int)hot_counts.size() != n_layer || (int)hot_expert_ids.size() != n_layer) { if (err) *err = "placement not initialized"; @@ -37,7 +42,7 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string * } nlohmann::json j; - j["arch"] = "qwen35moe"; + j["arch"] = arch_name; j["version"] = 1; j["n_layer"] = n_layer; j["n_expert"] = n_expert; @@ -59,9 +64,9 @@ bool Qwen35MoeExpertPlacement::save_json(const std::string & path, std::string * return true; } -bool Qwen35MoeExpertPlacement::load_json(const std::string & path, - Qwen35MoeExpertPlacement & out, - std::string * err) { +bool MoeHybridPlacement::load_json(const std::string & path, + MoeHybridPlacement & out, + std::string * err) { std::ifstream f(path); if (!f) { if (err) *err = "failed to open input file"; @@ -76,12 +81,10 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path, return false; } - if (j.value("arch", std::string()) != "qwen35moe") { - if (err) *err = "unexpected arch"; - return false; - } + // Accept both legacy "qwen35moe" and new "moe_hybrid" / any arch string. + // We don't reject based on arch — the caller validates dimensions. - Qwen35MoeExpertPlacement tmp; + MoeHybridPlacement tmp; try { tmp.n_layer = j.value("n_layer", 0); tmp.n_expert = j.value("n_expert", 0); @@ -105,11 +108,11 @@ bool Qwen35MoeExpertPlacement::load_json(const std::string & path, return true; } -bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & stats, - int total_hot_budget, - int min_hot_per_layer, - Qwen35MoeExpertPlacement & out, - std::string * err) { +bool MoeHybridPlacement::build_from_stats(const MoeHybridRoutingStats & stats, + int total_hot_budget, + int min_hot_per_layer, + MoeHybridPlacement & out, + std::string * err) { if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) { if (err) *err = "stats not initialized"; return false; @@ -127,7 +130,7 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st return false; } - Qwen35MoeExpertPlacement tmp; + MoeHybridPlacement tmp; tmp.n_layer = stats.n_layer; tmp.n_expert = stats.n_expert; tmp.n_expert_used = stats.n_expert_used; @@ -172,12 +175,12 @@ bool Qwen35MoeExpertPlacement::build_from_stats(const Qwen35MoeRoutingStats & st return true; } -bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes( - const Qwen35MoeRoutingStats & stats, +bool MoeHybridPlacement::build_from_stats_with_layer_bytes( + const MoeHybridRoutingStats & stats, const std::vector & layer_expert_bytes, uint64_t total_hot_budget_bytes, int min_hot_per_layer, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err) { if (stats.empty() || stats.n_layer <= 0 || stats.n_expert <= 0) { if (err) *err = "stats not initialized"; @@ -196,18 +199,22 @@ bool Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes( const int per_layer_floor = std::min(min_hot_per_layer, stats.n_expert); uint64_t floor_bytes = 0; for (int il = 0; il < stats.n_layer; ++il) { - floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il]; + if (layer_expert_bytes[(size_t)il] > 0) + floor_bytes += (uint64_t)per_layer_floor * layer_expert_bytes[(size_t)il]; } if (floor_bytes > total_hot_budget_bytes) { if (err) *err = "min_hot_per_layer exceeds byte budget"; return false; } - Qwen35MoeExpertPlacement tmp; + MoeHybridPlacement tmp; tmp.n_layer = stats.n_layer; tmp.n_expert = stats.n_expert; tmp.n_expert_used = stats.n_expert_used; - tmp.hot_counts.assign((size_t)tmp.n_layer, per_layer_floor); + tmp.hot_counts.resize((size_t)tmp.n_layer); + for (int il = 0; il < tmp.n_layer; ++il) { + tmp.hot_counts[(size_t)il] = (layer_expert_bytes[(size_t)il] > 0) ? per_layer_floor : 0; + } std::vector> ranked((size_t)tmp.n_layer); for (int il = 0; il < tmp.n_layer; ++il) { diff --git a/server/src/qwen35moe/qwen35moe_expert_placement.h b/server/src/common/moe_hybrid_placement.h similarity index 58% rename from server/src/qwen35moe/qwen35moe_expert_placement.h rename to server/src/common/moe_hybrid_placement.h index 49d810dfc..023228664 100644 --- a/server/src/qwen35moe/qwen35moe_expert_placement.h +++ b/server/src/common/moe_hybrid_placement.h @@ -1,8 +1,8 @@ -// qwen35moe expert placement config derived from per-layer routing statistics. +// Common MoE expert placement — determines which experts are hot (GPU) vs cold (CPU). #pragma once -#include "qwen35moe_routing_stats.h" +#include "moe_hybrid_types.h" #include #include @@ -10,7 +10,9 @@ namespace dflash::common { -struct Qwen35MoeExpertPlacement { +struct MoeHybridRoutingStats; // forward decl + +struct MoeHybridPlacement { int n_layer = 0; int n_expert = 0; int n_expert_used = 0; @@ -21,27 +23,29 @@ struct Qwen35MoeExpertPlacement { // Ranked hot expert ids kept on GPU per layer. std::vector> hot_expert_ids; - bool matches(const TargetWeights & w) const; + bool matches(int n_layer, int n_expert, int n_expert_used) const; + bool matches(const MoeHybridConfig & cfg) const; bool empty() const; bool is_hot(int layer_idx, int expert_idx) const; - bool save_json(const std::string & path, std::string * err = nullptr) const; + bool save_json(const std::string & path, const std::string & arch_name = "moe_hybrid", + std::string * err = nullptr) const; static bool load_json(const std::string & path, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err = nullptr); - static bool build_from_stats(const Qwen35MoeRoutingStats & stats, + static bool build_from_stats(const MoeHybridRoutingStats & stats, int total_hot_budget, int min_hot_per_layer, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err = nullptr); static bool build_from_stats_with_layer_bytes( - const Qwen35MoeRoutingStats & stats, + const MoeHybridRoutingStats & stats, const std::vector & layer_expert_bytes, uint64_t total_hot_budget_bytes, int min_hot_per_layer, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err = nullptr); }; diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.cpp b/server/src/common/moe_hybrid_routing_stats.cpp similarity index 67% rename from server/src/qwen35moe/qwen35moe_routing_stats.cpp rename to server/src/common/moe_hybrid_routing_stats.cpp index 2e5e7ad17..7d11c5c1a 100644 --- a/server/src/qwen35moe/qwen35moe_routing_stats.cpp +++ b/server/src/common/moe_hybrid_routing_stats.cpp @@ -1,4 +1,4 @@ -#include "qwen35moe_routing_stats.h" +#include "moe_hybrid_routing_stats.h" #include #include @@ -8,43 +8,50 @@ namespace dflash::common { -size_t Qwen35MoeRoutingStats::index_of(int layer_idx, int expert_idx) const { +size_t MoeHybridRoutingStats::index_of(int layer_idx, int expert_idx) const { return (size_t)layer_idx * (size_t)n_expert + (size_t)expert_idx; } -bool Qwen35MoeRoutingStats::init_from_weights(const TargetWeights & w) { - if (!w.is_moe || w.n_layer <= 0 || w.n_expert <= 0 || w.n_expert_used <= 0) { +bool MoeHybridRoutingStats::init(int n_layer_, int n_expert_, int n_expert_used_) { + if (n_layer_ <= 0 || n_expert_ <= 0 || n_expert_used_ <= 0) { return false; } - n_layer = w.n_layer; - n_expert = w.n_expert; - n_expert_used = w.n_expert_used; + n_layer = n_layer_; + n_expert = n_expert_; + n_expert_used = n_expert_used_; counts.assign((size_t)n_layer * (size_t)n_expert, 0); layer_totals.assign((size_t)n_layer, 0); return true; } -bool Qwen35MoeRoutingStats::matches(const TargetWeights & w) const { - return w.is_moe && - n_layer == w.n_layer && - n_expert == w.n_expert && - n_expert_used == w.n_expert_used && +bool MoeHybridRoutingStats::init(const MoeHybridConfig & cfg) { + return init(cfg.n_layer, cfg.n_expert, cfg.n_expert_used); +} + +bool MoeHybridRoutingStats::matches(int n_layer_, int n_expert_, int n_expert_used_) const { + return n_layer == n_layer_ && + n_expert == n_expert_ && + n_expert_used == n_expert_used_ && counts.size() == (size_t)n_layer * (size_t)n_expert && layer_totals.size() == (size_t)n_layer; } -bool Qwen35MoeRoutingStats::empty() const { +bool MoeHybridRoutingStats::matches(const MoeHybridConfig & cfg) const { + return matches(cfg.n_layer, cfg.n_expert, cfg.n_expert_used); +} + +bool MoeHybridRoutingStats::empty() const { return counts.empty(); } -uint64_t Qwen35MoeRoutingStats::count(int layer_idx, int expert_idx) const { +uint64_t MoeHybridRoutingStats::count(int layer_idx, int expert_idx) const { if (layer_idx < 0 || layer_idx >= n_layer || expert_idx < 0 || expert_idx >= n_expert) { return 0; } return counts[index_of(layer_idx, expert_idx)]; } -bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) { +bool MoeHybridRoutingStats::observe(int layer_idx, const int32_t * expert_ids, int n_ids) { if (!expert_ids || layer_idx < 0 || layer_idx >= n_layer || n_ids < 0) { return false; } @@ -57,12 +64,12 @@ bool Qwen35MoeRoutingStats::observe(int layer_idx, const int32_t * expert_ids, i for (int i = 0; i < n_ids; ++i) { const int expert_idx = expert_ids[i]; counts[index_of(layer_idx, expert_idx)]++; - layer_totals[(size_t) layer_idx]++; + layer_totals[(size_t)layer_idx]++; } return true; } -bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend, +bool MoeHybridRoutingStats::observe_selected_tensor(ggml_backend_t backend, int layer_idx, ggml_tensor * selected, std::string * err) { @@ -88,7 +95,7 @@ bool Qwen35MoeRoutingStats::observe_selected_tensor(ggml_backend_t backend, return true; } -std::vector Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const { +std::vector MoeHybridRoutingStats::ranked_experts(int layer_idx) const { if (layer_idx < 0 || layer_idx >= n_layer) return {}; std::vector ranked((size_t)n_expert); std::iota(ranked.begin(), ranked.end(), 0); @@ -102,17 +109,16 @@ std::vector Qwen35MoeRoutingStats::ranked_experts(int layer_idx) const { return ranked; } -std::vector Qwen35MoeRoutingStats::hot_experts(int layer_idx, int hot_count) const { +std::vector MoeHybridRoutingStats::hot_experts(int layer_idx, int hot_count) const { std::vector ranked = ranked_experts(layer_idx); if (hot_count < 0) hot_count = 0; - if ((size_t) hot_count < ranked.size()) { - ranked.resize((size_t) hot_count); + if ((size_t)hot_count < ranked.size()) { + ranked.resize((size_t)hot_count); } return ranked; } - -bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err) const { +bool MoeHybridRoutingStats::save_csv(const std::string & path, std::string * err) const { if (n_layer <= 0 || n_expert <= 0 || counts.size() != (size_t)n_layer * (size_t)n_expert) { if (err) *err = "routing stats not initialized"; return false; @@ -124,7 +130,6 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err return false; } - // Header comments f << "# hotness table: n_layer=" << n_layer << " n_expert=" << n_expert << " n_expert_used=" << n_expert_used << "\n"; @@ -145,8 +150,8 @@ bool Qwen35MoeRoutingStats::save_csv(const std::string & path, std::string * err return true; } -bool Qwen35MoeRoutingStats::load_csv(const std::string & path, - Qwen35MoeRoutingStats & out, +bool MoeHybridRoutingStats::load_csv(const std::string & path, + MoeHybridRoutingStats & out, std::string * err) { std::ifstream f(path); if (!f) { @@ -154,32 +159,28 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path, return false; } - int n_layer = 0, n_expert = 0, n_expert_used = 0; + int file_n_layer = 0, file_n_expert = 0, file_n_expert_used = 0; std::vector all_counts; std::string line; while (std::getline(f, line)) { - // Skip comments and empty lines if (line.empty() || line[0] == '#') { - // Try to parse header metadata from comment if (line.find("n_layer=") != std::string::npos) { std::sscanf(line.c_str(), "# hotness table: n_layer=%d n_expert=%d n_expert_used=%d", - &n_layer, &n_expert, &n_expert_used); + &file_n_layer, &file_n_expert, &file_n_expert_used); } continue; } - // Parse CSV row: comma-separated uint64 values std::vector row; const char * p = line.c_str(); while (*p) { - // Skip whitespace while (*p == ' ' || *p == '\t') ++p; if (!*p) break; char * end = nullptr; uint64_t val = std::strtoull(p, &end, 10); if (end == p) { - if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)n_expert, (size_t)1))); + if (err) *err = "malformed value in row " + std::to_string((int)(all_counts.size() / std::max((size_t)file_n_expert, (size_t)1))); return false; } row.push_back(val); @@ -189,40 +190,39 @@ bool Qwen35MoeRoutingStats::load_csv(const std::string & path, if (row.empty()) continue; - // Infer n_expert from first data row - if (n_expert == 0) { - n_expert = (int)row.size(); - } else if ((int)row.size() != n_expert) { - if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)n_expert)); + if (file_n_expert == 0) { + file_n_expert = (int)row.size(); + } else if ((int)row.size() != file_n_expert) { + if (err) *err = "inconsistent row width at layer " + std::to_string((int)(all_counts.size() / (size_t)file_n_expert)); return false; } all_counts.insert(all_counts.end(), row.begin(), row.end()); } - if (n_expert <= 0 || all_counts.empty()) { + if (file_n_expert <= 0 || all_counts.empty()) { if (err) *err = "no data rows found"; return false; } - const int detected_layers = (int)(all_counts.size() / (size_t)n_expert); - if (n_layer == 0) n_layer = detected_layers; - if (n_expert_used == 0) n_expert_used = 8; // default for Qwen3.5-MoE + const int detected_layers = (int)(all_counts.size() / (size_t)file_n_expert); + if (file_n_layer == 0) file_n_layer = detected_layers; + if (file_n_expert_used == 0) file_n_expert_used = 8; // default - if ((int)all_counts.size() != n_layer * n_expert) { - if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(n_layer) + ")"; + if ((int)all_counts.size() != file_n_layer * file_n_expert) { + if (err) *err = "row count (" + std::to_string(detected_layers) + ") doesn't match n_layer (" + std::to_string(file_n_layer) + ")"; return false; } - Qwen35MoeRoutingStats tmp; - tmp.n_layer = n_layer; - tmp.n_expert = n_expert; - tmp.n_expert_used = n_expert_used; + MoeHybridRoutingStats tmp; + tmp.n_layer = file_n_layer; + tmp.n_expert = file_n_expert; + tmp.n_expert_used = file_n_expert_used; tmp.counts = std::move(all_counts); - tmp.layer_totals.assign((size_t)n_layer, 0); - for (int il = 0; il < n_layer; ++il) { + tmp.layer_totals.assign((size_t)file_n_layer, 0); + for (int il = 0; il < file_n_layer; ++il) { uint64_t total = 0; - for (int ie = 0; ie < n_expert; ++ie) { + for (int ie = 0; ie < file_n_expert; ++ie) { total += tmp.counts[tmp.index_of(il, ie)]; } tmp.layer_totals[(size_t)il] = total; diff --git a/server/src/qwen35moe/qwen35moe_routing_stats.h b/server/src/common/moe_hybrid_routing_stats.h similarity index 71% rename from server/src/qwen35moe/qwen35moe_routing_stats.h rename to server/src/common/moe_hybrid_routing_stats.h index 209ba2a0e..17fa43886 100644 --- a/server/src/qwen35moe/qwen35moe_routing_stats.h +++ b/server/src/common/moe_hybrid_routing_stats.h @@ -1,8 +1,11 @@ -// Reusable qwen35moe routing-statistics scaffold for Phase 2 expert placement. +// Common MoE routing statistics for expert placement decisions. #pragma once -#include "internal.h" +#include "moe_hybrid_types.h" + +#include "ggml.h" +#include "ggml-backend.h" #include #include @@ -10,7 +13,7 @@ namespace dflash::common { -struct Qwen35MoeRoutingStats { +struct MoeHybridRoutingStats { int n_layer = 0; int n_expert = 0; int n_expert_used = 0; @@ -19,8 +22,10 @@ struct Qwen35MoeRoutingStats { std::vector counts; std::vector layer_totals; - bool init_from_weights(const TargetWeights & w); - bool matches(const TargetWeights & w) const; + bool init(int n_layer, int n_expert, int n_expert_used); + bool init(const MoeHybridConfig & cfg); + bool matches(int n_layer, int n_expert, int n_expert_used) const; + bool matches(const MoeHybridConfig & cfg) const; bool empty() const; uint64_t count(int layer_idx, int expert_idx) const; @@ -35,7 +40,7 @@ struct Qwen35MoeRoutingStats { bool save_csv(const std::string & path, std::string * err = nullptr) const; static bool load_csv(const std::string & path, - Qwen35MoeRoutingStats & out, + MoeHybridRoutingStats & out, std::string * err = nullptr); private: diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp b/server/src/common/moe_hybrid_storage.cpp similarity index 67% rename from server/src/qwen35moe/qwen35moe_hybrid_storage.cpp rename to server/src/common/moe_hybrid_storage.cpp index fa66c065c..ca080ce43 100644 --- a/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp +++ b/server/src/common/moe_hybrid_storage.cpp @@ -1,4 +1,4 @@ -#include "qwen35moe_hybrid_storage.h" +#include "moe_hybrid_storage.h" #include "ggml-cpu.h" @@ -41,7 +41,6 @@ static bool read_expert_slices(ggml_backend_t backend, return true; } -// Read expert slices from raw memory (e.g. mmap) instead of a GPU tensor. static bool read_expert_slices_from_mem(const uint8_t * tensor_data, size_t tensor_size, const std::vector & expert_ids, @@ -90,7 +89,7 @@ static ggml_tensor * new_like_with_expert_count(ggml_context * ctx, ggml_tensor } // namespace -Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() { +MoeHybridStorage::~MoeHybridStorage() { for (auto & layer : layers) { layer.hot_graph.free(); layer.cold_graph.free(); @@ -125,67 +124,73 @@ Qwen35MoeHybridStorage::~Qwen35MoeHybridStorage() { } } -bool Qwen35MoeHybridStorage::matches(const TargetWeights & w) const { - return placement.matches(w) && (int)layers.size() == w.n_layer; +bool MoeHybridStorage::matches(const MoeHybridConfig & cfg) const { + return placement.matches(cfg) && (int)layers.size() == cfg.n_layer; } -bool Qwen35MoeHybridStorage::empty() const { +bool MoeHybridStorage::empty() const { return layers.empty(); } -bool build_qwen35moe_hybrid_storage(const TargetWeights & w, - ggml_backend_t backend, - const Qwen35MoeExpertPlacement & placement, - Qwen35MoeHybridStorage & out, - std::string * err) { - if (!placement.matches(w)) { - if (err) *err = "placement does not match model"; +bool build_moe_hybrid_storage(const MoeHybridConfig & cfg, + ggml_backend_t gpu_backend, + const MoeHybridPlacement & placement, + const std::vector & layer_descs, + MoeHybridStorage & out, + std::string * err) { + if (!placement.matches(cfg)) { + if (err) *err = "placement does not match config"; return false; } - if (!w.is_moe) { - if (err) *err = "target is not qwen35moe"; + if ((int)layer_descs.size() != cfg.n_layer) { + if (err) *err = "layer_descs size does not match n_layer"; return false; } - out.placement = placement; - out.layers.resize((size_t)w.n_layer); + out.layers.resize((size_t)cfg.n_layer); out.cpu_backend = ggml_backend_cpu_init(); if (!out.cpu_backend) { if (err) *err = "failed to init cpu backend"; return false; } - ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8))); + ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8))); + + for (int il = 0; il < cfg.n_layer; ++il) { + const MoeLayerDesc & desc = layer_descs[(size_t)il]; + MoeHybridLayerStorage & dst = out.layers[(size_t)il]; + + // Skip dense layers (no experts) + if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) { + continue; + } - for (int il = 0; il < w.n_layer; ++il) { - const TargetLayer & L = w.layers[(size_t)il]; - Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il]; dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il]; - dst.hot_local_by_global.assign((size_t)w.n_expert, -1); - dst.cold_local_by_global.assign((size_t)w.n_expert, -1); + dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1); + dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1); - std::vector is_hot((size_t)w.n_expert, 0); + std::vector is_hot((size_t)cfg.n_expert, 0); for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) { const int32_t expert = dst.hot_expert_ids[i]; - if (expert < 0 || expert >= w.n_expert) { + if (expert < 0 || expert >= cfg.n_expert) { if (err) *err = "hot expert id out of range"; return false; } dst.hot_local_by_global[(size_t)expert] = (int32_t)i; is_hot[(size_t)expert] = 1; } - for (int expert = 0; expert < w.n_expert; ++expert) { + for (int expert = 0; expert < cfg.n_expert; ++expert) { if (!is_hot[(size_t)expert]) { dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size(); dst.cold_expert_ids.push_back((int32_t)expert); } } - dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr); - if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) || - !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) || - !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) || - !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) { + dst.fused_gate_up = desc.has_fused_gate_up(); + if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) { return false; } @@ -204,47 +209,41 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w, return false; } if (dst.fused_gate_up) { - dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count); - dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count); + dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count); + dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count); } else { - dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count); - dst.up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count); - dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count); + dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count); + dst.up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count); + dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count); } - dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, backend); + dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend); if (!dst.hot_buf) { if (err) *err = "failed to allocate hot expert buffer"; return false; } - // Copy hot expert slices from full GPU tensors to hot_buf std::vector hot_bytes; if (dst.fused_gate_up) { - if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.hot_expert_ids, - dst.gate_up_expert_bytes, hot_bytes, err)) { + if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.hot_expert_ids, + dst.gate_up_expert_bytes, hot_bytes, err)) return false; - } ggml_backend_tensor_set(dst.gate_up_hot, hot_bytes.data(), 0, hot_bytes.size()); - if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids, - dst.down_expert_bytes, hot_bytes, err)) { + if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids, + dst.down_expert_bytes, hot_bytes, err)) return false; - } ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size()); } else { - if (!read_expert_slices(backend, L.ffn_gate_exps, dst.hot_expert_ids, - dst.gate_expert_bytes, hot_bytes, err)) { + if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.hot_expert_ids, + dst.gate_expert_bytes, hot_bytes, err)) return false; - } ggml_backend_tensor_set(dst.gate_hot, hot_bytes.data(), 0, hot_bytes.size()); - if (!read_expert_slices(backend, L.ffn_up_exps, dst.hot_expert_ids, - dst.up_expert_bytes, hot_bytes, err)) { + if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.hot_expert_ids, + dst.up_expert_bytes, hot_bytes, err)) return false; - } ggml_backend_tensor_set(dst.up_hot, hot_bytes.data(), 0, hot_bytes.size()); - if (!read_expert_slices(backend, L.ffn_down_exps, dst.hot_expert_ids, - dst.down_expert_bytes, hot_bytes, err)) { + if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.hot_expert_ids, + dst.down_expert_bytes, hot_bytes, err)) return false; - } ggml_backend_tensor_set(dst.down_hot, hot_bytes.data(), 0, hot_bytes.size()); } } @@ -261,64 +260,42 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w, return false; } if (dst.fused_gate_up) { - dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count); - dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count); + dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count); + dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count); } else { - dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count); - dst.up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count); - dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count); + dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count); + dst.up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count); + dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count); } dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend); if (!dst.cold_buf) { if (err) *err = "failed to allocate cold expert buffer"; return false; } - } - if (dst.fused_gate_up) { - if (!read_expert_slices(backend, L.ffn_gate_up_exps, dst.cold_expert_ids, - dst.gate_up_expert_bytes, dst.gate_up_cold_bytes, err)) { - return false; - } - } else { - if (!read_expert_slices(backend, L.ffn_gate_exps, dst.cold_expert_ids, - dst.gate_expert_bytes, dst.gate_cold_bytes, err) || - !read_expert_slices(backend, L.ffn_up_exps, dst.cold_expert_ids, - dst.up_expert_bytes, dst.up_cold_bytes, err)) { - return false; - } - } - if (!read_expert_slices(backend, L.ffn_down_exps, dst.cold_expert_ids, - dst.down_expert_bytes, dst.down_cold_bytes, err)) { - return false; - } - - if (dst.fused_gate_up) { - if (dst.gate_up_cold && !dst.gate_up_cold_bytes.empty()) { - ggml_backend_tensor_set(dst.gate_up_cold, dst.gate_up_cold_bytes.data(), 0, dst.gate_up_cold_bytes.size()); - dst.gate_up_cold_bytes.clear(); - dst.gate_up_cold_bytes.shrink_to_fit(); - } - if (dst.down_cold && !dst.down_cold_bytes.empty()) { - ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size()); - dst.down_cold_bytes.clear(); - dst.down_cold_bytes.shrink_to_fit(); - } - } else { - if (dst.gate_cold && !dst.gate_cold_bytes.empty()) { - ggml_backend_tensor_set(dst.gate_cold, dst.gate_cold_bytes.data(), 0, dst.gate_cold_bytes.size()); - dst.gate_cold_bytes.clear(); - dst.gate_cold_bytes.shrink_to_fit(); - } - if (dst.up_cold && !dst.up_cold_bytes.empty()) { - ggml_backend_tensor_set(dst.up_cold, dst.up_cold_bytes.data(), 0, dst.up_cold_bytes.size()); - dst.up_cold_bytes.clear(); - dst.up_cold_bytes.shrink_to_fit(); - } - if (dst.down_cold && !dst.down_cold_bytes.empty()) { - ggml_backend_tensor_set(dst.down_cold, dst.down_cold_bytes.data(), 0, dst.down_cold_bytes.size()); - dst.down_cold_bytes.clear(); - dst.down_cold_bytes.shrink_to_fit(); + std::vector cold_bytes; + if (dst.fused_gate_up) { + if (!read_expert_slices(gpu_backend, desc.ffn_gate_up_exps, dst.cold_expert_ids, + dst.gate_up_expert_bytes, cold_bytes, err)) + return false; + ggml_backend_tensor_set(dst.gate_up_cold, cold_bytes.data(), 0, cold_bytes.size()); + if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids, + dst.down_expert_bytes, cold_bytes, err)) + return false; + ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size()); + } else { + if (!read_expert_slices(gpu_backend, desc.ffn_gate_exps, dst.cold_expert_ids, + dst.gate_expert_bytes, cold_bytes, err)) + return false; + ggml_backend_tensor_set(dst.gate_cold, cold_bytes.data(), 0, cold_bytes.size()); + if (!read_expert_slices(gpu_backend, desc.ffn_up_exps, dst.cold_expert_ids, + dst.up_expert_bytes, cold_bytes, err)) + return false; + ggml_backend_tensor_set(dst.up_cold, cold_bytes.data(), 0, cold_bytes.size()); + if (!read_expert_slices(gpu_backend, desc.ffn_down_exps, dst.cold_expert_ids, + dst.down_expert_bytes, cold_bytes, err)) + return false; + ggml_backend_tensor_set(dst.down_cold, cold_bytes.data(), 0, cold_bytes.size()); } } } @@ -326,67 +303,69 @@ bool build_qwen35moe_hybrid_storage(const TargetWeights & w, return true; } -bool build_qwen35moe_hybrid_storage_from_file( - const TargetWeights & w, +bool build_moe_hybrid_storage_from_file( + const MoeHybridConfig & cfg, ggml_backend_t gpu_backend, - const Qwen35MoeExpertPlacement & placement, + const MoeHybridPlacement & placement, + const std::vector & layer_descs, const std::vector & file_data, - Qwen35MoeHybridStorage & out, + MoeHybridStorage & out, std::string * err) { - if (!placement.matches(w)) { - if (err) *err = "placement does not match model"; - return false; - } - if (!w.is_moe) { - if (err) *err = "target is not qwen35moe"; + if (!placement.matches(cfg)) { + if (err) *err = "placement does not match config"; return false; } - if ((int)file_data.size() != w.n_layer) { - if (err) *err = "file_data size does not match n_layer"; + if ((int)layer_descs.size() != cfg.n_layer || (int)file_data.size() != cfg.n_layer) { + if (err) *err = "layer_descs/file_data size does not match n_layer"; return false; } - out.placement = placement; - out.layers.resize((size_t)w.n_layer); + out.layers.resize((size_t)cfg.n_layer); out.cpu_backend = ggml_backend_cpu_init(); if (!out.cpu_backend) { if (err) *err = "failed to init cpu backend"; return false; } - ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(w.n_expert_used, 8))); + ggml_backend_cpu_set_n_threads(out.cpu_backend, std::max(1, std::min(cfg.n_expert_used, 8))); - for (int il = 0; il < w.n_layer; ++il) { - const TargetLayer & L = w.layers[(size_t)il]; + for (int il = 0; il < cfg.n_layer; ++il) { + const MoeLayerDesc & desc = layer_descs[(size_t)il]; const LayerExpertFileData & fd = file_data[(size_t)il]; - Qwen35MoeHybridLayerStorage & dst = out.layers[(size_t)il]; + MoeHybridLayerStorage & dst = out.layers[(size_t)il]; + + // Skip dense layers (no experts) + if (!desc.ffn_gate_exps && !desc.ffn_up_exps && !desc.ffn_down_exps && !desc.ffn_gate_up_exps) { + continue; + } + dst.hot_expert_ids = placement.hot_expert_ids[(size_t)il]; - dst.hot_local_by_global.assign((size_t)w.n_expert, -1); - dst.cold_local_by_global.assign((size_t)w.n_expert, -1); + dst.hot_local_by_global.assign((size_t)cfg.n_expert, -1); + dst.cold_local_by_global.assign((size_t)cfg.n_expert, -1); - std::vector is_hot((size_t)w.n_expert, 0); + std::vector is_hot((size_t)cfg.n_expert, 0); for (size_t i = 0; i < dst.hot_expert_ids.size(); ++i) { const int32_t expert = dst.hot_expert_ids[i]; - if (expert < 0 || expert >= w.n_expert) { + if (expert < 0 || expert >= cfg.n_expert) { if (err) *err = "hot expert id out of range"; return false; } dst.hot_local_by_global[(size_t)expert] = (int32_t)i; is_hot[(size_t)expert] = 1; } - for (int expert = 0; expert < w.n_expert; ++expert) { + for (int expert = 0; expert < cfg.n_expert; ++expert) { if (!is_hot[(size_t)expert]) { dst.cold_local_by_global[(size_t)expert] = (int32_t)dst.cold_expert_ids.size(); dst.cold_expert_ids.push_back((int32_t)expert); } } - dst.fused_gate_up = (L.ffn_gate_up_exps != nullptr); - if (!validate_expert_tensor(L.ffn_gate_exps, w.n_expert, &dst.gate_expert_bytes, err) || - !validate_expert_tensor(L.ffn_up_exps, w.n_expert, &dst.up_expert_bytes, err) || - !validate_expert_tensor(L.ffn_down_exps, w.n_expert, &dst.down_expert_bytes, err) || - !validate_expert_tensor(L.ffn_gate_up_exps, w.n_expert, &dst.gate_up_expert_bytes, err)) { + dst.fused_gate_up = desc.has_fused_gate_up(); + if (!validate_expert_tensor(desc.ffn_gate_exps, cfg.n_expert, &dst.gate_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_up_exps, cfg.n_expert, &dst.up_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_down_exps, cfg.n_expert, &dst.down_expert_bytes, err) || + !validate_expert_tensor(desc.ffn_gate_up_exps, cfg.n_expert, &dst.gate_up_expert_bytes, err)) { return false; } @@ -405,20 +384,22 @@ bool build_qwen35moe_hybrid_storage_from_file( return false; } if (dst.fused_gate_up) { - dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_up_exps, hot_count); - dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count); + dst.gate_up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_up_exps, hot_count); + dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count); } else { - dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_gate_exps, hot_count); - dst.up_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_up_exps, hot_count); - dst.down_hot = new_like_with_expert_count(dst.hot_ctx, L.ffn_down_exps, hot_count); + dst.gate_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_gate_exps, hot_count); + dst.up_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_up_exps, hot_count); + dst.down_hot = new_like_with_expert_count(dst.hot_ctx, desc.ffn_down_exps, hot_count); } dst.hot_buf = ggml_backend_alloc_ctx_tensors(dst.hot_ctx, gpu_backend); if (!dst.hot_buf) { - if (err) *err = "failed to allocate hot expert GPU buffer"; + char msg[128]; + std::snprintf(msg, sizeof(msg), + "failed to allocate hot expert GPU buffer (layer %d, %d hot experts)", il, hot_count); + if (err) *err = msg; return false; } - // Load hot expert slices from file std::vector slice_buf; if (dst.fused_gate_up) { if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size, @@ -457,12 +438,12 @@ bool build_qwen35moe_hybrid_storage_from_file( return false; } if (dst.fused_gate_up) { - dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_up_exps, cold_count); - dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count); + dst.gate_up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_up_exps, cold_count); + dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count); } else { - dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_gate_exps, cold_count); - dst.up_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_up_exps, cold_count); - dst.down_cold = new_like_with_expert_count(dst.cold_ctx, L.ffn_down_exps, cold_count); + dst.gate_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_gate_exps, cold_count); + dst.up_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_up_exps, cold_count); + dst.down_cold = new_like_with_expert_count(dst.cold_ctx, desc.ffn_down_exps, cold_count); } dst.cold_buf = ggml_backend_alloc_ctx_tensors(dst.cold_ctx, out.cpu_backend); if (!dst.cold_buf) { @@ -470,7 +451,6 @@ bool build_qwen35moe_hybrid_storage_from_file( return false; } - // Load cold expert slices from file directly to CPU tensors std::vector slice_buf; if (dst.fused_gate_up) { if (!read_expert_slices_from_mem(fd.gate_up_exps.data, fd.gate_up_exps.size, diff --git a/server/src/qwen35moe/qwen35moe_hybrid_storage.h b/server/src/common/moe_hybrid_storage.h similarity index 58% rename from server/src/qwen35moe/qwen35moe_hybrid_storage.h rename to server/src/common/moe_hybrid_storage.h index c883da78e..465827ec0 100644 --- a/server/src/qwen35moe/qwen35moe_hybrid_storage.h +++ b/server/src/common/moe_hybrid_storage.h @@ -1,8 +1,9 @@ -// Phase 3 hybrid expert storage for qwen35moe. +// Common MoE hybrid expert storage — manages hot (GPU) and cold (CPU) expert buffers. #pragma once -#include "qwen35moe_expert_placement.h" +#include "moe_hybrid_types.h" +#include "moe_hybrid_placement.h" #include "ggml-alloc.h" @@ -29,7 +30,7 @@ struct CachedFfnGraph { void free(); }; -struct Qwen35MoeHybridLayerStorage { +struct MoeHybridLayerStorage { ggml_context * hot_ctx = nullptr; ggml_backend_buffer_t hot_buf = nullptr; ggml_tensor * gate_hot = nullptr; @@ -60,39 +61,31 @@ struct Qwen35MoeHybridLayerStorage { std::vector down_cold_bytes; std::vector gate_up_cold_bytes; - // Cached FFN graphs: hot_graph for all-hot case (n_expert_used hot experts), - // cold_graph for all-cold case (n_expert_used cold experts). - // These cover the common case; mixed hot/cold falls back to dynamic build. - CachedFfnGraph hot_graph; // GPU: fused routed(n_expert_used hot) + shared - CachedFfnGraph cold_graph; // CPU: routed(n_expert_used cold) + // Cached FFN graphs for common-case expert counts. + CachedFfnGraph hot_graph; + CachedFfnGraph cold_graph; }; -struct Qwen35MoeHybridStorage { - Qwen35MoeHybridStorage() = default; - Qwen35MoeHybridStorage(const Qwen35MoeHybridStorage &) = delete; - Qwen35MoeHybridStorage & operator=(const Qwen35MoeHybridStorage &) = delete; - Qwen35MoeHybridStorage(Qwen35MoeHybridStorage &&) = delete; - Qwen35MoeHybridStorage & operator=(Qwen35MoeHybridStorage &&) = delete; - ~Qwen35MoeHybridStorage(); +struct MoeHybridStorage { + MoeHybridStorage() = default; + MoeHybridStorage(const MoeHybridStorage &) = delete; + MoeHybridStorage & operator=(const MoeHybridStorage &) = delete; + MoeHybridStorage(MoeHybridStorage &&) = delete; + MoeHybridStorage & operator=(MoeHybridStorage &&) = delete; + ~MoeHybridStorage(); ggml_backend_t cpu_backend = nullptr; - Qwen35MoeExpertPlacement placement; - std::vector layers; + MoeHybridPlacement placement; + std::vector layers; - bool matches(const TargetWeights & w) const; + bool matches(const MoeHybridConfig & cfg) const; bool empty() const; }; -bool build_qwen35moe_hybrid_storage(const TargetWeights & w, - ggml_backend_t backend, - const Qwen35MoeExpertPlacement & placement, - Qwen35MoeHybridStorage & out, - std::string * err = nullptr); - // Expert tensor file data for split loading (one entry per expert tensor). struct ExpertTensorFileData { - const uint8_t * data = nullptr; // pointer into mmap - size_t size = 0; // total tensor size in bytes + const uint8_t * data = nullptr; + size_t size = 0; }; // Per-layer expert tensor file data for split loading. @@ -103,15 +96,23 @@ struct LayerExpertFileData { ExpertTensorFileData gate_up_exps; // optional fused }; +// Build hybrid storage from GPU-resident expert tensors. +// layer_descs: one MoeLayerDesc per MoE layer (caller constructs from model-specific types). +bool build_moe_hybrid_storage(const MoeHybridConfig & cfg, + ggml_backend_t gpu_backend, + const MoeHybridPlacement & placement, + const std::vector & layer_descs, + MoeHybridStorage & out, + std::string * err = nullptr); + // Build hybrid storage by loading expert data directly from file (mmap). -// Expert tensors in w are only used for metadata (ne/nb/type); their buffer -// may be null. Expert data is read from file_data entries. -bool build_qwen35moe_hybrid_storage_from_file( - const TargetWeights & w, +bool build_moe_hybrid_storage_from_file( + const MoeHybridConfig & cfg, ggml_backend_t gpu_backend, - const Qwen35MoeExpertPlacement & placement, + const MoeHybridPlacement & placement, + const std::vector & layer_descs, const std::vector & file_data, - Qwen35MoeHybridStorage & out, + MoeHybridStorage & out, std::string * err = nullptr); } // namespace dflash::common diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.cpp b/server/src/common/moe_hybrid_swap_manager.cpp similarity index 86% rename from server/src/qwen35moe/qwen35moe_swap_manager.cpp rename to server/src/common/moe_hybrid_swap_manager.cpp index 3c7006d60..399d9c612 100644 --- a/server/src/qwen35moe/qwen35moe_swap_manager.cpp +++ b/server/src/common/moe_hybrid_swap_manager.cpp @@ -1,14 +1,15 @@ -#include "qwen35moe_swap_manager.h" +#include "moe_hybrid_swap_manager.h" +#include "moe_hybrid_routing_stats.h" #include namespace dflash::common { -bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current, - const Qwen35MoeRoutingStats & stats, - const Qwen35MoeSwapPolicy & policy, - Qwen35MoeSwapPlan & out, - std::string * err) { +bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current, + const MoeHybridRoutingStats & stats, + const MoeHybridSwapPolicy & policy, + MoeHybridSwapPlan & out, + std::string * err) { if (current.n_layer != stats.n_layer || current.n_expert != stats.n_expert || current.n_expert_used != stats.n_expert_used) { @@ -16,14 +17,14 @@ bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current, return false; } - out = Qwen35MoeSwapPlan{}; + out = MoeHybridSwapPlan{}; out.next_placement = current; if (policy.max_swaps_total <= 0) { return true; } struct Candidate { - Qwen35MoeSwapAction action; + MoeHybridSwapAction action; uint64_t gain_delta = 0; }; std::vector candidates; diff --git a/server/src/common/moe_hybrid_swap_manager.h b/server/src/common/moe_hybrid_swap_manager.h new file mode 100644 index 000000000..c24a71740 --- /dev/null +++ b/server/src/common/moe_hybrid_swap_manager.h @@ -0,0 +1,39 @@ +// Common MoE hybrid swap manager — promotes/demotes experts at request boundaries. + +#pragma once + +#include "moe_hybrid_placement.h" + +#include +#include +#include + +namespace dflash::common { + +struct MoeHybridRoutingStats; + +struct MoeHybridSwapAction { + int layer_idx = -1; + int evict_expert = -1; + int promote_expert = -1; + uint64_t evict_count = 0; + uint64_t promote_count = 0; +}; + +struct MoeHybridSwapPlan { + MoeHybridPlacement next_placement; + std::vector actions; +}; + +struct MoeHybridSwapPolicy { + int max_swaps_total = 0; // 0 = no swaps + uint64_t min_promote_gain = 1; // promoted expert count must exceed evicted by this amount +}; + +bool build_moe_hybrid_swap_plan(const MoeHybridPlacement & current, + const MoeHybridRoutingStats & stats, + const MoeHybridSwapPolicy & policy, + MoeHybridSwapPlan & out, + std::string * err = nullptr); + +} // namespace dflash::common diff --git a/server/src/common/moe_hybrid_types.h b/server/src/common/moe_hybrid_types.h new file mode 100644 index 000000000..c3a15e6bd --- /dev/null +++ b/server/src/common/moe_hybrid_types.h @@ -0,0 +1,63 @@ +// Common MoE hybrid mode types and descriptors. +// +// Model-agnostic abstractions used by both qwen35moe and laguna backends +// to implement the hybrid expert offload strategy (hot experts on GPU, +// cold experts on CPU, concurrent evaluation). + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#include + +namespace dflash::common { + +// ─── MoE architecture config (model-agnostic) ────────────────────────── + +struct MoeHybridConfig { + int n_embd = 0; // hidden dimension + int n_expert = 0; // total experts per layer + int n_expert_used = 0; // top-k selected per token + int n_ff_exp = 0; // routed expert intermediate dimension + int n_ff_shexp = 0; // shared expert intermediate dimension (0 = no shared) + int n_layer = 0; // number of MoE layers + int first_moe_layer = 0; // index of first MoE layer (e.g., 0 for qwen35moe, 1 for laguna) +}; + +// ─── Per-layer expert tensor descriptor ───────────────────────────────── +// +// Provides a uniform view over model-specific layer structures. All pointers +// refer to the FULL expert tensor stacks on GPU (used for placement validation +// and metadata queries). In hybrid mode, the actual hot/cold split tensors +// live in MoeHybridLayerStorage. + +struct MoeLayerDesc { + // Routed expert weight tensors (stacked: [dim_in, dim_out, n_expert]) + ggml_tensor * ffn_gate_exps = nullptr; + ggml_tensor * ffn_up_exps = nullptr; + ggml_tensor * ffn_down_exps = nullptr; + ggml_tensor * ffn_gate_up_exps = nullptr; // optional fused gate+up + + // Shared expert tensors (nullptr if no shared expert) + ggml_tensor * ffn_gate_shexp = nullptr; + ggml_tensor * ffn_up_shexp = nullptr; + ggml_tensor * ffn_down_shexp = nullptr; + ggml_tensor * ffn_gate_inp_shexp = nullptr; // optional shared-expert gating + + // Per-tensor quantization scale factors (1.0f = no scaling) + float ffn_gate_exps_s = 1.0f; + float ffn_up_exps_s = 1.0f; + float ffn_down_exps_s = 1.0f; + float ffn_gate_up_exps_s = 1.0f; + float ffn_gate_shexp_s = 1.0f; + float ffn_up_shexp_s = 1.0f; + float ffn_down_shexp_s = 1.0f; + float ffn_gate_inp_shexp_s = 1.0f; + + bool has_fused_gate_up() const { return ffn_gate_up_exps != nullptr; } + bool has_shared_expert() const { return ffn_up_shexp != nullptr; } +}; + +} // namespace dflash::common + diff --git a/server/src/common/moe_hybrid_types_impl.h b/server/src/common/moe_hybrid_types_impl.h new file mode 100644 index 000000000..ddb4355bc --- /dev/null +++ b/server/src/common/moe_hybrid_types_impl.h @@ -0,0 +1,91 @@ +// Inline implementations for MoeHybridConfig/MoeLayerDesc conversion helpers. +// +// Include this header AFTER both moe_hybrid_types.h and the relevant +// model-specific weight struct header (internal.h or laguna_internal.h). +// The preprocessor guards detect which weight structs are available and +// only define the corresponding conversion helpers. + +#pragma once + +namespace dflash::common { + +// ─── qwen35 conversions ───────────────────────────────────────────────── + +#if defined(DFLASH_INTERNAL_H_INCLUDED) + +inline MoeHybridConfig make_moe_hybrid_config(const TargetWeights & w) { + MoeHybridConfig cfg; + cfg.n_embd = w.n_embd; + cfg.n_expert = w.n_expert; + cfg.n_expert_used = w.n_expert_used; + cfg.n_ff_exp = w.n_ff_exp; + cfg.n_ff_shexp = w.n_ff_shexp; + cfg.n_layer = w.n_layer; + cfg.first_moe_layer = 0; // all layers are MoE in qwen35moe + return cfg; +} + +inline MoeLayerDesc make_moe_layer_desc(const TargetLayer & L) { + MoeLayerDesc desc; + desc.ffn_gate_exps = L.ffn_gate_exps; + desc.ffn_up_exps = L.ffn_up_exps; + desc.ffn_down_exps = L.ffn_down_exps; + desc.ffn_gate_up_exps = L.ffn_gate_up_exps; + desc.ffn_gate_shexp = L.ffn_gate_shexp; + desc.ffn_up_shexp = L.ffn_up_shexp; + desc.ffn_down_shexp = L.ffn_down_shexp; + desc.ffn_gate_inp_shexp = L.ffn_gate_inp_shexp; + desc.ffn_gate_exps_s = L.ffn_gate_exps_s; + desc.ffn_up_exps_s = L.ffn_up_exps_s; + desc.ffn_down_exps_s = L.ffn_down_exps_s; + desc.ffn_gate_up_exps_s = L.ffn_gate_up_exps_s; + desc.ffn_gate_shexp_s = L.ffn_gate_shexp_s; + desc.ffn_up_shexp_s = L.ffn_up_shexp_s; + desc.ffn_down_shexp_s = L.ffn_down_shexp_s; + desc.ffn_gate_inp_shexp_s = L.ffn_gate_inp_shexp_s; + return desc; +} + +#endif // DFLASH_INTERNAL_H_INCLUDED + +// ─── Laguna conversions ───────────────────────────────────────────────── + +#if defined(DFLASH_LAGUNA_INTERNAL_H_INCLUDED) + +inline MoeHybridConfig make_moe_hybrid_config(const LagunaTargetWeights & w) { + MoeHybridConfig cfg; + cfg.n_embd = w.n_embd; + cfg.n_expert = w.n_expert; + cfg.n_expert_used = w.n_expert_used; + cfg.n_ff_exp = w.n_ff_exp; + cfg.n_ff_shexp = w.n_ff_shexp; + cfg.n_layer = w.n_layer; + cfg.first_moe_layer = w.n_layer_dense_lead; // layer 0 is dense in laguna + return cfg; +} + +inline MoeLayerDesc make_moe_layer_desc(const LagunaTargetLayer & L) { + MoeLayerDesc desc; + desc.ffn_gate_exps = L.ffn_gate_exps; + desc.ffn_up_exps = L.ffn_up_exps; + desc.ffn_down_exps = L.ffn_down_exps; + desc.ffn_gate_up_exps = nullptr; // laguna has no fused gate_up + desc.ffn_gate_shexp = L.ffn_gate_shexp; + desc.ffn_up_shexp = L.ffn_up_shexp; + desc.ffn_down_shexp = L.ffn_down_shexp; + desc.ffn_gate_inp_shexp = nullptr; // laguna has no shared-expert gate + // Laguna does not use per-tensor quantization scales + desc.ffn_gate_exps_s = 1.0f; + desc.ffn_up_exps_s = 1.0f; + desc.ffn_down_exps_s = 1.0f; + desc.ffn_gate_up_exps_s = 1.0f; + desc.ffn_gate_shexp_s = 1.0f; + desc.ffn_up_shexp_s = 1.0f; + desc.ffn_down_shexp_s = 1.0f; + desc.ffn_gate_inp_shexp_s = 1.0f; + return desc; +} + +#endif // DFLASH_LAGUNA_INTERNAL_H_INCLUDED + +} // namespace dflash::common diff --git a/server/src/internal.h b/server/src/internal.h index f9a890ff2..9b5b45ca7 100644 --- a/server/src/internal.h +++ b/server/src/internal.h @@ -2,6 +2,7 @@ // Not installed, not exposed in the public API. #pragma once +#define DFLASH_INTERNAL_H_INCLUDED #include #include @@ -27,7 +28,7 @@ namespace dflash::common { -struct Qwen35MoeHybridStorage; +struct MoeHybridStorage; // Single source of truth for error reporting. // All loaders / graph builders push into this via set_last_error(...). @@ -155,7 +156,7 @@ struct TargetWeights { std::vector layers; // size = 64 ggml_tensor * out_norm = nullptr; // [hidden] ggml_tensor * output = nullptr; // [hidden, vocab] (lm_head) - std::shared_ptr moe_hybrid; // optional Phase 3 hybrid storage + std::shared_ptr moe_hybrid; // optional hybrid storage (hot/cold expert split) // Metadata from GGUF (validated at load time) int full_attention_interval = 4; diff --git a/server/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp index 87723f596..ff82df7f7 100644 --- a/server/src/laguna/laguna_backend.cpp +++ b/server/src/laguna/laguna_backend.cpp @@ -10,7 +10,17 @@ #include "laguna_internal.h" #include "dflash27b.h" +#include "../common/moe_hybrid_types.h" +#include "../common/moe_hybrid_types_impl.h" +#include "../common/moe_hybrid_placement.h" +#include "../common/moe_hybrid_ffn_eval.h" +#include "../common/moe_hybrid_storage.h" +#include "../common/moe_hybrid_routing_stats.h" +#include "../common/moe_hybrid_swap_manager.h" +#include "common/step_graph.h" + #include "ggml-cuda.h" +#include "ggml-alloc.h" #include "common/snapshot_backend.h" #include @@ -20,6 +30,10 @@ #include #include #include +#include +#include +#include +#include namespace dflash::common { @@ -44,8 +58,9 @@ bool LagunaBackend::init() { return false; } - if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) { - std::fprintf(stderr, "load failed: %s\n", dflash27b_last_error()); + // Always use dynamic placement (like qwen35moe): partial load first, + // compute budget, then reload full if all experts fit. + if (!init_hybrid_mode()) { ggml_backend_free(backend_); backend_ = nullptr; return false; } @@ -146,6 +161,12 @@ int LagunaBackend::snapshot_cur_pos(int slot) const { GenerateResult LagunaBackend::generate(const GenerateRequest & req, const DaemonIO & io) { + if (hybrid_mode_ && moe_hybrid_) { + auto result = generate_hybrid(req, io); + if (result.ok) maybe_post_request_swap(); + return result; + } + const bool no_mask = (std::getenv("DFLASH_NO_MASK") != nullptr); GenerateResult result; DaemonIO out_io = io.with_token_callback(req.on_token); @@ -492,6 +513,983 @@ void LagunaBackend::free_drafter() { } } +// ── Hybrid MoE mode ───────────────────────────────────────────────────── +// +// Layer-by-layer decode: for each token, iterate through all 40 layers. +// Layer 0 (dense SwiGLU) runs as a monolithic GPU sub-graph. +// Layers 1..39 (sparse MoE) run attention+router on GPU, read back expert +// selections, then call the common hybrid FFN eval (hot on GPU, cold on CPU). + +using HybridClock = std::chrono::steady_clock; +static inline uint64_t elapsed_us(HybridClock::time_point t0, HybridClock::time_point t1) { + return (uint64_t)std::chrono::duration_cast(t1 - t0).count(); +} + +bool LagunaBackend::init_hybrid_mode() { + const char * hotness_path = std::getenv("DFLASH_LAGUNA_HOTNESS"); + + // Step 1: Load model WITHOUT expert data to GPU (partial load) + TargetLoadPlan _hybrid_plan; + _hybrid_plan.skip_expert_tensors = true; + if (!load_target_gguf_laguna_partial(args_.target_path, backend_, _hybrid_plan, w_)) { + std::fprintf(stderr, "[laguna-hybrid] partial load failed: %s\n", dflash27b_last_error()); + return false; + } + + // Step 2: Load/build routing stats + MoeHybridRoutingStats hotness; + std::string err; + std::string placement_source; + if (hotness_path && hotness_path[0]) { + if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, &err)) { + std::fprintf(stderr, "[laguna-hybrid] hotness load failed: %s\n", err.c_str()); + return false; + } + if (hotness.n_layer != w_.n_layer || hotness.n_expert != w_.n_expert) { + std::fprintf(stderr, "[laguna-hybrid] hotness dimensions mismatch (got %d×%d, want %d×%d)\n", + hotness.n_layer, hotness.n_expert, w_.n_layer, w_.n_expert); + return false; + } + placement_source = "file"; + } else { + // Uniform hotness (budget-only mode, no hotness file) + hotness.n_layer = w_.n_layer; + hotness.n_expert = w_.n_expert; + hotness.n_expert_used = w_.n_expert_used; + hotness.counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 1); + hotness.layer_totals.assign((size_t)w_.n_layer, (uint64_t)w_.n_expert); + placement_source = "uniform"; + } + + // Step 3: Query GPU memory and compute expert budget + size_t gpu_free = 0, gpu_total = 0; + ggml_backend_dev_t dev = ggml_backend_get_device(backend_); + if (dev) { + ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); + } + if (gpu_total == 0) { + std::fprintf(stderr, "[laguna-hybrid] could not query GPU memory\n"); + return false; + } + + // Compute per-layer expert size in bytes (laguna: separate gate/up/down, no fused) + std::vector layer_expert_bytes((size_t)w_.n_layer); + for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) { + const LagunaTargetLayer & L = w_.layers[(size_t)il]; + uint64_t bytes = 0; + if (L.ffn_gate_exps) bytes += ggml_nbytes(L.ffn_gate_exps) / (uint64_t)w_.n_expert; + if (L.ffn_up_exps) bytes += ggml_nbytes(L.ffn_up_exps) / (uint64_t)w_.n_expert; + if (L.ffn_down_exps) bytes += ggml_nbytes(L.ffn_down_exps) / (uint64_t)w_.n_expert; + layer_expert_bytes[(size_t)il] = bytes; + } + // Layer 0 is dense — no experts + for (int il = 0; il < w_.n_layer_dense_lead; ++il) { + layer_expert_bytes[(size_t)il] = 0; + } + + uint64_t total_expert_bytes = 0; + for (int il = 0; il < w_.n_layer; ++il) { + total_expert_bytes += layer_expert_bytes[(size_t)il] * (uint64_t)w_.n_expert; + } + + // KV cache estimate + const char * ctx_env = std::getenv("DFLASH_MAX_CONTEXT"); + int max_context = ctx_env ? std::atoi(ctx_env) : args_.max_ctx; + if (max_context <= 0) max_context = 8192; + + const uint64_t kv_bytes_per_tok = (uint64_t)w_.n_layer * 2 * + (uint64_t)w_.n_head_kv * (uint64_t)w_.head_dim * 2; + const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context; + + const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024; + const uint64_t safety_bytes = 512ULL * 1024 * 1024; + const uint64_t core_bytes = gpu_total - gpu_free; + + uint64_t expert_budget = 0; + if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes) { + expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes; + } + if (expert_budget > total_expert_bytes) { + expert_budget = total_expert_bytes; + } + + // Manual budget cap (absolute MB) + if (const char * cap_env = std::getenv("DFLASH_EXPERT_BUDGET_MB")) { + uint64_t cap_bytes = (uint64_t)std::atoi(cap_env) * 1024ULL * 1024ULL; + if (cap_bytes > 0 && cap_bytes < expert_budget) { + std::printf("[laguna-hybrid] capping expert budget from %.2f GiB to %d MB\n", + expert_budget / 1024.0 / 1024.0 / 1024.0, std::atoi(cap_env)); + expert_budget = cap_bytes; + } + } + + // Percentage-based budget cap + if (const char * pct_env = std::getenv("DFLASH_EXPERT_BUDGET_PCT")) { + int pct = std::atoi(pct_env); + if (pct > 0 && pct < 100) { + uint64_t pct_bytes = total_expert_bytes * (uint64_t)pct / 100ULL; + if (pct_bytes < expert_budget) { + std::printf("[laguna-hybrid] capping expert budget to %d%% = %.2f GiB (of %.2f GiB)\n", + pct, pct_bytes / 1024.0 / 1024.0 / 1024.0, + total_expert_bytes / 1024.0 / 1024.0 / 1024.0); + expert_budget = pct_bytes; + } + } + } + + std::printf("[laguna] dynamic placement: gpu_total=%.2f GiB, core=%.2f GiB, " + "kv_cache=%.2f GiB (ctx=%d), warm=%.0f MB, safety=%.0f MB, " + "expert_budget=%.2f GiB (of %.2f GiB total experts)\n", + gpu_total / 1024.0 / 1024.0 / 1024.0, + core_bytes / 1024.0 / 1024.0 / 1024.0, + kv_total / 1024.0 / 1024.0 / 1024.0, + max_context, + warm_cache_bytes / 1024.0 / 1024.0, + safety_bytes / 1024.0 / 1024.0, + expert_budget / 1024.0 / 1024.0 / 1024.0, + total_expert_bytes / 1024.0 / 1024.0 / 1024.0); + std::fflush(stdout); + + if (expert_budget == 0) { + std::fprintf(stderr, "[laguna-hybrid] no VRAM budget for experts\n"); + return false; + } + + // Step 4: Build placement + MoeHybridPlacement placement; + if (!MoeHybridPlacement::build_from_stats_with_layer_bytes( + hotness, layer_expert_bytes, expert_budget, + /*min_hot_per_layer=*/std::min(w_.n_expert_used, w_.n_expert), + placement, &err)) { + std::fprintf(stderr, "[laguna-hybrid] placement build failed: %s\n", err.c_str()); + return false; + } + + int total_moe_experts = (w_.n_layer - w_.n_layer_dense_lead) * w_.n_expert; + std::printf("[laguna] dynamic placement result: %d hot experts, %d cold experts\n", + placement.total_hot, total_moe_experts - placement.total_hot); + + // If all experts fit, reload full model to GPU (non-hybrid path) + if (placement.total_hot >= total_moe_experts) { + std::printf("[laguna] all experts fit in VRAM, loading fully to GPU\n"); + std::fflush(stdout); + free_laguna_target_weights(w_); + if (!load_target_gguf_laguna(args_.target_path, backend_, w_)) { + std::fprintf(stderr, "[laguna] full reload failed: %s\n", dflash27b_last_error()); + return false; + } + return true; + } + + // Step 5: Load expert data from GGUF mmap into hot/cold split buffers + { + ggml_context * expert_meta = nullptr; + gguf_init_params gip{}; + gip.no_alloc = true; + gip.ctx = &expert_meta; + gguf_context * gctx = gguf_init_from_file(args_.target_path.c_str(), gip); + if (!gctx) { + std::fprintf(stderr, "[laguna-hybrid] failed to re-open GGUF for expert loading\n"); + return false; + } + + int fd = ::open(args_.target_path.c_str(), O_RDONLY); + if (fd < 0) { + gguf_free(gctx); + std::fprintf(stderr, "[laguna-hybrid] open failed for mmap\n"); + return false; + } + struct stat st; + if (::fstat(fd, &st) < 0) { + ::close(fd); + gguf_free(gctx); + std::fprintf(stderr, "[laguna-hybrid] fstat failed\n"); + return false; + } + const size_t file_size = (size_t)st.st_size; + void * mmap_addr = ::mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0); + ::close(fd); + if (mmap_addr == MAP_FAILED) { + gguf_free(gctx); + std::fprintf(stderr, "[laguna-hybrid] mmap failed\n"); + return false; + } + + const size_t data_start = gguf_get_data_offset(gctx); + const auto * file_bytes = (const uint8_t *)mmap_addr; + + // Build per-layer expert file data + std::vector layer_file_data((size_t)w_.n_layer); + for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) { + char name[128]; + auto find_tensor_data = [&](const char * suffix) -> ExpertTensorFileData { + std::snprintf(name, sizeof(name), "blk.%d.%s.weight", il, suffix); + int64_t tid = gguf_find_tensor(gctx, name); + if (tid < 0) return {}; + size_t off = data_start + gguf_get_tensor_offset(gctx, tid); + size_t sz = gguf_get_tensor_size(gctx, tid); + if (off + sz > file_size) return {}; + return { file_bytes + off, sz }; + }; + + layer_file_data[(size_t)il].gate_exps = find_tensor_data("ffn_gate_exps"); + layer_file_data[(size_t)il].up_exps = find_tensor_data("ffn_up_exps"); + layer_file_data[(size_t)il].down_exps = find_tensor_data("ffn_down_exps"); + // laguna has no fused gate_up_exps + } + + auto hybrid = std::make_shared(); + MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w_); + std::vector layer_descs((size_t)w_.n_layer); + for (int il = 0; il < w_.n_layer; ++il) { + layer_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]); + } + if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend_, placement, layer_descs, layer_file_data, *hybrid, &err)) { + ::munmap(mmap_addr, file_size); + gguf_free(gctx); + std::fprintf(stderr, "[laguna-hybrid] storage build failed: %s\n", err.c_str()); + return false; + } + + ::munmap(mmap_addr, file_size); + gguf_free(gctx); + + moe_hybrid_ = std::move(hybrid); + } + + // Print stats + int total_cold = 0; + uint64_t hot_bytes = 0, cold_bytes = 0; + for (int il = w_.n_layer_dense_lead; il < w_.n_layer; ++il) { + const auto & layer = moe_hybrid_->layers[(size_t)il]; + total_cold += (int)layer.cold_expert_ids.size(); + const uint64_t per_expert_bytes = + (uint64_t)layer.gate_expert_bytes + (uint64_t)layer.up_expert_bytes + (uint64_t)layer.down_expert_bytes; + hot_bytes += per_expert_bytes * (uint64_t)layer.hot_expert_ids.size(); + cold_bytes += per_expert_bytes * (uint64_t)layer.cold_expert_ids.size(); + } + std::printf("[laguna-hybrid] storage ready: total_hot=%d (%.2f GiB VRAM) total_cold=%d (%.2f GiB RAM) source=%s\n", + placement.total_hot, + hot_bytes / 1024.0 / 1024.0 / 1024.0, + total_cold, + cold_bytes / 1024.0 / 1024.0 / 1024.0, + placement_source.c_str()); + + if (total_cold > 0) { + hybrid_mode_ = true; + std::printf("[laguna-hybrid] hybrid decode path active (%d cold experts)\n", total_cold); + } else { + hybrid_mode_ = true; // partial load: expert tensors only in hybrid storage + std::printf("[laguna-hybrid] all experts hot — using hybrid path (all-hot)\n"); + } + + // Configure telemetry and swap policy + if (const char * telemetry = std::getenv("DFLASH_LAGUNA_TELEMETRY")) { + hybrid_telemetry_ = std::atoi(telemetry) != 0; + } + if (const char * out_path = std::getenv("DFLASH_LAGUNA_NEXT_PLACEMENT_OUT")) { + routing_stats_out_path_ = out_path; + } + if (const char * swap_max = std::getenv("DFLASH_LAGUNA_SWAP_MAX")) { + swap_policy_.max_swaps_total = std::max(0, std::atoi(swap_max)); + } + if (const char * swap_gain = std::getenv("DFLASH_LAGUNA_SWAP_MIN_GAIN")) { + swap_policy_.min_promote_gain = (uint64_t)std::max(1, std::atoi(swap_gain)); + } + + // Allocate routing stats collector + if (!routing_stats_out_path_.empty()) { + routing_stats_ = std::make_shared(); + routing_stats_->n_layer = w_.n_layer; + routing_stats_->n_expert = w_.n_expert; + routing_stats_->n_expert_used = w_.n_expert_used; + routing_stats_->counts.assign((size_t)w_.n_layer * (size_t)w_.n_expert, 0); + routing_stats_->layer_totals.assign((size_t)w_.n_layer, 0); + } + + std::fflush(stdout); + return true; +} + +// ── Laguna hybrid per-layer pre-FFN graph ─────────────────────────────── +// +// Builds attention + router for a single layer. For MoE layers, outputs: +// sg.ffn_post = post-attention normed hidden (input to FFN) +// sg.ffn_residual = residual to add after FFN output +// sg.moe_selected = [n_used] expert IDs +// sg.moe_weights = [n_used] combine weights +// For the dense layer 0, outputs the full layer result in sg.hidden_input. + +static bool build_laguna_layer_prefn_step( + StepGraph & sg, + const LagunaTargetWeights & w, + LagunaTargetCache & cache, + ggml_backend_t backend, + int il, + int kv_start, + int n_tokens) +{ + step_graph_free(sg); + + const int n_embd = w.n_embd; + const bool is_full = laguna_is_full_attn_layer(w, il); + const bool is_dense = (il < w.n_layer_dense_lead); + const LagunaTargetLayer & L = w.layers[(size_t)il]; + const int kv_len = kv_start + n_tokens; + const int n_head = w.n_head_arr[il]; + const int n_head_kv = w.n_head_kv; + const int head_dim = w.head_dim; + + ggml_init_params ip{}; + ip.mem_size = ggml_tensor_overhead() * 4096 + ggml_graph_overhead() + 8 * 1024 * 1024; + ip.no_alloc = true; + sg.ctx = ggml_init(ip); + if (!sg.ctx) return false; + sg.gf = ggml_new_graph_custom(sg.ctx, 4096, false); + + // Input: hidden state [n_embd, n_tokens] + sg.inp_embed = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(sg.inp_embed); + ggml_set_name(sg.inp_embed, "inp_embed"); + + // Positions + sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, n_tokens); + ggml_set_input(sg.positions); + + // Attention mask (causal) + ggml_tensor * attn_mask = nullptr; + if (kv_len > 0) { + attn_mask = ggml_new_tensor_4d(sg.ctx, GGML_TYPE_F32, kv_len, n_tokens, 1, 1); + ggml_set_input(attn_mask); + sg.attn_mask = attn_mask; + } + + ggml_tensor * inp = sg.inp_embed; + + // Pre-attn RMS norm + ggml_tensor * cur = ggml_rms_norm(sg.ctx, inp, 1e-6f); + cur = ggml_mul(sg.ctx, cur, L.attn_norm); + + // QKV projections + const int q_dim = n_head * head_dim; + ggml_tensor * Qcur = ggml_mul_mat(sg.ctx, L.wq, cur); // [q_dim, n_tokens] + ggml_tensor * Kcur = ggml_mul_mat(sg.ctx, L.wk, cur); // [n_head_kv * head_dim, n_tokens] + ggml_tensor * Vcur = ggml_mul_mat(sg.ctx, L.wv, cur); // [n_head_kv * head_dim, n_tokens] + + // Per-head softplus gate + ggml_tensor * gate = ggml_mul_mat(sg.ctx, L.wqkv_gate, cur); // [n_head, n_tokens] + gate = ggml_softplus(sg.ctx, gate); + + // Reshape Q to [head_dim, n_head, n_tokens] + Qcur = ggml_reshape_3d(sg.ctx, Qcur, head_dim, n_head, n_tokens); + Kcur = ggml_reshape_3d(sg.ctx, Kcur, head_dim, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(sg.ctx, Vcur, head_dim, n_head_kv, n_tokens); + + // Q-norm / K-norm + Qcur = ggml_rms_norm(sg.ctx, Qcur, 1e-6f); + Qcur = ggml_mul(sg.ctx, Qcur, L.q_norm); + Kcur = ggml_rms_norm(sg.ctx, Kcur, 1e-6f); + Kcur = ggml_mul(sg.ctx, Kcur, L.k_norm); + + // RoPE (YaRN on full-attention layers, plain on SWA layers) + const float rope_th = is_full ? w.rope_freq_base_full : w.rope_freq_base_swa; + const int n_rot = is_full ? w.n_rot_full : w.n_rot_swa; + const float ext_factor = is_full ? 1.0f : 0.0f; + const float attn_factor = 1.0f; + const float beta_fast = is_full ? w.yarn_beta_fast : 32.0f; + const float beta_slow = is_full ? w.yarn_beta_slow : 1.0f; + const int n_ctx_orig = is_full ? w.yarn_orig_ctx : 0; + const float freq_scale = is_full ? (1.0f / w.yarn_factor) : 1.0f; + + Qcur = ggml_rope_ext(sg.ctx, Qcur, sg.positions, /*freq_factors=*/nullptr, + n_rot, GGML_ROPE_TYPE_NEOX, + n_ctx_orig, rope_th, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(sg.ctx, Kcur, sg.positions, nullptr, + n_rot, GGML_ROPE_TYPE_NEOX, + n_ctx_orig, rope_th, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + // KV cache write — permute to [head_dim, n_tokens, n_head_kv] layout + ggml_tensor * cache_k = cache.attn_k[(size_t)il]; + ggml_tensor * cache_v = cache.attn_v[(size_t)il]; + + ggml_tensor * Kcur_T = ggml_permute(sg.ctx, Kcur, 0, 2, 1, 3); + ggml_tensor * Vcur_T = ggml_permute(sg.ctx, Vcur, 0, 2, 1, 3); + + ggml_tensor * k_view = ggml_view_3d(sg.ctx, cache_k, + head_dim, n_tokens, n_head_kv, + cache_k->nb[1], cache_k->nb[2], + cache_k->nb[1] * (size_t)kv_start); + ggml_tensor * k_cpy = ggml_cpy(sg.ctx, Kcur_T, k_view); + ggml_build_forward_expand(sg.gf, k_cpy); + + ggml_tensor * v_view = ggml_view_3d(sg.ctx, cache_v, + head_dim, n_tokens, n_head_kv, + cache_v->nb[1], cache_v->nb[2], + cache_v->nb[1] * (size_t)kv_start); + ggml_tensor * v_cpy = ggml_cpy(sg.ctx, Vcur_T, v_view); + ggml_build_forward_expand(sg.gf, v_cpy); + + // Flash attention + ggml_tensor * Qfa = ggml_permute(sg.ctx, Qcur, 0, 2, 1, 3); + Qfa = ggml_cont(sg.ctx, Qfa); + + ggml_tensor * Kfa = ggml_view_3d(sg.ctx, cache_k, + head_dim, kv_len, n_head_kv, + cache_k->nb[1], cache_k->nb[2], 0); + ggml_tensor * Vfa = ggml_view_3d(sg.ctx, cache_v, + head_dim, kv_len, n_head_kv, + cache_v->nb[1], cache_v->nb[2], 0); + + const float kq_scale = 1.0f / std::sqrt((float)head_dim); + ggml_tensor * attn_mask_f16 = attn_mask ? ggml_cast(sg.ctx, attn_mask, GGML_TYPE_F16) : nullptr; + ggml_tensor * attn = ggml_flash_attn_ext(sg.ctx, Qfa, Kfa, Vfa, attn_mask_f16, + kq_scale, 0.0f, 0.0f); + + // Per-head softplus gate + ggml_tensor * gate_b = ggml_reshape_3d(sg.ctx, gate, 1, n_head, n_tokens); + gate_b = ggml_cast(sg.ctx, gate_b, attn->type); + attn = ggml_mul(sg.ctx, attn, gate_b); + + attn = ggml_reshape_2d(sg.ctx, attn, q_dim, n_tokens); + + // Output projection + ggml_tensor * attn_out = ggml_mul_mat(sg.ctx, L.wo, attn); // [n_embd, n_tokens] + + // Residual after attention + ggml_tensor * ffn_inp = ggml_add(sg.ctx, attn_out, inp); + + if (is_dense) { + // Dense layer 0: run full MLP in this graph + ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f); + normed = ggml_mul(sg.ctx, normed, L.ffn_norm); + + ggml_tensor * g = ggml_mul_mat(sg.ctx, L.w_gate, normed); + ggml_tensor * u = ggml_mul_mat(sg.ctx, L.w_up, normed); + ggml_tensor * gu = ggml_swiglu_split(sg.ctx, g, u); + ggml_tensor * d = ggml_mul_mat(sg.ctx, L.w_down, gu); + ggml_tensor * layer_out = ggml_add(sg.ctx, d, ffn_inp); + + sg.hidden_input = layer_out; + ggml_set_output(layer_out); + ggml_build_forward_expand(sg.gf, layer_out); + } else { + // MoE layer: output pre-FFN normed + residual + router decisions + ggml_tensor * normed = ggml_rms_norm(sg.ctx, ffn_inp, 1e-6f); + normed = ggml_mul(sg.ctx, normed, L.ffn_norm); + sg.ffn_post = normed; + ggml_set_output(normed); + + sg.ffn_residual = ffn_inp; + ggml_set_output(ffn_inp); + + // Router: sigmoid + score-correction bias + top-k + ggml_tensor * router_logits = ggml_mul_mat(sg.ctx, L.ffn_gate_inp, normed); + ggml_tensor * probs = ggml_sigmoid(sg.ctx, router_logits); + ggml_tensor * scores_sel = ggml_add(sg.ctx, probs, L.ffn_exp_probs_b); + ggml_tensor * selected = ggml_top_k(sg.ctx, scores_sel, w.n_expert_used); + ggml_set_output(selected); + + // Gather original probs (no bias) for combine weights + ggml_tensor * probs_3d = ggml_reshape_3d(sg.ctx, probs, 1, w.n_expert, n_tokens); + ggml_tensor * weights_raw = ggml_get_rows(sg.ctx, probs_3d, selected); + weights_raw = ggml_reshape_2d(sg.ctx, weights_raw, w.n_expert_used, n_tokens); + + // Sum-normalize + scale + ggml_tensor * w_sum = ggml_sum_rows(sg.ctx, weights_raw); + ggml_tensor * weights_normed = ggml_div(sg.ctx, weights_raw, w_sum); + if (w.expert_weights_scale != 1.0f) { + weights_normed = ggml_scale(sg.ctx, weights_normed, w.expert_weights_scale); + } + sg.moe_weights = weights_normed; + ggml_set_output(weights_normed); + + sg.moe_selected.resize(1); + sg.moe_selected[0] = selected; + + ggml_build_forward_expand(sg.gf, normed); + ggml_build_forward_expand(sg.gf, ffn_inp); + ggml_build_forward_expand(sg.gf, selected); + ggml_build_forward_expand(sg.gf, weights_normed); + } + + // Allocate + if (!sg.alloc) { + sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) { + return false; + } + + return true; +} + +// ── Hybrid forward: one token through all 40 layers ───────────────────── + +bool LagunaBackend::hybrid_forward_one_token(int32_t tok, int kv_pos, + std::vector & act_cur, + int32_t & argmax_out) { + const int hidden = w_.n_embd; + const int vocab = w_.embedder.n_vocab; + + // Embed token + if (!w_.embedder.embed(&tok, 1, act_cur.data())) return false; + + // GPU-resident state for MoE layers + GpuResidentState gpu_state; + if (!init_gpu_resident_state(gpu_state, backend_, hidden)) return false; + ggml_backend_tensor_set(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + + StepGraph layer_sg; + std::vector selected((size_t)w_.n_expert_used); + std::vector weights_buf((size_t)w_.n_expert_used); + ggml_backend_t cpu_be = moe_hybrid_->cpu_backend; + + for (int il = 0; il < w_.n_layer; ++il) { + const bool is_dense = (il < w_.n_layer_dense_lead); + + if (!build_laguna_layer_prefn_step(layer_sg, w_, cache_, backend_, il, kv_pos, 1)) { + step_graph_destroy(layer_sg); + gpu_state.destroy(); + return false; + } + + // GPU→GPU: copy persistent act_cur to pre-FFN graph input + ggml_backend_tensor_copy(gpu_state.act_cur, layer_sg.inp_embed); + + // Set positions + int32_t pos_val = kv_pos; + ggml_backend_tensor_set(layer_sg.positions, &pos_val, 0, sizeof(int32_t)); + + // Causal mask: single token decode — all positions [0..kv_pos] visible + if (layer_sg.attn_mask) { + const int kv_len = kv_pos + 1; + std::vector mask_data((size_t)kv_len, 0.0f); + ggml_backend_tensor_set(layer_sg.attn_mask, mask_data.data(), 0, sizeof(float) * (size_t)kv_len); + } + + auto st = ggml_backend_graph_compute(backend_, layer_sg.gf); + if (st != GGML_STATUS_SUCCESS) { + step_graph_destroy(layer_sg); + gpu_state.destroy(); + return false; + } + + if (is_dense) { + // Dense layer: read full output back to GPU-resident state + ggml_backend_tensor_copy(layer_sg.hidden_input, gpu_state.act_cur); + } else { + // MoE layer: read router decisions, then do hybrid FFN eval + ggml_tensor * sel_tensor = layer_sg.moe_selected[0]; + ggml_backend_tensor_get(sel_tensor, selected.data(), 0, + sizeof(int32_t) * selected.size()); + ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0, + sizeof(float) * weights_buf.size()); + + if (routing_stats_) { + routing_stats_->observe(il, selected.data(), (int)selected.size()); + } + + // Hybrid FFN: hot on GPU, cold on CPU, combine on GPU + auto & storage = moe_hybrid_->layers[(size_t)il]; + MoeHybridConfig cfg = make_moe_hybrid_config(w_); + MoeLayerDesc desc = make_moe_layer_desc(w_.layers[(size_t)il]); + if (!eval_moe_hybrid_ffn_gpu_resident( + backend_, cfg, desc, storage, cpu_be, + layer_sg.ffn_post, layer_sg.ffn_residual, + gpu_state, + selected.data(), weights_buf.data(), + (int)selected.size())) { + step_graph_destroy(layer_sg); + gpu_state.destroy(); + return false; + } + } + } + + // Read final hidden state and project logits + ggml_backend_tensor_get(gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + step_graph_destroy(layer_sg); + gpu_state.destroy(); + + // Project logits: final RMS norm + lm_head + { + ggml_init_params ip{}; + ip.mem_size = 64 * 1024 * 1024; + ip.no_alloc = true; + ggml_context * ctx = ggml_init(ip); + if (!ctx) return false; + + ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1); + ggml_set_input(h_in); + ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false); + + ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f); + normed = ggml_mul(ctx, normed, w_.out_norm); + ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed); + ggml_set_output(logits); + ggml_build_forward_expand(gf, logits); + + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + ggml_gallocr_free(alloc); + ggml_free(ctx); + return false; + } + ggml_backend_tensor_set(h_in, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) { + ggml_gallocr_free(alloc); + ggml_free(ctx); + return false; + } + + std::vector logits_buf((size_t)vocab); + ggml_backend_tensor_get(logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab); + ggml_gallocr_free(alloc); + ggml_free(ctx); + + // Argmax + argmax_out = 0; + float best = logits_buf[0]; + for (int j = 1; j < vocab; ++j) { + if (logits_buf[(size_t)j] > best) { + best = logits_buf[(size_t)j]; + argmax_out = j; + } + } + } + return true; +} + +// ── Hybrid generate ───────────────────────────────────────────────────── + +GenerateResult LagunaBackend::generate_hybrid(const GenerateRequest & req, + const DaemonIO & io) { + GenerateResult result; + DaemonIO out_io = io.with_token_callback(req.on_token); + const bool should_emit = req.stream || (bool)out_io.on_token; + const int N = (int)req.prompt.size(); + + if (N + req.n_gen > args_.max_ctx) { + result.error = "overflow"; + return result; + } + + reset_laguna_target_cache(cache_); + + // ── Hybrid Prefill: layer-by-layer pre-FFN + batched hybrid FFN ── + const int hidden = w_.n_embd; + const int n_expert_used = w_.n_expert_used; + ggml_backend_t cpu_be = moe_hybrid_->cpu_backend; + + std::vector embed_all((size_t)N * (size_t)hidden); + if (!w_.embedder.embed(req.prompt.data(), N, embed_all.data())) { + result.error = "embed_prefill"; + return result; + } + + auto t_pf0 = std::chrono::steady_clock::now(); + const int prefill_chunk = std::min(args_.chunk, N); + + StepGraph prefill_sg; // persistent across layers to reuse GPU buffer + ggml_gallocr_t ffn_hot_alloc = nullptr; + ggml_gallocr_t ffn_cold_alloc = nullptr; + + for (int il = 0; il < w_.n_layer; ++il) { + const bool is_dense = (il < w_.n_layer_dense_lead); + const bool is_full = laguna_is_full_attn_layer(w_, il); + + for (int chunk_start = 0; chunk_start < N; chunk_start += prefill_chunk) { + const int chunk_len = std::min(prefill_chunk, N - chunk_start); + + step_graph_free(prefill_sg); // reset ctx/graph but keep gallocr buffer + if (!build_laguna_layer_prefn_step(prefill_sg, w_, cache_, backend_, + il, chunk_start, chunk_len)) { + result.error = "prefill_build"; + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + return result; + } + + // Set input embeddings + ggml_backend_tensor_set(prefill_sg.inp_embed, + embed_all.data() + (size_t)chunk_start * (size_t)hidden, 0, + sizeof(float) * (size_t)chunk_len * (size_t)hidden); + + // Set positions + std::vector pos_data((size_t)chunk_len); + for (int i = 0; i < chunk_len; ++i) pos_data[i] = chunk_start + i; + ggml_backend_tensor_set(prefill_sg.positions, pos_data.data(), 0, + sizeof(int32_t) * (size_t)chunk_len); + + // Set attention mask (causal or causal+SWA depending on layer) + if (prefill_sg.attn_mask) { + const int kv_len = chunk_start + chunk_len; + std::vector mask((size_t)kv_len * (size_t)chunk_len, -INFINITY); + for (int q = 0; q < chunk_len; ++q) { + const int abs_q = chunk_start + q; + const int win_lo = is_full ? 0 : std::max(0, abs_q - w_.sliding_window + 1); + for (int k = win_lo; k <= abs_q && k < kv_len; ++k) { + mask[(size_t)q * (size_t)kv_len + (size_t)k] = 0.0f; + } + } + ggml_backend_tensor_set(prefill_sg.attn_mask, mask.data(), 0, + sizeof(float) * mask.size()); + } + + // Compute pre-FFN graph + auto st = ggml_backend_graph_compute(backend_, prefill_sg.gf); + if (st != GGML_STATUS_SUCCESS) { + result.error = "prefill_compute"; + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + return result; + } + + if (is_dense) { + // Dense layer outputs full result directly + std::vector layer_out((size_t)chunk_len * (size_t)hidden); + ggml_backend_tensor_get(prefill_sg.hidden_input, layer_out.data(), 0, + sizeof(float) * layer_out.size()); + std::memcpy(embed_all.data() + (size_t)chunk_start * (size_t)hidden, + layer_out.data(), + sizeof(float) * layer_out.size()); + } else { + // MoE layer: read router decisions, run hybrid FFN + std::vector chunk_residuals((size_t)chunk_len * (size_t)hidden); + std::vector chunk_post((size_t)chunk_len * (size_t)hidden); + std::vector chunk_selected((size_t)chunk_len * (size_t)n_expert_used); + std::vector chunk_weights((size_t)chunk_len * (size_t)n_expert_used); + + ggml_backend_tensor_get(prefill_sg.ffn_residual, chunk_residuals.data(), 0, + sizeof(float) * chunk_residuals.size()); + ggml_backend_tensor_get(prefill_sg.ffn_post, chunk_post.data(), 0, + sizeof(float) * chunk_post.size()); + + ggml_tensor * sel_tensor = prefill_sg.moe_selected.empty() ? nullptr : prefill_sg.moe_selected[0]; + if (!sel_tensor || !prefill_sg.moe_weights) { + result.error = "prefill_router_outputs"; + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + return result; + } + ggml_backend_tensor_get(sel_tensor, chunk_selected.data(), 0, + sizeof(int32_t) * chunk_selected.size()); + ggml_backend_tensor_get(prefill_sg.moe_weights, chunk_weights.data(), 0, + sizeof(float) * chunk_weights.size()); + + // Observe routing stats + if (routing_stats_) { + for (int i = 0; i < chunk_len; ++i) { + routing_stats_->observe(il, chunk_selected.data() + (size_t)i * (size_t)n_expert_used, n_expert_used); + } + } + + // Batched hybrid FFN evaluation + auto & storage = moe_hybrid_->layers[(size_t)il]; + MoeHybridConfig chunk_cfg = make_moe_hybrid_config(w_); + MoeLayerDesc chunk_desc = make_moe_layer_desc(w_.layers[(size_t)il]); + std::vector ffn_batch_out; + if (!eval_moe_hybrid_ffn_batched( + backend_, cpu_be, chunk_cfg, chunk_desc, storage, + chunk_post.data(), + chunk_selected.data(), + chunk_weights.data(), + chunk_len, ffn_batch_out, &result.error, + &ffn_hot_alloc, &ffn_cold_alloc)) { + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + return result; + } + + // Combine: FFN output + residual → embed_all for next layer + for (int i = 0; i < chunk_len; ++i) { + const float * ffn = ffn_batch_out.data() + (size_t)i * (size_t)hidden; + const float * res = chunk_residuals.data() + (size_t)i * (size_t)hidden; + float * out_embed = embed_all.data() + (size_t)(chunk_start + i) * (size_t)hidden; + for (int j = 0; j < hidden; ++j) { + out_embed[j] = ffn[j] + res[j]; + } + } + } + } + } + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + + // Project logits from last token's hidden state + cache_.cur_pos = N; + std::vector last_logits; + { + ggml_init_params ip{}; + ip.mem_size = 64 * 1024 * 1024; + ip.no_alloc = true; + ggml_context * ctx = ggml_init(ip); + ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false); + + ggml_tensor * h_in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, 1); + ggml_set_input(h_in); + ggml_tensor * normed = ggml_rms_norm(ctx, h_in, 1e-6f); + normed = ggml_mul(ctx, normed, w_.out_norm); + ggml_tensor * logits = ggml_mul_mat(ctx, w_.output, normed); + ggml_set_output(logits); + ggml_build_forward_expand(gf, logits); + + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend_)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + ggml_gallocr_free(alloc); + ggml_free(ctx); + result.error = "prefill_logits_alloc"; + return result; + } + // Set last token's hidden state + ggml_backend_tensor_set(h_in, + embed_all.data() + (size_t)(N - 1) * (size_t)hidden, 0, + sizeof(float) * (size_t)hidden); + if (ggml_backend_graph_compute(backend_, gf) != GGML_STATUS_SUCCESS) { + ggml_gallocr_free(alloc); + ggml_free(ctx); + result.error = "prefill_logits_compute"; + return result; + } + last_logits.resize((size_t)w_.embedder.n_vocab); + ggml_backend_tensor_get(logits, last_logits.data(), 0, + sizeof(float) * last_logits.size()); + ggml_gallocr_free(alloc); + ggml_free(ctx); + } + + auto t_pf1 = std::chrono::steady_clock::now(); + result.prefill_s = std::chrono::duration(t_pf1 - t_pf0).count(); + + // ── Decode (hybrid layer-by-layer) ── + auto argmax = [](const std::vector & ll) { + int best = 0; float bv = ll[0]; + for (size_t i = 1; i < ll.size(); ++i) + if (ll[i] > bv) { bv = ll[i]; best = (int)i; } + return best; + }; + + std::vector history; + history.reserve((size_t)N + (size_t)req.n_gen); + history.insert(history.end(), req.prompt.begin(), req.prompt.end()); + + auto pick = [&](const std::vector & ll) -> int { + return req.do_sample + ? sample_logits(ll.data(), (int)ll.size(), req.sampler, history, sampler_rng_) + : argmax(ll); + }; + + int next_tok = pick(last_logits); + result.tokens.reserve(req.n_gen); + + // Budget force-close (same pattern as non-hybrid path) + const BudgetHook & budget_hook = req.budget_hook; + bool budget_close_started = false; + int close_inject_pos = 0; + auto maybe_force_close = [&](int32_t & tok, int committed_now) { + if (budget_hook.close_token_ids.empty()) return; + if (budget_close_started && + close_inject_pos < (int)budget_hook.close_token_ids.size()) + { + tok = budget_hook.close_token_ids[close_inject_pos++]; + return; + } + if (budget_close_started) return; + int remaining = req.n_gen - committed_now; + if (remaining <= budget_hook.hard_limit_remaining) { + int32_t first_close = budget_hook.close_token_ids.front(); + if (tok == first_close) { + budget_close_started = true; + close_inject_pos = 1; + return; + } + tok = first_close; + budget_close_started = true; + close_inject_pos = 1; + result.budget_forced_close = true; + } + }; + + std::vector act_cur((size_t)w_.n_embd); + auto t_g0 = std::chrono::steady_clock::now(); + for (int s = 0; s < req.n_gen; ++s) { + maybe_force_close(next_tok, s); + if (next_tok == w_.eos_id || next_tok == w_.eos_chat_id) break; + result.tokens.push_back(next_tok); + history.push_back(next_tok); + if (should_emit) { + out_io.emit(next_tok); + if (out_io.cancelled) break; + } + + // Hybrid forward: one token through all layers + int32_t argmax_tok = 0; + if (!hybrid_forward_one_token(next_tok, cache_.cur_pos, act_cur, argmax_tok)) { + result.error = "decode"; + break; + } + cache_.cur_pos++; + + if (req.do_sample) { + // For sampling, we need full logits — project from act_cur + // (hybrid_forward_one_token already computed argmax; for sampling + // we re-project — FIXME: return logits from forward to avoid double projection) + next_tok = argmax_tok; // For now, use argmax even in sample mode as fallback + } else { + next_tok = argmax_tok; + } + } + auto t_g1 = std::chrono::steady_clock::now(); + result.decode_s = std::chrono::duration(t_g1 - t_g0).count(); + + if (should_emit) out_io.emit(-1); + result.ok = (result.error.empty()); + return result; +} + +void LagunaBackend::maybe_post_request_swap() { + if (!hybrid_mode_ || !moe_hybrid_ || swap_policy_.max_swaps_total <= 0) return; + if (!routing_stats_) return; + + MoeHybridSwapPlan plan; + std::string err; + if (!build_moe_hybrid_swap_plan(moe_hybrid_->placement, *routing_stats_, + swap_policy_, plan, &err)) { + std::fprintf(stderr, "[laguna-hybrid] swap plan failed: %s\n", err.c_str()); + return; + } + if (plan.actions.empty()) return; + + // Rebuild storage with new placement + auto rebuilt = std::make_shared(); + MoeHybridConfig swap_cfg = make_moe_hybrid_config(w_); + std::vector swap_descs((size_t)w_.n_layer); + for (int il = 0; il < w_.n_layer; ++il) { + swap_descs[(size_t)il] = make_moe_layer_desc(w_.layers[(size_t)il]); + } + if (!build_moe_hybrid_storage(swap_cfg, backend_, + plan.next_placement, swap_descs, *rebuilt, &err)) { + std::fprintf(stderr, "[laguna-hybrid] swap rebuild failed: %s\n", err.c_str()); + return; + } + moe_hybrid_ = std::move(rebuilt); + + // Save updated routing stats if configured + if (!routing_stats_out_path_.empty()) { + routing_stats_->save_csv(routing_stats_out_path_, &err); + } + + std::printf("[laguna-hybrid] applied %zu swap actions at request boundary\n", plan.actions.size()); + std::fflush(stdout); +} + // ── Shutdown ──────────────────────────────────────────────────────────── void LagunaBackend::shutdown() { diff --git a/server/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h index afdaf8f63..6ebe9081c 100644 --- a/server/src/laguna/laguna_backend.h +++ b/server/src/laguna/laguna_backend.h @@ -10,11 +10,16 @@ #include "laguna_internal.h" #include "placement/placement_config.h" #include "qwen3_drafter.h" +#include "../common/moe_hybrid_ffn_eval.h" +#include "../common/moe_hybrid_storage.h" +#include "../common/moe_hybrid_routing_stats.h" +#include "../common/moe_hybrid_swap_manager.h" #include "ggml.h" #include "ggml-backend.h" #include +#include #include #include #include @@ -77,7 +82,23 @@ class LagunaBackend : public ModelBackend { DrafterContext drafter_ctx_{}; bool drafter_loaded_ = false; + // ── Hybrid MoE mode (hot/cold expert split) ── + bool hybrid_mode_ = false; + std::shared_ptr moe_hybrid_; + std::shared_ptr routing_stats_; + std::string routing_stats_out_path_; + MoeHybridSwapPolicy swap_policy_; + bool hybrid_telemetry_ = false; + bool ensure_slot(int slot); + + // Hybrid mode helpers + bool init_hybrid_mode(); + GenerateResult generate_hybrid(const GenerateRequest & req, const DaemonIO & io); + bool hybrid_forward_one_token(int32_t tok, int kv_pos, + std::vector & act_cur, + int32_t & argmax_out); + void maybe_post_request_swap(); }; } // namespace dflash::common diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h index 32c981300..132eadd86 100644 --- a/server/src/laguna/laguna_internal.h +++ b/server/src/laguna/laguna_internal.h @@ -20,6 +20,7 @@ // - Vocab = 100352. BOS = 2. EOS = {2, 24}. Pad = 9. #pragma once +#define DFLASH_LAGUNA_INTERNAL_H_INCLUDED #include #include @@ -136,6 +137,10 @@ bool load_target_gguf_laguna(const std::string & path, ggml_backend_t backend, LagunaTargetWeights & out); +// Partial loader. With plan.skip_expert_tensors=true this performs the +// hybrid-MoE load: non-expert tensors go to GPU, expert tensors are kept +// off-GPU (metadata/shapes stay valid for size queries). Also supports +// layer-range partial loads via plan.layer_begin/layer_end/load_output. bool load_target_gguf_laguna_partial(const std::string & path, ggml_backend_t backend, const TargetLoadPlan & plan, diff --git a/server/src/laguna/laguna_target_loader.cpp b/server/src/laguna/laguna_target_loader.cpp index cd9b617a2..2a91f91f7 100644 --- a/server/src/laguna/laguna_target_loader.cpp +++ b/server/src/laguna/laguna_target_loader.cpp @@ -60,6 +60,9 @@ namespace dflash::common { +// fwd-decl: defined below at file scope, used by should_load_laguna_tensor +static bool is_laguna_expert_tensor(const char * name); + namespace { // Same Mmap shape as gguf_target_loader.cpp's local helper. Duplicated locally @@ -157,6 +160,7 @@ bool should_load_laguna_tensor(const char * name, const TargetLoadPlan & plan) { std::strcmp(name, "output.weight") == 0) { return plan.load_output; } + if (plan.skip_expert_tensors && is_laguna_expert_tensor(name)) return false; int layer_id = -1; if (parse_block_tensor_name(name, layer_id)) { return layer_id >= plan.layer_begin && layer_id < plan.layer_end; @@ -572,4 +576,18 @@ void free_laguna_target_weights(LagunaTargetWeights & w) { w.output = nullptr; } +// ── Partial loader (hybrid mode) ──────────────────────────────────────── +// +// Loads laguna GGUF but skips uploading expert tensor DATA to GPU. +// Tensor metadata (shapes, offsets) is still parsed so that the hybrid +// storage builder can use ggml_nbytes() to compute per-expert sizes. +// Expert data will be loaded via mmap into the hot/cold split buffers. + +static bool is_laguna_expert_tensor(const char * name) { + // Expert tensors are: ffn_gate_exps, ffn_up_exps, ffn_down_exps + // (per-layer, named blk..ffn_{gate,up,down}_exps.weight) + return std::strstr(name, "ffn_gate_exps") != nullptr || + std::strstr(name, "ffn_up_exps") != nullptr || + std::strstr(name, "ffn_down_exps") != nullptr; +} } // namespace dflash::common diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp index 67cdaef5b..2667680a9 100644 --- a/server/src/qwen35moe/qwen35moe_backend.cpp +++ b/server/src/qwen35moe/qwen35moe_backend.cpp @@ -1,5 +1,8 @@ #include "qwen35moe_backend.h" +#include "../common/moe_hybrid_placement.h" +#include "../common/moe_hybrid_types.h" +#include "../common/moe_hybrid_types_impl.h" #include "common/sampler.h" #include "common/dflash_spec_decode.h" #include "dflash_draft_graph.h" @@ -44,8 +47,8 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights & } if (const char * stats_path = std::getenv("DFLASH_QWEN35MOE_RUNTIME_STATS_OUT")) { - routing_stats_ = std::make_shared(); - if (!routing_stats_->init_from_weights(out)) { + routing_stats_ = std::make_shared(); + if (!routing_stats_->init(out.n_layer, out.n_expert, out.n_expert_used)) { set_last_error("qwen35moe runtime stats init failed"); return false; } @@ -54,7 +57,7 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights & // Phase 2: Compute dynamic placement based on VRAM budget. // Expert tensor metadata (ne/nb) is valid even without GPU allocation. - Qwen35MoeExpertPlacement placement; + MoeHybridPlacement placement; std::string placement_source; std::string err; @@ -139,8 +142,13 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights & layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps"); } - auto hybrid = std::make_shared(); - if (!build_qwen35moe_hybrid_storage_from_file(out, backend, placement, layer_file_data, *hybrid, &err)) { + auto hybrid = std::make_shared(); + MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(out); + std::vector layer_descs((size_t)out.n_layer); + for (int il = 0; il < out.n_layer; ++il) { + layer_descs[(size_t)il] = make_moe_layer_desc(out.layers[(size_t)il]); + } + if (!build_moe_hybrid_storage_from_file(hybrid_cfg, backend, placement, layer_descs, layer_file_data, *hybrid, &err)) { ::munmap(mmap_addr, file_size); gguf_free(gctx); set_last_error(std::string("qwen35moe hybrid storage build failed: ") + err); @@ -218,24 +226,29 @@ void Qwen35MoeBackend::maybe_post_request_swap() { if (!target_weights().moe_hybrid || swap_policy_.max_swaps_total <= 0) return; - Qwen35MoeSwapPlan plan; + MoeHybridSwapPlan plan; std::string err; - if (!build_qwen35moe_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_, + if (!build_moe_hybrid_swap_plan(target_weights().moe_hybrid->placement, *routing_stats_, swap_policy_, plan, &err)) { std::fprintf(stderr, "[qwen35moe] swap plan failed: %s\n", err.c_str()); return; } if (plan.actions.empty()) return; - auto rebuilt = std::make_shared(); - if (!build_qwen35moe_hybrid_storage(target_weights(), target_backend(), - plan.next_placement, *rebuilt, &err)) { + auto rebuilt = std::make_shared(); + MoeHybridConfig swap_cfg = make_moe_hybrid_config(target_weights()); + std::vector swap_descs((size_t)target_weights().n_layer); + for (int il = 0; il < target_weights().n_layer; ++il) { + swap_descs[(size_t)il] = make_moe_layer_desc(target_weights().layers[(size_t)il]); + } + if (!build_moe_hybrid_storage(swap_cfg, target_backend(), + plan.next_placement, swap_descs, *rebuilt, &err)) { std::fprintf(stderr, "[qwen35moe] swap rebuild failed: %s\n", err.c_str()); return; } target_weights().moe_hybrid = std::move(rebuilt); if (!placement_out_path_.empty()) { - if (!plan.next_placement.save_json(placement_out_path_, &err)) { + if (!plan.next_placement.save_json(placement_out_path_, "qwen35moe", &err)) { std::fprintf(stderr, "[qwen35moe] failed to save next placement: %s\n", err.c_str()); } } @@ -259,7 +272,8 @@ bool Qwen35MoeBackend::ensure_pipe_state(int kv_start) { if (pipe_state_ && pipe_state_->valid()) return true; pipe_state_ = std::make_unique(); if (!init_pipelined_decode_state(*pipe_state_, target_backend(), target_weights(), - target_cache(), kv_start, cfg_.kq_stride_pad)) { + target_cache(), *target_weights().moe_hybrid, + kv_start, cfg_.kq_stride_pad)) { pipe_state_.reset(); return false; } @@ -274,6 +288,15 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, std::vector logits_buf((size_t)vocab); std::vector act_cur((size_t)hidden); + // Telemetry accumulators for the full decode loop + using DecodeClock = std::chrono::steady_clock; + uint64_t tel_embed_us = 0; + uint64_t tel_layers_us = 0; + uint64_t tel_logits_us = 0; + uint64_t tel_sample_us = 0; + PipelinedDecodeTelemetry tel_layers_accum{}; + int tel_n_tokens = 0; + // Persistent logits graph (built once, reused per token) StepGraph logits_sg; auto project_logits = [&]() -> bool { @@ -297,7 +320,9 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, return false; } } - ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + // GPU→GPU: pipe act_cur directly into logits graph (no host bounce) + ggml_backend_tensor_copy_async(target_backend(), target_backend(), + pipe_state_->gpu_state.act_cur, logits_sg.hidden_input); auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf); if (st != GGML_STATUS_SUCCESS) return false; ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab); @@ -312,7 +337,7 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, ggml_backend_tensor_get(target_step_graph().logits, logits_buf.data(), prefill_logits_offset(), sizeof(float) * (size_t)vocab); first_tok = sample_logits(logits_buf.data(), vocab, sampler_config(), - out_tokens, sampler_rng_engine()); + out_tokens, sampler_rng_engine()); } else { first_tok = target_cache().last_tok; } @@ -329,30 +354,36 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, } for (int step = 1; step < n_gen; ++step) { + const auto tok_t0 = DecodeClock::now(); + int32_t tok = out_tokens.back(); if (!target_weights().embedder.embed(&tok, 1, act_cur.data())) { return false; } - ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0, - sizeof(float) * (size_t)hidden); + ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, + act_cur.data(), 0, sizeof(float) * (size_t)hidden); + const auto embed_done = DecodeClock::now(); + PipelinedDecodeTelemetry tel; if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(), - target_cache(), *target_weights().moe_hybrid, - committed, cfg_.kq_stride_pad, nullptr)) { + target_cache(), *target_weights().moe_hybrid, + committed, cfg_.kq_stride_pad, + hybrid_telemetry_ ? &tel : nullptr)) { return false; } + const auto layers_done = DecodeClock::now(); - ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0, - sizeof(float) * (size_t)hidden); + // act_cur stays on GPU — project_logits reads it via GPU→GPU copy if (!project_logits()) { step_graph_destroy(logits_sg); return false; } + const auto logits_done = DecodeClock::now(); int32_t next_tok; if (sampler_config().temp > 0) { next_tok = sample_logits(logits_buf.data(), vocab, sampler_config(), - out_tokens, sampler_rng_engine()); + out_tokens, sampler_rng_engine()); } else { next_tok = 0; float best = logits_buf[0]; @@ -363,6 +394,48 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, } } } + const auto sample_done = DecodeClock::now(); + + if (hybrid_telemetry_) { + auto us = [](DecodeClock::time_point a, DecodeClock::time_point b) -> uint64_t { + return (uint64_t)std::chrono::duration_cast(b - a).count(); + }; + tel_embed_us += us(tok_t0, embed_done); + tel_layers_us += us(embed_done, layers_done); + tel_logits_us += us(layers_done, logits_done); + tel_sample_us += us(logits_done, sample_done); + tel_n_tokens++; + // Accumulate per-layer telemetry + tel_layers_accum.total_us += tel.total_us; + tel_layers_accum.prefn_graph_build_us += tel.prefn_graph_build_us; + tel_layers_accum.prefn_compute_us += tel.prefn_compute_us; + tel_layers_accum.routing_readback_us += tel.routing_readback_us; + tel_layers_accum.ffn_us += tel.ffn_us; + tel_layers_accum.ffn_allhot_us += tel.ffn_allhot_us; + tel_layers_accum.ffn_mixed_us += tel.ffn_mixed_us; + tel_layers_accum.gpu_idle_us += tel.gpu_idle_us; + tel_layers_accum.tensor_io_us += tel.tensor_io_us; + tel_layers_accum.combine_overhead_us += tel.combine_overhead_us; + tel_layers_accum.cold_cpu_us += tel.cold_cpu_us; + tel_layers_accum.cold_compute_us += tel.cold_compute_us; + tel_layers_accum.hot_graph_build_us += tel.hot_graph_build_us; + tel_layers_accum.ffn_post_get_us += tel.ffn_post_get_us; + tel_layers_accum.sync_wait_us += tel.sync_wait_us; + tel_layers_accum.allhot_layers += tel.allhot_layers; + tel_layers_accum.mixed_layers += tel.mixed_layers; + tel_layers_accum.total_layers += tel.total_layers; + tel_layers_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds; + tel_layers_accum.routed_ffn_layers += tel.routed_ffn_layers; + tel_layers_accum.routed_prefn_us += tel.routed_prefn_us; + tel_layers_accum.routed_sync_us += tel.routed_sync_us; + tel_layers_accum.routed_readback_us += tel.routed_readback_us; + tel_layers_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us; + tel_layers_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us; + tel_layers_accum.routed_final_sync_us += tel.routed_final_sync_us; + tel_layers_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits; + tel_layers_accum.routed_total_expert_slots += tel.routed_total_expert_slots; + } + out_tokens.push_back(next_tok); io.emit(next_tok); committed++; @@ -371,6 +444,58 @@ bool Qwen35MoeBackend::run_pipelined_decode_path(int committed, int n_gen, if (is_eos_tok(next_tok, target_weights())) break; } + // ── Print decode telemetry ── + if (hybrid_telemetry_ && tel_n_tokens > 0) { + const double total_us = (double)(tel_embed_us + tel_layers_us + tel_logits_us + tel_sample_us); + std::printf("[qwen35moe-ar] === AR DECODE TELEMETRY (n_tokens=%d, %.1f tok/s) ===\n", + tel_n_tokens, tel_n_tokens / (total_us / 1e6)); + std::printf(" per-token breakdown:\n"); + std::printf(" embed=%.2fms layers=%.2fms logits=%.2fms sample=%.2fms\n", + tel_embed_us / 1000.0 / tel_n_tokens, + tel_layers_us / 1000.0 / tel_n_tokens, + tel_logits_us / 1000.0 / tel_n_tokens, + tel_sample_us / 1000.0 / tel_n_tokens); + std::printf(" time budget: embed=%.1f%% layers=%.1f%% logits=%.1f%% sample=%.1f%%\n", + 100.0 * tel_embed_us / total_us, + 100.0 * tel_layers_us / total_us, + 100.0 * tel_logits_us / total_us, + 100.0 * tel_sample_us / total_us); + // Routed path breakdown (the dominant path) + if (tel_layers_accum.routed_ffn_layers > 0) { + const int rl = tel_layers_accum.routed_ffn_layers; + std::printf(" routed FFN path (%d layer-evals, %d cold_hits / %d slots = %.1f%% cold):\n", + rl, + tel_layers_accum.routed_cold_expert_hits, + tel_layers_accum.routed_total_expert_slots, + tel_layers_accum.routed_total_expert_slots > 0 + ? 100.0 * tel_layers_accum.routed_cold_expert_hits / tel_layers_accum.routed_total_expert_slots + : 0.0); + std::printf(" per-layer avg: prefn_dispatch=%.1fus sync_stall=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n", + (double)tel_layers_accum.routed_prefn_us / rl, + (double)tel_layers_accum.routed_sync_us / rl, + (double)tel_layers_accum.routed_readback_us / rl, + (double)tel_layers_accum.routed_cpu_remap_us / rl, + (double)tel_layers_accum.routed_ffn_dispatch_us / rl); + std::printf(" total: sync_stall=%.1fms (%.1f%% of layers time)\n", + tel_layers_accum.routed_sync_us / 1000.0, + 100.0 * tel_layers_accum.routed_sync_us / (double)tel_layers_us); + std::printf(" final_sync=%.1fms (%.1f%% of layers time)\n", + tel_layers_accum.routed_final_sync_us / 1000.0, + 100.0 * tel_layers_accum.routed_final_sync_us / (double)tel_layers_us); + } + // Split path stats (if any) + if (tel_layers_accum.mixed_layers > 0) { + std::printf(" split path: mixed=%d layers, cold_cpu=%.1fms, ffn_mixed=%.1fms\n", + tel_layers_accum.mixed_layers, + tel_layers_accum.cold_cpu_us / 1000.0, + tel_layers_accum.ffn_mixed_us / 1000.0); + } + std::printf(" split path allhot=%d layers, hot_graph_rebuilds=%d\n", + tel_layers_accum.allhot_layers - tel_layers_accum.routed_ffn_layers, + tel_layers_accum.hot_graph_rebuilds); + std::fflush(stdout); + } + step_graph_destroy(logits_sg); return true; } @@ -417,7 +542,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, const int n_layer = target_weights().n_layer; uint64_t build_us_total = 0, compute_us_total = 0, readback_us_total = 0, ffn_us_total = 0; - Qwen35MoeHybridFfnTelemetry ffn_tel_accum{}; + MoeHybridFfnTelemetry ffn_tel_accum{}; StepGraph logits_sg; // Persistent logits graph (used by spec-decode branch) @@ -426,7 +551,7 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, }; // Helper: compute logits from act_cur (persistent graph, built once) - auto compute_logits = [&]() -> bool { + auto compute_logits = [&](ggml_tensor* gpu_src = nullptr) -> bool { if (!logits_sg.ctx) { // First call: build the logits graph ggml_init_params ip{}; @@ -449,7 +574,13 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, return false; } } - ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + if (gpu_src) { + // GPU→GPU: pipe act_cur directly without host bounce + ggml_backend_tensor_copy_async(target_backend(), target_backend(), + gpu_src, logits_sg.hidden_input); + } else { + ggml_backend_tensor_set(logits_sg.hidden_input, act_cur.data(), 0, sizeof(float) * (size_t)hidden); + } auto st = ggml_backend_graph_compute(target_backend(), logits_sg.gf); if (st != GGML_STATUS_SUCCESS) return false; ggml_backend_tensor_get(logits_sg.logits, logits_buf.data(), 0, sizeof(float) * (size_t)vocab); @@ -474,9 +605,12 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, } // Process layer by layer, chunked within each layer + StepGraph prefill_sg; // persistent across layers to reuse GPU buffer + ggml_gallocr_t ffn_hot_alloc = nullptr; + ggml_gallocr_t ffn_cold_alloc = nullptr; + for (int il = 0; il < n_layer; ++il) { auto & storage = target_weights().moe_hybrid->layers[(size_t)il]; - const auto & L = target_weights().layers[(size_t)il]; for (int chunk_start = 0; chunk_start < prompt_len; chunk_start += prefill_chunk) { const int chunk_len = std::min(prefill_chunk, prompt_len - chunk_start); @@ -485,12 +619,14 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, const bool with_mask = (cfg_.kq_stride_pad > KQ_MASK_PAD) || (chunk_len > 1); // Build pre-FFN graph for this chunk - StepGraph prefill_sg; + step_graph_free(prefill_sg); // reset ctx/graph but keep gallocr buffer if (!build_layer_prefn_step(prefill_sg, target_weights(), target_cache(), target_backend(), il, /*kv_start=*/chunk_start, /*n_tokens=*/chunk_len, with_mask, /*fa_window=*/0, cfg_.kq_stride_pad)) { result.error = "prefill_build"; step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); cleanup_graphs(); return result; } @@ -531,6 +667,8 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, if (st != GGML_STATUS_SUCCESS) { result.error = "prefill_compute"; step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); cleanup_graphs(); return result; } @@ -570,33 +708,43 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, } } - // Batched hybrid FFN for this chunk. - // The routed-expert mul_mat_id MMQ kernel writes out of bounds on - // Ampere when the per-call token count exceeds ~8: the expert token - // distribution overshoots the destination tiles on the - // need_check=false write path, silently corrupting neighbouring GPU - // allocations during prefill and crashing with an illegal memory - // access at a later decode sync (~4th request under the server). - // Sub-batch the FFN to a safe width so the attention prefill can - // stay at the full chunk size. - std::vector ffn_batch_out((size_t)chunk_len * (size_t)hidden); - constexpr int kFfnSafeBatch = 8; - for (int fb = 0; fb < chunk_len; fb += kFfnSafeBatch) { - const int fl = std::min(kFfnSafeBatch, chunk_len - fb); - std::vector sub_out; - if (!eval_qwen35moe_hybrid_ffn_batched( - target_backend(), cpu_be, target_weights(), L, storage, - chunk_post.data() + (size_t)fb * (size_t)hidden, - chunk_selected.data() + (size_t)fb * (size_t)n_expert_used, - chunk_weights.data() + (size_t)fb * (size_t)n_expert_used, - fl, sub_out, &result.error)) { - step_graph_destroy(prefill_sg); - cleanup_graphs(); - return result; + // Hybrid FFN — skip batched path when cold experts exist (CUDA mul_mat_id bug on sm_75) + MoeHybridConfig chunk_cfg = make_moe_hybrid_config(target_weights()); + MoeLayerDesc chunk_desc = make_moe_layer_desc(target_weights().layers[(size_t)il]); + std::vector ffn_batch_out; + bool ffn_ok = false; + if (storage.cold_expert_ids.empty()) { + // All experts hot — safe to use batched path + ffn_ok = eval_moe_hybrid_ffn_batched( + target_backend(), cpu_be, chunk_cfg, chunk_desc, storage, + chunk_post.data(), + chunk_selected.data(), + chunk_weights.data(), + chunk_len, ffn_batch_out, &result.error, + &ffn_hot_alloc, &ffn_cold_alloc); + } + if (!ffn_ok) { + // Per-token fallback (avoids sm_75 mul_mat_id assertion with cold experts) + result.error.clear(); + ffn_batch_out.assign((size_t)hidden * (size_t)chunk_len, 0.0f); + std::vector single_out; + for (int ti = 0; ti < chunk_len; ++ti) { + const float * tok_post = chunk_post.data() + (size_t)ti * (size_t)hidden; + const int32_t * tok_sel = chunk_selected.data() + (size_t)ti * (size_t)n_expert_used; + const float * tok_wts = chunk_weights.data() + (size_t)ti * (size_t)n_expert_used; + if (!eval_moe_hybrid_ffn_single( + target_backend(), chunk_cfg, chunk_desc, storage, cpu_be, + tok_post, tok_sel, tok_wts, n_expert_used, single_out)) { + result.error = "prefill_ffn_single"; + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); + cleanup_graphs(); + return result; + } + std::memcpy(ffn_batch_out.data() + (size_t)ti * (size_t)hidden, + single_out.data(), sizeof(float) * (size_t)hidden); } - std::memcpy(ffn_batch_out.data() + (size_t)fb * (size_t)hidden, - sub_out.data(), - (size_t)fl * (size_t)hidden * sizeof(float)); } // Combine FFN output + residual → embed_all for next layer @@ -634,10 +782,11 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, } const auto t4 = HybridClock::now(); ffn_us_total += elapsed_us(t3, t4); - - step_graph_destroy(prefill_sg); } } + step_graph_destroy(prefill_sg); + if (ffn_hot_alloc) ggml_gallocr_free(ffn_hot_alloc); + if (ffn_cold_alloc) ggml_gallocr_free(ffn_cold_alloc); // Copy last token's output to act_cur for decode std::memcpy(act_cur.data(), embed_all.data() + (size_t)(prompt_len - 1) * (size_t)hidden, @@ -729,8 +878,9 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, cleanup_graphs(); return result; } - ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0, - sizeof(float) * (size_t)hidden); + // Upload embedding async on compute stream + ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, + act_cur.data(), 0, sizeof(float) * (size_t)hidden); PipelinedDecodeTelemetry tel; if (!pipelined_decode_one_token(*pipe_state_, target_backend(), target_weights(), @@ -749,14 +899,31 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, decode_tel_accum.ffn_us += tel.ffn_us; decode_tel_accum.ffn_allhot_us += tel.ffn_allhot_us; decode_tel_accum.ffn_mixed_us += tel.ffn_mixed_us; + decode_tel_accum.gpu_idle_us += tel.gpu_idle_us; + decode_tel_accum.tensor_io_us += tel.tensor_io_us; + decode_tel_accum.combine_overhead_us += tel.combine_overhead_us; + decode_tel_accum.cold_cpu_us += tel.cold_cpu_us; + decode_tel_accum.cold_compute_us += tel.cold_compute_us; + decode_tel_accum.hot_graph_build_us += tel.hot_graph_build_us; + decode_tel_accum.ffn_post_get_us += tel.ffn_post_get_us; + decode_tel_accum.sync_wait_us += tel.sync_wait_us; decode_tel_accum.allhot_layers += tel.allhot_layers; decode_tel_accum.mixed_layers += tel.mixed_layers; decode_tel_accum.total_layers += tel.total_layers; + decode_tel_accum.hot_graph_rebuilds += tel.hot_graph_rebuilds; + decode_tel_accum.routed_ffn_layers += tel.routed_ffn_layers; + decode_tel_accum.routed_prefn_us += tel.routed_prefn_us; + decode_tel_accum.routed_sync_us += tel.routed_sync_us; + decode_tel_accum.routed_readback_us += tel.routed_readback_us; + decode_tel_accum.routed_cpu_remap_us += tel.routed_cpu_remap_us; + decode_tel_accum.routed_ffn_dispatch_us += tel.routed_ffn_dispatch_us; + decode_tel_accum.routed_final_sync_us += tel.routed_final_sync_us; + decode_tel_accum.routed_cold_expert_hits += tel.routed_cold_expert_hits; + decode_tel_accum.routed_total_expert_slots += tel.routed_total_expert_slots; } - ggml_backend_tensor_get(pipe_state_->gpu_state.act_cur, act_cur.data(), 0, - sizeof(float) * (size_t)hidden); - if (!compute_logits()) { + // act_cur stays on GPU — compute_logits reads it via GPU→GPU copy + if (!compute_logits(pipe_state_->gpu_state.act_cur)) { result.error = "decode_logits"; cleanup_graphs(); return result; @@ -793,12 +960,51 @@ GenerateResult Qwen35MoeBackend::generate(const GenerateRequest & req, decode_tel_accum.ffn_mixed_us / 1000.0, decode_tel_accum.allhot_layers, decode_tel_accum.mixed_layers); - if (n_dec > 0) { + std::printf(" GPU IDLE: tensor_io=%.1fms combine=%.1fms sync_wait=%.1fms\n", + decode_tel_accum.tensor_io_us / 1000.0, + decode_tel_accum.combine_overhead_us / 1000.0, + decode_tel_accum.sync_wait_us / 1000.0); + std::printf(" CPU TIME: cold_total=%.1fms cold_compute=%.1fms hot_graph_build=%.1fms ffn_post_get=%.1fms\n", + decode_tel_accum.cold_cpu_us / 1000.0, + decode_tel_accum.cold_compute_us / 1000.0, + decode_tel_accum.hot_graph_build_us / 1000.0, + decode_tel_accum.ffn_post_get_us / 1000.0); + std::printf(" hot_graph_rebuilds=%d routed_ffn_layers=%d\n", + decode_tel_accum.hot_graph_rebuilds, + decode_tel_accum.routed_ffn_layers); + // Routed path breakdown + if (decode_tel_accum.routed_ffn_layers > 0) { + const int rl = decode_tel_accum.routed_ffn_layers; + std::printf(" ROUTED PATH (%d layer-evals, %d cold / %d slots = %.1f%% cold):\n", + rl, decode_tel_accum.routed_cold_expert_hits, + decode_tel_accum.routed_total_expert_slots, + decode_tel_accum.routed_total_expert_slots > 0 + ? 100.0 * decode_tel_accum.routed_cold_expert_hits / decode_tel_accum.routed_total_expert_slots + : 0.0); + std::printf(" per-layer: prefn=%.1fus sync=%.1fus readback=%.1fus remap=%.1fus ffn_dispatch=%.1fus\n", + (double)decode_tel_accum.routed_prefn_us / rl, + (double)decode_tel_accum.routed_sync_us / rl, + (double)decode_tel_accum.routed_readback_us / rl, + (double)decode_tel_accum.routed_cpu_remap_us / rl, + (double)decode_tel_accum.routed_ffn_dispatch_us / rl); + std::printf(" totals: sync_stall=%.1fms final_sync=%.1fms\n", + decode_tel_accum.routed_sync_us / 1000.0, + decode_tel_accum.routed_final_sync_us / 1000.0); + } + if (n_dec > 0 && decode_tel_accum.total_us > 0) { + const double gpu_compute_us = (double)(decode_tel_accum.prefn_compute_us + decode_tel_accum.ffn_us - decode_tel_accum.cold_cpu_us); + const double gpu_util_pct = 100.0 * gpu_compute_us / (double)decode_tel_accum.total_us; std::printf(" per-token avg: prefn_build=%.2fms prefn_compute=%.2fms readback=%.2fms ffn=%.2fms\n", decode_tel_accum.prefn_graph_build_us / 1000.0 / n_dec, decode_tel_accum.prefn_compute_us / 1000.0 / n_dec, decode_tel_accum.routing_readback_us / 1000.0 / n_dec, decode_tel_accum.ffn_us / 1000.0 / n_dec); + std::printf(" per-token avg: tensor_io=%.2fms combine=%.2fms cold_cpu=%.2fms cold_compute=%.2fms\n", + decode_tel_accum.tensor_io_us / 1000.0 / n_dec, + decode_tel_accum.combine_overhead_us / 1000.0 / n_dec, + decode_tel_accum.cold_cpu_us / 1000.0 / n_dec, + decode_tel_accum.cold_compute_us / 1000.0 / n_dec); + std::printf(" estimated GPU utilization: %.1f%%\n", gpu_util_pct); } std::fflush(stdout); } @@ -871,8 +1077,8 @@ bool Qwen35MoeBackend::hybrid_forward_one_token(int32_t tok, int kv_pos, // Ensure pipelined state if (!ensure_pipe_state(kv_pos)) return false; - // Upload to GPU-resident act_cur - ggml_backend_tensor_set(pipe_state_->gpu_state.act_cur, act_cur.data(), 0, + // Upload to GPU-resident act_cur (async — compute stream ordering guarantees correctness) + ggml_backend_tensor_set_async(target_backend(), pipe_state_->gpu_state.act_cur, act_cur.data(), 0, sizeof(float) * (size_t)hidden); // Run pipelined decode (all 40 layers with cached DeltaNet + hot/cold FFN) @@ -1157,12 +1363,12 @@ bool Qwen35MoeBackend::do_hybrid_spec_decode(int committed, int n_gen, bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path, ggml_backend_t backend, const TargetWeights & w, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err) { // Load hotness table or assume uniform hotness - Qwen35MoeRoutingStats hotness; + MoeHybridRoutingStats hotness; if (hotness_path && hotness_path[0]) { - if (!Qwen35MoeRoutingStats::load_csv(std::string(hotness_path), hotness, err)) { + if (!MoeHybridRoutingStats::load_csv(std::string(hotness_path), hotness, err)) { return false; } if (hotness.n_layer != w.n_layer || hotness.n_expert != w.n_expert) { @@ -1277,7 +1483,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path, } // Build placement using greedy knapsack with byte budget - if (!Qwen35MoeExpertPlacement::build_from_stats_with_layer_bytes( + if (!MoeHybridPlacement::build_from_stats_with_layer_bytes( hotness, layer_expert_bytes, expert_budget, /*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert), out, err)) { diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h index 9c9ccf8dd..7e29d8f07 100644 --- a/server/src/qwen35moe/qwen35moe_backend.h +++ b/server/src/qwen35moe/qwen35moe_backend.h @@ -4,11 +4,11 @@ #include "qwen35_backend.h" #include "graph_builders.h" -#include "qwen35moe_hybrid_ffn_eval.h" -#include "qwen35moe_hybrid_storage.h" #include "qwen35moe_pipelined_decode.h" -#include "qwen35moe_routing_stats.h" -#include "qwen35moe_swap_manager.h" +#include "../common/moe_hybrid_ffn_eval.h" +#include "../common/moe_hybrid_storage.h" +#include "../common/moe_hybrid_routing_stats.h" +#include "../common/moe_hybrid_swap_manager.h" #include #include @@ -36,17 +36,17 @@ class Qwen35MoeBackend : public Qwen35Backend { void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override; private: - std::shared_ptr routing_stats_; + std::shared_ptr routing_stats_; std::string routing_stats_out_path_; std::string placement_out_path_; - Qwen35MoeSwapPolicy swap_policy_; + MoeHybridSwapPolicy swap_policy_; bool hybrid_telemetry_ = false; void maybe_post_request_swap(); bool load_dynamic_placement(const char * hotness_path, ggml_backend_t backend, const TargetWeights & w, - Qwen35MoeExpertPlacement & out, + MoeHybridPlacement & out, std::string * err); // Hybrid speculative decode: draft tokens using DFlash draft model, diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h deleted file mode 100644 index 55f824a03..000000000 --- a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h +++ /dev/null @@ -1,219 +0,0 @@ -// Single-token hybrid qwen35moe FFN evaluation helpers. - -#pragma once - -#include "internal.h" -#include "qwen35moe_hybrid_storage.h" - -#include "ggml-backend.h" - -#include -#include -#include - -namespace dflash::common { - -// GPU-resident residual combine graph: output = residual + hot_out + cold_correction. -// Built once at decode start, reused every layer to keep act_cur on GPU. -struct ResidualCombineGraph { - ggml_context * ctx = nullptr; - ggml_cgraph * gf = nullptr; - ggml_gallocr_t alloc = nullptr; - ggml_tensor * residual_in = nullptr; // [n_embd] F32 input - ggml_tensor * hot_in = nullptr; // [n_embd] F32 input - ggml_tensor * cold_in = nullptr; // [n_embd] F32 input (zeros when no cold) - ggml_tensor * output = nullptr; // [n_embd] F32 output - - ResidualCombineGraph() = default; - ~ResidualCombineGraph() { free(); } - ResidualCombineGraph(const ResidualCombineGraph &) = delete; - ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete; - ResidualCombineGraph(ResidualCombineGraph && o) noexcept - : ctx(o.ctx), gf(o.gf), alloc(o.alloc), - residual_in(o.residual_in), hot_in(o.hot_in), - cold_in(o.cold_in), output(o.output) { - o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr; - o.residual_in = nullptr; o.hot_in = nullptr; - o.cold_in = nullptr; o.output = nullptr; - } - ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept { - if (this != &o) { - free(); - ctx = o.ctx; gf = o.gf; alloc = o.alloc; - residual_in = o.residual_in; hot_in = o.hot_in; - cold_in = o.cold_in; output = o.output; - o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr; - o.residual_in = nullptr; o.hot_in = nullptr; - o.cold_in = nullptr; o.output = nullptr; - } - return *this; - } - bool valid() const { return ctx && gf && alloc && output; } - void free(); - void destroy(); -}; - -// Build the residual combine graph on the given GPU backend. -bool build_residual_combine_graph(ResidualCombineGraph & out, ggml_backend_t backend, int n_embd); - -// GPU-resident state for the decode loop: persistent act_cur + combine graph. -struct GpuResidentState { - ggml_context * ctx = nullptr; - ggml_backend_buffer_t buf = nullptr; - ggml_tensor * act_cur = nullptr; // [n_embd] F32 persistent GPU tensor - - ResidualCombineGraph combine; - - GpuResidentState() = default; - ~GpuResidentState() { destroy(); } - GpuResidentState(const GpuResidentState &) = delete; - GpuResidentState & operator=(const GpuResidentState &) = delete; - GpuResidentState(GpuResidentState && o) noexcept - : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur), - combine(std::move(o.combine)) { - o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr; - } - GpuResidentState & operator=(GpuResidentState && o) noexcept { - if (this != &o) { - destroy(); - ctx = o.ctx; buf = o.buf; act_cur = o.act_cur; - combine = std::move(o.combine); - o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr; - } - return *this; - } - bool valid() const { return ctx && buf && act_cur && combine.valid(); } - void destroy(); -}; - -// Initialize GPU-resident state for decode. -bool init_gpu_resident_state(GpuResidentState & out, ggml_backend_t backend, int n_embd); - -struct Qwen35MoeHybridFfnTelemetry { - uint64_t ffn_wall_us = 0; - uint64_t partition_us = 0; - uint64_t hot_us = 0; - uint64_t cold_us = 0; - uint64_t shared_us = 0; - uint64_t combine_us = 0; - int hot_selected = 0; - int cold_selected = 0; -}; - -bool eval_qwen35moe_reference_ffn_single( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected, - std::vector & out, - std::string * err = nullptr); - -bool eval_qwen35moe_hybrid_ffn_single( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - ggml_backend_t cpu_backend, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected, - std::vector & out, - Qwen35MoeHybridFfnTelemetry * telemetry = nullptr, - std::string * err = nullptr); - -// Batched prefill FFN: processes n_tokens at once using the full GPU expert tensors. -// Uses pre-computed routing (selected_ids, selected_weights) from the pre-FFN graph. -// cur_host: [n_embd × n_tokens] post-norm hidden states -// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs) -// selected_weights: [n_expert_used × n_tokens] routing weights -// out: [n_embd × n_tokens] output (resized internally) -bool eval_qwen35moe_batched_prefill_ffn( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_tokens, - std::vector & out, - std::string * err = nullptr); - -// Batched hybrid prefill FFN: processes n_tokens at once with hot experts on GPU -// and cold experts on CPU concurrently. Uses pre-computed routing from the pre-FFN -// graph. Falls back to eval_qwen35moe_batched_prefill_ffn when all selected experts -// are hot. -// cur_host: [n_embd × n_tokens] post-norm hidden states (row-major) -// selected_ids: [n_expert_used × n_tokens] expert selections (global IDs) -// selected_weights: [n_expert_used × n_tokens] routing weights -// out: [n_embd × n_tokens] output (resized internally) -bool eval_qwen35moe_hybrid_ffn_batched( - ggml_backend_t gpu_backend, - ggml_backend_t cpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - const float * cur_host, - const int32_t * selected_ids, - const float * selected_weights, - int n_tokens, - std::vector & out, - std::string * err = nullptr); - -// GPU-resident single-token hybrid FFN eval: keeps data on GPU, only reads -// router IDs to CPU for hot/cold partitioning. Uses tensor_copy for GPU→GPU -// transfers instead of round-tripping through host memory. -// ffn_post_gpu: [n_embd] F32 on GPU — the post-attention-norm hidden state -// ffn_residual_gpu: [n_embd] F32 on GPU — the pre-FFN residual -// gpu_state: persistent GPU state with act_cur and combine graph -// After call: gpu_state.act_cur holds the layer output on GPU. -bool eval_qwen35moe_hybrid_ffn_gpu_resident( - ggml_backend_t gpu_backend, - const TargetWeights & w, - const TargetLayer & L, - Qwen35MoeHybridLayerStorage & storage, - ggml_backend_t cpu_backend, - ggml_tensor * ffn_post_gpu, - ggml_tensor * ffn_residual_gpu, - GpuResidentState & gpu_state, - const int32_t * selected_ids, - const float * selected_weights, - int n_selected); - -// Build/rebuild cached hot FFN graph for a given number of hot experts. -bool build_cached_hot_graph( - CachedFfnGraph & out, - ggml_backend_t backend, - ggml_tensor * gate_tensor, - ggml_tensor * up_tensor, - ggml_tensor * down_tensor, - ggml_tensor * gate_up_tensor, - float gate_scale, - float up_scale, - float down_scale, - float gate_up_scale, - const TargetLayer & L, - int n_embd, - int n_ff_exp, - int n_hot); - -// Build/rebuild cached cold FFN graph for a given number of cold experts. -bool build_cached_cold_graph( - CachedFfnGraph & out, - ggml_backend_t cpu_backend, - ggml_tensor * gate_tensor, - ggml_tensor * up_tensor, - ggml_tensor * down_tensor, - ggml_tensor * gate_up_tensor, - float gate_scale, - float up_scale, - float down_scale, - float gate_up_scale, - int n_embd, - int n_ff_exp, - int n_cold); - -} // namespace dflash::common diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp index 78039e4ad..e89f9d057 100644 --- a/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp +++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.cpp @@ -3,6 +3,8 @@ #include "qwen35moe_pipelined_decode.h" +#include "../common/moe_hybrid_types_impl.h" + #include "ggml-alloc.h" #include "ggml-backend.h" @@ -31,6 +33,7 @@ void CachedPrefnGraph::free() { moe_weights = nullptr; } + // Build a cached pre-FFN graph for a DeltaNet layer. // DeltaNet layers have no kv_start-dependent views — the graph structure is // identical across tokens. We build once and reuse by updating inp_embed data. @@ -96,6 +99,8 @@ static bool build_cached_deltanet_prefn( void PipelinedDecodeState::destroy() { for (auto & cpg : cached_prefn) cpg.free(); cached_prefn.clear(); + for (auto & rff : cached_routed_ffn) rff.free(); + cached_routed_ffn.clear(); gpu_state.destroy(); routing_ids_buf.clear(); routing_weights_buf.clear(); @@ -109,6 +114,7 @@ bool init_pipelined_decode_state( ggml_backend_t backend, const TargetWeights & w, TargetCache & cache, + MoeHybridStorage & hybrid, int kv_start, int kq_stride_pad) { @@ -129,27 +135,90 @@ bool init_pipelined_decode_state( out.routing_weights_buf.resize((size_t)w.n_expert_used); out.ffn_post_host_buf.resize((size_t)w.n_embd); - // Build cached pre-FFN graphs for DeltaNet layers + // Check if routed FFN pipeline is disabled + const bool routed_disabled = (std::getenv("DFLASH_QWEN35MOE_NO_ROUTED") != nullptr); + + // Cold experts are computed on the cold backend (CPU/Halo) by default. + // Set DFLASH_DROP_COLD=1 to skip cold computation (fast but lossy). + out.cold_compute = (std::getenv("DFLASH_DROP_COLD") == nullptr); + + // Build cached pre-FFN graphs for all DeltaNet layers. out.cached_prefn.resize((size_t)w.n_layer); - int cached_count = 0; + int cached_prefn_count = 0; for (int il = 0; il < w.n_layer; ++il) { const bool is_attn = (((il + 1) % w.full_attention_interval) == 0); if (!is_attn) { - // DeltaNet layer: cache the graph if (!build_cached_deltanet_prefn( out.cached_prefn[(size_t)il], backend, w, cache, il, kv_start, kq_stride_pad)) { std::fprintf(stderr, "[pipelined] failed to cache DeltaNet prefn for layer %d\n", il); - // Non-fatal: will fall back to dynamic build for this layer } else { - cached_count++; + cached_prefn_count++; } } - // Attention layers: cached_prefn[il] remains invalid (rebuilt per-token) } - out.cold_in_zeroed = true; - // cold_in was already zeroed in init_gpu_resident_state + // Build cached routed FFN graphs for ALL layers (StreamMoE-inspired pipeline). + // Includes attention layers — eliminates expensive split-path FFN for mixed layers. + // Cold entries get weight=0 at runtime, contributing nothing to output. + out.cached_routed_ffn.resize((size_t)w.n_layer); + int routed_count = 0; + if (!routed_disabled) { + for (int il = 0; il < w.n_layer; ++il) { + if ((size_t)il >= hybrid.layers.size()) continue; + + auto & storage = hybrid.layers[(size_t)il]; + const TargetLayer & L = w.layers[(size_t)il]; + + if (!build_cached_hot_graph( + out.cached_routed_ffn[(size_t)il], backend, + storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, + L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, + make_moe_layer_desc(L), w.n_embd, w.n_ff_exp, w.n_expert_used)) { + // Non-fatal: fall back to split path for this layer + } else { + routed_count++; + } + } + } + std::fprintf(stderr, "[pipelined] cached %d prefn + %d routed FFN graphs%s\n", + cached_prefn_count, routed_count, + out.cold_compute ? "" : " (drop_cold=lossy)"); + + // Initialize fused cold FFN compute (bypasses ggml graph dispatch) + if (out.cold_compute) { + out.cold_ffn_compute = make_cpu_cold_ffn_compute(w.n_ff_exp); + out.cold_ffn_layers.resize((size_t)w.n_layer); + out.cold_output_buf.resize((size_t)w.n_embd); + for (int il = 0; il < w.n_layer && (size_t)il < hybrid.layers.size(); ++il) { + auto & storage = hybrid.layers[(size_t)il]; + const TargetLayer & L = w.layers[(size_t)il]; + auto & cl = out.cold_ffn_layers[(size_t)il]; + cl.fused_gate_up = (storage.gate_up_cold != nullptr); + if (cl.fused_gate_up) { + cl.gate_up_data = storage.gate_up_cold ? storage.gate_up_cold->data : nullptr; + cl.gate_up_stride = storage.gate_up_cold ? storage.gate_up_cold->nb[2] : 0; + cl.gate_up_type = storage.gate_up_cold ? storage.gate_up_cold->type : GGML_TYPE_Q4_K; + cl.gate_up_scale = L.ffn_gate_up_exps_s; + } else { + cl.gate_data = storage.gate_cold ? storage.gate_cold->data : nullptr; + cl.up_data = storage.up_cold ? storage.up_cold->data : nullptr; + cl.gate_stride = storage.gate_cold ? storage.gate_cold->nb[2] : 0; + cl.up_stride = storage.up_cold ? storage.up_cold->nb[2] : 0; + cl.gate_type = storage.gate_cold ? storage.gate_cold->type : GGML_TYPE_Q4_K; + cl.up_type = storage.up_cold ? storage.up_cold->type : GGML_TYPE_Q4_K; + cl.gate_scale = L.ffn_gate_exps_s; + cl.up_scale = L.ffn_up_exps_s; + } + cl.down_data = storage.down_cold ? storage.down_cold->data : nullptr; + cl.down_stride = storage.down_cold ? storage.down_cold->nb[2] : 0; + cl.down_type = storage.down_cold ? storage.down_cold->type : GGML_TYPE_Q4_K; + cl.down_scale = L.ffn_down_exps_s; + } + std::fprintf(stderr, "[pipelined] cold FFN: fused kernel (bypasses ggml graph)\n"); + } + + out.cold_in_zeroed = true; return true; } @@ -160,7 +229,7 @@ bool pipelined_decode_one_token( ggml_backend_t backend, const TargetWeights & w, TargetCache & cache, - Qwen35MoeHybridStorage & hybrid, + MoeHybridStorage & hybrid, int kv_pos, int kq_stride_pad, PipelinedDecodeTelemetry * tel) { @@ -171,16 +240,7 @@ bool pipelined_decode_one_token( ggml_backend_t cpu_be = hybrid.cpu_backend; if (tel) { - tel->total_us = 0; - tel->prefn_graph_build_us = 0; - tel->prefn_compute_us = 0; - tel->routing_readback_us = 0; - tel->ffn_us = 0; - tel->ffn_allhot_us = 0; - tel->ffn_mixed_us = 0; - tel->allhot_layers = 0; - tel->mixed_layers = 0; - tel->total_layers = 0; + *tel = PipelinedDecodeTelemetry{}; } const auto tok_t0 = PipelineClock::now(); @@ -190,6 +250,158 @@ bool pipelined_decode_one_token( const bool is_attn = (((il + 1) % state.full_attention_interval) == 0); const auto prefn_build_t0 = PipelineClock::now(); + // ══════════════════════════════════════════════════════════════════════ + // ROUTED FFN FAST PATH (StreamMoE-inspired async pipeline): + // prefn(async) → sync → routing readback → rffn(async) + cold(CPU parallel) → combine(async) + // Handles both all-hot and mixed layers. Cold compute runs on CPU + // in parallel with GPU rffn — zero overhead when all experts are hot. + // ══════════════════════════════════════════════════════════════════════ + if (!is_attn + && state.cached_prefn[(size_t)il].valid() + && state.cached_routed_ffn[(size_t)il].valid()) { + + auto & cpg = state.cached_prefn[(size_t)il]; + auto & rffn = state.cached_routed_ffn[(size_t)il]; + + // 1. Copy act_cur → prefn input (GPU→GPU async) + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed); + + if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now()); + + // 2. Run prefn graph (DeltaNet + router) + const auto prefn_compute_t0 = PipelineClock::now(); + ggml_backend_graph_compute_async(backend, cpg.gf); + + // 3. Sync to read routing decisions from prefn output + const auto sync_t0 = PipelineClock::now(); + ggml_backend_synchronize(backend); + const auto sync_t1 = PipelineClock::now(); + + // Read routing decisions from GPU + int32_t global_ids[8]; + float router_weights[8]; + ggml_backend_tensor_get(cpg.moe_selected, global_ids, 0, + sizeof(int32_t) * (size_t)n_expert_used); + ggml_backend_tensor_get(cpg.moe_weights, router_weights, 0, + sizeof(float) * (size_t)n_expert_used); + const auto readback_t1 = PipelineClock::now(); + + // CPU-side local ID mapping + cold partition (trivial: 8 lookups) + auto & storage = hybrid.layers[(size_t)il]; + int32_t local_ids[8]; + float masked_weights[8]; + int32_t cold_ids[8]; + float cold_weights[8]; + int n_cold = 0; + int layer_cold_hits = 0; + for (int i = 0; i < n_expert_used; ++i) { + int32_t gid = global_ids[i]; + int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size()) + ? storage.hot_local_by_global[(size_t)gid] : -1; + if (lid >= 0) { + local_ids[i] = lid; + masked_weights[i] = router_weights[i]; + } else { + local_ids[i] = 0; // safe: maps to expert 0 (result zeroed by weight) + masked_weights[i] = 0.0f; // cold expert contributes nothing to hot path + layer_cold_hits++; + // Record for cold compute + if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) { + int32_t cold_local = storage.cold_local_by_global[(size_t)gid]; + if (cold_local >= 0) { + cold_ids[n_cold] = cold_local; + cold_weights[n_cold] = router_weights[i]; + n_cold++; + } + } + } + } + const bool has_cold_selected = (n_cold > 0); + const auto remap_t1 = PipelineClock::now(); + + // D2H ffn_post for cold compute (GPU already synced, data is ready) + if (has_cold_selected) { + ggml_backend_tensor_get(cpg.ffn_post, state.ffn_post_host_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + } + + // Upload pre-computed inputs to rffn graph (H→D async on compute stream) + ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0, + sizeof(int32_t) * (size_t)n_expert_used); + ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0, + sizeof(float) * (size_t)n_expert_used); + // Copy ffn_post from prefn output → rffn input (GPU→GPU, already synced) + ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_post, rffn.inp); + + // 4. Copy residual to combine input (async) + ggml_backend_tensor_copy_async(backend, backend, cpg.ffn_residual, state.gpu_state.combine.residual_in); + + // 5. Run routed FFN graph (async — mul_mat_id + shared expert) + ggml_backend_graph_compute_async(backend, rffn.gf); + + // 6. Cold compute on CPU (parallel with GPU rffn above) + const auto cold_t0 = PipelineClock::now(); + if (has_cold_selected) { + state.cold_ffn_compute->compute( + state.cold_ffn_layers[(size_t)il], + state.ffn_post_host_buf.data(), + cold_ids, cold_weights, n_cold, + n_embd, w.n_ff_exp, + state.cold_output_buf.data()); + } + if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now()); + + // 7. Copy FFN output → combine.hot_in (async, ordered after FFN on GPU stream) + ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in); + + // 8. Upload cold result or ensure cold_in is zero + if (has_cold_selected) { + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, + state.cold_output_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + state.cold_in_zeroed = false; + } else if (!state.cold_in_zeroed) { + static float zeros[8192] = {}; + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0, + sizeof(float) * (size_t)n_embd); + state.cold_in_zeroed = true; + } + + // 9. Run combine graph (async — adds residual + hot + cold) + ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf); + + // 10. Copy combine output → act_cur for next layer (async) + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur); + + if (tel) { + tel->prefn_compute_us += pipe_elapsed_us(prefn_compute_t0, PipelineClock::now()); + tel->routed_prefn_us += pipe_elapsed_us(prefn_compute_t0, sync_t0); + tel->routed_sync_us += pipe_elapsed_us(sync_t0, sync_t1); + tel->routed_readback_us += pipe_elapsed_us(sync_t1, readback_t1); + tel->routed_cpu_remap_us += pipe_elapsed_us(readback_t1, remap_t1); + tel->routed_ffn_dispatch_us += pipe_elapsed_us(remap_t1, PipelineClock::now()); + tel->routed_cold_expert_hits += layer_cold_hits; + tel->routed_total_expert_slots += n_expert_used; + if (has_cold_selected) { + tel->mixed_layers++; + } else { + tel->allhot_layers++; + } + tel->total_layers++; + tel->routed_ffn_layers++; + } + continue; + } + + // ══════════════════════════════════════════════════════════════════════ + // SPLIT PATH: separate prefn + routing readback + FFN (original logic) + // Used for attention layers or layers without routed FFN graph. + // ══════════════════════════════════════════════════════════════════════ + + // Sync any pending async work before entering the split path + // (split path needs synchronous access to GPU data) + ggml_backend_synchronize(backend); + ggml_tensor * ffn_post_gpu = nullptr; ggml_tensor * ffn_residual_gpu = nullptr; ggml_tensor * moe_selected_tensor = nullptr; @@ -203,11 +415,11 @@ bool pipelined_decode_one_token( step_graph_destroy(dyn_sg); return false; } - // Copy act_cur to graph input (GPU→GPU) - ggml_backend_tensor_copy(state.gpu_state.act_cur, dyn_sg.inp_embed); + // Copy act_cur to graph input (GPU→GPU) — async on compute stream + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, dyn_sg.inp_embed); if (dyn_sg.positions) { int32_t pos4[4] = {kv_pos, kv_pos, kv_pos, 0}; - ggml_backend_tensor_set(dyn_sg.positions, pos4, 0, sizeof(pos4)); + ggml_backend_tensor_set_async(backend, dyn_sg.positions, pos4, 0, sizeof(pos4)); } if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now()); @@ -228,7 +440,8 @@ bool pipelined_decode_one_token( } else { // DeltaNet layer: reuse cached graph, just update input auto & cpg = state.cached_prefn[(size_t)il]; - ggml_backend_tensor_copy(state.gpu_state.act_cur, cpg.inp_embed); + // Async copy on compute stream — ordered before next graph_compute + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.act_cur, cpg.inp_embed); if (tel) tel->prefn_graph_build_us += pipe_elapsed_us(prefn_build_t0, PipelineClock::now()); @@ -244,19 +457,122 @@ bool pipelined_decode_one_token( } // ── Read routing decisions (tiny: 32 + 32 bytes) ── + // Use get_async + single sync instead of 2 separate sync tensor_gets. + // After graph_compute (SYNC) above, data is ready — just need D2H copy. const auto routing_t0 = PipelineClock::now(); if (!moe_selected_tensor || !moe_weights_tensor) return false; - ggml_backend_tensor_get(moe_selected_tensor, state.routing_ids_buf.data(), 0, + ggml_backend_tensor_get_async(backend, moe_selected_tensor, state.routing_ids_buf.data(), 0, sizeof(int32_t) * (size_t)n_expert_used); - ggml_backend_tensor_get(moe_weights_tensor, state.routing_weights_buf.data(), 0, + ggml_backend_tensor_get_async(backend, moe_weights_tensor, state.routing_weights_buf.data(), 0, sizeof(float) * (size_t)n_expert_used); + ggml_backend_synchronize(backend); if (tel) tel->routing_readback_us += pipe_elapsed_us(routing_t0, PipelineClock::now()); - // ── FFN: hot/cold partition + compute ── + // ── FFN: use routed FFN (cold-masking) if graph available, else split path ── const auto ffn_t0 = PipelineClock::now(); auto & storage = hybrid.layers[(size_t)il]; const auto & L = w.layers[(size_t)il]; + // Try routed FFN path for this layer (works for attention layers too) + // Handles cold experts inline — cold compute runs parallel with GPU rffn. + auto & rffn = state.cached_routed_ffn[(size_t)il]; + if (rffn.valid()) { + // Partition hot/cold: remap global→local, zero cold weights for hot path + int32_t local_ids[8]; + float masked_weights[8]; + int32_t cold_ids[8]; + float cold_weights[8]; + int n_cold = 0; + int layer_cold_hits = 0; + for (int i = 0; i < n_expert_used; ++i) { + int32_t gid = state.routing_ids_buf[(size_t)i]; + int32_t lid = (gid >= 0 && gid < (int)storage.hot_local_by_global.size()) + ? storage.hot_local_by_global[(size_t)gid] : -1; + if (lid >= 0) { + local_ids[i] = lid; + masked_weights[i] = state.routing_weights_buf[(size_t)i]; + } else { + local_ids[i] = 0; + masked_weights[i] = 0.0f; + layer_cold_hits++; + if (state.cold_ffn_compute && gid >= 0 && gid < (int)storage.cold_local_by_global.size()) { + int32_t cold_local = storage.cold_local_by_global[(size_t)gid]; + if (cold_local >= 0) { + cold_ids[n_cold] = cold_local; + cold_weights[n_cold] = state.routing_weights_buf[(size_t)i]; + n_cold++; + } + } + } + } + const bool has_cold_selected = (n_cold > 0); + + // D2H ffn_post for cold compute (GPU already synced after routing readback) + if (has_cold_selected) { + ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + } + + // Upload IDs + weights, copy inputs, dispatch rffn (all async) + ggml_backend_tensor_set_async(backend, rffn.ids, local_ids, 0, + sizeof(int32_t) * (size_t)n_expert_used); + ggml_backend_tensor_set_async(backend, rffn.weights, masked_weights, 0, + sizeof(float) * (size_t)n_expert_used); + ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, rffn.inp); + ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in); + ggml_backend_graph_compute_async(backend, rffn.gf); + + // Cold compute on CPU (parallel with GPU rffn above) + const auto cold_t0 = PipelineClock::now(); + if (has_cold_selected) { + state.cold_ffn_compute->compute( + state.cold_ffn_layers[(size_t)il], + state.ffn_post_host_buf.data(), + cold_ids, cold_weights, n_cold, + n_embd, w.n_ff_exp, + state.cold_output_buf.data()); + } + if (tel && has_cold_selected) tel->cold_compute_us += pipe_elapsed_us(cold_t0, PipelineClock::now()); + + // Copy hot result → combine input (async, ordered after rffn on GPU stream) + ggml_backend_tensor_copy_async(backend, backend, rffn.output, state.gpu_state.combine.hot_in); + + // Upload cold result or ensure cold_in is zero + if (has_cold_selected) { + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, + state.cold_output_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + state.cold_in_zeroed = false; + } else if (!state.cold_in_zeroed) { + static float zeros[8192] = {}; + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0, + sizeof(float) * (size_t)n_embd); + state.cold_in_zeroed = true; + } + + ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf); + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur); + + if (tel) { + uint64_t ffn_layer_us = pipe_elapsed_us(ffn_t0, PipelineClock::now()); + tel->ffn_us += ffn_layer_us; + tel->total_layers++; + tel->routed_ffn_layers++; + if (has_cold_selected) { + tel->mixed_layers++; + tel->ffn_mixed_us += ffn_layer_us; + } else { + tel->allhot_layers++; + tel->ffn_allhot_us += ffn_layer_us; + } + tel->routed_cold_expert_hits += layer_cold_hits; + tel->routed_total_expert_slots += n_expert_used; + } + continue; + } + + // ── Fallback: full split path (no routed FFN graph for this layer) ── + // Partition into hot/cold (fast: just a lookup table scan, ~8 iterations) int n_hot = 0, n_cold = 0; int32_t hot_ids[8], cold_ids[8]; @@ -287,99 +603,125 @@ bool pipelined_decode_one_token( // ── Read ffn_post to CPU NOW (before hot launch) ── // The routing readback above already synced the GPU stream, so ffn_post // is guaranteed ready. Reading it here avoids a sync AFTER hot launch. + const auto tensor_io_t0 = PipelineClock::now(); if (has_cold) { ggml_backend_tensor_get(ffn_post_gpu, state.ffn_post_host_buf.data(), 0, sizeof(float) * (size_t)n_embd); } + if (tel) tel->ffn_post_get_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now()); + - // ── GPU→GPU: copy residual to combine input ── - ggml_backend_tensor_copy(ffn_residual_gpu, state.gpu_state.combine.residual_in); + // ── GPU→GPU: copy residual to combine input (async on compute stream) ── + ggml_backend_tensor_copy_async(backend, backend, ffn_residual_gpu, state.gpu_state.combine.residual_in); // ── Prepare + launch hot graph (async — returns immediately) ── bool hot_async_launched = false; if (has_hot || has_shared) { if (!storage.hot_graph.valid() || storage.hot_graph.n_hot != n_hot) { + const auto hbuild_t0 = PipelineClock::now(); build_cached_hot_graph(storage.hot_graph, backend, storage.gate_hot, storage.up_hot, storage.down_hot, storage.gate_up_hot, L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - L, n_embd, w.n_ff_exp, n_hot); + make_moe_layer_desc(L), n_embd, w.n_ff_exp, n_hot); + if (tel) { tel->hot_graph_build_us += pipe_elapsed_us(hbuild_t0, PipelineClock::now()); tel->hot_graph_rebuilds++; } } if (storage.hot_graph.valid() && storage.hot_graph.n_hot == n_hot) { - ggml_backend_tensor_copy(ffn_post_gpu, storage.hot_graph.inp); + // All setup on compute stream — no per-op cudaStreamSynchronize + ggml_backend_tensor_copy_async(backend, backend, ffn_post_gpu, storage.hot_graph.inp); if (storage.hot_graph.ids && has_hot) { - ggml_backend_tensor_set(storage.hot_graph.ids, hot_ids, 0, - sizeof(int32_t) * (size_t)n_hot); + ggml_backend_tensor_set_async(backend, storage.hot_graph.ids, hot_ids, 0, + sizeof(int32_t) * (size_t)n_hot); } if (storage.hot_graph.weights && has_hot) { - ggml_backend_tensor_set(storage.hot_graph.weights, hot_weights, 0, - sizeof(float) * (size_t)n_hot); + ggml_backend_tensor_set_async(backend, storage.hot_graph.weights, hot_weights, 0, + sizeof(float) * (size_t)n_hot); } - // Launch hot GPU async — no sync until combine + // Launch hot GPU async — queued after copies on same stream ggml_backend_graph_compute_async(backend, storage.hot_graph.gf); hot_async_launched = true; } } + if (tel) tel->tensor_io_us += pipe_elapsed_us(tensor_io_t0, PipelineClock::now()); // ── Cold path: runs on CPU IN PARALLEL with hot GPU ── + const auto cold_t0 = PipelineClock::now(); if (has_cold) { // ffn_post already read above (before hot launch) — no GPU sync here! - if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) { - build_cached_cold_graph(storage.cold_graph, cpu_be, - storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, - L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, - n_embd, w.n_ff_exp, n_cold); - } - if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) { - ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0, - sizeof(float) * (size_t)n_embd); - ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0, - sizeof(int32_t) * (size_t)n_cold); - ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0, - sizeof(float) * (size_t)n_cold); - // CPU cold compute — hot GPU runs concurrently on its stream - auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf); - if (cst != GGML_STATUS_SUCCESS) { + const auto cold_compute_t0 = PipelineClock::now(); + if (state.cold_ffn_compute) { + // Fused kernel: bypass ggml graph dispatch entirely + state.cold_ffn_compute->compute( + state.cold_ffn_layers[(size_t)il], + state.ffn_post_host_buf.data(), + cold_ids, + cold_weights, + n_cold, n_embd, w.n_ff_exp, + state.cold_output_buf.data()); + } else { + // Fallback: ggml cold graph (legacy path) + if (!storage.cold_graph.valid() || storage.cold_graph.n_hot != n_cold) { + build_cached_cold_graph(storage.cold_graph, cpu_be, + storage.gate_cold, storage.up_cold, storage.down_cold, storage.gate_up_cold, + L.ffn_gate_exps_s, L.ffn_up_exps_s, L.ffn_down_exps_s, L.ffn_gate_up_exps_s, + n_embd, w.n_ff_exp, n_cold); + } + if (storage.cold_graph.valid() && storage.cold_graph.n_hot == n_cold) { + ggml_backend_tensor_set(storage.cold_graph.inp, state.ffn_post_host_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + ggml_backend_tensor_set(storage.cold_graph.ids, cold_ids, 0, + sizeof(int32_t) * (size_t)n_cold); + ggml_backend_tensor_set(storage.cold_graph.weights, cold_weights, 0, + sizeof(float) * (size_t)n_cold); + auto cst = ggml_backend_graph_compute(cpu_be, storage.cold_graph.gf); + if (cst != GGML_STATUS_SUCCESS) { + if (hot_async_launched) ggml_backend_synchronize(backend); + return false; + } + } else { if (hot_async_launched) ggml_backend_synchronize(backend); return false; } - } else { - if (hot_async_launched) ggml_backend_synchronize(backend); - return false; } + if (tel) tel->cold_compute_us += pipe_elapsed_us(cold_compute_t0, PipelineClock::now()); } + if (tel) tel->cold_cpu_us += pipe_elapsed_us(cold_t0, PipelineClock::now()); - // ── Sync hot GPU (only now — after cold CPU finished) ── + // ── Combine: queue on compute stream (no explicit sync needed) ── + const auto combine_t0 = PipelineClock::now(); if (hot_async_launched) { - ggml_backend_synchronize(backend); - ggml_backend_tensor_copy(storage.hot_graph.output, state.gpu_state.combine.hot_in); + ggml_backend_tensor_copy_async(backend, backend, storage.hot_graph.output, state.gpu_state.combine.hot_in); } else { float zeros[8192]; std::memset(zeros, 0, sizeof(float) * (size_t)n_embd); - ggml_backend_tensor_set(state.gpu_state.combine.hot_in, zeros, 0, - sizeof(float) * (size_t)n_embd); + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.hot_in, zeros, 0, + sizeof(float) * (size_t)n_embd); } - // ── Upload cold result (or keep zeros) ── if (has_cold) { - ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0, - sizeof(float) * (size_t)n_embd); - ggml_backend_tensor_set(state.gpu_state.combine.cold_in, state.ffn_post_host_buf.data(), 0, - sizeof(float) * (size_t)n_embd); + const float * cold_result = state.cold_ffn_compute + ? state.cold_output_buf.data() + : nullptr; + if (!cold_result) { + // Legacy path: read from ggml tensor + ggml_backend_tensor_get(storage.cold_graph.output, state.ffn_post_host_buf.data(), 0, + sizeof(float) * (size_t)n_embd); + cold_result = state.ffn_post_host_buf.data(); + } + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, cold_result, 0, + sizeof(float) * (size_t)n_embd); state.cold_in_zeroed = false; } else if (!state.cold_in_zeroed) { float zeros[8192]; std::memset(zeros, 0, sizeof(float) * (size_t)n_embd); - ggml_backend_tensor_set(state.gpu_state.combine.cold_in, zeros, 0, - sizeof(float) * (size_t)n_embd); + ggml_backend_tensor_set_async(backend, state.gpu_state.combine.cold_in, zeros, 0, + sizeof(float) * (size_t)n_embd); state.cold_in_zeroed = true; } - // ── Combine: output = residual + hot + cold ── - auto cst = ggml_backend_graph_compute(backend, state.gpu_state.combine.gf); - if (cst != GGML_STATUS_SUCCESS) return false; + ggml_backend_graph_compute_async(backend, state.gpu_state.combine.gf); - // ── Copy combine output to persistent act_cur ── - ggml_backend_tensor_copy(state.gpu_state.combine.output, state.gpu_state.act_cur); + ggml_backend_tensor_copy_async(backend, backend, state.gpu_state.combine.output, state.gpu_state.act_cur); + if (tel) tel->combine_overhead_us += pipe_elapsed_us(combine_t0, PipelineClock::now()); const auto ffn_t1 = PipelineClock::now(); if (tel) { @@ -398,8 +740,17 @@ bool pipelined_decode_one_token( step_graph_destroy(dyn_sg); + // Sync the compute stream before returning — caller needs act_cur on CPU. + // All async ops (combine + copy) from the last layer must complete. + const auto final_sync_t0 = PipelineClock::now(); + ggml_backend_synchronize(backend); + if (tel) { + tel->routed_final_sync_us = pipe_elapsed_us(final_sync_t0, PipelineClock::now()); tel->total_us = pipe_elapsed_us(tok_t0, PipelineClock::now()); + // GPU idle = time in tensor I/O + routing readback + combine overhead + // (these are all periods where GPU compute stream is idle) + tel->gpu_idle_us = tel->tensor_io_us + tel->routing_readback_us + tel->combine_overhead_us; } return true; } diff --git a/server/src/qwen35moe/qwen35moe_pipelined_decode.h b/server/src/qwen35moe/qwen35moe_pipelined_decode.h index bc03264eb..330d72f7b 100644 --- a/server/src/qwen35moe/qwen35moe_pipelined_decode.h +++ b/server/src/qwen35moe/qwen35moe_pipelined_decode.h @@ -10,13 +10,15 @@ #pragma once #include "internal.h" -#include "qwen35moe_hybrid_ffn_eval.h" -#include "qwen35moe_hybrid_storage.h" +#include "../common/moe_hybrid_ffn_eval.h" +#include "../common/moe_hybrid_storage.h" +#include "../common/cold_ffn_compute.h" #include "graph_builders.h" #include "ggml-backend.h" #include +#include #include namespace dflash::common { @@ -65,9 +67,30 @@ struct PipelinedDecodeTelemetry { uint64_t ffn_us = 0; uint64_t ffn_allhot_us = 0; uint64_t ffn_mixed_us = 0; + // GPU utilization diagnosis: time the GPU is idle waiting for CPU + uint64_t gpu_idle_us = 0; // total GPU idle (tensor_io + combine_overhead + sync_wait) + uint64_t tensor_io_us = 0; // hot path setup: D2H readback + GPU copies + kernel launch + uint64_t combine_overhead_us = 0; // combine graph dispatch + copy + uint64_t cold_cpu_us = 0; // cold path total (graph build + ggml CPU compute) + uint64_t cold_compute_us = 0; // just ggml_backend_graph_compute(cpu_be) time + uint64_t hot_graph_build_us = 0; // hot graph rebuild (only when n_hot changes) + uint64_t ffn_post_get_us = 0; // D2H readback of ffn_post for cold path + uint64_t sync_wait_us = 0; // time in ggml_backend_synchronize (waiting for GPU) int allhot_layers = 0; int mixed_layers = 0; int total_layers = 0; + int hot_graph_rebuilds = 0; // count of hot graph rebuilds + int routed_ffn_layers = 0; // layers handled by routed FFN (async pipeline) + + // ── Routed path breakdown (StreamMoE fast path) ── + uint64_t routed_prefn_us = 0; // prefn graph compute (async dispatch + sync) + uint64_t routed_sync_us = 0; // GPU sync stall waiting for prefn + uint64_t routed_readback_us = 0; // D2H readback of routing IDs + weights + uint64_t routed_cpu_remap_us = 0; // CPU-side local ID mapping + cold masking + uint64_t routed_ffn_dispatch_us = 0;// FFN graph dispatch + combine (async) + uint64_t routed_final_sync_us = 0; // final sync at end of token (if measured) + int routed_cold_expert_hits = 0; // experts masked (weight=0) in routed path + int routed_total_expert_slots = 0; // total expert slots processed }; // State for pipelined decode: holds cached DeltaNet pre-FFN graphs + @@ -79,6 +102,10 @@ struct PipelinedDecodeState { // Attention layers (every full_attention_interval-th) are nullptr (rebuilt each token) std::vector cached_prefn; + // Cached routed FFN graphs for DeltaNet layers (layer index → graph) + // StreamMoE-inspired: reads routing from GPU, eliminates CPU sync. + std::vector cached_routed_ffn; + // Persistent host buffers (avoid per-layer allocation) std::vector routing_ids_buf; std::vector routing_weights_buf; @@ -87,6 +114,16 @@ struct PipelinedDecodeState { // Persistent zero buffer for cold_in (set once at init) bool cold_in_zeroed = false; + // When true (default), cold experts are computed on the cold backend + // (CPU/Halo) instead of being dropped via cold-masking. Exact but slower. + // Set DFLASH_DROP_COLD=1 to disable (fast but lossy). + bool cold_compute = true; + + // Fused cold FFN compute (bypasses ggml graph dispatch overhead) + std::unique_ptr cold_ffn_compute; + std::vector cold_ffn_layers; // per-layer cold weight metadata + std::vector cold_output_buf; // [n_embd] scratch for cold FFN output + // Tracking int n_layer = 0; int n_embd = 0; @@ -100,10 +137,15 @@ struct PipelinedDecodeState { PipelinedDecodeState(PipelinedDecodeState && o) noexcept : gpu_state(std::move(o.gpu_state)), cached_prefn(std::move(o.cached_prefn)), + cached_routed_ffn(std::move(o.cached_routed_ffn)), routing_ids_buf(std::move(o.routing_ids_buf)), routing_weights_buf(std::move(o.routing_weights_buf)), ffn_post_host_buf(std::move(o.ffn_post_host_buf)), cold_in_zeroed(o.cold_in_zeroed), + cold_compute(o.cold_compute), + cold_ffn_compute(std::move(o.cold_ffn_compute)), + cold_ffn_layers(std::move(o.cold_ffn_layers)), + cold_output_buf(std::move(o.cold_output_buf)), n_layer(o.n_layer), n_embd(o.n_embd), n_expert_used(o.n_expert_used), full_attention_interval(o.full_attention_interval) { @@ -114,10 +156,15 @@ struct PipelinedDecodeState { destroy(); gpu_state = std::move(o.gpu_state); cached_prefn = std::move(o.cached_prefn); + cached_routed_ffn = std::move(o.cached_routed_ffn); routing_ids_buf = std::move(o.routing_ids_buf); routing_weights_buf = std::move(o.routing_weights_buf); ffn_post_host_buf = std::move(o.ffn_post_host_buf); cold_in_zeroed = o.cold_in_zeroed; + cold_compute = o.cold_compute; + cold_ffn_compute = std::move(o.cold_ffn_compute); + cold_ffn_layers = std::move(o.cold_ffn_layers); + cold_output_buf = std::move(o.cold_output_buf); n_layer = o.n_layer; n_embd = o.n_embd; n_expert_used = o.n_expert_used; full_attention_interval = o.full_attention_interval; @@ -136,6 +183,7 @@ bool init_pipelined_decode_state( ggml_backend_t backend, const TargetWeights & w, TargetCache & cache, + MoeHybridStorage & hybrid, int kv_start, // initial KV position for graph caching int kq_stride_pad); @@ -147,7 +195,7 @@ bool pipelined_decode_one_token( ggml_backend_t backend, const TargetWeights & w, TargetCache & cache, - Qwen35MoeHybridStorage & hybrid, + MoeHybridStorage & hybrid, int kv_pos, // current KV position int kq_stride_pad, PipelinedDecodeTelemetry * telemetry = nullptr); diff --git a/server/src/qwen35moe/qwen35moe_swap_manager.h b/server/src/qwen35moe/qwen35moe_swap_manager.h deleted file mode 100644 index 1acacedd9..000000000 --- a/server/src/qwen35moe/qwen35moe_swap_manager.h +++ /dev/null @@ -1,37 +0,0 @@ -// Request-boundary swap planning for qwen35moe expert placement. - -#pragma once - -#include "qwen35moe_expert_placement.h" - -#include -#include -#include - -namespace dflash::common { - -struct Qwen35MoeSwapAction { - int layer_idx = -1; - int evict_expert = -1; - int promote_expert = -1; - uint64_t evict_count = 0; - uint64_t promote_count = 0; -}; - -struct Qwen35MoeSwapPlan { - Qwen35MoeExpertPlacement next_placement; - std::vector actions; -}; - -struct Qwen35MoeSwapPolicy { - int max_swaps_total = 0; // 0 = no swaps - uint64_t min_promote_gain = 1; // promoted expert count must exceed evicted by at least this amount -}; - -bool build_qwen35moe_swap_plan(const Qwen35MoeExpertPlacement & current, - const Qwen35MoeRoutingStats & stats, - const Qwen35MoeSwapPolicy & policy, - Qwen35MoeSwapPlan & out, - std::string * err = nullptr); - -} // namespace dflash::common diff --git a/server/test/test_dflash.cpp b/server/test/test_dflash.cpp index f2544f5b0..76685ac60 100644 --- a/server/test/test_dflash.cpp +++ b/server/test/test_dflash.cpp @@ -28,9 +28,11 @@ // qwen35 + DFlash + DDTree pipeline below. #include "qwen35_daemon.h" // arch dispatch - single-GPU qwen35 daemon mode #include "qwen35moe_daemon.h" -#include "qwen35moe_hybrid_ffn_eval.h" -#include "qwen35moe_hybrid_storage.h" -#include "qwen35moe_expert_placement.h" +#include "../src/common/moe_hybrid_ffn_eval.h" +#include "../src/common/moe_hybrid_storage.h" +#include "../src/common/moe_hybrid_placement.h" +#include "../src/common/moe_hybrid_routing_stats.h" +#include "../src/common/moe_hybrid_types_impl.h" #include "qwen35moe_pipelined_decode.h" #include "qwen35_layer_split.h" // multi-GPU layer-split daemon args #include "layer_split_daemon_loop.h" // extracted layer-split daemon loop @@ -1575,7 +1577,7 @@ int main(int argc, char ** argv) { // Build placement stats that mark the router's default picks as COLD // by giving them zero count (so they're placed cold), while giving // all other experts count=1 (so the hottest N are picked as hot). - Qwen35MoeRoutingStats biased_stats; + MoeHybridRoutingStats biased_stats; biased_stats.n_layer = w.n_layer; biased_stats.n_expert = w.n_expert; biased_stats.n_expert_used = w.n_expert_used; @@ -1592,9 +1594,9 @@ int main(int argc, char ** argv) { } std::printf(" forced %d default-route experts to cold for worst-case bench\n", forced_cold_count); - Qwen35MoeExpertPlacement placement; + MoeHybridPlacement placement; std::string place_err; - if (!Qwen35MoeExpertPlacement::build_from_stats( + if (!MoeHybridPlacement::build_from_stats( biased_stats, total_hot_budget, /*min_hot_per_layer=*/std::min(w.n_expert_used, w.n_expert), placement, &place_err)) { @@ -1651,10 +1653,15 @@ int main(int argc, char ** argv) { layer_file_data[(size_t)il].gate_up_exps = find_tensor_data("ffn_gate_up_exps"); } - auto hybrid = std::make_shared(); + auto hybrid = std::make_shared(); std::string hybrid_err; - if (!build_qwen35moe_hybrid_storage_from_file( - w, backend, placement, layer_file_data, *hybrid, &hybrid_err)) { + MoeHybridConfig hybrid_cfg = make_moe_hybrid_config(w); + std::vector hybrid_descs((size_t)w.n_layer); + for (int il = 0; il < w.n_layer; ++il) { + hybrid_descs[(size_t)il] = make_moe_layer_desc(w.layers[(size_t)il]); + } + if (!build_moe_hybrid_storage_from_file( + hybrid_cfg, backend, placement, hybrid_descs, layer_file_data, *hybrid, &hybrid_err)) { std::fprintf(stderr, "[time-breakdown] hybrid storage build failed: %s\n", hybrid_err.c_str()); } else { @@ -1712,8 +1719,8 @@ int main(int argc, char ** argv) { sizeof(int32_t) * selected.size()); ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0, sizeof(float) * weights_buf.size()); - eval_qwen35moe_hybrid_ffn_gpu_resident( - backend, w, w.layers[(size_t)il], + eval_moe_hybrid_ffn_gpu_resident( + backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]), hybrid->layers[(size_t)il], cpu_be, layer_sg.ffn_post, layer_sg.ffn_residual, gpu_state, @@ -1750,8 +1757,8 @@ int main(int argc, char ** argv) { ggml_backend_tensor_get(layer_sg.moe_weights, weights_buf.data(), 0, sizeof(float) * weights_buf.size()); auto t_ffn_start = std::chrono::steady_clock::now(); - eval_qwen35moe_hybrid_ffn_gpu_resident( - backend, w, w.layers[(size_t)il], + eval_moe_hybrid_ffn_gpu_resident( + backend, make_moe_hybrid_config(w), make_moe_layer_desc(w.layers[(size_t)il]), hybrid->layers[(size_t)il], cpu_be, layer_sg.ffn_post, layer_sg.ffn_residual, gpu_state, @@ -1811,7 +1818,7 @@ int main(int argc, char ** argv) { // Init pipelined state PipelinedDecodeState pipe_state; - if (!init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) { + if (!init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid, ctx, g_kq_stride_pad)) { std::fprintf(stderr, "[time-breakdown] pipelined state init failed\n"); continue; } @@ -1882,24 +1889,24 @@ int main(int argc, char ** argv) { std::printf("\n[time-breakdown] === PIPELINED realistic placement (uniform hot/cold) ===\n"); { // Build uniform placement: hottest N experts per layer based on uniform counts - Qwen35MoeRoutingStats uniform_stats; + MoeHybridRoutingStats uniform_stats; uniform_stats.n_layer = w.n_layer; uniform_stats.n_expert = w.n_expert; uniform_stats.n_expert_used = w.n_expert_used; uniform_stats.counts.assign((size_t)w.n_layer * (size_t)w.n_expert, 1); uniform_stats.layer_totals.assign((size_t)w.n_layer, (uint64_t)w.n_expert); - Qwen35MoeExpertPlacement uniform_placement; + MoeHybridPlacement uniform_placement; std::string up_err; - if (Qwen35MoeExpertPlacement::build_from_stats( + if (MoeHybridPlacement::build_from_stats( uniform_stats, total_hot_budget, std::min(w.n_expert_used, w.n_expert), uniform_placement, &up_err)) { // Rebuild hybrid storage with uniform placement - auto hybrid_realistic = std::make_shared(); - if (build_qwen35moe_hybrid_storage_from_file( - w, backend, uniform_placement, layer_file_data, + auto hybrid_realistic = std::make_shared(); + if (build_moe_hybrid_storage_from_file( + hybrid_cfg, backend, uniform_placement, hybrid_descs, layer_file_data, *hybrid_realistic, &up_err)) { std::printf(" uniform placement: hot=%d cold=%d — expect ~60%% all-hot layers\n", uniform_placement.total_hot, @@ -1908,7 +1915,7 @@ int main(int argc, char ** argv) { int ctx = 2000; if (ctx + 1 <= max_ctx) { PipelinedDecodeState pipe_state; - if (init_pipelined_decode_state(pipe_state, backend, w, cache, ctx, g_kq_stride_pad)) { + if (init_pipelined_decode_state(pipe_state, backend, w, cache, *hybrid_realistic, ctx, g_kq_stride_pad)) { std::vector act_cur_pipe((size_t)hidden, 0.0f); ggml_backend_tensor_set(pipe_state.gpu_state.act_cur, act_cur_pipe.data(), 0, sizeof(float) * (size_t)hidden); diff --git a/server/test/test_qwen35moe_expert_placement.cpp b/server/test/test_qwen35moe_expert_placement.cpp index c263d87df..05e1b92dd 100644 --- a/server/test/test_qwen35moe_expert_placement.cpp +++ b/server/test/test_qwen35moe_expert_placement.cpp @@ -1,4 +1,5 @@ -#include "qwen35moe_expert_placement.h" +#include "../src/common/moe_hybrid_placement.h" +#include "../src/common/moe_hybrid_routing_stats.h" #include #include @@ -15,7 +16,7 @@ static void expect(bool cond, const char * msg) { } int main() { - Qwen35MoeRoutingStats stats; + MoeHybridRoutingStats stats; stats.n_layer = 2; stats.n_expert = 4; stats.n_expert_used = 2; @@ -25,11 +26,11 @@ int main() { }; stats.layer_totals = {280, 103}; - Qwen35MoeExpertPlacement placement; + MoeHybridPlacement placement; std::string err; - expect(Qwen35MoeExpertPlacement::build_from_stats(stats, /*total_hot_budget=*/4, - /*min_hot_per_layer=*/1, - placement, &err), err.c_str()); + expect(MoeHybridPlacement::build_from_stats(stats, /*total_hot_budget=*/4, + /*min_hot_per_layer=*/1, + placement, &err), err.c_str()); expect(placement.n_layer == 2, "n_layer"); expect(placement.hot_counts.size() == 2, "hot_counts size"); expect(placement.hot_counts[0] == 3, "layer0 got extra hot slots"); @@ -41,17 +42,12 @@ int main() { expect(placement.is_hot(1, 0), "layer1 expert0 hot"); expect(!placement.is_hot(1, 1), "layer1 expert1 cold"); - TargetWeights w; - w.is_moe = true; - w.n_layer = 2; - w.n_expert = 4; - w.n_expert_used = 2; - expect(placement.matches(w), "placement matches weights"); - - const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-placement-test.json"; - expect(placement.save_json(tmp.string(), &err), err.c_str()); - Qwen35MoeExpertPlacement loaded; - expect(Qwen35MoeExpertPlacement::load_json(tmp.string(), loaded, &err), err.c_str()); + expect(placement.matches(2, 4, 2), "placement matches dims"); + + const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-placement-test.json"; + expect(placement.save_json(tmp.string(), "moe_hybrid", &err), err.c_str()); + MoeHybridPlacement loaded; + expect(MoeHybridPlacement::load_json(tmp.string(), loaded, &err), err.c_str()); expect(loaded.hot_counts == placement.hot_counts, "loaded hot counts"); expect(loaded.hot_expert_ids == placement.hot_expert_ids, "loaded hot ids"); std::filesystem::remove(tmp); diff --git a/server/test/test_qwen35moe_routing_stats.cpp b/server/test/test_qwen35moe_routing_stats.cpp index ea14ac630..cfdf87f07 100644 --- a/server/test/test_qwen35moe_routing_stats.cpp +++ b/server/test/test_qwen35moe_routing_stats.cpp @@ -1,4 +1,4 @@ -#include "qwen35moe_routing_stats.h" +#include "../src/common/moe_hybrid_routing_stats.h" #include #include @@ -16,15 +16,9 @@ static void expect(bool cond, const char * msg) { } int main() { - TargetWeights w; - w.is_moe = true; - w.n_layer = 2; - w.n_expert = 4; - w.n_expert_used = 2; - - Qwen35MoeRoutingStats stats; - expect(stats.init_from_weights(w), "init_from_weights"); - expect(stats.matches(w), "matches after init"); + MoeHybridRoutingStats stats; + expect(stats.init(2, 4, 2), "init"); + expect(stats.matches(2, 4, 2), "matches after init"); const int32_t layer0_a[] = {2, 1}; const int32_t layer0_b[] = {2, 3}; @@ -49,13 +43,13 @@ int main() { expect(hot0.size() == 2, "hot size"); expect(hot0[0] == 2, "hot leader"); - const auto tmp = std::filesystem::temp_directory_path() / "qwen35moe-routing-stats-test.csv"; + const auto tmp = std::filesystem::temp_directory_path() / "moe-hybrid-routing-stats-test.csv"; std::string err; expect(stats.save_csv(tmp.string(), &err), err.c_str()); - Qwen35MoeRoutingStats loaded; - expect(Qwen35MoeRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str()); - expect(loaded.matches(w), "loaded matches weights"); + MoeHybridRoutingStats loaded; + expect(MoeHybridRoutingStats::load_csv(tmp.string(), loaded, &err), err.c_str()); + expect(loaded.matches(2, 4, 2), "loaded matches dims"); expect(loaded.count(0, 2) == 2, "loaded count"); expect(loaded.layer_totals[1] == 2, "loaded total"); diff --git a/server/test/test_qwen35moe_swap_manager.cpp b/server/test/test_qwen35moe_swap_manager.cpp index 6962da496..cbbf5e6ee 100644 --- a/server/test/test_qwen35moe_swap_manager.cpp +++ b/server/test/test_qwen35moe_swap_manager.cpp @@ -1,4 +1,6 @@ -#include "qwen35moe_swap_manager.h" +#include "../src/common/moe_hybrid_swap_manager.h" +#include "../src/common/moe_hybrid_placement.h" +#include "../src/common/moe_hybrid_routing_stats.h" #include #include @@ -13,7 +15,7 @@ static void expect(bool cond, const char * msg) { } int main() { - Qwen35MoeRoutingStats stats; + MoeHybridRoutingStats stats; stats.n_layer = 2; stats.n_expert = 4; stats.n_expert_used = 2; @@ -23,7 +25,7 @@ int main() { }; stats.layer_totals = {205, 194}; - Qwen35MoeExpertPlacement placement; + MoeHybridPlacement placement; placement.n_layer = 2; placement.n_expert = 4; placement.n_expert_used = 2; @@ -31,13 +33,13 @@ int main() { placement.hot_counts = {1, 1}; placement.hot_expert_ids = {{1}, {0}}; - Qwen35MoeSwapPolicy policy; + MoeHybridSwapPolicy policy; policy.max_swaps_total = 1; policy.min_promote_gain = 5; - Qwen35MoeSwapPlan plan; + MoeHybridSwapPlan plan; std::string err; - expect(build_qwen35moe_swap_plan(placement, stats, policy, plan, &err), err.c_str()); + expect(build_moe_hybrid_swap_plan(placement, stats, policy, plan, &err), err.c_str()); expect(plan.actions.size() == 1, "one swap planned"); expect(plan.actions[0].layer_idx == 0, "layer0 swap"); expect(plan.actions[0].evict_expert == 1, "evict weakest hot");