Luce-Org · howard0su · May 27, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -254,6 +254,7 @@ add_library(dflash_common STATIC
     src/qwen35moe/qwen35moe_expert_placement.cpp
     src/qwen35moe/qwen35moe_hybrid_storage.cpp
     src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
+    src/qwen35moe/qwen35moe_pipelined_decode.cpp
     src/qwen35moe/qwen35moe_swap_manager.cpp
     src/qwen35/layer_split_forward.cpp
     src/qwen35/layer_split_daemon.cpp

diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
@@ -6,6 +6,7 @@
 #include "graph_builders.h"
 #include "qwen35moe_hybrid_ffn_eval.h"
 #include "qwen35moe_hybrid_storage.h"
+#include "qwen35moe_pipelined_decode.h"
 #include "qwen35moe_routing_stats.h"
 #include "qwen35moe_swap_manager.h"
 
@@ -24,7 +25,7 @@ class Qwen35MoeBackend : public Qwen35Backend {
     GenerateResult restore_and_generate(int slot,
                                         const GenerateRequest & req,
                                         const DaemonIO & io) override;
-    bool supports_dflash_spec_decode() const override { return !hybrid_mode_; }
+    bool supports_dflash_spec_decode() const override { return !target_weights().moe_hybrid; }
 
 protected:
     bool load_target_model(ggml_backend_t backend, TargetWeights & out) override;
@@ -35,13 +36,10 @@ class Qwen35MoeBackend : public Qwen35Backend {
     void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;
 
 private:
-    bool hybrid_mode_ = false;
     std::shared_ptr<Qwen35MoeRoutingStats> routing_stats_;
     std::string routing_stats_out_path_;
     std::string placement_out_path_;
     Qwen35MoeSwapPolicy swap_policy_;
-    uint64_t last_hot_selected_ = 0;
-    uint64_t last_cold_selected_ = 0;
     bool hybrid_telemetry_ = false;
 
     void maybe_post_request_swap();
@@ -62,6 +60,15 @@ class Qwen35MoeBackend : public Qwen35Backend {
     bool hybrid_forward_one_token(int32_t tok, int kv_pos,
                                   std::vector<float> & act_cur,
                                   int32_t & argmax_out);
+
+    // Pipelined decode: uses cached DeltaNet graphs + optimized FFN loop
+    bool run_pipelined_decode_path(int committed, int n_gen,
+                                   std::vector<int32_t> & out_tokens,
+                                   const DaemonIO & io);
+
+    // Persistent pipelined state (initialized once, reused across requests)
+    std::unique_ptr<struct PipelinedDecodeState> pipe_state_;
+    bool ensure_pipe_state(int kv_start);
 };
 
 }  // namespace dflash::common
diff --git a/server/src/qwen35moe/qwen35moe_ffn.cpp b/server/src/qwen35moe/qwen35moe_ffn.cpp
@@ -2,6 +2,8 @@
 
 #include "qwen35_ops.h"
 
+#include <cmath>
+
 namespace dflash::common {
 
 Qwen35MoeRouterOutputs build_qwen35moe_router(
@@ -31,11 +33,16 @@ Qwen35MoeRouterOutputs build_qwen35moe_router(
     ggml_tensor * weights  = ggml_get_rows(ctx, probs_3d, selected);
     weights = ggml_reshape_2d(ctx, weights, n_used, n_tokens);
 
-    if (w.expert_gating_func == 2) {
+    // Always normalize selected expert weights by their sum (matches
+    // llama.cpp's norm_w=true for qwen35moe). Without this, top-k softmax
+    // weights sum to much less than 1.0, causing systematically underscaled
+    // FFN output.
+    {
         ggml_tensor * w_sum = ggml_sum_rows(ctx, weights);
+        w_sum = ggml_clamp(ctx, w_sum, 6.103515625e-5f, INFINITY);
         weights = ggml_div(ctx, weights, w_sum);
     }
-    if (w.expert_weights_scale != 1.0f) {
+    if (w.expert_weights_scale != 0.0f && w.expert_weights_scale != 1.0f) {
         weights = ggml_scale(ctx, weights, w.expert_weights_scale);
     }
 

diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
@@ -910,12 +910,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
     if (n_tokens <= 0) return true;
 
     // ── Step 1: Partition routing into hot and cold ──
-    // Use dummy expert ID 0 with weight 0.0 for "other device" slots
-    // (ggml_mul_mat_id does not handle -1 IDs safely)
+    // Dummy slots use weight 0.0 and are distributed evenly across all experts
+    // to avoid pathological routing imbalance that triggers OOB in MMQ stream-k.
     const int total_slots = n_used * n_tokens;
-    std::vector<int32_t> hot_sel(total_slots, 0);
+    const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
+                          : storage.gate_hot    ? (int)storage.gate_hot->ne[2]
+                          : 1;
+    std::vector<int32_t> hot_sel(total_slots);
+    for (int i = 0; i < total_slots; ++i) hot_sel[i] = i % n_hot_stack;
     std::vector<float>   hot_wts(total_slots, 0.0f);
-    std::vector<int32_t> cold_sel(total_slots, 0);
+    std::vector<int32_t> cold_sel(total_slots);
+    for (int i = 0; i < total_slots; ++i) cold_sel[i] = i % std::max(1, (int)(storage.down_cold ? storage.down_cold->ne[2] : 1));
     std::vector<float>   cold_wts(total_slots, 0.0f);
     bool has_hot = false, has_cold = false;
 

diff --git a/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
@@ -24,6 +24,30 @@ struct ResidualCombineGraph {
     ggml_tensor * cold_in = nullptr;       // [n_embd] F32 input (zeros when no cold)
     ggml_tensor * output = nullptr;        // [n_embd] F32 output
 
+    ResidualCombineGraph() = default;
+    ~ResidualCombineGraph() { free(); }
+    ResidualCombineGraph(const ResidualCombineGraph &) = delete;
+    ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
+    ResidualCombineGraph(ResidualCombineGraph && o) noexcept
+        : ctx(o.ctx), gf(o.gf), alloc(o.alloc),
+          residual_in(o.residual_in), hot_in(o.hot_in),
+          cold_in(o.cold_in), output(o.output) {
+        o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+        o.residual_in = nullptr; o.hot_in = nullptr;
+        o.cold_in = nullptr; o.output = nullptr;
+    }
+    ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
+        if (this != &o) {
+            free();
+            ctx = o.ctx; gf = o.gf; alloc = o.alloc;
+            residual_in = o.residual_in; hot_in = o.hot_in;
+            cold_in = o.cold_in; output = o.output;
+            o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
+            o.residual_in = nullptr; o.hot_in = nullptr;
+            o.cold_in = nullptr; o.output = nullptr;
+        }
+        return *this;
+    }
     bool valid() const { return ctx && gf && alloc && output; }
     void free();
     void destroy();
@@ -40,6 +64,24 @@ struct GpuResidentState {
 
     ResidualCombineGraph combine;
 
+    GpuResidentState() = default;
+    ~GpuResidentState() { destroy(); }
+    GpuResidentState(const GpuResidentState &) = delete;
+    GpuResidentState & operator=(const GpuResidentState &) = delete;
+    GpuResidentState(GpuResidentState && o) noexcept
+        : ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
+          combine(std::move(o.combine)) {
+        o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+    }
+    GpuResidentState & operator=(GpuResidentState && o) noexcept {
+        if (this != &o) {
+            destroy();
+            ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
+            combine = std::move(o.combine);
+            o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
+        }
+        return *this;
+    }
     bool valid() const { return ctx && buf && act_cur && combine.valid(); }
     void destroy();
 };