Skip to content
1 change: 1 addition & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ add_library(dflash_common STATIC
src/qwen35moe/qwen35moe_expert_placement.cpp
src/qwen35moe/qwen35moe_hybrid_storage.cpp
src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
src/qwen35moe/qwen35moe_pipelined_decode.cpp
src/qwen35moe/qwen35moe_swap_manager.cpp
src/qwen35/layer_split_forward.cpp
src/qwen35/layer_split_daemon.cpp
Expand Down
602 changes: 204 additions & 398 deletions server/src/qwen35moe/qwen35moe_backend.cpp

Large diffs are not rendered by default.

15 changes: 11 additions & 4 deletions server/src/qwen35moe/qwen35moe_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "graph_builders.h"
#include "qwen35moe_hybrid_ffn_eval.h"
#include "qwen35moe_hybrid_storage.h"
#include "qwen35moe_pipelined_decode.h"
#include "qwen35moe_routing_stats.h"
#include "qwen35moe_swap_manager.h"

Expand All @@ -24,7 +25,7 @@ class Qwen35MoeBackend : public Qwen35Backend {
GenerateResult restore_and_generate(int slot,
const GenerateRequest & req,
const DaemonIO & io) override;
bool supports_dflash_spec_decode() const override { return !hybrid_mode_; }
bool supports_dflash_spec_decode() const override { return !target_weights().moe_hybrid; }

protected:
bool load_target_model(ggml_backend_t backend, TargetWeights & out) override;
Expand All @@ -35,13 +36,10 @@ class Qwen35MoeBackend : public Qwen35Backend {
void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;

private:
bool hybrid_mode_ = false;
std::shared_ptr<Qwen35MoeRoutingStats> routing_stats_;
std::string routing_stats_out_path_;
std::string placement_out_path_;
Qwen35MoeSwapPolicy swap_policy_;
uint64_t last_hot_selected_ = 0;
uint64_t last_cold_selected_ = 0;
bool hybrid_telemetry_ = false;

void maybe_post_request_swap();
Expand All @@ -62,6 +60,15 @@ class Qwen35MoeBackend : public Qwen35Backend {
bool hybrid_forward_one_token(int32_t tok, int kv_pos,
std::vector<float> & act_cur,
int32_t & argmax_out);

// Pipelined decode: uses cached DeltaNet graphs + optimized FFN loop
bool run_pipelined_decode_path(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io);

// Persistent pipelined state (initialized once, reused across requests)
std::unique_ptr<struct PipelinedDecodeState> pipe_state_;
bool ensure_pipe_state(int kv_start);
};

} // namespace dflash::common
11 changes: 9 additions & 2 deletions server/src/qwen35moe/qwen35moe_ffn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "qwen35_ops.h"

#include <cmath>

namespace dflash::common {

Qwen35MoeRouterOutputs build_qwen35moe_router(
Expand Down Expand Up @@ -31,11 +33,16 @@ Qwen35MoeRouterOutputs build_qwen35moe_router(
ggml_tensor * weights = ggml_get_rows(ctx, probs_3d, selected);
weights = ggml_reshape_2d(ctx, weights, n_used, n_tokens);

if (w.expert_gating_func == 2) {
// Always normalize selected expert weights by their sum (matches
// llama.cpp's norm_w=true for qwen35moe). Without this, top-k softmax
// weights sum to much less than 1.0, causing systematically underscaled
// FFN output.
{
ggml_tensor * w_sum = ggml_sum_rows(ctx, weights);
w_sum = ggml_clamp(ctx, w_sum, 6.103515625e-5f, INFINITY);
weights = ggml_div(ctx, weights, w_sum);
}
if (w.expert_weights_scale != 1.0f) {
if (w.expert_weights_scale != 0.0f && w.expert_weights_scale != 1.0f) {
Comment thread
howard0su marked this conversation as resolved.
weights = ggml_scale(ctx, weights, w.expert_weights_scale);
}

Expand Down
13 changes: 9 additions & 4 deletions server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,12 +910,17 @@ bool eval_qwen35moe_hybrid_ffn_batched(
if (n_tokens <= 0) return true;

// ── Step 1: Partition routing into hot and cold ──
// Use dummy expert ID 0 with weight 0.0 for "other device" slots
// (ggml_mul_mat_id does not handle -1 IDs safely)
// Dummy slots use weight 0.0 and are distributed evenly across all experts
// to avoid pathological routing imbalance that triggers OOB in MMQ stream-k.
const int total_slots = n_used * n_tokens;
std::vector<int32_t> hot_sel(total_slots, 0);
const int n_hot_stack = storage.gate_up_hot ? (int)storage.gate_up_hot->ne[2]
: storage.gate_hot ? (int)storage.gate_hot->ne[2]
: 1;
std::vector<int32_t> hot_sel(total_slots);
for (int i = 0; i < total_slots; ++i) hot_sel[i] = i % n_hot_stack;
std::vector<float> hot_wts(total_slots, 0.0f);
std::vector<int32_t> cold_sel(total_slots, 0);
std::vector<int32_t> cold_sel(total_slots);
for (int i = 0; i < total_slots; ++i) cold_sel[i] = i % std::max(1, (int)(storage.down_cold ? storage.down_cold->ne[2] : 1));
std::vector<float> cold_wts(total_slots, 0.0f);
bool has_hot = false, has_cold = false;

Expand Down
42 changes: 42 additions & 0 deletions server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,30 @@ struct ResidualCombineGraph {
ggml_tensor * cold_in = nullptr; // [n_embd] F32 input (zeros when no cold)
ggml_tensor * output = nullptr; // [n_embd] F32 output

ResidualCombineGraph() = default;
~ResidualCombineGraph() { free(); }
ResidualCombineGraph(const ResidualCombineGraph &) = delete;
ResidualCombineGraph & operator=(const ResidualCombineGraph &) = delete;
ResidualCombineGraph(ResidualCombineGraph && o) noexcept
: ctx(o.ctx), gf(o.gf), alloc(o.alloc),
residual_in(o.residual_in), hot_in(o.hot_in),
cold_in(o.cold_in), output(o.output) {
o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
o.residual_in = nullptr; o.hot_in = nullptr;
o.cold_in = nullptr; o.output = nullptr;
}
ResidualCombineGraph & operator=(ResidualCombineGraph && o) noexcept {
if (this != &o) {
free();
ctx = o.ctx; gf = o.gf; alloc = o.alloc;
residual_in = o.residual_in; hot_in = o.hot_in;
cold_in = o.cold_in; output = o.output;
o.ctx = nullptr; o.gf = nullptr; o.alloc = nullptr;
o.residual_in = nullptr; o.hot_in = nullptr;
o.cold_in = nullptr; o.output = nullptr;
}
return *this;
}
bool valid() const { return ctx && gf && alloc && output; }
void free();
void destroy();
Expand All @@ -40,6 +64,24 @@ struct GpuResidentState {

ResidualCombineGraph combine;

GpuResidentState() = default;
~GpuResidentState() { destroy(); }
GpuResidentState(const GpuResidentState &) = delete;
GpuResidentState & operator=(const GpuResidentState &) = delete;
GpuResidentState(GpuResidentState && o) noexcept
: ctx(o.ctx), buf(o.buf), act_cur(o.act_cur),
combine(std::move(o.combine)) {
o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
}
GpuResidentState & operator=(GpuResidentState && o) noexcept {
if (this != &o) {
destroy();
ctx = o.ctx; buf = o.buf; act_cur = o.act_cur;
combine = std::move(o.combine);
o.ctx = nullptr; o.buf = nullptr; o.act_cur = nullptr;
}
return *this;
}
bool valid() const { return ctx && buf && act_cur && combine.valid(); }
void destroy();
};
Expand Down
Loading
Loading