Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
02f3ad3
refactor: extract MoE hybrid mode into common layer for qwen35moe + l…
howard0su May 29, 2026
c8a9225
feat(pipeline): add routed FFN fast path with cached graphs
howard0su May 29, 2026
8fe762d
feat(pipeline): GPU-resident act_cur with async logits projection
howard0su May 29, 2026
2138e84
fix(test): pass MoeHybridStorage to init_pipelined_decode_state
howard0su May 29, 2026
1cd31e6
feat(laguna): print VRAM allocation budget and hot/cold split info
howard0su May 30, 2026
168dee5
fix(common): skip dense layers in MoE hybrid placement and storage
howard0su May 30, 2026
b036812
feat(laguna): hybrid prefill path + DFLASH_EXPERT_BUDGET_PCT support
howard0su May 30, 2026
481006c
feat(laguna): match qwen35moe dynamic placement print format
howard0su May 30, 2026
9221bf5
feat(laguna): auto dynamic placement like qwen35moe
howard0su May 30, 2026
235c189
perf(hybrid-prefill): remove sub-batching and reuse gallocr across la…
howard0su May 30, 2026
aaa2323
telemetry: instrument AR decode routed path + per-token breakdown
howard0su May 31, 2026
050b59a
fix: skip batched prefill FFN when layer has cold experts
howard0su May 31, 2026
79bbd60
perf: extend routed FFN cold-masking to attention layers
howard0su May 31, 2026
cacaff9
feat: add DFLASH_COLD_COMPUTE option for exact cold expert computation
howard0su May 31, 2026
709a6b9
feat: fused cold FFN kernel with OpenMP parallelization
howard0su May 31, 2026
32e0675
feat: hybrid routed+cold path for inline cold compute
howard0su May 31, 2026
4727d20
Remove agents.md
howard0su Jun 3, 2026
a660b37
fix(laguna): link test/bench targets against the active ggml backend
davide221 Jun 4, 2026
df468bd
fix(moe-hybrid): avoid ggml-cuda MMQ mul_mat_id crash on reduced hot …
davide221 Jun 4, 2026
f140dde
Merge main into layersplit_refactor
davide221 Jun 4, 2026
62766fe
docs(server): document hybrid MoE hot/cold expert split
davide221 Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,14 @@ add_library(dflash_common STATIC
src/qwen35moe/qwen35moe_ffn.cpp
src/qwen35moe/qwen35moe_backend.cpp
src/qwen35moe/qwen35moe_daemon.cpp
src/qwen35moe/qwen35moe_routing_stats.cpp
src/qwen35moe/qwen35moe_expert_placement.cpp
src/qwen35moe/qwen35moe_hybrid_storage.cpp
src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
src/qwen35moe/qwen35moe_pipelined_decode.cpp
src/qwen35moe/qwen35moe_swap_manager.cpp
# ── Common MoE hybrid infrastructure ──
src/common/moe_hybrid_placement.cpp
src/common/moe_hybrid_routing_stats.cpp
src/common/moe_hybrid_storage.cpp
src/common/moe_hybrid_ffn_eval.cpp
src/common/cold_ffn_cpu.cpp
src/common/moe_hybrid_swap_manager.cpp
src/qwen35/layer_split_forward.cpp
src/qwen35/layer_split_daemon.cpp
src/qwen35/qwen35_backend.cpp
Expand Down Expand Up @@ -523,6 +525,11 @@ target_link_libraries(dflash_common
ggml-base
nlohmann_json::nlohmann_json
)
# OpenMP for parallel cold FFN kernel (saturate memory bandwidth).
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(dflash_common PRIVATE OpenMP::OpenMP_CXX)
endif()
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_link_libraries(dflash_common PRIVATE hip::host)
endif()
Expand Down Expand Up @@ -638,32 +645,32 @@ if(DFLASH27B_TESTS)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda)
target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
add_executable(smoke_target_forward test/smoke_target_forward.cpp)
Expand Down
53 changes: 53 additions & 0 deletions server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,59 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
| Math500 | 35.13 | 69.77 | 5.15 | **1.99×** |
| **Mean** | 34.97 | 69.19 | 5.17 | **1.98×** |

## Hybrid MoE (hot/cold expert split)

For MoE targets whose experts don't fit in VRAM, dflash can split experts
across the GPU and CPU: the most-used (**hot**) experts stay resident on the
GPU, the rest (**cold**) live in host RAM and are evaluated on the CPU,
overlapped with the GPU hot path. This trades some decode/prefill speed for
VRAM headroom, and is computed automatically at load by a dynamic-placement
pass. It applies to both MoE arches: `qwen35`/`qwen36` and `laguna`.

**When it triggers.** If the experts fit in the available VRAM budget, all
experts load to GPU (no split, fastest path). Otherwise placement keeps as
many hot experts as the budget allows and routes the rest to CPU. You can also
shrink the budget manually to force a split (e.g. to free VRAM for a longer
context or a larger target).

### Budget knobs

| Env | Arch | Effect |
|---|---|---|
| `DFLASH_EXPERT_BUDGET_MB N` | both | Cap hot-expert VRAM to `N` MB. Applies only when `N` is below the auto-computed budget; experts beyond it go cold (CPU). |
| `DFLASH_EXPERT_BUDGET_PCT P` | laguna | Keep hot experts to `P`% (`0<P<100`) of total expert bytes. Applies only when below the auto budget. |
| `DFLASH_MAX_CONTEXT N` | both | Override the max context used when sizing the KV cache (more KV = less VRAM left for hot experts). |

### Placement / tuning knobs (per arch)

Substitute `<ARCH>` = `LAGUNA` or `QWEN35MOE`:

| Env | Effect |
|---|---|
| `DFLASH_<ARCH>_HOTNESS <file>` | Expert frequency/hotness file driving which experts are placed hot. |
| `DFLASH_<ARCH>_TELEMETRY 1` | Log per-layer hot/cold FFN timing telemetry. |
| `DFLASH_<ARCH>_SWAP_MAX N` | Max hot/cold promotions per request boundary (runtime re-placement); `0` disables swapping. |
| `DFLASH_<ARCH>_SWAP_MIN_GAIN N` | Min observed-frequency gain before a cold expert is promoted to hot. |
| `DFLASH_<ARCH>_NEXT_PLACEMENT_OUT <file>` | Dump the placement chosen this run (warm-start the hotness file next time). |
| `DFLASH_QWEN35MOE_RUNTIME_STATS_OUT <file>` | (qwen only) Dump runtime routing-frequency stats. |

### Example

```bash
# Force ~8 GB of hot experts on GPU; the rest run cold on the CPU.
DFLASH_EXPERT_BUDGET_MB=8000 ./build/dflash_server models/laguna-xs2-Q4_K_M.gguf --port 8000
# Startup log e.g.: "dynamic placement result: 4717 hot experts, 5267 cold experts"
```

### Caveat: reduced-stack prefill chunking

When a layer's hot-expert stack is **reduced** (i.e. a genuine split), the
ggml-cuda MMQ `mul_mat_id` kernel illegal-accesses for certain batch sizes on
**both** HIP/gfx1151 and CUDA/sm_86. As a guard, hybrid-split **prefill** is
sliced into ≤4-token sub-batches (forcing the stable MMVQ path); **decode**
(single-token) is unaffected. This costs some prefill throughput on split
layers and is removed once the kernel is fixed upstream.

## Laguna-XS.2 target (experimental, Poolside MoE)

[Poolside Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2) is a 40-layer MoE LLM with 256 experts (top-8) plus an always-on shared expert, per-layer head counts `[48,64,64,64]×10`, and a per-layer SWA pattern (window 512). It is **architecturally distinct from `qwen35`**, so dflash adds a hand-rolled CUDA forward path (`Path A`, ggml-only — no libllama dependency) that mirrors the qwen35 stack. The Q4_K_M GGUF lands at 18.77 GiB on a single RTX 3090; tok_embd stays CPU-only (110 MiB) to keep the GPU budget under 24 GB.
Expand Down
59 changes: 59 additions & 0 deletions server/src/common/cold_ffn_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// ColdFfnCompute: Direct compute interface for cold expert FFN.
// Bypasses ggml graph dispatch overhead. Shared-memory model (CPU/Halo).
#pragma once

#include "ggml.h"
#include <cstdint>
#include <memory>

namespace dflash::common {

// Per-layer cold weight metadata — raw pointers into shared memory.
struct ColdFfnLayer {
const void * gate_up_data = nullptr; // fused [n_cold, n_ff*2, n_embd] quantized
const void * gate_data = nullptr; // separate gate [n_cold, n_ff, n_embd]
const void * up_data = nullptr; // separate up [n_cold, n_ff, n_embd]
const void * down_data = nullptr; // [n_cold, n_embd, n_ff] quantized

size_t gate_up_stride = 0; // bytes between experts in gate_up tensor
size_t gate_stride = 0; // bytes between experts in gate tensor
size_t up_stride = 0; // bytes between experts in up tensor
size_t down_stride = 0; // bytes between experts in down tensor

ggml_type gate_up_type = GGML_TYPE_Q4_K; // type for fused gate_up
ggml_type gate_type = GGML_TYPE_Q4_K; // type for separate gate
ggml_type up_type = GGML_TYPE_Q4_K; // type for separate up
ggml_type down_type = GGML_TYPE_Q4_K; // type for down projection
bool fused_gate_up = false; // true if gate+up are fused

// Scale factors (applied after matmul). 1.0 = no scaling.
float gate_up_scale = 1.0f;
float gate_scale = 1.0f;
float up_scale = 1.0f;
float down_scale = 1.0f;
};

// Abstract compute interface. Implementations: CPU (now), Halo (future).
struct ColdFfnCompute {
virtual ~ColdFfnCompute() = default;

// Compute cold expert FFN contributions and accumulate into output.
// input: [n_embd] F32 — post-norm hidden state
// ids: [n_cold] I32 — local cold expert indices
// weights: [n_cold] F32 — routing weights for each cold expert
// output: [n_embd] F32 — accumulated weighted expert outputs (zeroed by callee)
virtual void compute(
const ColdFfnLayer & layer,
const float * input,
const int32_t * ids,
const float * weights,
int n_cold,
int n_embd,
int n_ff,
float * output) = 0;
};

// Create CPU-based fused cold FFN compute.
std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max);

} // namespace dflash::common
190 changes: 190 additions & 0 deletions server/src/common/cold_ffn_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// CpuColdFfnCompute: Fused cold expert FFN using ggml vec_dot primitives.
// Bypasses ggml graph dispatch overhead. Uses OpenMP to saturate memory bandwidth.
// Memory-bandwidth bound at ~45 GB/s DDR4. Target: 15.7ms → ~3ms/token.

#include "cold_ffn_compute.h"
#include "ggml-cpu.h"

#include <cmath>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <algorithm>

#ifdef _OPENMP
#include <omp.h>
#endif

namespace dflash::common {

class CpuColdFfnCompute : public ColdFfnCompute {
int n_ff_max_;
int n_threads_;

// Per-thread scratch buffers for parallel down matmul
struct ThreadBuf {
std::vector<float> scratch; // [n_ff * 2] gate_up result + SwiGLU
std::vector<uint8_t> mid_conv; // down input converted to vec_dot_type
};
std::vector<ThreadBuf> thread_bufs_;
std::vector<uint8_t> inp_conv_; // input converted (shared, read-only during matmul)

public:
explicit CpuColdFfnCompute(int n_ff_max, int n_threads = 0) : n_ff_max_(n_ff_max) {
#ifdef _OPENMP
if (n_threads <= 0) {
const char * env = std::getenv("DFLASH_COLD_THREADS");
n_threads = env ? std::atoi(env) : 0;
}
n_threads_ = n_threads > 0 ? n_threads : std::min(omp_get_max_threads(), 8);
#else
n_threads_ = 1;
#endif
fprintf(stderr, "[cold_ffn] using %d threads\n", n_threads_);
thread_bufs_.resize(n_threads_);
for (auto & tb : thread_bufs_) {
tb.scratch.resize((size_t)n_ff_max * 2);
}
}

void compute(
const ColdFfnLayer & layer,
const float * input,
const int32_t * ids,
const float * weights,
int n_cold,
int n_embd,
int n_ff,
float * output) override {

if (n_cold <= 0) return;
std::memset(output, 0, sizeof(float) * (size_t)n_embd);

// Gate/up phase type traits
const ggml_type gu_type = layer.fused_gate_up ? layer.gate_up_type : layer.gate_type;
const auto * gu_cpu_traits = ggml_get_type_traits_cpu(gu_type);
const auto gu_vec_dot = gu_cpu_traits->vec_dot;
const auto gu_vec_dot_type = gu_cpu_traits->vec_dot_type;
const auto gu_from_float = ggml_get_type_traits_cpu(gu_vec_dot_type)->from_float;

// Down phase type traits (may differ from gate/up)
const auto * dn_cpu_traits = ggml_get_type_traits_cpu(layer.down_type);
const auto dn_vec_dot = dn_cpu_traits->vec_dot;
const auto dn_vec_dot_type = dn_cpu_traits->vec_dot_type;
const auto dn_from_float = ggml_get_type_traits_cpu(dn_vec_dot_type)->from_float;

const size_t inp_row_size = ggml_row_size(gu_vec_dot_type, n_embd);
const size_t mid_row_size = ggml_row_size(dn_vec_dot_type, n_ff);
const size_t gu_weight_row = ggml_row_size(gu_type, n_embd);
const size_t dn_weight_row = ggml_row_size(layer.down_type, n_ff);

// For separate gate/up — up may have a different type than gate
size_t up_weight_row = gu_weight_row;
const ggml_type up_type_actual = layer.fused_gate_up ? gu_type : layer.up_type;
(void)up_type_actual;
ggml_vec_dot_t up_vec_dot = gu_vec_dot;
ggml_type up_vdt = gu_vec_dot_type;
if (!layer.fused_gate_up && layer.up_type != layer.gate_type) {
const auto * up_cpu_traits = ggml_get_type_traits_cpu(layer.up_type);
up_vec_dot = up_cpu_traits->vec_dot;
up_vdt = up_cpu_traits->vec_dot_type;
up_weight_row = ggml_row_size(layer.up_type, n_embd);
}

// Ensure input conversion buffer is large enough
if (inp_conv_.size() < inp_row_size) inp_conv_.resize(inp_row_size);
// Ensure per-thread mid_conv buffers
for (auto & tb : thread_bufs_) {
if (tb.mid_conv.size() < mid_row_size) tb.mid_conv.resize(mid_row_size);
}

// Convert input for up if different type
std::vector<uint8_t> inp_conv_up;
if (!layer.fused_gate_up && up_vdt != gu_vec_dot_type) {
size_t up_inp_row_size = ggml_row_size(up_vdt, n_embd);
inp_conv_up.resize(up_inp_row_size);
auto up_from_float = ggml_get_type_traits_cpu(up_vdt)->from_float;
up_from_float(input, inp_conv_up.data(), n_embd);
}

// Convert input to gate's vec_dot format once
gu_from_float(input, inp_conv_.data(), n_embd);

for (int e = 0; e < n_cold; ++e) {
const int32_t eid = ids[e];
const float w = weights[e];
if (w == 0.0f) continue;

// Use thread 0's scratch for gate_up (serial phase)
float * scratch = thread_bufs_[0].scratch.data();

// ── Phase 1: gate_up matmul → scratch[0..n_ff*2) ──
// Parallel over rows (each row is independent, reading shared inp_conv_)
if (layer.fused_gate_up) {
const char * expert = (const char *)layer.gate_up_data + (size_t)eid * layer.gate_up_stride;
const int n_rows = n_ff * 2;
#ifdef _OPENMP
#pragma omp parallel for num_threads(n_threads_) schedule(static)
#endif
for (int row = 0; row < n_rows; ++row) {
const void * row_ptr = expert + (size_t)row * gu_weight_row;
gu_vec_dot(n_embd, &scratch[row], 0, row_ptr, 0, inp_conv_.data(), 0, 1);
}
if (layer.gate_up_scale != 1.0f) {
for (int i = 0; i < n_rows; ++i) scratch[i] *= layer.gate_up_scale;
}
} else {
const char * gate_expert = (const char *)layer.gate_data + (size_t)eid * layer.gate_stride;
const char * up_expert = (const char *)layer.up_data + (size_t)eid * layer.up_stride;
const uint8_t * up_inp = (!inp_conv_up.empty()) ? inp_conv_up.data() : inp_conv_.data();
#ifdef _OPENMP
#pragma omp parallel for num_threads(n_threads_) schedule(static)
#endif
for (int row = 0; row < n_ff; ++row) {
const void * gp = gate_expert + (size_t)row * gu_weight_row;
gu_vec_dot(n_embd, &scratch[row], 0, gp, 0, inp_conv_.data(), 0, 1);
const void * up = up_expert + (size_t)row * up_weight_row;
up_vec_dot(n_embd, &scratch[n_ff + row], 0, up, 0, up_inp, 0, 1);
}
if (layer.gate_scale != 1.0f) {
for (int i = 0; i < n_ff; ++i) scratch[i] *= layer.gate_scale;
}
if (layer.up_scale != 1.0f) {
for (int i = 0; i < n_ff; ++i) scratch[n_ff + i] *= layer.up_scale;
}
}

// ── Phase 2: SwiGLU activation ──
for (int i = 0; i < n_ff; ++i) {
const float gate = scratch[i];
const float up = scratch[n_ff + i];
scratch[i] = (gate / (1.0f + expf(-gate))) * up;
}

// ── Phase 3: down matmul → output (weighted accumulate) ──
// Convert SwiGLU result to down's vec_dot format (serial, small)
dn_from_float(scratch, thread_bufs_[0].mid_conv.data(), n_ff);
const uint8_t * mid_conv_data = thread_bufs_[0].mid_conv.data();

const char * down_expert = (const char *)layer.down_data + (size_t)eid * layer.down_stride;
const float scale = w * layer.down_scale;

// Parallel down matmul — each thread accumulates its own output rows
#ifdef _OPENMP
#pragma omp parallel for num_threads(n_threads_) schedule(static)
#endif
for (int row = 0; row < n_embd; ++row) {
float val;
const void * row_ptr = down_expert + (size_t)row * dn_weight_row;
dn_vec_dot(n_ff, &val, 0, row_ptr, 0, mid_conv_data, 0, 1);
output[row] += scale * val;
}
}
}
};

std::unique_ptr<ColdFfnCompute> make_cpu_cold_ffn_compute(int n_ff_max) {
return std::make_unique<CpuColdFfnCompute>(n_ff_max);
}

} // namespace dflash::common
Loading
Loading