Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,11 @@ if(DFLASH27B_TESTS)
target_include_directories(test_derived_scalars PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
add_test(NAME derived_scalars COMMAND test_derived_scalars)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_placement.cpp")
add_executable(test_kvflash_placement test/test_kvflash_placement.cpp)
target_include_directories(test_kvflash_placement PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
add_test(NAME kvflash_placement COMMAND test_kvflash_placement)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_bandit_integration.cpp")
add_executable(test_bandit_integration test/test_bandit_integration.cpp)
target_include_directories(test_bandit_integration PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
Expand Down Expand Up @@ -787,6 +792,12 @@ if(DFLASH27B_TESTS)
add_executable(test_kvflash_qk test/test_kvflash_qk.cpp)
target_include_directories(test_kvflash_qk PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_pool_sizing.cpp")
# Pure unit test for kvflash_pool_from_env: no ggml link, no GPU.
add_executable(test_kvflash_pool_sizing test/test_kvflash_pool_sizing.cpp)
target_include_directories(test_kvflash_pool_sizing PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
add_test(NAME kvflash_pool_sizing COMMAND test_kvflash_pool_sizing)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp")
add_executable(test_restore_delta test/test_restore_delta.cpp)
target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
Expand Down
54 changes: 54 additions & 0 deletions server/src/common/kvflash_placement.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// KVFlash placement KV-reservation rule (architecture-agnostic, header-only).
//
// Any MoE / weight-offload backend that places experts against a VRAM budget
// must decide how much KV to reserve. Reserving for `max_ctx` forces experts
// cold at high max_ctx even when KVFlash bounds the *resident* KV to a fixed
// pool. This helper centralises the rule so every backend (qwen35moe today,
// DeepSeek-V4 / future MoE next) inherits the "pool bounds the expert-placement
// cliff" win without re-deriving the byte math.
#pragma once

#include <cstdint>

namespace dflash::common {

struct KvfPlacementDecision {
uint64_t kv_total = 0; // bytes to reserve for the KV cache
int kv_ctx = 0; // tokens the reservation covers (pool or max_ctx)
bool all_hot_full_kv = false; // would ALL experts be hot with the FULL max_ctx KV?
bool pool_reduced = false; // did we reserve for the pool instead of max_ctx?
};

// Decide the KV reservation for VRAM-budget expert placement.
//
// kvf_pool: resident KVFlash pool in tokens (0 = KVFlash inactive).
// all_hot_full_kv reports whether the full max_ctx KV already fits all experts
// hot — i.e. KVFlash is redundant; the caller's gate uses it to disable the
// pool when unneeded (so a pool that is *itself* keeping experts hot is never
// disabled).
// When KVFlash is active AND the full reservation would force experts cold, the
// reservation is reduced to the pool so experts stay hot.
inline KvfPlacementDecision kvflash_placement_decision(
uint64_t kv_bytes_per_tok, int max_ctx, int kvf_pool,
uint64_t gpu_total, uint64_t core_bytes, uint64_t total_expert_bytes,
uint64_t warm_bytes, uint64_t safety_bytes, uint64_t draft_bytes)
{
KvfPlacementDecision d;
const uint64_t kv_full = kv_bytes_per_tok * (uint64_t)max_ctx;
const uint64_t fixed = core_bytes + warm_bytes + safety_bytes + draft_bytes;

uint64_t eb_full = 0;
if (gpu_total > fixed + kv_full) eb_full = gpu_total - fixed - kv_full;
d.all_hot_full_kv = (eb_full >= total_expert_bytes);

d.kv_ctx = max_ctx;
d.kv_total = kv_full;
if (kvf_pool > 0 && kvf_pool < max_ctx && !d.all_hot_full_kv) {
d.kv_ctx = kvf_pool;
d.kv_total = kv_bytes_per_tok * (uint64_t)kvf_pool;
d.pool_reduced = true;
}
return d;
}

} // namespace dflash::common
43 changes: 26 additions & 17 deletions server/src/qwen35/qwen35_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,24 @@ Qwen35Backend::Qwen35Backend(const Qwen35Config & cfg) : cfg_(cfg) {}

Qwen35Backend::~Qwen35Backend() { shutdown(); }

// "auto" pool budget: device-free minus a reserve (compute buffers + drafter
// when expected), converted at this model's pooled-KV density. Shared with MoE
// placement so reservation and runtime allocation size the pool identically.
KvFlashAutoBudget Qwen35Backend::make_kvflash_budget(const TargetWeights & w,
int64_t gpu_free) const {
ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
dflash::resolve_kv_types(kv_k, kv_v);
const int n_full = w.n_layer / w.full_attention_interval;
KvFlashAutoBudget b;
b.free_bytes = gpu_free;
b.bytes_per_token = (int64_t)n_full * w.n_head_kv *
(int64_t)(ggml_row_size(kv_k, w.n_embd_head_k) +
ggml_row_size(kv_v, w.n_embd_head_v));
b.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
(kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
return b;
}

// ── init() ──────────────────────────────────────────────────────────────

bool Qwen35Backend::init() {
Expand Down Expand Up @@ -242,29 +260,20 @@ bool Qwen35Backend::init() {
// point and the cache is not yet allocated, so device-free minus a
// reserve (compute buffers + the drafter when expected) is what the
// pool can really use, converted at this model's pooled-KV density.
KvFlashAutoBudget kvf_budget;
{
size_t gpu_free = 0, gpu_total = 0;
if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
}
ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
dflash::resolve_kv_types(kv_k, kv_v);
const int n_full = w_.n_layer / w_.full_attention_interval;
kvf_budget.free_bytes = (int64_t)gpu_free;
kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv *
(int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) +
ggml_row_size(kv_v, w_.n_embd_head_v));
kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
(kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
size_t gpu_free = 0, gpu_total = 0;
if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
}
KvFlashAutoBudget kvf_budget = make_kvflash_budget(w_, (int64_t)gpu_free);
kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
!kvflash_drafter_path_.empty() ||
kvflash_qk_policy_,
kvflash_scorer_expected(),
kvf_budget);
if (kvflash_tokens_ > 0) {
kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64));
}
// Subclass gate (e.g. MoE all-hot): may zero kvflash_tokens_ before the KV
// cache is sized, so create_target_cache allocates full max_ctx KV.
if (!post_kvflash_init_gate()) return false;
if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_,
/*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) {
std::fprintf(stderr, "cache: %s\n", dflash27b_last_error());
Expand Down
9 changes: 9 additions & 0 deletions server/src/qwen35/qwen35_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ class Qwen35Backend : public ModelBackend {
std::vector<int32_t> & out_tokens,
const DaemonIO & io);
virtual bool should_capture_moe_router() const { return false; }
// Hook after kvflash pool sizing, before create_target_cache: a subclass
// may disable the pool (kvflash_tokens_=0) when it is redundant. Default no-op.
virtual bool post_kvflash_init_gate() { return true; }
virtual void after_target_compute(StepGraph &,
int /*kv_start*/,
int /*n_tokens*/) {}
Expand Down Expand Up @@ -181,6 +184,12 @@ class Qwen35Backend : public ModelBackend {
int kvflash_tau_ = 64;
bool kvflash_drafter_failed_ = false; // don't retry a failed load
bool kvflash_active() const { return kvflash_tokens_ > 0; }
// Pool sizing inputs — shared so MoE placement reserves exactly the pool
// runtime allocates (else placement over-reserves KV and starves experts).
bool kvflash_scorer_expected() const {
return !kvflash_drafter_path_.empty() || kvflash_qk_policy_;
}
KvFlashAutoBudget make_kvflash_budget(const TargetWeights & w, int64_t gpu_free) const;
// Target-QK policy (--kvflash-policy qk): residency scored with the
// target's own pooled post-RoPE keys vs the current decode query
// (kvflash_qk.h); no drafter. Keys pool at chunk-seal time; the query
Expand Down
61 changes: 59 additions & 2 deletions server/src/qwen35moe/qwen35moe_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "../common/moe_hybrid_stream.h"
#include "../common/moe_hybrid_types.h"
#include "../common/moe_hybrid_types_impl.h"
#include "../common/kvflash_placement.h"
#include "common/ggml_graph_precision.h"
#include "common/sampler.h"
#include "common/dflash_spec_decode.h"
Expand Down Expand Up @@ -74,6 +75,9 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
if (placement.total_hot >= out.n_layer * out.n_expert) {
std::printf("[qwen35moe] all experts fit in VRAM, loading fully to GPU\n");
std::fflush(stdout);
// Record the placement result so post_kvflash_init_gate() can disable
// the KVFlash pool (moe_hybrid is null on this all-hot path).
placement_all_hot_ = true;
free_target_weights(out);
return load_target_gguf(cfg_.target_path, backend, out);
}
Expand Down Expand Up @@ -328,6 +332,36 @@ bool Qwen35MoeBackend::spark_bootstrap_finalize(const std::string & profile_path
return true;
}

bool Qwen35MoeBackend::post_kvflash_init_gate() {
// Gate: disable the KVFlash pool when dynamic placement confirmed all experts
// fit hot even with the FULL max_ctx KV reservation — the pool then reserves
// nothing useful (pure slot-map overhead). placement_all_hot_full_kv_ is set
// in load_dynamic_placement(). When the pool is what KEEPS experts hot
// (placement_all_hot_ true but _full_kv_ false), we must NOT disable it.
if (!kvflash_active()) return true;

bool should_disable = false;
if (placement_all_hot_full_kv_) {
should_disable = true;
} else if (target_weights().moe_hybrid) {
int total_cold = 0;
for (const auto & ls : target_weights().moe_hybrid->layers) {
total_cold += (int)ls.cold_expert_ids.size();
}
if (total_cold == 0) should_disable = true; // hybrid built but 0 cold
}

if (should_disable) {
std::printf("[kvflash] disabled: placement all-hot at max_ctx %d, pool not needed\n",
cfg_.device.max_ctx);
std::fflush(stdout);
kvflash_tokens_ = 0;
kvflash_tau_ = 64;
kvflash_drafter_path_.clear();
}
return true;
}

void Qwen35MoeBackend::maybe_post_request_swap() {
if (!routing_stats_) return;

Expand Down Expand Up @@ -2132,7 +2166,12 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
// KV cache: n_layer × 2 (K+V) × n_head_kv × head_dim × sizeof(fp16) × max_context
const uint64_t kv_bytes_per_tok = (uint64_t)w.n_layer * 2 *
(uint64_t)w.n_head_kv * (uint64_t)w.n_embd_head_k * 2;
const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context;
// Size the reservation with the SAME inputs runtime uses (scorer policy +
// VRAM budget); the bare-max_context call took the no-budget fallback
// (max_ctx/2) and over-reserved KV, starving experts of hot placement.
const int kvf_pool = kvflash_pool_from_env(
max_context, KvFlashConfig{}, kvflash_scorer_expected(),
make_kvflash_budget(w, (int64_t)gpu_free));

const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024; // 200 MB warm/staging
uint64_t safety_bytes = 512ULL * 1024 * 1024; // 512 MB safety margin
Expand All @@ -2148,6 +2187,24 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
// Core model bytes = what's already used on GPU (non-expert tensors)
const uint64_t core_bytes = gpu_total - gpu_free;

// KVFlash reserves a fixed resident pool, not max_ctx, of KV. When active
// and the full reservation would force experts cold, reserve for the pool so
// experts stay hot (decouples max_ctx from the expert-placement cliff). The
// rule is centralised in kvflash_placement_decision() so future MoE backends
// (DeepSeek-V4, ...) inherit it instead of re-deriving the byte math.
const auto kvf_dec = dflash::common::kvflash_placement_decision(
kv_bytes_per_tok, max_context, kvf_pool,
gpu_total, core_bytes, total_expert_bytes,
warm_cache_bytes, safety_bytes, draft_reserve_bytes);
const uint64_t kv_total = kvf_dec.kv_total;
const int kv_ctx_log = kvf_dec.kv_ctx;
placement_all_hot_full_kv_ = kvf_dec.all_hot_full_kv;
if (kvf_dec.pool_reduced) {
std::printf("[kvflash] placement reserves pool KV (%d tokens, not max_ctx %d) "
"-> experts stay hot\n", kvf_pool, max_context);
std::fflush(stdout);
}

uint64_t expert_budget = 0;
if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes + draft_reserve_bytes) {
expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes - draft_reserve_bytes;
Expand Down Expand Up @@ -2186,7 +2243,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
gpu_total / 1024.0 / 1024.0 / 1024.0,
core_bytes / 1024.0 / 1024.0 / 1024.0,
kv_total / 1024.0 / 1024.0 / 1024.0,
max_context,
kv_ctx_log,
warm_cache_bytes / 1024.0 / 1024.0,
safety_bytes / 1024.0 / 1024.0,
expert_budget / 1024.0 / 1024.0 / 1024.0,
Expand Down
8 changes: 8 additions & 0 deletions server/src/qwen35moe/qwen35moe_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Qwen35MoeBackend : public Qwen35Backend {

protected:
bool load_target_model(ggml_backend_t backend, TargetWeights & out) override;
bool post_kvflash_init_gate() override;
bool run_ar_decode_path(int committed, int n_gen,
std::vector<int32_t> & out_tokens,
const DaemonIO & io) override;
Expand All @@ -43,6 +44,13 @@ class Qwen35MoeBackend : public Qwen35Backend {
void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;

private:
// All-hot placement signal for post_kvflash_init_gate(): set when
// load_target_model takes the all-hot early-return (moe_hybrid null).
bool placement_all_hot_ = false;
// True iff all experts fit hot with the FULL max_ctx KV reservation
// (KVFlash redundant). When false but placement_all_hot_ is true, the pool
// is what kept experts hot — the gate must NOT disable KVFlash.
bool placement_all_hot_full_kv_ = false;
std::shared_ptr<MoeHybridRoutingStats> routing_stats_;
std::string routing_stats_out_path_;
std::string placement_out_path_;
Expand Down
66 changes: 66 additions & 0 deletions server/test/test_kvflash_moe_placement.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# Integration test (GPU + model): KVFlash pool-sized KV reservation keeps MoE
# experts hot at high max_ctx, where the full-KV reservation would force them
# cold. Validates the qwen35moe placement-reservation fix end to end.
#
# Hardware-gated (needs a ~24GB GPU + the 35B-A3B MoE GGUF). Not wired into
# ctest; run manually or on a numbered-run box per CONTRIBUTING.md.
#
# TARGET=/path/Qwen3.6-35B-A3B-...Q3_K_M.gguf bash test_kvflash_moe_placement.sh
set -euo pipefail

REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO/server/build/dflash_server}"
TARGET="${TARGET:-/home/peppi/models/qwen3.6-35b-a3b/Qwen3.6-35B-A3B-UD-Q3_K_M.gguf}"
CHAT_TEMPLATE="${CHAT_TEMPLATE:-/home/peppi/models/qwen3-coder-chat-template.jinja}"
HOST=127.0.0.1; PORT="${PORT:-18080}"
MAX_CTX="${MAX_CTX:-131072}" # large enough that full-KV forces experts cold
POOL="${POOL:-49152}"
LOCK="${DG_GPU_LOCK:-/tmp/dg_gpu.lock}"

[ -x "$SERVER_BIN" ] || { echo "SKIP: server not built: $SERVER_BIN"; exit 0; }
[ -f "$TARGET" ] || { echo "SKIP: target GGUF not found: $TARGET"; exit 0; }

fail() { echo "FAIL: $*" >&2; exit 1; }

# Launch the server, wait for /v1/models, capture the placement line, kill it.
# Echoes the dynamic-placement result line.
placement_line() {
local extra=("$@") log; log="$(mktemp)"
( flock "$LOCK" "$SERVER_BIN" "$TARGET" \
--host "$HOST" --port "$PORT" --max-ctx "$MAX_CTX" --model-name luce \
--chat-template-file "$CHAT_TEMPLATE" "${extra[@]}" >"$log" 2>&1 ) &
local pid=$!
local ready=0
for _ in $(seq 1 90); do
curl -fsS "http://$HOST:$PORT/v1/models" >/dev/null 2>&1 && { ready=1; break; }
kill -0 "$pid" 2>/dev/null || break
sleep 2
done
# Server-launcher runs under flock in a subshell; find + kill the real server.
pkill -9 -f "$SERVER_BIN .*--port $PORT" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
[ "$ready" = 1 ] || { cat "$log" >&2; rm -f "$log"; fail "server did not become ready"; }
grep -E 'dynamic placement result|kvflash.*disabled|placement reserves pool' "$log"
rm -f "$log"
}

echo "== Arm A: no KVFlash @max_ctx $MAX_CTX (expect the cliff: cold experts > 0) =="
A="$(placement_line)"
echo "$A"
cold_a="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$A")"
[ -n "$cold_a" ] || fail "could not parse Arm A cold-expert count"
[ "$cold_a" -gt 0 ] || fail "expected cold experts without KVFlash at max_ctx $MAX_CTX, got $cold_a"
echo " -> $cold_a cold experts (cliff confirmed)"

echo "== Arm B: --kvflash $POOL @max_ctx $MAX_CTX (expect pool reservation, 0 cold) =="
B="$(placement_line --kvflash "$POOL")"
echo "$B"
grep -q 'placement reserves pool KV' <<<"$B" \
|| fail "KVFlash did not reduce the KV reservation to the pool"
cold_b="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$B")"
[ -n "$cold_b" ] || fail "could not parse Arm B cold-expert count"
[ "$cold_b" -eq 0 ] || fail "expected 0 cold experts with KVFlash, got $cold_b"
echo " -> 0 cold experts (experts stay hot via pool reservation)"

echo "PASS: kvflash MoE placement — pool reservation keeps experts hot (A: $cold_a cold -> B: 0 cold)"
Loading