Luce-Org · dusterbloom · Jun 20, 2026 · Jun 22, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -611,6 +611,11 @@ if(DFLASH27B_TESTS)
         target_include_directories(test_derived_scalars PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
         add_test(NAME derived_scalars COMMAND test_derived_scalars)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_placement.cpp")
+        add_executable(test_kvflash_placement test/test_kvflash_placement.cpp)
+        target_include_directories(test_kvflash_placement PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
+        add_test(NAME kvflash_placement COMMAND test_kvflash_placement)
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_bandit_integration.cpp")
         add_executable(test_bandit_integration test/test_bandit_integration.cpp)
         target_include_directories(test_bandit_integration PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
@@ -787,6 +792,12 @@ if(DFLASH27B_TESTS)
         add_executable(test_kvflash_qk test/test_kvflash_qk.cpp)
         target_include_directories(test_kvflash_qk PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_pool_sizing.cpp")
+        # Pure unit test for kvflash_pool_from_env: no ggml link, no GPU.
+        add_executable(test_kvflash_pool_sizing test/test_kvflash_pool_sizing.cpp)
+        target_include_directories(test_kvflash_pool_sizing PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
+        add_test(NAME kvflash_pool_sizing COMMAND test_kvflash_pool_sizing)
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp")
         add_executable(test_restore_delta test/test_restore_delta.cpp)
         target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})

diff --git a/server/src/common/kvflash_placement.h b/server/src/common/kvflash_placement.h
@@ -0,0 +1,54 @@
+// KVFlash placement KV-reservation rule (architecture-agnostic, header-only).
+//
+// Any MoE / weight-offload backend that places experts against a VRAM budget
+// must decide how much KV to reserve.  Reserving for `max_ctx` forces experts
+// cold at high max_ctx even when KVFlash bounds the *resident* KV to a fixed
+// pool.  This helper centralises the rule so every backend (qwen35moe today,
+// DeepSeek-V4 / future MoE next) inherits the "pool bounds the expert-placement
+// cliff" win without re-deriving the byte math.
+#pragma once
+
+#include <cstdint>
+
+namespace dflash::common {
+
+struct KvfPlacementDecision {
+    uint64_t kv_total       = 0;      // bytes to reserve for the KV cache
+    int      kv_ctx         = 0;      // tokens the reservation covers (pool or max_ctx)
+    bool     all_hot_full_kv = false; // would ALL experts be hot with the FULL max_ctx KV?
+    bool     pool_reduced   = false;  // did we reserve for the pool instead of max_ctx?
+};
+
+// Decide the KV reservation for VRAM-budget expert placement.
+//
+// kvf_pool: resident KVFlash pool in tokens (0 = KVFlash inactive).
+// all_hot_full_kv reports whether the full max_ctx KV already fits all experts
+//   hot — i.e. KVFlash is redundant; the caller's gate uses it to disable the
+//   pool when unneeded (so a pool that is *itself* keeping experts hot is never
+//   disabled).
+// When KVFlash is active AND the full reservation would force experts cold, the
+//   reservation is reduced to the pool so experts stay hot.
+inline KvfPlacementDecision kvflash_placement_decision(
+    uint64_t kv_bytes_per_tok, int max_ctx, int kvf_pool,
+    uint64_t gpu_total, uint64_t core_bytes, uint64_t total_expert_bytes,
+    uint64_t warm_bytes, uint64_t safety_bytes, uint64_t draft_bytes)
+{
+    KvfPlacementDecision d;
+    const uint64_t kv_full = kv_bytes_per_tok * (uint64_t)max_ctx;
+    const uint64_t fixed   = core_bytes + warm_bytes + safety_bytes + draft_bytes;
+
+    uint64_t eb_full = 0;
+    if (gpu_total > fixed + kv_full) eb_full = gpu_total - fixed - kv_full;
+    d.all_hot_full_kv = (eb_full >= total_expert_bytes);
+
+    d.kv_ctx   = max_ctx;
+    d.kv_total = kv_full;
+    if (kvf_pool > 0 && kvf_pool < max_ctx && !d.all_hot_full_kv) {
+        d.kv_ctx       = kvf_pool;
+        d.kv_total     = kv_bytes_per_tok * (uint64_t)kvf_pool;
+        d.pool_reduced = true;
+    }
+    return d;
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
@@ -149,6 +149,24 @@ Qwen35Backend::Qwen35Backend(const Qwen35Config & cfg) : cfg_(cfg) {}
 
 Qwen35Backend::~Qwen35Backend() { shutdown(); }
 
+// "auto" pool budget: device-free minus a reserve (compute buffers + drafter
+// when expected), converted at this model's pooled-KV density. Shared with MoE
+// placement so reservation and runtime allocation size the pool identically.
+KvFlashAutoBudget Qwen35Backend::make_kvflash_budget(const TargetWeights & w,
+                                                     int64_t gpu_free) const {
+    ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
+    dflash::resolve_kv_types(kv_k, kv_v);
+    const int n_full = w.n_layer / w.full_attention_interval;
+    KvFlashAutoBudget b;
+    b.free_bytes      = gpu_free;
+    b.bytes_per_token = (int64_t)n_full * w.n_head_kv *
+        (int64_t)(ggml_row_size(kv_k, w.n_embd_head_k) +
+                  ggml_row_size(kv_v, w.n_embd_head_v));
+    b.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
+        (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+    return b;
+}
+
 // ── init() ──────────────────────────────────────────────────────────────
 
 bool Qwen35Backend::init() {
@@ -242,29 +260,20 @@ bool Qwen35Backend::init() {
     // point and the cache is not yet allocated, so device-free minus a
     // reserve (compute buffers + the drafter when expected) is what the
     // pool can really use, converted at this model's pooled-KV density.
-    KvFlashAutoBudget kvf_budget;
-    {
-        size_t gpu_free = 0, gpu_total = 0;
-        if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
-            ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
-        }
-        ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
-        dflash::resolve_kv_types(kv_k, kv_v);
-        const int n_full = w_.n_layer / w_.full_attention_interval;
-        kvf_budget.free_bytes      = (int64_t)gpu_free;
-        kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv *
-            (int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) +
-                      ggml_row_size(kv_v, w_.n_embd_head_v));
-        kvf_budget.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
-            (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
+    size_t gpu_free = 0, gpu_total = 0;
+    if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) {
+        ggml_backend_dev_memory(dev, &gpu_free, &gpu_total);
     }
+    KvFlashAutoBudget kvf_budget = make_kvflash_budget(w_, (int64_t)gpu_free);
     kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{},
-                                            !kvflash_drafter_path_.empty() ||
-                                            kvflash_qk_policy_,
+                                            kvflash_scorer_expected(),
                                             kvf_budget);
     if (kvflash_tokens_ > 0) {
         kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64));
     }
+    // Subclass gate (e.g. MoE all-hot): may zero kvflash_tokens_ before the KV
+    // cache is sized, so create_target_cache allocates full max_ctx KV.
+    if (!post_kvflash_init_gate()) return false;
     if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_,
                              /*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) {
         std::fprintf(stderr, "cache: %s\n", dflash27b_last_error());

diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h
@@ -133,6 +133,9 @@ class Qwen35Backend : public ModelBackend {
                                     std::vector<int32_t> & out_tokens,
                                     const DaemonIO & io);
     virtual bool should_capture_moe_router() const { return false; }
+    // Hook after kvflash pool sizing, before create_target_cache: a subclass
+    // may disable the pool (kvflash_tokens_=0) when it is redundant. Default no-op.
+    virtual bool post_kvflash_init_gate() { return true; }
     virtual void after_target_compute(StepGraph &,
                                       int /*kv_start*/,
                                       int /*n_tokens*/) {}
@@ -181,6 +184,12 @@ class Qwen35Backend : public ModelBackend {
     int  kvflash_tau_    = 64;
     bool kvflash_drafter_failed_ = false;           // don't retry a failed load
     bool kvflash_active() const { return kvflash_tokens_ > 0; }
+    // Pool sizing inputs — shared so MoE placement reserves exactly the pool
+    // runtime allocates (else placement over-reserves KV and starves experts).
+    bool kvflash_scorer_expected() const {
+        return !kvflash_drafter_path_.empty() || kvflash_qk_policy_;
+    }
+    KvFlashAutoBudget make_kvflash_budget(const TargetWeights & w, int64_t gpu_free) const;
     // Target-QK policy (--kvflash-policy qk): residency scored with the
     // target's own pooled post-RoPE keys vs the current decode query
     // (kvflash_qk.h); no drafter. Keys pool at chunk-seal time; the query

diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -4,6 +4,7 @@
 #include "../common/moe_hybrid_stream.h"
 #include "../common/moe_hybrid_types.h"
 #include "../common/moe_hybrid_types_impl.h"
+#include "../common/kvflash_placement.h"
 #include "common/ggml_graph_precision.h"
 #include "common/sampler.h"
 #include "common/dflash_spec_decode.h"
@@ -74,6 +75,9 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights &
     if (placement.total_hot >= out.n_layer * out.n_expert) {
         std::printf("[qwen35moe] all experts fit in VRAM, loading fully to GPU\n");
         std::fflush(stdout);
+        // Record the placement result so post_kvflash_init_gate() can disable
+        // the KVFlash pool (moe_hybrid is null on this all-hot path).
+        placement_all_hot_ = true;
         free_target_weights(out);
         return load_target_gguf(cfg_.target_path, backend, out);
     }
@@ -328,6 +332,36 @@ bool Qwen35MoeBackend::spark_bootstrap_finalize(const std::string & profile_path
     return true;
 }
 
+bool Qwen35MoeBackend::post_kvflash_init_gate() {
+    // Gate: disable the KVFlash pool when dynamic placement confirmed all experts
+    // fit hot even with the FULL max_ctx KV reservation — the pool then reserves
+    // nothing useful (pure slot-map overhead).  placement_all_hot_full_kv_ is set
+    // in load_dynamic_placement().  When the pool is what KEEPS experts hot
+    // (placement_all_hot_ true but _full_kv_ false), we must NOT disable it.
+    if (!kvflash_active()) return true;
+
+    bool should_disable = false;
+    if (placement_all_hot_full_kv_) {
+        should_disable = true;
+    } else if (target_weights().moe_hybrid) {
+        int total_cold = 0;
+        for (const auto & ls : target_weights().moe_hybrid->layers) {
+            total_cold += (int)ls.cold_expert_ids.size();
+        }
+        if (total_cold == 0) should_disable = true;  // hybrid built but 0 cold
+    }
+
+    if (should_disable) {
+        std::printf("[kvflash] disabled: placement all-hot at max_ctx %d, pool not needed\n",
+                    cfg_.device.max_ctx);
+        std::fflush(stdout);
+        kvflash_tokens_ = 0;
+        kvflash_tau_    = 64;
+        kvflash_drafter_path_.clear();
+    }
+    return true;
+}
+
 void Qwen35MoeBackend::maybe_post_request_swap() {
     if (!routing_stats_) return;
 
@@ -2132,7 +2166,12 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
     // KV cache: n_layer × 2 (K+V) × n_head_kv × head_dim × sizeof(fp16) × max_context
     const uint64_t kv_bytes_per_tok = (uint64_t)w.n_layer * 2 *
         (uint64_t)w.n_head_kv * (uint64_t)w.n_embd_head_k * 2;
-    const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context;
+    // Size the reservation with the SAME inputs runtime uses (scorer policy +
+    // VRAM budget); the bare-max_context call took the no-budget fallback
+    // (max_ctx/2) and over-reserved KV, starving experts of hot placement.
+    const int      kvf_pool      = kvflash_pool_from_env(
+        max_context, KvFlashConfig{}, kvflash_scorer_expected(),
+        make_kvflash_budget(w, (int64_t)gpu_free));
 
     const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024;  // 200 MB warm/staging
     uint64_t safety_bytes = 512ULL * 1024 * 1024;      // 512 MB safety margin
@@ -2148,6 +2187,24 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
     // Core model bytes = what's already used on GPU (non-expert tensors)
     const uint64_t core_bytes = gpu_total - gpu_free;
 
+    // KVFlash reserves a fixed resident pool, not max_ctx, of KV.  When active
+    // and the full reservation would force experts cold, reserve for the pool so
+    // experts stay hot (decouples max_ctx from the expert-placement cliff).  The
+    // rule is centralised in kvflash_placement_decision() so future MoE backends
+    // (DeepSeek-V4, ...) inherit it instead of re-deriving the byte math.
+    const auto kvf_dec = dflash::common::kvflash_placement_decision(
+        kv_bytes_per_tok, max_context, kvf_pool,
+        gpu_total, core_bytes, total_expert_bytes,
+        warm_cache_bytes, safety_bytes, draft_reserve_bytes);
+    const uint64_t kv_total   = kvf_dec.kv_total;
+    const int      kv_ctx_log = kvf_dec.kv_ctx;
+    placement_all_hot_full_kv_ = kvf_dec.all_hot_full_kv;
+    if (kvf_dec.pool_reduced) {
+        std::printf("[kvflash] placement reserves pool KV (%d tokens, not max_ctx %d) "
+                    "-> experts stay hot\n", kvf_pool, max_context);
+        std::fflush(stdout);
+    }
+
     uint64_t expert_budget = 0;
     if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes + draft_reserve_bytes) {
         expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes - draft_reserve_bytes;
@@ -2186,7 +2243,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
                 gpu_total / 1024.0 / 1024.0 / 1024.0,
                 core_bytes / 1024.0 / 1024.0 / 1024.0,
                 kv_total / 1024.0 / 1024.0 / 1024.0,
-                max_context,
+                kv_ctx_log,
                 warm_cache_bytes / 1024.0 / 1024.0,
                 safety_bytes / 1024.0 / 1024.0,
                 expert_budget / 1024.0 / 1024.0 / 1024.0,

diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
@@ -34,6 +34,7 @@ class Qwen35MoeBackend : public Qwen35Backend {
 
 protected:
     bool load_target_model(ggml_backend_t backend, TargetWeights & out) override;
+    bool post_kvflash_init_gate() override;
     bool run_ar_decode_path(int committed, int n_gen,
                             std::vector<int32_t> & out_tokens,
                             const DaemonIO & io) override;
@@ -43,6 +44,13 @@ class Qwen35MoeBackend : public Qwen35Backend {
     void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override;
 
 private:
+    // All-hot placement signal for post_kvflash_init_gate(): set when
+    // load_target_model takes the all-hot early-return (moe_hybrid null).
+    bool placement_all_hot_ = false;
+    // True iff all experts fit hot with the FULL max_ctx KV reservation
+    // (KVFlash redundant). When false but placement_all_hot_ is true, the pool
+    // is what kept experts hot — the gate must NOT disable KVFlash.
+    bool placement_all_hot_full_kv_ = false;
     std::shared_ptr<MoeHybridRoutingStats> routing_stats_;
     std::string routing_stats_out_path_;
     std::string placement_out_path_;

diff --git a/server/test/test_kvflash_moe_placement.sh b/server/test/test_kvflash_moe_placement.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Integration test (GPU + model): KVFlash pool-sized KV reservation keeps MoE
+# experts hot at high max_ctx, where the full-KV reservation would force them
+# cold. Validates the qwen35moe placement-reservation fix end to end.
+#
+# Hardware-gated (needs a ~24GB GPU + the 35B-A3B MoE GGUF). Not wired into
+# ctest; run manually or on a numbered-run box per CONTRIBUTING.md.
+#
+#   TARGET=/path/Qwen3.6-35B-A3B-...Q3_K_M.gguf bash test_kvflash_moe_placement.sh
+set -euo pipefail
+
+REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO/server/build/dflash_server}"
+TARGET="${TARGET:-/home/peppi/models/qwen3.6-35b-a3b/Qwen3.6-35B-A3B-UD-Q3_K_M.gguf}"
+CHAT_TEMPLATE="${CHAT_TEMPLATE:-/home/peppi/models/qwen3-coder-chat-template.jinja}"
+HOST=127.0.0.1; PORT="${PORT:-18080}"
+MAX_CTX="${MAX_CTX:-131072}"   # large enough that full-KV forces experts cold
+POOL="${POOL:-49152}"
+LOCK="${DG_GPU_LOCK:-/tmp/dg_gpu.lock}"
+
+[ -x "$SERVER_BIN" ] || { echo "SKIP: server not built: $SERVER_BIN"; exit 0; }
+[ -f "$TARGET" ]     || { echo "SKIP: target GGUF not found: $TARGET"; exit 0; }
+
+fail() { echo "FAIL: $*" >&2; exit 1; }
+
+# Launch the server, wait for /v1/models, capture the placement line, kill it.
+# Echoes the dynamic-placement result line.
+placement_line() {
+    local extra=("$@") log; log="$(mktemp)"
+    ( flock "$LOCK" "$SERVER_BIN" "$TARGET" \
+        --host "$HOST" --port "$PORT" --max-ctx "$MAX_CTX" --model-name luce \
+        --chat-template-file "$CHAT_TEMPLATE" "${extra[@]}" >"$log" 2>&1 ) &
+    local pid=$!
+    local ready=0
+    for _ in $(seq 1 90); do
+        curl -fsS "http://$HOST:$PORT/v1/models" >/dev/null 2>&1 && { ready=1; break; }
+        kill -0 "$pid" 2>/dev/null || break
+        sleep 2
+    done
+    # Server-launcher runs under flock in a subshell; find + kill the real server.
+    pkill -9 -f "$SERVER_BIN .*--port $PORT" 2>/dev/null || true
+    wait "$pid" 2>/dev/null || true
+    [ "$ready" = 1 ] || { cat "$log" >&2; rm -f "$log"; fail "server did not become ready"; }
+    grep -E 'dynamic placement result|kvflash.*disabled|placement reserves pool' "$log"
+    rm -f "$log"
+}
+
+echo "== Arm A: no KVFlash @max_ctx $MAX_CTX (expect the cliff: cold experts > 0) =="
+A="$(placement_line)"
+echo "$A"
+cold_a="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$A")"
+[ -n "$cold_a" ] || fail "could not parse Arm A cold-expert count"
+[ "$cold_a" -gt 0 ] || fail "expected cold experts without KVFlash at max_ctx $MAX_CTX, got $cold_a"
+echo "  -> $cold_a cold experts (cliff confirmed)"
+
+echo "== Arm B: --kvflash $POOL @max_ctx $MAX_CTX (expect pool reservation, 0 cold) =="
+B="$(placement_line --kvflash "$POOL")"
+echo "$B"
+grep -q 'placement reserves pool KV' <<<"$B" \
+    || fail "KVFlash did not reduce the KV reservation to the pool"
+cold_b="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$B")"
+[ -n "$cold_b" ] || fail "could not parse Arm B cold-expert count"
+[ "$cold_b" -eq 0 ] || fail "expected 0 cold experts with KVFlash, got $cold_b"
+echo "  -> 0 cold experts (experts stay hot via pool reservation)"
+
+echo "PASS: kvflash MoE placement — pool reservation keeps experts hot (A: $cold_a cold -> B: 0 cold)"