diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index d30efef7a..1382df5f2 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -611,6 +611,11 @@ if(DFLASH27B_TESTS) target_include_directories(test_derived_scalars PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) add_test(NAME derived_scalars COMMAND test_derived_scalars) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_placement.cpp") + add_executable(test_kvflash_placement test/test_kvflash_placement.cpp) + target_include_directories(test_kvflash_placement PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) + add_test(NAME kvflash_placement COMMAND test_kvflash_placement) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_bandit_integration.cpp") add_executable(test_bandit_integration test/test_bandit_integration.cpp) target_include_directories(test_bandit_integration PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) @@ -787,6 +792,12 @@ if(DFLASH27B_TESTS) add_executable(test_kvflash_qk test/test_kvflash_qk.cpp) target_include_directories(test_kvflash_qk PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kvflash_pool_sizing.cpp") + # Pure unit test for kvflash_pool_from_env: no ggml link, no GPU. + add_executable(test_kvflash_pool_sizing test/test_kvflash_pool_sizing.cpp) + target_include_directories(test_kvflash_pool_sizing PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) + add_test(NAME kvflash_pool_sizing COMMAND test_kvflash_pool_sizing) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp") add_executable(test_restore_delta test/test_restore_delta.cpp) target_include_directories(test_restore_delta PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) diff --git a/server/src/common/kvflash_placement.h b/server/src/common/kvflash_placement.h new file mode 100644 index 000000000..70fb78724 --- /dev/null +++ b/server/src/common/kvflash_placement.h @@ -0,0 +1,54 @@ +// KVFlash placement KV-reservation rule (architecture-agnostic, header-only). +// +// Any MoE / weight-offload backend that places experts against a VRAM budget +// must decide how much KV to reserve. Reserving for `max_ctx` forces experts +// cold at high max_ctx even when KVFlash bounds the *resident* KV to a fixed +// pool. This helper centralises the rule so every backend (qwen35moe today, +// DeepSeek-V4 / future MoE next) inherits the "pool bounds the expert-placement +// cliff" win without re-deriving the byte math. +#pragma once + +#include + +namespace dflash::common { + +struct KvfPlacementDecision { + uint64_t kv_total = 0; // bytes to reserve for the KV cache + int kv_ctx = 0; // tokens the reservation covers (pool or max_ctx) + bool all_hot_full_kv = false; // would ALL experts be hot with the FULL max_ctx KV? + bool pool_reduced = false; // did we reserve for the pool instead of max_ctx? +}; + +// Decide the KV reservation for VRAM-budget expert placement. +// +// kvf_pool: resident KVFlash pool in tokens (0 = KVFlash inactive). +// all_hot_full_kv reports whether the full max_ctx KV already fits all experts +// hot — i.e. KVFlash is redundant; the caller's gate uses it to disable the +// pool when unneeded (so a pool that is *itself* keeping experts hot is never +// disabled). +// When KVFlash is active AND the full reservation would force experts cold, the +// reservation is reduced to the pool so experts stay hot. +inline KvfPlacementDecision kvflash_placement_decision( + uint64_t kv_bytes_per_tok, int max_ctx, int kvf_pool, + uint64_t gpu_total, uint64_t core_bytes, uint64_t total_expert_bytes, + uint64_t warm_bytes, uint64_t safety_bytes, uint64_t draft_bytes) +{ + KvfPlacementDecision d; + const uint64_t kv_full = kv_bytes_per_tok * (uint64_t)max_ctx; + const uint64_t fixed = core_bytes + warm_bytes + safety_bytes + draft_bytes; + + uint64_t eb_full = 0; + if (gpu_total > fixed + kv_full) eb_full = gpu_total - fixed - kv_full; + d.all_hot_full_kv = (eb_full >= total_expert_bytes); + + d.kv_ctx = max_ctx; + d.kv_total = kv_full; + if (kvf_pool > 0 && kvf_pool < max_ctx && !d.all_hot_full_kv) { + d.kv_ctx = kvf_pool; + d.kv_total = kv_bytes_per_tok * (uint64_t)kvf_pool; + d.pool_reduced = true; + } + return d; +} + +} // namespace dflash::common diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index 4b1978ed8..74175d773 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -149,6 +149,24 @@ Qwen35Backend::Qwen35Backend(const Qwen35Config & cfg) : cfg_(cfg) {} Qwen35Backend::~Qwen35Backend() { shutdown(); } +// "auto" pool budget: device-free minus a reserve (compute buffers + drafter +// when expected), converted at this model's pooled-KV density. Shared with MoE +// placement so reservation and runtime allocation size the pool identically. +KvFlashAutoBudget Qwen35Backend::make_kvflash_budget(const TargetWeights & w, + int64_t gpu_free) const { + ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0; + dflash::resolve_kv_types(kv_k, kv_v); + const int n_full = w.n_layer / w.full_attention_interval; + KvFlashAutoBudget b; + b.free_bytes = gpu_free; + b.bytes_per_token = (int64_t)n_full * w.n_head_kv * + (int64_t)(ggml_row_size(kv_k, w.n_embd_head_k) + + ggml_row_size(kv_v, w.n_embd_head_v)); + b.reserve_bytes = (int64_t)(1.5 * 1073741824.0) + + (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0)); + return b; +} + // ── init() ────────────────────────────────────────────────────────────── bool Qwen35Backend::init() { @@ -242,29 +260,20 @@ bool Qwen35Backend::init() { // point and the cache is not yet allocated, so device-free minus a // reserve (compute buffers + the drafter when expected) is what the // pool can really use, converted at this model's pooled-KV density. - KvFlashAutoBudget kvf_budget; - { - size_t gpu_free = 0, gpu_total = 0; - if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) { - ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); - } - ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0; - dflash::resolve_kv_types(kv_k, kv_v); - const int n_full = w_.n_layer / w_.full_attention_interval; - kvf_budget.free_bytes = (int64_t)gpu_free; - kvf_budget.bytes_per_token = (int64_t)n_full * w_.n_head_kv * - (int64_t)(ggml_row_size(kv_k, w_.n_embd_head_k) + - ggml_row_size(kv_v, w_.n_embd_head_v)); - kvf_budget.reserve_bytes = (int64_t)(1.5 * 1073741824.0) + - (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0)); + size_t gpu_free = 0, gpu_total = 0; + if (ggml_backend_dev_t dev = ggml_backend_get_device(target_backend_)) { + ggml_backend_dev_memory(dev, &gpu_free, &gpu_total); } + KvFlashAutoBudget kvf_budget = make_kvflash_budget(w_, (int64_t)gpu_free); kvflash_tokens_ = kvflash_pool_from_env(cfg_.device.max_ctx, KvFlashConfig{}, - !kvflash_drafter_path_.empty() || - kvflash_qk_policy_, + kvflash_scorer_expected(), kvf_budget); if (kvflash_tokens_ > 0) { kvflash_tau_ = std::max(1, env_int_or_default("DFLASH_KVFLASH_TAU", 64)); } + // Subclass gate (e.g. MoE all-hot): may zero kvflash_tokens_ before the KV + // cache is sized, so create_target_cache allocates full max_ctx KV. + if (!post_kvflash_init_gate()) return false; if (!create_target_cache(w_, cfg_.device.max_ctx, max_verify_tokens, target_backend_, cache_, /*prefill_only=*/true, /*ctx_alloc=*/kvflash_tokens_)) { std::fprintf(stderr, "cache: %s\n", dflash27b_last_error()); diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h index 75be91b12..bc4fcdc0b 100644 --- a/server/src/qwen35/qwen35_backend.h +++ b/server/src/qwen35/qwen35_backend.h @@ -133,6 +133,9 @@ class Qwen35Backend : public ModelBackend { std::vector & out_tokens, const DaemonIO & io); virtual bool should_capture_moe_router() const { return false; } + // Hook after kvflash pool sizing, before create_target_cache: a subclass + // may disable the pool (kvflash_tokens_=0) when it is redundant. Default no-op. + virtual bool post_kvflash_init_gate() { return true; } virtual void after_target_compute(StepGraph &, int /*kv_start*/, int /*n_tokens*/) {} @@ -181,6 +184,12 @@ class Qwen35Backend : public ModelBackend { int kvflash_tau_ = 64; bool kvflash_drafter_failed_ = false; // don't retry a failed load bool kvflash_active() const { return kvflash_tokens_ > 0; } + // Pool sizing inputs — shared so MoE placement reserves exactly the pool + // runtime allocates (else placement over-reserves KV and starves experts). + bool kvflash_scorer_expected() const { + return !kvflash_drafter_path_.empty() || kvflash_qk_policy_; + } + KvFlashAutoBudget make_kvflash_budget(const TargetWeights & w, int64_t gpu_free) const; // Target-QK policy (--kvflash-policy qk): residency scored with the // target's own pooled post-RoPE keys vs the current decode query // (kvflash_qk.h); no drafter. Keys pool at chunk-seal time; the query diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp index f82ab6dff..933937521 100644 --- a/server/src/qwen35moe/qwen35moe_backend.cpp +++ b/server/src/qwen35moe/qwen35moe_backend.cpp @@ -4,6 +4,7 @@ #include "../common/moe_hybrid_stream.h" #include "../common/moe_hybrid_types.h" #include "../common/moe_hybrid_types_impl.h" +#include "../common/kvflash_placement.h" #include "common/ggml_graph_precision.h" #include "common/sampler.h" #include "common/dflash_spec_decode.h" @@ -74,6 +75,9 @@ bool Qwen35MoeBackend::load_target_model(ggml_backend_t backend, TargetWeights & if (placement.total_hot >= out.n_layer * out.n_expert) { std::printf("[qwen35moe] all experts fit in VRAM, loading fully to GPU\n"); std::fflush(stdout); + // Record the placement result so post_kvflash_init_gate() can disable + // the KVFlash pool (moe_hybrid is null on this all-hot path). + placement_all_hot_ = true; free_target_weights(out); return load_target_gguf(cfg_.target_path, backend, out); } @@ -328,6 +332,36 @@ bool Qwen35MoeBackend::spark_bootstrap_finalize(const std::string & profile_path return true; } +bool Qwen35MoeBackend::post_kvflash_init_gate() { + // Gate: disable the KVFlash pool when dynamic placement confirmed all experts + // fit hot even with the FULL max_ctx KV reservation — the pool then reserves + // nothing useful (pure slot-map overhead). placement_all_hot_full_kv_ is set + // in load_dynamic_placement(). When the pool is what KEEPS experts hot + // (placement_all_hot_ true but _full_kv_ false), we must NOT disable it. + if (!kvflash_active()) return true; + + bool should_disable = false; + if (placement_all_hot_full_kv_) { + should_disable = true; + } else if (target_weights().moe_hybrid) { + int total_cold = 0; + for (const auto & ls : target_weights().moe_hybrid->layers) { + total_cold += (int)ls.cold_expert_ids.size(); + } + if (total_cold == 0) should_disable = true; // hybrid built but 0 cold + } + + if (should_disable) { + std::printf("[kvflash] disabled: placement all-hot at max_ctx %d, pool not needed\n", + cfg_.device.max_ctx); + std::fflush(stdout); + kvflash_tokens_ = 0; + kvflash_tau_ = 64; + kvflash_drafter_path_.clear(); + } + return true; +} + void Qwen35MoeBackend::maybe_post_request_swap() { if (!routing_stats_) return; @@ -2132,7 +2166,12 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path, // KV cache: n_layer × 2 (K+V) × n_head_kv × head_dim × sizeof(fp16) × max_context const uint64_t kv_bytes_per_tok = (uint64_t)w.n_layer * 2 * (uint64_t)w.n_head_kv * (uint64_t)w.n_embd_head_k * 2; - const uint64_t kv_total = kv_bytes_per_tok * (uint64_t)max_context; + // Size the reservation with the SAME inputs runtime uses (scorer policy + + // VRAM budget); the bare-max_context call took the no-budget fallback + // (max_ctx/2) and over-reserved KV, starving experts of hot placement. + const int kvf_pool = kvflash_pool_from_env( + max_context, KvFlashConfig{}, kvflash_scorer_expected(), + make_kvflash_budget(w, (int64_t)gpu_free)); const uint64_t warm_cache_bytes = 200ULL * 1024 * 1024; // 200 MB warm/staging uint64_t safety_bytes = 512ULL * 1024 * 1024; // 512 MB safety margin @@ -2148,6 +2187,24 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path, // Core model bytes = what's already used on GPU (non-expert tensors) const uint64_t core_bytes = gpu_total - gpu_free; + // KVFlash reserves a fixed resident pool, not max_ctx, of KV. When active + // and the full reservation would force experts cold, reserve for the pool so + // experts stay hot (decouples max_ctx from the expert-placement cliff). The + // rule is centralised in kvflash_placement_decision() so future MoE backends + // (DeepSeek-V4, ...) inherit it instead of re-deriving the byte math. + const auto kvf_dec = dflash::common::kvflash_placement_decision( + kv_bytes_per_tok, max_context, kvf_pool, + gpu_total, core_bytes, total_expert_bytes, + warm_cache_bytes, safety_bytes, draft_reserve_bytes); + const uint64_t kv_total = kvf_dec.kv_total; + const int kv_ctx_log = kvf_dec.kv_ctx; + placement_all_hot_full_kv_ = kvf_dec.all_hot_full_kv; + if (kvf_dec.pool_reduced) { + std::printf("[kvflash] placement reserves pool KV (%d tokens, not max_ctx %d) " + "-> experts stay hot\n", kvf_pool, max_context); + std::fflush(stdout); + } + uint64_t expert_budget = 0; if (gpu_total > core_bytes + kv_total + warm_cache_bytes + safety_bytes + draft_reserve_bytes) { expert_budget = gpu_total - core_bytes - kv_total - warm_cache_bytes - safety_bytes - draft_reserve_bytes; @@ -2186,7 +2243,7 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path, gpu_total / 1024.0 / 1024.0 / 1024.0, core_bytes / 1024.0 / 1024.0 / 1024.0, kv_total / 1024.0 / 1024.0 / 1024.0, - max_context, + kv_ctx_log, warm_cache_bytes / 1024.0 / 1024.0, safety_bytes / 1024.0 / 1024.0, expert_budget / 1024.0 / 1024.0 / 1024.0, diff --git a/server/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h index 090225170..a731b4f7a 100644 --- a/server/src/qwen35moe/qwen35moe_backend.h +++ b/server/src/qwen35moe/qwen35moe_backend.h @@ -34,6 +34,7 @@ class Qwen35MoeBackend : public Qwen35Backend { protected: bool load_target_model(ggml_backend_t backend, TargetWeights & out) override; + bool post_kvflash_init_gate() override; bool run_ar_decode_path(int committed, int n_gen, std::vector & out_tokens, const DaemonIO & io) override; @@ -43,6 +44,13 @@ class Qwen35MoeBackend : public Qwen35Backend { void after_target_compute(StepGraph & sg, int kv_start, int n_tokens) override; private: + // All-hot placement signal for post_kvflash_init_gate(): set when + // load_target_model takes the all-hot early-return (moe_hybrid null). + bool placement_all_hot_ = false; + // True iff all experts fit hot with the FULL max_ctx KV reservation + // (KVFlash redundant). When false but placement_all_hot_ is true, the pool + // is what kept experts hot — the gate must NOT disable KVFlash. + bool placement_all_hot_full_kv_ = false; std::shared_ptr routing_stats_; std::string routing_stats_out_path_; std::string placement_out_path_; diff --git a/server/test/test_kvflash_moe_placement.sh b/server/test/test_kvflash_moe_placement.sh new file mode 100755 index 000000000..3bcc54cec --- /dev/null +++ b/server/test/test_kvflash_moe_placement.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Integration test (GPU + model): KVFlash pool-sized KV reservation keeps MoE +# experts hot at high max_ctx, where the full-KV reservation would force them +# cold. Validates the qwen35moe placement-reservation fix end to end. +# +# Hardware-gated (needs a ~24GB GPU + the 35B-A3B MoE GGUF). Not wired into +# ctest; run manually or on a numbered-run box per CONTRIBUTING.md. +# +# TARGET=/path/Qwen3.6-35B-A3B-...Q3_K_M.gguf bash test_kvflash_moe_placement.sh +set -euo pipefail + +REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO/server/build/dflash_server}" +TARGET="${TARGET:-/home/peppi/models/qwen3.6-35b-a3b/Qwen3.6-35B-A3B-UD-Q3_K_M.gguf}" +CHAT_TEMPLATE="${CHAT_TEMPLATE:-/home/peppi/models/qwen3-coder-chat-template.jinja}" +HOST=127.0.0.1; PORT="${PORT:-18080}" +MAX_CTX="${MAX_CTX:-131072}" # large enough that full-KV forces experts cold +POOL="${POOL:-49152}" +LOCK="${DG_GPU_LOCK:-/tmp/dg_gpu.lock}" + +[ -x "$SERVER_BIN" ] || { echo "SKIP: server not built: $SERVER_BIN"; exit 0; } +[ -f "$TARGET" ] || { echo "SKIP: target GGUF not found: $TARGET"; exit 0; } + +fail() { echo "FAIL: $*" >&2; exit 1; } + +# Launch the server, wait for /v1/models, capture the placement line, kill it. +# Echoes the dynamic-placement result line. +placement_line() { + local extra=("$@") log; log="$(mktemp)" + ( flock "$LOCK" "$SERVER_BIN" "$TARGET" \ + --host "$HOST" --port "$PORT" --max-ctx "$MAX_CTX" --model-name luce \ + --chat-template-file "$CHAT_TEMPLATE" "${extra[@]}" >"$log" 2>&1 ) & + local pid=$! + local ready=0 + for _ in $(seq 1 90); do + curl -fsS "http://$HOST:$PORT/v1/models" >/dev/null 2>&1 && { ready=1; break; } + kill -0 "$pid" 2>/dev/null || break + sleep 2 + done + # Server-launcher runs under flock in a subshell; find + kill the real server. + pkill -9 -f "$SERVER_BIN .*--port $PORT" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + [ "$ready" = 1 ] || { cat "$log" >&2; rm -f "$log"; fail "server did not become ready"; } + grep -E 'dynamic placement result|kvflash.*disabled|placement reserves pool' "$log" + rm -f "$log" +} + +echo "== Arm A: no KVFlash @max_ctx $MAX_CTX (expect the cliff: cold experts > 0) ==" +A="$(placement_line)" +echo "$A" +cold_a="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$A")" +[ -n "$cold_a" ] || fail "could not parse Arm A cold-expert count" +[ "$cold_a" -gt 0 ] || fail "expected cold experts without KVFlash at max_ctx $MAX_CTX, got $cold_a" +echo " -> $cold_a cold experts (cliff confirmed)" + +echo "== Arm B: --kvflash $POOL @max_ctx $MAX_CTX (expect pool reservation, 0 cold) ==" +B="$(placement_line --kvflash "$POOL")" +echo "$B" +grep -q 'placement reserves pool KV' <<<"$B" \ + || fail "KVFlash did not reduce the KV reservation to the pool" +cold_b="$(sed -nE 's/.*result: [0-9]+ hot experts, ([0-9]+) cold experts.*/\1/p' <<<"$B")" +[ -n "$cold_b" ] || fail "could not parse Arm B cold-expert count" +[ "$cold_b" -eq 0 ] || fail "expected 0 cold experts with KVFlash, got $cold_b" +echo " -> 0 cold experts (experts stay hot via pool reservation)" + +echo "PASS: kvflash MoE placement — pool reservation keeps experts hot (A: $cold_a cold -> B: 0 cold)" diff --git a/server/test/test_kvflash_placement.cpp b/server/test/test_kvflash_placement.cpp new file mode 100644 index 000000000..01da0f3ab --- /dev/null +++ b/server/test/test_kvflash_placement.cpp @@ -0,0 +1,85 @@ +// Unit test for the KVFlash placement KV-reservation decision (no GPU). +// +// Behaviour under test: when KVFlash is active, the MoE expert placement must +// reserve KV for the resident POOL, not max_ctx — otherwise a large max_ctx +// reservation forces experts cold even though KVFlash bounds the resident KV. +// The decision also reports whether the model is all-hot with the FULL max_ctx +// KV (i.e. KVFlash is redundant) so the gate can disable the pool when unneeded. +#include "../src/common/kvflash_placement.h" + +#include +#include + +using namespace dflash::common; + +static void expect(bool cond, const char * msg) { + if (!cond) { + std::fprintf(stderr, "FAIL: %s\n", msg); + std::exit(1); + } +} + +int main() { + // qwen3.6-35B-A3B-like budget on a 24 GiB card: + // ~80 KiB/token KV (5 GiB @ 65536, 10 GiB @ 131072) + // experts ~13.19 GiB, core ~3.12 GiB, draft ~1.2 GiB present. + const uint64_t MiB = 1024ull * 1024; + const uint64_t GiB = 1024ull * MiB; + const uint64_t kv_per_tok = 80 * 1024; // bytes/token + const uint64_t gpu = 24 * GiB; + const uint64_t core = 3 * GiB + 122 * MiB; // ~3.12 GiB + const uint64_t experts = 13 * GiB + 195 * MiB; // ~13.19 GiB + const uint64_t warm = 200 * MiB; + const uint64_t safety = 256 * MiB; // reduced when draft present + const uint64_t draft = 1200 * MiB; + + // Case 1 — max_ctx 65536, NO kvflash: reserve full ctx, fits all-hot. + { + auto d = kvflash_placement_decision(kv_per_tok, 65536, /*pool=*/0, + gpu, core, experts, warm, safety, draft); + expect(d.kv_ctx == 65536, "C1: no-kvflash reserves full ctx"); + expect(d.all_hot_full_kv, "C1: 65536 full KV fits all experts hot"); + expect(!d.pool_reduced, "C1: no pool reduction"); + } + + // Case 2 — max_ctx 65536 + pool 49152: full KV still fits all-hot, so KVFlash + // is redundant — do NOT reduce to the pool (the gate will disable it). + { + auto d = kvflash_placement_decision(kv_per_tok, 65536, /*pool=*/49152, + gpu, core, experts, warm, safety, draft); + expect(d.all_hot_full_kv, "C2: 65536 full KV still fits -> kvflash redundant"); + expect(d.kv_ctx == 65536, "C2: keeps full ctx (gate disables kvflash)"); + expect(!d.pool_reduced, "C2: no pool reduction when redundant"); + } + + // Case 3 (THE FIX) — max_ctx 131072 + pool 49152: full KV (10 GiB) forces + // experts cold, so reserve for the POOL -> experts stay hot. + { + auto d = kvflash_placement_decision(kv_per_tok, 131072, /*pool=*/49152, + gpu, core, experts, warm, safety, draft); + expect(!d.all_hot_full_kv, "C3: 131072 full KV forces experts cold"); + expect(d.kv_ctx == 49152, "C3: reserves POOL ctx, not max_ctx"); + expect(d.pool_reduced, "C3: pool reduction engaged"); + expect(d.kv_total == kv_per_tok * 49152ull, "C3: kv_total sized to pool"); + expect(d.kv_total < kv_per_tok * 131072ull, "C3: pool reservation < full"); + } + + // Case 4 — max_ctx 131072, NO kvflash: full ctx, cold cliff. + { + auto d = kvflash_placement_decision(kv_per_tok, 131072, /*pool=*/0, + gpu, core, experts, warm, safety, draft); + expect(d.kv_ctx == 131072, "C4: no-kvflash reserves full ctx"); + expect(!d.all_hot_full_kv, "C4: 131072 no-kvflash -> cold cliff"); + } + + // Case 5 — pool >= max_ctx: pool can't exceed ctx, no reduction. + { + auto d = kvflash_placement_decision(kv_per_tok, 32768, /*pool=*/49152, + gpu, core, experts, warm, safety, draft); + expect(!d.pool_reduced, "C5: pool>=ctx -> no reduction"); + expect(d.kv_ctx == 32768, "C5: keeps full ctx"); + } + + std::printf("PASS: kvflash placement decision (5 cases)\n"); + return 0; +} diff --git a/server/test/test_kvflash_pool_sizing.cpp b/server/test/test_kvflash_pool_sizing.cpp new file mode 100644 index 000000000..fc4dd64b3 --- /dev/null +++ b/server/test/test_kvflash_pool_sizing.cpp @@ -0,0 +1,53 @@ +// Pure unit test for kvflash_pool_from_env (kvflash_pager.h). No ggml, no GPU. +// +// Guards the MoE placement bug (PR #428): placement called pool sizing with +// bare max_ctx (no budget, scorer_expected=false) and got the max_ctx/2 +// fallback, while runtime passed a real VRAM budget + scorer policy and got a +// speed-capped value. Placement then over-reserved KV and starved experts. +// These asserts pin the two behaviours so a future caller can't silently +// reintroduce the divergence. +#include "../src/common/kvflash_pager.h" + +#include +#include + +using namespace dflash::common; + +static void expect(bool cond, const char * msg) { + if (!cond) { std::fprintf(stderr, "FAIL: %s\n", msg); std::exit(1); } +} + +int main() { + setenv("DFLASH_KVFLASH", "auto", 1); + unsetenv("DFLASH_KVFLASH_MAX_POOL"); + const int max_ctx = 131072; + + // No budget supplied -> fallback fraction of max_ctx (the buggy placement + // path). scorer_expected toggles 1/2 vs 1/4. + expect(kvflash_pool_from_env(max_ctx, KvFlashConfig{}, false) == max_ctx / 2, + "no-budget fallback should be max_ctx/2 without scorer"); + expect(kvflash_pool_from_env(max_ctx, KvFlashConfig{}, true) == max_ctx / 4, + "no-budget fallback should be max_ctx/4 with scorer"); + + // Real budget with ample VRAM -> capped at the speed point (16384), far + // below max_ctx/2. This is what runtime actually allocates; placement must + // pass the same budget so it reserves this, not 65536. + KvFlashAutoBudget budget; + budget.free_bytes = 12LL * 1024 * 1024 * 1024; // 12 GiB free + budget.reserve_bytes = 1LL * 1024 * 1024 * 1024; + budget.bytes_per_token = 80 * 1024; // ~qwen35moe density + budget.speed_cap_tokens = 16384; + const int with_budget = kvflash_pool_from_env(max_ctx, KvFlashConfig{}, true, budget); + expect(with_budget == 16384, "ample-VRAM auto pool should hit the speed cap"); + expect(with_budget < max_ctx / 2, + "budgeted pool must be smaller than the no-budget fallback " + "(the divergence that starved experts)"); + + // Tight VRAM -> budget binds below the cap, still well under max_ctx/2. + budget.free_bytes = 2LL * 1024 * 1024 * 1024; // 2 GiB free + const int tight = kvflash_pool_from_env(max_ctx, KvFlashConfig{}, true, budget); + expect(tight > 0 && tight <= 16384, "tight-VRAM pool stays within the cap"); + + std::printf("OK test_kvflash_pool_sizing\n"); + return 0; +}