From d4546a5105a09971178478a1663a2a60af14f15c Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Wed, 27 May 2026 09:08:00 +0200 Subject: [PATCH 01/16] feat(pflash): ee7 early-exit drafter + anchor-transitive cascade + bug-42 tail-capture guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ee7 truncates drafter forward at layer 7 of 28, scoring only those layers. 9.3× drafter wall at 128K (RTX 3090, Qwen3.6-27B-Q4_K_M target + Qwen2.5-0.5B-BF16 drafter). Anchor-transitive cascade rescues multi-hop on bimodal-density prompts (gated, default OFF). Bug #42 fix: tail-capture view-bounds guard at S%4096 in {1..7}. 5 unit tests included. Bench scripts split to follow-up PR. --- server/CMakeLists.txt | 39 ++ server/src/common/score_range.h | 48 +++ server/src/qwen3/anchor_scan.cpp | 169 +++++++++ server/src/qwen3/anchor_scan.h | 42 +++ server/src/qwen3/qwen3_drafter.cpp | 103 ++--- server/src/qwen3/qwen3_graph.cpp | 107 ++++-- server/src/qwen3/qwen3_loader.cpp | 12 + server/test/test_anchor_transitive.cpp | 355 ++++++++++++++++++ .../test_drafter_early_exit_score_range.cpp | 108 ++++++ .../test/test_drafter_tail_capture_guard.cpp | 128 +++++++ .../test_drafter_warm_path_regression.cpp | 164 ++++++++ 11 files changed, 1201 insertions(+), 74 deletions(-) create mode 100644 server/src/common/score_range.h create mode 100644 server/src/qwen3/anchor_scan.cpp create mode 100644 server/src/qwen3/anchor_scan.h create mode 100644 server/test/test_anchor_transitive.cpp create mode 100644 server/test/test_drafter_early_exit_score_range.cpp create mode 100644 server/test/test_drafter_tail_capture_guard.cpp create mode 100644 server/test/test_drafter_warm_path_regression.cpp diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 345ee8aee..cfdc22937 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -217,6 +217,7 @@ add_library(dflash_common STATIC src/draft/draft_gguf_loader.cpp src/draft/draft_safetensors_loader.cpp src/draft/draft_graph.cpp + src/qwen3/anchor_scan.cpp src/qwen3/qwen3_drafter.cpp src/qwen3/qwen3_loader.cpp src/qwen3/qwen3_graph.cpp @@ -572,6 +573,44 @@ if(DFLASH27B_TESTS) target_link_libraries(test_bandit_integration PRIVATE dflash_common) add_test(NAME bandit_integration COMMAND test_bandit_integration) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp") + add_executable(test_drafter_early_exit_score_range + test/test_drafter_early_exit_score_range.cpp) + target_include_directories(test_drafter_early_exit_score_range PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME test_drafter_early_exit_score_range + COMMAND test_drafter_early_exit_score_range) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp") + add_executable(test_anchor_transitive + test/test_anchor_transitive.cpp + src/qwen3/anchor_scan.cpp) + target_include_directories(test_anchor_transitive PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3) + add_test(NAME test_anchor_transitive + COMMAND test_anchor_transitive) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp") + add_executable(test_drafter_warm_path_regression + test/test_drafter_warm_path_regression.cpp) + target_include_directories(test_drafter_warm_path_regression PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME test_drafter_warm_path_regression + COMMAND test_drafter_warm_path_regression) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp") + # GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix. + add_executable(test_drafter_tail_capture_guard + test/test_drafter_tail_capture_guard.cpp) + target_compile_definitions(test_drafter_tail_capture_guard PRIVATE + TAIL_GUARD_USE_NEW_FORMULA) + add_test(NAME test_drafter_tail_capture_guard + COMMAND test_drafter_tail_capture_guard) + # RED phase binary: same source WITHOUT the fix flag — documents the bug. + add_executable(test_drafter_tail_capture_guard_red + test/test_drafter_tail_capture_guard.cpp) + # No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL. + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp") add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp) target_link_libraries(test_draft_vs_reference PRIVATE dflash_common) diff --git a/server/src/common/score_range.h b/server/src/common/score_range.h new file mode 100644 index 000000000..1ad137207 --- /dev/null +++ b/server/src/common/score_range.h @@ -0,0 +1,48 @@ +// Pure helper: compute the [score_layer_start, score_layer_end) range for +// tail-attention scoring given the forward-pass layer limit and the optional +// SCORE_LAYERS count. +// +// Parameters: +// n_layer - total number of layers in the model (e.g. 28) +// score_layers - value of PFLASH_DRAFTER_SCORE_LAYERS (-1 = all) +// fwd_layer_limit - number of layers actually computed (== early_exit_n when +// early-exit is active, else n_layer) +// +// Semantics: SCORE_LAYERS is interpreted as "how many of the computed layers +// to score", counted from the END of the forward range [0, fwd_layer_limit). +// This way SCORE_LAYERS=7 with early_exit_n=7 scores layers [0,7) instead of +// producing the empty interval [7,7) that the old code yielded. +#pragma once + +#include + +namespace dflash::common { + +struct ScoreRange { + int start; // inclusive + int end; // exclusive + int count() const { return end - start; } + bool empty() const { return start >= end; } +}; + +// Compute the scoring layer range. +// When early-exit is active, SCORE_LAYERS counts from 0 upward within the +// computed range [0, fwd_layer_limit), not from the end of the full model. +inline ScoreRange compute_score_range(int n_layer, int score_layers, int fwd_layer_limit) { + // score_layers <= 0 means "use all computed layers" + const int effective_n = fwd_layer_limit; + int start; + if (score_layers > 0 && score_layers < n_layer) { + // Clamp: can't request more layers than were computed. + int want = std::min(score_layers, effective_n); + start = effective_n - want; + } else { + start = 0; + } + int end = fwd_layer_limit; + // Clamp start to never exceed end. + if (start > end) start = end; + return { start, end }; +} + +} // namespace dflash::common diff --git a/server/src/qwen3/anchor_scan.cpp b/server/src/qwen3/anchor_scan.cpp new file mode 100644 index 000000000..e0088167a --- /dev/null +++ b/server/src/qwen3/anchor_scan.cpp @@ -0,0 +1,169 @@ +#include "anchor_scan.h" + +#include +#include +#include +#include + +namespace dflash::qwen3 { + +// Force chunk and its radius-neighborhood into `forced`. +static void force_neighborhood(std::vector& forced, int n_chunks, + int chunk, int radius) { + int lo = std::max(0, chunk - radius); + int hi = std::min(n_chunks - 1, chunk + radius); + for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; +} + +void scan_and_force( + const std::vector& ids, + int body_end, + const std::vector& query_pool, + const AnchorScanCfg& cfg, + std::vector& forced) +{ + const int n_chunks = (int)forced.size(); + const int ngram = cfg.ngram; + const int search_end = std::max(0, body_end - ngram); + + for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) { + int hits = 0; + int hit_pos[8]; + for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) { + bool same = true; + for (int k = 0; k < ngram; ++k) { + if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) { + same = false; + break; + } + } + if (same) { + if (hits < 8) hit_pos[hits] = p; + ++hits; + } + } + if (hits > 0 && hits <= cfg.max_anchor_hits) { + for (int i = 0; i < hits && i < 8; ++i) { + force_neighborhood(forced, n_chunks, + hit_pos[i] / cfg.chunk_size, + cfg.anchor_radius); + } + } + } +} + +// Helper: count set entries in forced. +static int count_set(const std::vector& forced) { + int n = 0; + for (uint8_t v : forced) n += (v != 0); + return n; +} + +void scan_and_force_transitive( + const std::vector& ids, + int body_end, + const std::vector& initial_query_pool, + const AnchorScanCfg& cfg, + int max_iters, + std::vector& forced) +{ + auto pool = initial_query_pool; + const int n_chunks = (int)forced.size(); + + // Precompute token frequencies in body once. + std::unordered_map body_freq; + body_freq.reserve((size_t)body_end); + for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]]; + + // Build inverted index: token -> list of body positions (for rare tokens only). + std::unordered_map> rare_positions; + if (cfg.rare_token_max_freq > 0) { + for (auto& kv : body_freq) { + if (kv.second <= cfg.rare_token_max_freq) { + rare_positions[kv.first] = {}; + } + } + for (int p = 0; p < body_end; ++p) { + auto it = rare_positions.find(ids[(size_t)p]); + if (it != rare_positions.end()) it->second.push_back(p); + } + } + + // Pass-1: run the initial scan. + const int count_before_pass1 = count_set(forced); + scan_and_force(ids, body_end, pool, cfg, forced); + const int gained_pass1 = count_set(forced) - count_before_pass1; + + // Gating: if pass-1 already found many anchors, skip the cascade entirely. + if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) { + return; + } + + // Cascade loop: expand pool with newly-forced tokens and re-scan. + std::vector prev_forced; + for (int it = 0; it < max_iters; ++it) { + prev_forced = forced; + + // Rare-token single-match: worklist-driven so cascades within a pass are + // caught (e.g. hop3 forces hop2 which forces hop1 in one outer iteration). + if (cfg.rare_token_max_freq > 0) { + std::vector worklist; + for (int c = 0; c < n_chunks; ++c) { + if (forced[c] && !prev_forced[c]) worklist.push_back(c); + } + // On first iteration, seed from everything forced so far (pass-1 results). + if (it == 0) { + worklist.clear(); + for (int c = 0; c < n_chunks; ++c) { + if (forced[c]) worklist.push_back(c); + } + } + for (int wi = 0; wi < (int)worklist.size(); ++wi) { + int c = worklist[wi]; + int s = c * cfg.chunk_size; + int e = std::min(body_end, (c + 1) * cfg.chunk_size); + for (int j = s; j < e; ++j) { + auto it2 = rare_positions.find(ids[(size_t)j]); + if (it2 == rare_positions.end()) continue; + for (int p : it2->second) { + int target_c = p / cfg.chunk_size; + if (!forced[(size_t)target_c]) { + force_neighborhood(forced, n_chunks, + target_c, cfg.anchor_radius); + worklist.push_back(target_c); + } + } + } + } + } + + // Hard cap: if we exceeded max_forced_count, revert this iteration and stop. + if (count_set(forced) > cfg.max_forced_count) { + forced = prev_forced; + break; + } + + if (forced == prev_forced) break; + + // Expand pool with tokens from newly-forced chunks (feeds next 4-gram pass). + for (int c = 0; c < n_chunks; ++c) { + if (forced[c] && !prev_forced[c]) { + int s = c * cfg.chunk_size; + int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size); + for (int j = s; j < e; ++j) pool.push_back(ids[j]); + } + } + + // 4-gram scan with expanded pool for next iteration. + prev_forced = forced; + scan_and_force(ids, body_end, pool, cfg, forced); + + // Hard cap check after 4-gram expansion too. + if (count_set(forced) > cfg.max_forced_count) { + forced = prev_forced; + break; + } + } +} + +} // namespace dflash::qwen3 diff --git a/server/src/qwen3/anchor_scan.h b/server/src/qwen3/anchor_scan.h new file mode 100644 index 000000000..8f75a0855 --- /dev/null +++ b/server/src/qwen3/anchor_scan.h @@ -0,0 +1,42 @@ +// N-gram anchor scan: mark chunks forced by token-match between a query pool +// and the body of an ids sequence. Pure CPU, no GPU, no model required. +#pragma once + +#include +#include +#include + +namespace dflash::qwen3 { + +struct AnchorScanCfg { + int chunk_size; + int anchor_radius; + int max_anchor_hits; + int ngram = 4; + int rare_token_max_freq = 8; // tokens appearing <= this many times in body count as rare + int cascade_min_anchor_count = 0; // skip cascade if pass-1 forced >= this many chunks (0 = always cascade) + int max_forced_count = INT_MAX; // hard cap on total forced chunks +}; + +// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end). +// `forced` is in-out; new hits are OR-merged. Idempotent. +void scan_and_force( + const std::vector& ids, + int body_end, + const std::vector& query_pool, + const AnchorScanCfg& cfg, + std::vector& forced +); + +// Transitive variant: expands the query pool with tokens from newly-forced +// chunks and re-runs scan_and_force until a fixed point or max_iters reached. +void scan_and_force_transitive( + const std::vector& ids, + int body_end, + const std::vector& initial_query_pool, + const AnchorScanCfg& cfg, + int max_iters, + std::vector& forced +); + +} // namespace dflash::qwen3 diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index f65cb079f..589351f9c 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -17,6 +17,7 @@ #include "qwen3_drafter_model.h" #include "common/backend_precision.h" #include "internal.h" +#include "anchor_scan.h" #include "ggml.h" #include "ggml-alloc.h" @@ -64,6 +65,13 @@ static int env_int(const char * name, int fallback) { return fallback; } +static float env_float(const char * name, float def) { + if (const char * v = std::getenv(name)) { + try { return std::stof(v); } catch (...) {} + } + return def; +} + static void force_chunk_neighborhood(std::vector & forced, int n_chunks, int chunk, int radius) { int lo = std::max(0, chunk - radius); @@ -548,33 +556,34 @@ static std::vector qwen35_score_and_compress( for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); - std::vector forced((size_t)n_chunks, 0); + const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); + const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); + const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + const int anchor_ngram = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", 4); + const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", 2); + + const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); + const float max_forced_ratio = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); const int q0 = std::max(0, S - query_tokens); - constexpr int NGRAM = 4; - for (int q = q0; q + NGRAM <= S; ++q) { - int hits = 0; - int hit_pos[8]; - const int search_end = std::max(0, q0 - NGRAM); - for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) { - bool same = true; - for (int k = 0; k < NGRAM; ++k) { - if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; } - } - if (same) { - if (hits < 8) hit_pos[hits] = p; - ++hits; - } - } - if (hits > 0 && hits <= max_anchor_hits) { - for (int i = 0; i < hits && i < 8; ++i) { - force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius); - } - } + std::vector query_pool(ids.begin() + q0, ids.end()); + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius, + max_anchor_hits, anchor_ngram, + rare_token_max_freq}; + anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); + anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); + + const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; + const int max_iters = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + if (use_transitive) { + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + anchor_cfg, max_iters, forced); + } else { + dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } + for (int c = 0; c < n_chunks; ++c) { if (forced[(size_t)c] && !selected[(size_t)c]) { selected[(size_t)c] = 1; @@ -740,34 +749,36 @@ std::vector drafter_score_and_compress( head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw)); tail_chunks = std::max(0, budget - head_chunks); } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); + const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); + const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + const int anchor_ngram = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", 4); + const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", 2); + + const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); + const float max_forced_ratio = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); + std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1; for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; const int q0 = std::max(0, S - query_tokens); - constexpr int NGRAM = 4; - for (int q = q0; q + NGRAM <= S; ++q) { - int hits = 0; - int hit_pos[8]; - const int search_end = std::max(0, q0 - NGRAM); - for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) { - bool same = true; - for (int k = 0; k < NGRAM; ++k) { - if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; } - } - if (same) { - if (hits < 8) hit_pos[hits] = p; - ++hits; - } - } - if (hits > 0 && hits <= max_anchor_hits) { - for (int i = 0; i < hits && i < 8; ++i) { - force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius); - } + { + std::vector query_pool(ids.begin() + q0, ids.end()); + dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius, + max_anchor_hits, anchor_ngram, + rare_token_max_freq}; + anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); + anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); + + const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; + const int max_iters = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + if (use_transitive) { + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + anchor_cfg, max_iters, forced); + } else { + dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } } diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp index a23bcefb3..858bcd75e 100644 --- a/server/src/qwen3/qwen3_graph.cpp +++ b/server/src/qwen3/qwen3_graph.cpp @@ -35,6 +35,7 @@ #include "qwen3_drafter_model.h" #include "internal.h" #include "flashprefill.h" +#include "../common/score_range.h" #include "device_runtime.h" @@ -249,13 +250,39 @@ bool forward_qwen3_drafter_model( } running_max.assign((size_t)n_lookahead * S, -INFINITY); + // Compute score_layer_start early so we can avoid allocating K_norope/Q_norope + // for layers that will never be used in scoring. At S=128K the full K_norope + // allocation is ~5.6 GB (21 unused layers × 268 MB) — skipping it keeps total + // VRAM under 24 GB and eliminates the warm-path regression (A_compute 5.4x). + static const int score_layers_pre = []() -> int { + const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS"); + if (e) { int v = std::atoi(e); if (v > 0) return v; } + return -1; + }(); + static const int early_exit_pre = []() -> int { + const char * e = std::getenv("PFLASH_DRAFTER_EARLY_EXIT_N"); + if (e) { int v = std::atoi(e); if (v > 0) return v; } + return -1; + }(); + // fwd_layer_limit_pre mirrors the fwd_layer_limit computed later in the loop. + const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer) + ? early_exit_pre : w.n_layer; + // Use compute_score_range (same formula as the scoring loop) so the pre-alloc + // boundary is guaranteed to match the actual scoring boundary. + const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre); + const int score_layer_start_pre = pre_range.start; + // Number of layers that participate in scoring (and need K_norope/Q_norope). + const int n_score_layers = pre_range.count(); + PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf; std::vector K_curr_v((size_t)w.n_layer); std::vector V_curr_v((size_t)w.n_layer); std::vector Q_last_v((size_t)w.n_layer); - // NoPE: pre-RoPE K (full sequence) and Q tail; allocated only when nope_tail. - std::vector K_norope_v(nope_tail ? (size_t)w.n_layer : 0); - std::vector Q_norope_v(nope_tail ? (size_t)w.n_layer : 0); + // NoPE: only allocate K_norope/Q_norope for layers that will be scored. + // When score_layer_start_pre > 0 this trims up to 21 × 268 MB = 5.6 GB, + // preventing the VRAM overflow that causes the warm-path regression at 128K. + std::vector K_norope_v(nope_tail ? (size_t)n_score_layers : 0); + std::vector Q_norope_v(nope_tail ? (size_t)n_score_layers : 0); auto cleanup_all = [&]() { free_pers(hidden_buf); free_pers(pos_buf); @@ -294,9 +321,10 @@ bool forward_qwen3_drafter_model( cleanup_all(); return false; } - if (nope_tail) { - if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[il]) || - !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[il])) { + if (nope_tail && il >= score_layer_start_pre && il < fwd_layer_limit_pre) { + const int si = il - score_layer_start_pre; + if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[si]) || + !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[si])) { set_last_error("forward_qwen3: K_norope/Q_norope alloc failed at layer " + std::to_string(il)); cleanup_all(); return false; @@ -352,6 +380,10 @@ bool forward_qwen3_drafter_model( ggml_free(gctx); } + // PFLASH_DRAFTER_EARLY_EXIT_N: already read into early_exit_pre above. + // Alias used in the forward-loop limit below. + const int & early_exit_n = early_exit_pre; + // Per-layer A→FA→B loop. ggml_gallocr_t galloc = ggml_gallocr_new( ggml_backend_get_default_buffer_type(w.backend)); @@ -372,7 +404,10 @@ bool forward_qwen3_drafter_model( double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0; double t_fp = 0.0; - for (int il = 0; il < w.n_layer; ++il) { + const int fwd_layer_limit = (early_exit_n > 0 && early_exit_n < w.n_layer) + ? early_exit_n : w.n_layer; + + for (int il = 0; il < fwd_layer_limit; ++il) { const auto & L = w.layers[il]; const bool debug_first_layer = (il == 0 && std::getenv("DFLASH_FP_DEBUG_LAYER0") != nullptr); @@ -411,19 +446,22 @@ bool forward_qwen3_drafter_model( ggml_tensor * Q = ggml_mul_mat(gA, L.wq, h_norm); Q = ggml_reshape_3d(gA, Q, D, H, cl); - Q = ggml_rms_norm(gA, Q, eps); - Q = ggml_mul(gA, Q, L.q_norm); - // NoPE: capture pre-RoPE Q tail so the tail scorer is not biased by distance. - if (nope_tail) { + if (L.q_norm) { + Q = ggml_rms_norm(gA, Q, eps); + Q = ggml_mul(gA, Q, L.q_norm); + } + // NoPE: capture pre-RoPE Q tail (only for layers that will be scored). + if (nope_tail && il >= score_layer_start_pre) { + const int si = il - score_layer_start_pre; const int tail_lo_nr = S - n_lookahead; - if (tail_lo_nr >= cs && tail_lo_nr < cs + cl) { + if (tail_lo_nr >= cs && tail_lo_nr + n_lookahead <= cs + cl) { const int local_lo_nr = tail_lo_nr - cs; ggml_tensor * Q_prenrope_tail = ggml_view_3d( gA, Q, D, H, n_lookahead, Q->nb[1], Q->nb[2], (size_t)local_lo_nr * Q->nb[2]); ggml_build_forward_expand(gfA, - ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[il].t)); + ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[si].t)); } } Q = ggml_rope_ext(gA, Q, pos_chunk, nullptr, D, @@ -432,12 +470,15 @@ bool forward_qwen3_drafter_model( ggml_tensor * K = ggml_mul_mat(gA, L.wk, h_norm); K = ggml_reshape_3d(gA, K, D, Hk, cl); - K = ggml_rms_norm(gA, K, eps); - K = ggml_mul(gA, K, L.k_norm); - // NoPE: save pre-RoPE K chunk alongside K_curr_v. - if (nope_tail) { - const size_t kn_esz = ggml_element_size(K_norope_v[il].t); - ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[il].t, D, Hk, cl, + if (L.k_norm) { + K = ggml_rms_norm(gA, K, eps); + K = ggml_mul(gA, K, L.k_norm); + } + // NoPE: save pre-RoPE K chunk (only for layers that will be scored). + if (nope_tail && il >= score_layer_start_pre) { + const int si = il - score_layer_start_pre; + const size_t kn_esz = ggml_element_size(K_norope_v[si].t); + ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[si].t, D, Hk, cl, kn_esz * D, kn_esz * D * Hk, (size_t)cs * kn_esz * D * Hk); ggml_build_forward_expand(gfA, ggml_cpy(gA, K, Kn_dst)); @@ -466,7 +507,7 @@ bool forward_qwen3_drafter_model( // Copy Q tail to Q_last_v[il] in the chunk that contains the tail. const int tail_lo = S - n_lookahead; - if (tail_lo >= cs && tail_lo < cs + cl) { + if (tail_lo >= cs && tail_lo + n_lookahead <= cs + cl) { int local_lo = tail_lo - cs; ggml_tensor * Q_tail_local = ggml_view_3d( gA, Q, D, H, n_lookahead, @@ -707,12 +748,12 @@ bool forward_qwen3_drafter_model( } #endif - if (il == 0 || il == w.n_layer - 1) { + if (il == 0 || il == fwd_layer_limit - 1) { std::fprintf(stderr, "[qwen3-0.6b-fp] layer %d/%d done " "(A_setup=%.3fs A_alloc=%.3fs A_compute=%.3fs FP=%.3fs " "B_warm=%.3fs B_setup=%.3fs B_alloc=%.3fs B_copy_in=%.3fs B_norm=%.3fs B_compute=%.3fs B_copy_out=%.3fs)\n", - il + 1, w.n_layer, + il + 1, fwd_layer_limit, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out); std::fflush(stderr); @@ -724,19 +765,28 @@ bool forward_qwen3_drafter_model( auto t_fwd_end = std::chrono::steady_clock::now(); double t_fwd = std::chrono::duration(t_fwd_end - t_total_start).count(); - // Tail attention scoring (unchanged from previous impl). + // Tail attention scoring. + // score_layers_pre / compute_score_range already determined the range before + // allocation (to size K_norope_v correctly). Re-use that result here. + // score_layer_start_pre == score_layer_start by construction (same formula, + // same env vars, same fwd_layer_limit_pre == fwd_layer_limit). + const int score_layer_start = score_layer_start_pre; + const int score_layer_end = fwd_layer_limit; + std::vector probs_h((size_t)S * n_lookahead * H); auto t_score_start = std::chrono::steady_clock::now(); - for (int il = 0; il < w.n_layer; ++il) { + for (int il = score_layer_start; il < score_layer_end; ++il) { ggml_init_params ip{}; ip.mem_size = ggml_tensor_overhead() * 32 + ggml_graph_overhead() + 16 * 1024; ip.no_alloc = true; ggml_context * gctx = ggml_init(ip); + // K_norope_v / Q_norope_v are indexed from score_layer_start_pre. + const int si = il - score_layer_start_pre; ggml_tensor * K_f32 = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, D, Hk, S); ggml_tensor * K_cast = ggml_cpy(gctx, - nope_tail ? K_norope_v[il].t : K_curr_v[il].t, K_f32); + nope_tail ? K_norope_v[si].t : K_curr_v[il].t, K_f32); ggml_tensor * K_perm = ggml_cont(gctx, ggml_permute(gctx, K_cast, 0, 2, 1, 3)); ggml_tensor * K_score = K_perm; @@ -749,7 +799,7 @@ bool forward_qwen3_drafter_model( } ggml_tensor * Q_tail_perm = ggml_cont(gctx, ggml_permute(gctx, - nope_tail ? Q_norope_v[il].t : Q_last_v[il].t, + nope_tail ? Q_norope_v[si].t : Q_last_v[il].t, 0, 2, 1, 3)); ggml_tensor * attn_score = ggml_mul_mat(gctx, K_score, Q_tail_perm); ggml_tensor * probs = ggml_soft_max_ext(gctx, attn_score, mask_tail_buf.t, @@ -796,8 +846,9 @@ bool forward_qwen3_drafter_model( double t_score = std::chrono::duration(t_total_end - t_score_start).count(); std::fprintf(stderr, "[qwen3-0.6b-fp] forward %.2fs (S=%d, A_setup=%.2fs A_alloc=%.2fs A_compute=%.2fs FP=%.2fs B_warm=%.2fs B_setup=%.2fs B_alloc=%.2fs B_copy_in=%.2fs B_norm=%.2fs B_compute=%.2fs B_copy_out=%.2fs) " - "tail-score %.2fs total %.2fs\n", - t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, t_score, t_fwd + t_score); + "tail-score %.2fs (layers %d-%d) total %.2fs\n", + t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, + t_score, score_layer_start, score_layer_end - 1, t_fwd + t_score); std::fflush(stderr); cleanup_all(); diff --git a/server/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp index ed38ee106..b7b35a85e 100644 --- a/server/src/qwen3/qwen3_loader.cpp +++ b/server/src/qwen3/qwen3_loader.cpp @@ -133,6 +133,18 @@ bool load_qwen3_drafter_model(const std::string & path, out.head_dim = (int)get_u32(gctx, "qwen3.attention.key_length", 128); out.rope_theta = get_f32(gctx, "qwen3.rope.freq_base", 1000000.0f); + // Detect weight quant type from blk.0.attn_q.weight; support BF16 and Q8_0. + ggml_type wtype = GGML_TYPE_BF16; + { + int64_t tidx = gguf_find_tensor(gctx, "blk.0.attn_q.weight"); + if (tidx >= 0) { + wtype = gguf_get_tensor_type(gctx, tidx); + } + } + std::fprintf(stderr, "[qwen3-0.6b] detected weight type: %s\n", + wtype == GGML_TYPE_Q8_0 ? "Q8_0" : "BF16"); + std::fflush(stderr); + // Compute total tensor metadata size for context allocation. const int n_layer = out.n_layer; const int n_tensors_per_layer = 11; diff --git a/server/test/test_anchor_transitive.cpp b/server/test/test_anchor_transitive.cpp new file mode 100644 index 000000000..ae8a0bbce --- /dev/null +++ b/server/test/test_anchor_transitive.cpp @@ -0,0 +1,355 @@ +// TDD: anchor transitive multi-pass. +// +// T1 — single-pass query-match preserved (regression pin, PASS today) +// T2 — single-pass misses chain hops (characterises limitation, PASS today) +// T3 — transitive rescues all hops (RED until Phase 2) +// +// Pure CPU — no GPU, no model load. + +#include "../src/qwen3/anchor_scan.h" + +#include +#include +#include +#include + +#define REQUIRE(cond) \ + do { if (!(cond)) { \ + std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \ + std::exit(1); \ + } } while (0) + +static constexpr int32_t FILLER = 1; +static constexpr int32_t M1 = 1001, M2 = 1002, M3 = 1003; +static constexpr int CHUNK = 64; + +// Place a marker 4-gram [FILLER, FILLER, MARKER, FILLER] at position pos. +static void place_marker_4gram(std::vector& ids, int pos, int32_t marker) { + ids[(size_t)pos] = FILLER; + ids[(size_t)pos + 1] = FILLER; + ids[(size_t)pos + 2] = marker; + ids[(size_t)pos + 3] = FILLER; +} + +// T1 — single-pass finds a query-matching marker in the body. +static void t1_single_pass_match() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // Body marker at pos 100 (chunk 1). + place_marker_4gram(ids, 100, M3); + // Same 4-gram in the query suffix at pos 2044 (inside query window). + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; // N - 100 + std::vector query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced); + + // Chunk containing pos 100 must be forced. + const int target_chunk = 100 / CHUNK; // chunk 1 + REQUIRE(forced[(size_t)target_chunk] == 1); + + std::printf("T1 PASS: chunk %d forced by single-pass M3 match\n", target_chunk); +} + +// T2 — single-pass only forces the direct match; chain hops stay unforced. +static void t2_single_pass_misses_hops() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // hop1 at pos 200 (chunk 3): contains M1. + place_marker_4gram(ids, 200, M1); + + // hop2 at pos 600 (chunk 9): contains M2 + M1 (bridge to hop1). + place_marker_4gram(ids, 600, M2); + place_marker_4gram(ids, 604, M1); + + // hop3 at pos 1200 (chunk 18): contains M3 + M2 (bridge to hop2). + place_marker_4gram(ids, 1200, M3); + place_marker_4gram(ids, 1204, M2); + + // Query suffix at pos 2044: contains M3. + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; + std::vector query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced); + + const int chunk_hop3 = 1200 / CHUNK; // 18 + const int chunk_hop2 = 600 / CHUNK; // 9 + const int chunk_hop1 = 200 / CHUNK; // 3 + + // Single-pass: only the direct M3 match at pos 1200 is forced. + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 0); + REQUIRE(forced[(size_t)chunk_hop1] == 0); + + std::printf("T2 PASS: chunk(%d) forced, chunk(%d) and chunk(%d) NOT forced (single-pass)\n", + chunk_hop3, chunk_hop2, chunk_hop1); +} + +// T3 — transitive rescues all hops (FAILS until Phase 2 implements the function). +static void t3_transitive_rescues_all() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + place_marker_4gram(ids, 200, M1); + + place_marker_4gram(ids, 600, M2); + place_marker_4gram(ids, 604, M1); + + place_marker_4gram(ids, 1200, M3); + place_marker_4gram(ids, 1204, M2); + + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; + std::vector initial_query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool, + cfg, /*max_iters=*/3, forced); + + const int chunk_hop3 = 1200 / CHUNK; + const int chunk_hop2 = 600 / CHUNK; + const int chunk_hop1 = 200 / CHUNK; + + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 1); + REQUIRE(forced[(size_t)chunk_hop1] == 1); + + std::printf("T3 PASS: all hops forced transitively\n"); +} + +// T4 — variable-name reuse across templates (FAILS until v2 adds rare-token match). +// +// Token layout: +// FILLER=1, V1=2001(X42), V2=2002(Y42), V3=2003(Z42) +// Template-context tokens: A=3001,B=3002,C=3003,D=3004,E=3005,F=3006 +// Query-match tokens: X1=4001,X2=4002,X3=4003 +// +// hop3 (chunk 18, pos 1200): [X1,X2,V3,X3,E,V2,F,FILL] — 4-gram [X1,X2,V3,X3] matches query +// hop2 (chunk 9, pos 600): [C,V2,FILL,V1,D,FILL,FILL] — V2 in DIFFERENT context than hop3 +// hop1 (chunk 3, pos 200): [A,V1,FILL,B] — V1 in DIFFERENT context than hop2 +// query (pos 2044): [X1,X2,V3,X3] — matches hop3 4-gram exactly +// +// Pass 1 (4-gram): forces hop3. +// Pass 1 rare-token: V2 (freq=2) found in hop3 → also at pos 601 (hop2 chunk 9) → forces hop2. +// Pass 2 rare-token: V1 (freq=2) found in hop2 → also at pos 201 (hop1 chunk 3) → forces hop1. +// Today's impl (4-gram only) fails because V2 4-grams in hop3 ≠ V2 4-grams in hop2. +static void t4_rare_token_bridges_different_context() { + static constexpr int32_t V1 = 2001, V2 = 2002, V3 = 2003; + static constexpr int32_t A = 3001, B = 3002, C = 3003, D = 3004, E = 3005, F = 3006; + static constexpr int32_t X1 = 4001, X2 = 4002, X3 = 4003; + + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // hop1 (chunk 3, pos 200): [A, V1, FILL, B] + ids[200] = A; ids[201] = V1; ids[202] = FILLER; ids[203] = B; + + // hop2 (chunk 9, pos 600): [C, V2, FILL, V1, D, FILL, FILL] + ids[600] = C; ids[601] = V2; ids[602] = FILLER; ids[603] = V1; + ids[604] = D; ids[605] = FILLER; ids[606] = FILLER; + + // hop3 (chunk 18, pos 1200): [X1, X2, V3, X3, E, V2, F, FILL] + // V2 here is in 4-gram context [E,V2,F,FILL] — differs from hop2's [C,V2,FILL,V1] + ids[1200] = X1; ids[1201] = X2; ids[1202] = V3; ids[1203] = X3; + ids[1204] = E; ids[1205] = V2; ids[1206] = F; ids[1207] = FILLER; + + // query suffix (pos 2044): [X1, X2, V3, X3] — exact 4-gram match to hop3 + ids[2044] = X1; ids[2045] = X2; ids[2046] = V3; ids[2047] = X3; + + const int q0 = 1948; + std::vector initial_query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4, + /*rare_token_max_freq=*/8}; + dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool, + cfg, /*max_iters=*/3, forced); + + const int chunk_hop3 = 1200 / CHUNK; // 18 + const int chunk_hop2 = 600 / CHUNK; // 9 + const int chunk_hop1 = 200 / CHUNK; // 3 + + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 1); + REQUIRE(forced[(size_t)chunk_hop1] == 1); + + std::printf("T4 PASS: all hops forced via rare-token bridge (V2 freq=2, V1 freq=2)\n"); +} + +// T5: gate closes when pass-1 already finds >= cascade_min_anchor_count chunks. +// +// Layout (N=4096, chunk=64 → 64 chunks): +// A common 4-gram [CMN,CMN,CMN,CMN] appears 50 times at scattered body positions. +// One forced chunk (chunk 5, pos 320) also contains a unique rare token RT (freq=1). +// RT appears once more at a separate body position in chunk 60 (pos 3840). +// Query suffix contains the common 4-gram → pass-1 forces all 50 matching chunks. +// +// With cascade_min_anchor_count=5: gained=50 >= 5 → gate closes → cascade skipped. +// chunk 60 (pos 3840, which has RT but is only reachable via cascade) stays UNFORCED. +// +// With cascade_min_anchor_count=0: gate open → cascade runs → chunk 60 gets forced. +// This contrast proves the gate is operative. +static void t5_gate_closes_when_pass1_finds_many() { + static constexpr int32_t CMN = 5001; // common token (4-gram made of it) + static constexpr int32_t RT = 5002; // rare token (freq=2) + + const int N = 4096; + const int n_chunks = (N + CHUNK - 1) / CHUNK; // 64 + std::vector ids((size_t)N, FILLER); + + // Place common 4-gram at 50 scattered body positions (chunks 0..49). + // Spaced 64 tokens apart to land in different chunks. + for (int i = 0; i < 50; ++i) { + int pos = i * 64 + 4; // pos 4, 68, 132, ... (well within body) + ids[(size_t)pos] = CMN; + ids[(size_t)pos + 1] = CMN; + ids[(size_t)pos + 2] = CMN; + ids[(size_t)pos + 3] = CMN; + } + + // RT appears in chunk 5 (pos 320) and chunk 60 (pos 3840). + ids[320] = RT; + ids[3840] = RT; + + // Query suffix: just the common 4-gram so pass-1 fires on all 50 body positions. + const int q0 = N - 32; + ids[(size_t)q0] = CMN; + ids[(size_t)q0 + 1] = CMN; + ids[(size_t)q0 + 2] = CMN; + ids[(size_t)q0 + 3] = CMN; + std::vector query_pool(ids.begin() + q0, ids.end()); + + // --- Test A: gate CLOSED (cascade_min_anchor_count=5) --- + { + std::vector forced_a((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/64, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/5, + /*max_forced_count=*/INT_MAX}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/3, forced_a); + + // Pass-1 forces chunks 0..49 (50 chunks); gate closes → cascade skipped. + // chunk 60 (pos 3840 has RT but only reachable via cascade) must be UNFORCED. + const int chunk_rt_extra = 3840 / CHUNK; // 60 + REQUIRE(forced_a[(size_t)chunk_rt_extra] == 0); + // chunk 5 (contains RT at pos 320) is forced by pass-1 (common 4-gram at pos 324). + REQUIRE(forced_a[5] == 1); + + std::printf("T5a PASS: gate closed (gained=50 >= min=5), chunk %d unforced\n", + chunk_rt_extra); + } + + // --- Test B: gate OPEN (cascade_min_anchor_count=0) → cascade forces chunk 60 --- + { + std::vector forced_b((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/64, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/0, + /*max_forced_count=*/INT_MAX}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/3, forced_b); + + // Cascade runs; chunk 5 is forced by pass-1 and contains RT; + // RT at pos 3840 → chunk 60 forced via rare-token cascade. + const int chunk_rt_extra = 3840 / CHUNK; + REQUIRE(forced_b[(size_t)chunk_rt_extra] == 1); + + std::printf("T5b PASS: gate open (min=0), cascade forced chunk %d via RT\n", + chunk_rt_extra); + } +} + +// T6: hard cap (max_forced_count) prevents runaway cascade. +// +// Layout (N=2048, chunk=64 → 32 chunks): +// Query contains 4-gram [TGR,TGR,TGR,TGR] which matches body chunk 0. +// Chunk 0 contains chain token C0 (freq=2): also appears in chunk 1. +// Chunk 1 contains chain token C1 (freq=2): also appears in chunk 2. +// ... 20 such chain links. +// Pass-1 forces chunk 0 (1 chunk gained < cascade_min_anchor_count=0 → gate open). +// Cascade rare-token worklist propagates: chunk 0→1→2→...→20 (20 more). +// max_forced_count=5 → cascade stops when total > 5. Result: forced <= 5. +static void t6_hard_cap_prevents_runaway() { + static constexpr int32_t TGR = 7000; // trigger token for 4-gram pass-1 match + + const int N = 2048; + const int n_chunks = (N + CHUNK - 1) / CHUNK; // 32 + std::vector ids((size_t)N, FILLER); + + // body chunk 0 (pos 0): place 4-gram [TGR,TGR,TGR,TGR] so pass-1 forces it. + ids[0] = TGR; ids[1] = TGR; ids[2] = TGR; ids[3] = TGR; + + // Rare-token chain: C_i appears in chunk i (at offset 8) and chunk i+1 (at offset 9). + // Offsets 8 and 9 within each chunk don't collide between consecutive tokens. + // Cascade worklist: chunk i forced → C_i found at offset 8 → chunk i+1 forced. + for (int i = 0; i < 20; ++i) { + int32_t tok = 7100 + i; + ids[(size_t)(i * 64 + 8)] = tok; // in chunk i, offset 8 + ids[(size_t)((i + 1) * 64 + 9)] = tok; // in chunk i+1, offset 9 + } + + // Query suffix: contains [TGR,TGR,TGR,TGR] → pass-1 matches body chunk 0. + const int q0 = N - 64; + ids[(size_t)q0] = TGR; + ids[(size_t)q0 + 1] = TGR; + ids[(size_t)q0 + 2] = TGR; + ids[(size_t)q0 + 3] = TGR; + std::vector query_pool(ids.begin() + q0, ids.end()); + + // Without cap: cascade forces chunks 0..20 (21 chunks total). + // With cap=5: stops at 5. + std::vector forced((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/0, + /*max_forced_count=*/5}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/25, forced); + + int total_forced = 0; + for (int c = 0; c < n_chunks; ++c) total_forced += (int)forced[(size_t)c]; + + REQUIRE(total_forced <= 5); + REQUIRE(forced[0] == 1); // chunk 0 always forced by pass-1 + + std::printf("T6 PASS: hard cap engaged, forced=%d (cap=5, chain length=20)\n", + total_forced); +} + +int main() { + t1_single_pass_match(); + t2_single_pass_misses_hops(); + t3_transitive_rescues_all(); + t4_rare_token_bridges_different_context(); + t5_gate_closes_when_pass1_finds_many(); + t6_hard_cap_prevents_runaway(); + std::printf("\nAll anchor_transitive tests passed.\n"); + return 0; +} diff --git a/server/test/test_drafter_early_exit_score_range.cpp b/server/test/test_drafter_early_exit_score_range.cpp new file mode 100644 index 000000000..96e888e77 --- /dev/null +++ b/server/test/test_drafter_early_exit_score_range.cpp @@ -0,0 +1,108 @@ +// Unit tests for dflash::common::compute_score_range(). +// Plain int main(), no frameworks. +// +// Verifies that SCORE_LAYERS is interpreted relative to fwd_layer_limit +// (the early-exit boundary) rather than the full model depth, so that +// early_exit_n=7 + score_layers=7 produces the non-empty range [0,7) +// instead of the phantom-empty [7,7) the old inline code produced. + +#include "score_range.h" + +#include +#include + +// REQUIRE survives -DNDEBUG (bare assert does not). +#define REQUIRE(cond) \ + do { if (!(cond)) { \ + std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \ + std::exit(1); \ + } } while (0) + +using dflash::common::ScoreRange; +using dflash::common::compute_score_range; + +// T1 — The exact bug scenario: early_exit_n=7, score_layers=7, n_layer=28. +// OLD code: start = min(28-7, 7) = 7, end = 7 → empty loop. +// NEW code: effective_n=7, want=min(7,7)=7, start=7-7=0, end=7 → [0,7). +static void t1_bug_scenario() { + ScoreRange r = compute_score_range(/*n_layer=*/28, + /*score_layers=*/7, + /*fwd_layer_limit=*/7); + REQUIRE(r.start == 0 && "score_layer_start must be 0"); + REQUIRE(r.end == 7 && "score_layer_end must equal fwd_layer_limit"); + REQUIRE(!r.empty() && "range must be non-empty"); + REQUIRE(r.count() == 7); + printf("T1 pass: early_exit_n=7 score_layers=7 n_layer=28 -> [%d,%d)\n", + r.start, r.end); +} + +// T2 — No early exit (fwd_layer_limit == n_layer). +// score_layers=7 should pick the last 7 layers [21,28). +static void t2_no_early_exit() { + ScoreRange r = compute_score_range(28, 7, 28); + REQUIRE(r.start == 21); + REQUIRE(r.end == 28); + REQUIRE(!r.empty()); + REQUIRE(r.count() == 7); + printf("T2 pass: no early exit score_layers=7 -> [%d,%d)\n", r.start, r.end); +} + +// T3 — score_layers == -1 (all layers) with no early exit. +static void t3_all_layers_no_exit() { + ScoreRange r = compute_score_range(28, -1, 28); + REQUIRE(r.start == 0); + REQUIRE(r.end == 28); + REQUIRE(!r.empty()); + printf("T3 pass: score_layers=-1 no exit -> [%d,%d)\n", r.start, r.end); +} + +// T4 — All layers, with early exit at 14. +static void t4_all_layers_with_exit() { + ScoreRange r = compute_score_range(28, -1, 14); + REQUIRE(r.start == 0); + REQUIRE(r.end == 14); + REQUIRE(!r.empty()); + printf("T4 pass: score_layers=-1 early_exit=14 -> [%d,%d)\n", r.start, r.end); +} + +// T5 — SCORE_LAYERS larger than fwd_layer_limit: clamp to [0, fwd_layer_limit). +static void t5_score_layers_exceeds_exit() { + // score_layers=14 but only 7 computed: want = min(14,7) = 7, start=0 + ScoreRange r = compute_score_range(28, 14, 7); + REQUIRE(r.start == 0); + REQUIRE(r.end == 7); + REQUIRE(!r.empty()); + printf("T5 pass: score_layers=14 early_exit=7 -> [%d,%d)\n", r.start, r.end); +} + +// T6 — SCORE_LAYERS == n_layer (all layers) with no early exit. +static void t6_score_layers_equals_n_layer() { + ScoreRange r = compute_score_range(28, 28, 28); + // score_layers == n_layer → condition (score_layers < n_layer) is false → start=0 + REQUIRE(r.start == 0); + REQUIRE(r.end == 28); + REQUIRE(!r.empty()); + printf("T6 pass: score_layers=n_layer=28 -> [%d,%d)\n", r.start, r.end); +} + +// T7 — early_exit_n == 14, score_layers == 7: should produce [7,14). +static void t7_partial_exit_partial_score() { + ScoreRange r = compute_score_range(28, 7, 14); + REQUIRE(r.start == 7); + REQUIRE(r.end == 14); + REQUIRE(!r.empty()); + REQUIRE(r.count() == 7); + printf("T7 pass: early_exit=14 score_layers=7 -> [%d,%d)\n", r.start, r.end); +} + +int main() { + t1_bug_scenario(); + t2_no_early_exit(); + t3_all_layers_no_exit(); + t4_all_layers_with_exit(); + t5_score_layers_exceeds_exit(); + t6_score_layers_equals_n_layer(); + t7_partial_exit_partial_score(); + printf("\nAll score_range tests passed.\n"); + return 0; +} diff --git a/server/test/test_drafter_tail_capture_guard.cpp b/server/test/test_drafter_tail_capture_guard.cpp new file mode 100644 index 000000000..a00763e3e --- /dev/null +++ b/server/test/test_drafter_tail_capture_guard.cpp @@ -0,0 +1,128 @@ +// Unit tests for the tail-capture chunk-boundary guard in qwen3_graph.cpp. +// Reproduces Bug #42: ggml_view_3d overrun when S % chunk_size ∈ {1..7} +// and n_lookahead == 8. +// +// Pure integer arithmetic — no ggml, no GPU, no server deps. +// +// Root cause (codex's diagnosis, confirmed by momus's data audit): +// tail_lo = S - n_lookahead +// When chunk 0 contains S = chunk_size + r tokens (r ∈ {1..7}), a second +// chunk was dispatched but we still evaluate the first chunk's guard with +// cs=0, cl=chunk_size. tail_lo = chunk_size + r - n_lookahead = 4088 + r. +// +// OLD guard: tail_lo >= cs && tail_lo < cs + cl +// r=1..7: (4088+r) >= 0 && (4088+r) < 4096 → TRUE ← BUG: tail overruns +// +// NEW guard: tail_lo >= cs && tail_lo + n_lookahead <= cs + cl +// r=1..7: (4088+r) + 8 <= 4096 → 4096+r <= 4096 → FALSE ← correct: skip +// +// TDD RED/GREEN: +// RED (before patch): TAIL_GUARD_USE_NEW_FORMULA undefined → old guard inline → test FAILS. +// GREEN (after patch): TAIL_GUARD_USE_NEW_FORMULA defined via compiler flag → test PASSES. +// The patch to qwen3_graph.cpp changes the same 2 lines as this toggle. + +#include +#include + +#define REQUIRE(cond) \ + do { if (!(cond)) { \ + std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \ + std::exit(1); \ + } } while (0) + +// The guard being tested — toggled by compile-time flag to reproduce RED/GREEN. +#ifdef TAIL_GUARD_USE_NEW_FORMULA +static bool tail_fits(int tail_lo, int cs, int cl, int n_lookahead) { + return tail_lo >= cs && tail_lo + n_lookahead <= cs + cl; // NEW (fix) +} +#else +static bool tail_fits(int tail_lo, int cs, int cl, int n_lookahead) { + (void)n_lookahead; + return tail_lo >= cs && tail_lo < cs + cl; // OLD (Bug #42) +} +#endif + +// T1: First chunk (cs=0, cl=4096), S = chunk_size + r for r ∈ {1..7}. +// Tail straddles the chunk boundary: tail_lo ∈ [4089..4095], needs 8 tokens +// → runs 1..7 tokens past the end → view must be SKIPPED. +// CORRECT answer: false. Old guard returns true → BUG → RED test FAILS. +static void t1_straddling_tail_must_be_skipped() { + const int chunk_size = 4096, n_lookahead = 8; + const int cs = 0, cl = chunk_size; // first chunk + + for (int r = 1; r <= 7; r++) { + const int S = chunk_size + r; + const int tail_lo = S - n_lookahead; // = 4088 + r ∈ [4089..4095] + + const bool result = tail_fits(tail_lo, cs, cl, n_lookahead); + std::printf("T1 r=%d S=%d tail_lo=%d tail_hi=%d chunk=[%d,%d): fits=%d (expect 0)\n", + r, S, tail_lo, tail_lo + n_lookahead, cs, cs + cl, (int)result); + REQUIRE(!result && "tail overruns chunk boundary — guard must return false"); + } +} + +// T2: r=0 (S == chunk_size exactly). tail_lo=4088, tail_hi=4096=chunk end. Fits exactly. +// Both old and new guards agree: true. +static void t2_tail_fits_exactly_at_chunk_end() { + const int chunk_size = 4096, n_lookahead = 8; + const int cs = 0, cl = chunk_size; + const int S = chunk_size; + const int tail_lo = S - n_lookahead; // 4088 + + const bool result = tail_fits(tail_lo, cs, cl, n_lookahead); + std::printf("T2 r=0 S=%d tail_lo=%d: fits=%d (expect 1)\n", S, tail_lo, (int)result); + REQUIRE(result && "tail fits exactly at chunk end — must return true"); +} + +// T3: r=8 (S = chunk_size + 8). tail_lo=4096 — at cs+cl boundary, outside chunk. +// Both guards agree: false. +static void t3_tail_starts_outside_chunk() { + const int chunk_size = 4096, n_lookahead = 8; + const int cs = 0, cl = chunk_size; + const int S = chunk_size + 8; + const int tail_lo = S - n_lookahead; // 4096 + + const bool result = tail_fits(tail_lo, cs, cl, n_lookahead); + std::printf("T3 r=8 S=%d tail_lo=%d: fits=%d (expect 0)\n", S, tail_lo, (int)result); + REQUIRE(!result && "tail starts at next chunk — must return false"); +} + +// T4: Second chunk (cs=4096, cl=4096), S=8192, tail fully inside. +// tail_lo=8184, tail_hi=8192 == cs+cl. Both guards agree: true. +static void t4_second_chunk_tail_fits_exactly() { + const int chunk_size = 4096, n_lookahead = 8; + const int cs = chunk_size, cl = chunk_size; // second chunk + const int S = 2 * chunk_size; + const int tail_lo = S - n_lookahead; // 8184 + + const bool result = tail_fits(tail_lo, cs, cl, n_lookahead); + std::printf("T4 second chunk S=%d tail_lo=%d cs=%d: fits=%d (expect 1)\n", + S, tail_lo, cs, (int)result); + REQUIRE(result && "tail fits exactly in second chunk — must return true"); +} + +// T5: Second chunk, r=3. tail straddles end of second chunk. +// S = 2*4096 + 3 = 8195. tail_lo = 8187, tail_hi = 8195. cs+cl = 8192. +// New guard: 8195 <= 8192 → false. Old guard: 8187 < 8192 → true (BUG). +static void t5_second_chunk_straddling_tail_skipped() { + const int chunk_size = 4096, n_lookahead = 8; + const int cs = chunk_size, cl = chunk_size; // second chunk [4096,8192) + const int r = 3; + const int S = 2 * chunk_size + r; + const int tail_lo = S - n_lookahead; // 8187 + + const bool result = tail_fits(tail_lo, cs, cl, n_lookahead); + std::printf("T5 second chunk r=%d S=%d tail_lo=%d: fits=%d (expect 0)\n", + r, S, tail_lo, (int)result); + REQUIRE(!result && "tail straddles end of second chunk — must return false"); +} + +int main() { + t1_straddling_tail_must_be_skipped(); + t2_tail_fits_exactly_at_chunk_end(); + t3_tail_starts_outside_chunk(); + t4_second_chunk_tail_fits_exactly(); + t5_second_chunk_straddling_tail_skipped(); + std::printf("All tail_capture guard tests passed.\n"); + return 0; +} diff --git a/server/test/test_drafter_warm_path_regression.cpp b/server/test/test_drafter_warm_path_regression.cpp new file mode 100644 index 000000000..4a2015319 --- /dev/null +++ b/server/test/test_drafter_warm_path_regression.cpp @@ -0,0 +1,164 @@ +// Regression test: layer-subset warm-path buffer sizing fix. +// +// Root cause (commit that introduced fix): when PFLASH_DRAFTER_SCORE_LAYERS=7 +// with a 28-layer model, the old code allocated K_norope_v for ALL 28 layers +// (~7.5 GB on RTX 3090 at S=128K) even though only 7 layers are read in scoring. +// The extra 21 × 268 MB = 5.6 GB pushed total VRAM above 24 GB, causing GPU +// page migration and a 5.4× A_compute regression on warm runs. +// +// The fix: size K_norope_v / Q_norope_v to n_score_layers (= score_range.count()), +// which equals 7 rather than 28. This test verifies the sizing formula via +// compute_score_range without needing a GPU. + +#include "score_range.h" + +#include +#include + +using dflash::common::ScoreRange; +using dflash::common::compute_score_range; + +// Helper: compute n_score_layers as the fixed allocator does. +static int score_layer_count(int n_layer, int score_layers_env, int early_exit_env) { + const int fwd_limit = (early_exit_env > 0 && early_exit_env < n_layer) + ? early_exit_env : n_layer; + ScoreRange r = compute_score_range(n_layer, score_layers_env, fwd_limit); + return r.count(); +} + +// T1: baseline case — SCORE_LAYERS unset (-1), no early exit. +// K_norope_v should have n_layer entries. +static void t1_baseline_full_alloc() { + int n = score_layer_count(28, -1, -1); + assert(n == 28 && "baseline: all 28 layers must be allocated"); + printf("T1 pass: baseline n_score_layers=%d\n", n); +} + +// T2: L7 case — SCORE_LAYERS=7, no early exit. +// OLD: allocated 28 entries (5.6 GB wasted). NEW: 7 entries. +static void t2_l7_trimmed_alloc() { + int n = score_layer_count(28, 7, -1); + assert(n == 7 && "L7: only 7 K_norope entries must be allocated"); + printf("T2 pass: L7 n_score_layers=%d (was 28 before fix)\n", n); +} + +// T3: early-exit=14, SCORE_LAYERS=7. Scoring range [7,14), 7 layers. +static void t3_early_exit_with_score_layers() { + int n = score_layer_count(28, 7, 14); + assert(n == 7); + printf("T3 pass: early_exit=14 score_layers=7 -> n_score_layers=%d\n", n); +} + +// T4: early-exit=7, SCORE_LAYERS=7 (the classic double-7 composition). +// Range [0,7), 7 layers. +static void t4_ee7_score7_composition() { + int n = score_layer_count(28, 7, 7); + assert(n == 7); + printf("T4 pass: ee7+score7 n_score_layers=%d\n", n); +} + +// T5: SCORE_LAYERS not set (all layers), early-exit=14. +// Scoring range [0,14), 14 layers needed. +static void t5_all_score_with_early_exit() { + int n = score_layer_count(28, -1, 14); + assert(n == 14); + printf("T5 pass: score_all early_exit=14 n_score_layers=%d\n", n); +} + +// T6: validate that score_layer_start_pre matches score_layer_start used +// in the scoring loop (must be identical for correct buffer indexing). +static void t6_start_pre_matches_loop_start() { + // Replicate the pre-alloc computation. + const int n_layer = 28, score_layers_env = 7, early_exit_env = -1; + const int fwd_limit = (early_exit_env > 0 && early_exit_env < n_layer) + ? early_exit_env : n_layer; + ScoreRange pre = compute_score_range(n_layer, score_layers_env, fwd_limit); + // Scoring loop uses the same fwd_layer_limit (== fwd_limit) and same env. + ScoreRange loop = compute_score_range(n_layer, score_layers_env, fwd_limit); + assert(pre.start == loop.start && "score_layer_start_pre must equal score_layer_start"); + assert(pre.end == loop.end); + printf("T6 pass: pre_start=%d loop_start=%d (match)\n", pre.start, loop.start); +} + +// T7: alloc loop boundary check — the alloc loop iterates 0..n_layer but must only +// fill K_norope_v for layers in [score_layer_start_pre, fwd_layer_limit_pre). +// This replicates the guard added to the alloc loop: il >= start AND il < fwd_limit. +// Before the fix: il was only bounded below (il >= start), causing K_norope_v[si] +// out-of-bounds when n_score_layers < n_layer (e.g. ee14: si 0..27 but vec size 14). +static void t7_alloc_loop_upper_bound() { + struct FakeVec { + int capacity; + int max_si_written = -1; + void write(int si) { + assert(si >= 0 && si < capacity && "si out of bounds"); + if (si > max_si_written) max_si_written = si; + } + }; + + // Simulate ee14 (no SCORE_LAYERS, early_exit=14, n_layer=28). + { + const int n_layer = 28, score_layers = -1, early_exit = 14; + const int fwd_limit = early_exit; + ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit); + const int n_score = r.count(); // 14 + FakeVec v{n_score}; + int writes = 0; + for (int il = 0; il < n_layer; ++il) { + // Correct guard: il >= start AND il < fwd_limit (the fix) + if (il >= r.start && il < fwd_limit) { + v.write(il - r.start); + writes++; + } + } + assert(writes == n_score && "ee14: must write exactly n_score_layers entries"); + printf("T7a pass: ee14 alloc writes=%d capacity=%d (no overflow)\n", writes, n_score); + } + + // Simulate ee7 (SCORE_LAYERS=7, early_exit=7, n_layer=28). + { + const int n_layer = 28, score_layers = 7, early_exit = 7; + const int fwd_limit = early_exit; + ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit); + const int n_score = r.count(); // 7 + FakeVec v{n_score}; + int writes = 0; + for (int il = 0; il < n_layer; ++il) { + if (il >= r.start && il < fwd_limit) { + v.write(il - r.start); + writes++; + } + } + assert(writes == n_score && "ee7: must write exactly 7 entries"); + printf("T7b pass: ee7 alloc writes=%d capacity=%d (no overflow)\n", writes, n_score); + } + + // Simulate baseline (no ee, no score_layers). + { + const int n_layer = 28, score_layers = -1, early_exit = -1; + const int fwd_limit = n_layer; + ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit); + const int n_score = r.count(); // 28 + FakeVec v{n_score}; + int writes = 0; + for (int il = 0; il < n_layer; ++il) { + if (il >= r.start && il < fwd_limit) { + v.write(il - r.start); + writes++; + } + } + assert(writes == n_score && "baseline: must write 28 entries"); + printf("T7c pass: baseline alloc writes=%d capacity=%d (no overflow)\n", writes, n_score); + } +} + +int main() { + t1_baseline_full_alloc(); + t2_l7_trimmed_alloc(); + t3_early_exit_with_score_layers(); + t4_ee7_score7_composition(); + t5_all_score_with_early_exit(); + t6_start_pre_matches_loop_start(); + t7_alloc_loop_upper_bound(); + printf("\nAll warm-path regression tests passed.\n"); + return 0; +} From 94907a441726eb6249a6af1cc1b06d7d2194121e Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Wed, 27 May 2026 12:32:08 +0200 Subject: [PATCH 02/16] =?UTF-8?q?refactor(pflash):=20rename=20DFLASH=5FCOM?= =?UTF-8?q?PRESS=5F*=20=E2=86=92=20PFLASH=5FCOMPRESS=5F*=20(cascade=20env?= =?UTF-8?q?=20vars)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/src/qwen3/qwen3_drafter.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index 589351f9c..67c9adc6a 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -559,11 +559,11 @@ static std::vector qwen35_score_and_compress( const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); - const int anchor_ngram = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", 4); - const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", 2); + const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); + const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); - const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); - const float max_forced_ratio = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); + const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); + const float max_forced_ratio = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); const int q0 = std::max(0, S - query_tokens); std::vector query_pool(ids.begin() + q0, ids.end()); @@ -575,8 +575,8 @@ static std::vector qwen35_score_and_compress( anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); - const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; - const int max_iters = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; + const int max_iters = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); if (use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, anchor_cfg, max_iters, forced); @@ -752,11 +752,11 @@ std::vector drafter_score_and_compress( const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); - const int anchor_ngram = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", 4); - const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", 2); + const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); + const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); - const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); - const float max_forced_ratio = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); + const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); + const float max_forced_ratio = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); @@ -772,8 +772,8 @@ std::vector drafter_score_and_compress( anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); - const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; - const int max_iters = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; + const int max_iters = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); if (use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, anchor_cfg, max_iters, forced); From 99f6b38d68c6b2d65bb6499d45c4d5763224482b Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Wed, 27 May 2026 15:12:49 +0200 Subject: [PATCH 03/16] fix(pflash): adaptive anchor_radius eliminates 64K NIAH cliff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At >=32K context the needle text is more likely to straddle multiple chunks (chunk_size=32), and the fixed anchor_radius=2 window (5 chunks ~160 tokens) loses the back half of the needle digits — the model retrieves '...is 4' but truncates/hallucinates the continuation. Adaptive scaling based on n_chunks: <32K context (<1024 chunks): radius=2, max_anchor_hits=8 (unchanged) 32-64K (1024-2047 chunks): radius=4, max_anchor_hits=16 >=64K (>=2048 chunks): radius=8, max_anchor_hits=32 Override via PFLASH_COMPRESS_ANCHOR_RADIUS / PFLASH_COMPRESS_MAX_ANCHOR_HITS env vars (legacy DFLASH_COMPRESS_* names still accepted). Validated at 49K context: NIAH needle 'kowefada 1596346' correctly retrieved (was: '1594' or hallucinated 'is 048394839483' before fix). Resolves the long-standing 'project_64k_quality_cliff' memory entry. --- server/src/qwen3/qwen3_drafter.cpp | 58 +++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index 67c9adc6a..2fbf5850f 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -557,8 +557,33 @@ static std::vector qwen35_score_and_compress( } const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff: + // at long context the needle text is more likely to straddle multiple + // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the + // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8. + // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins). + int anchor_radius; + { + const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); + const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); + if (env_r >= 0) anchor_radius = env_r; + else if (legacy_r >= 0) anchor_radius = legacy_r; + else if (n_chunks < 1024) anchor_radius = 2; + else if (n_chunks < 2048) anchor_radius = 4; + else anchor_radius = 8; + } + // max_anchor_hits scales the same way: at long context, distinctive + // anchors are sparser, so we can afford to keep more hits per qi. + int max_anchor_hits; + { + const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + if (env_h >= 0) max_anchor_hits = env_h; + else if (legacy_h >= 0) max_anchor_hits = legacy_h; + else if (n_chunks < 1024) max_anchor_hits = 8; + else if (n_chunks < 2048) max_anchor_hits = 16; + else max_anchor_hits = 32; + } const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); @@ -750,8 +775,33 @@ std::vector drafter_score_and_compress( tail_chunks = std::max(0, budget - head_chunks); } const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff: + // at long context the needle text is more likely to straddle multiple + // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the + // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8. + // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins). + int anchor_radius; + { + const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); + const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); + if (env_r >= 0) anchor_radius = env_r; + else if (legacy_r >= 0) anchor_radius = legacy_r; + else if (n_chunks < 1024) anchor_radius = 2; + else if (n_chunks < 2048) anchor_radius = 4; + else anchor_radius = 8; + } + // max_anchor_hits scales the same way: at long context, distinctive + // anchors are sparser, so we can afford to keep more hits per qi. + int max_anchor_hits; + { + const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + if (env_h >= 0) max_anchor_hits = env_h; + else if (legacy_h >= 0) max_anchor_hits = legacy_h; + else if (n_chunks < 1024) max_anchor_hits = 8; + else if (n_chunks < 2048) max_anchor_hits = 16; + else max_anchor_hits = 32; + } const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); From 766e46dd85fe8f5056fffaa6dd70ee2b892b2561 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 00:41:28 +0200 Subject: [PATCH 04/16] bench: add eval_quality_compare.py for LongBench F1 regression detection --- dflash/scripts/eval_quality_compare.py | 166 +++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 dflash/scripts/eval_quality_compare.py diff --git a/dflash/scripts/eval_quality_compare.py b/dflash/scripts/eval_quality_compare.py new file mode 100644 index 000000000..cd4578e9e --- /dev/null +++ b/dflash/scripts/eval_quality_compare.py @@ -0,0 +1,166 @@ +"""MT-Bench quality comparator. + +Reads all results_*.json in the given directory (or current dir), +treats baseline_off as reference, and prints a markdown comparison table. + +Usage: + python eval_quality_compare.py [--dir PATH] [--out PATH] +""" +import argparse +import json +import sys +from pathlib import Path + + +def load_results(path: Path) -> dict[tuple[int, int], str]: + """Returns {(question_id, turn_num): reply} for turn_num in {1, 2}.""" + mapping = {} + with open(path) as f: + records = json.load(f) + for r in records: + qid = r["question_id"] + mapping[(qid, 1)] = r["turn_1"] + mapping[(qid, 2)] = r["turn_2"] + return mapping + + +def lcp_ratio(a: str, b: str) -> float: + """Longest common prefix length / min(len(a), len(b)).""" + denom = min(len(a), len(b)) + if denom == 0: + return 1.0 if a == b else 0.0 + i = 0 + while i < denom and a[i] == b[i]: + i += 1 + return i / denom + + +def compare(ref: dict, cand: dict) -> dict: + """Compute comparison metrics between ref and cand reply maps.""" + keys = sorted(set(ref) & set(cand)) + if not keys: + return {"exact_match_rate": 0.0, "mean_lcp_ratio": 0.0, + "divergence_count": 0, "total_pairs": 0, + "first_5_divergences": []} + + exact = 0 + lcp_sum = 0.0 + divergences = [] + + for k in keys: + r, c = ref[k], cand[k] + if r == c: + exact += 1 + else: + if len(divergences) < 5: + qid, turn = k + divergences.append((qid, turn, r[:50], c[:50])) + lcp_sum += lcp_ratio(r, c) + + n = len(keys) + return { + "exact_match_rate": exact / n, + "mean_lcp_ratio": lcp_sum / n, + "divergence_count": n - exact, + "total_pairs": n, + "first_5_divergences": divergences, + } + + +def main() -> int: + ap = argparse.ArgumentParser(description="MT-Bench quality comparator") + ap.add_argument("--dir", type=Path, default=Path("."), + help="Directory containing results_*.json files") + ap.add_argument("--out", type=Path, + default=Path(__file__).parent.parent / "eval/summary.md", + help="Output markdown summary path") + args = ap.parse_args() + + result_files = sorted(args.dir.glob("results_*.json")) + if not result_files: + print(f"ERROR: no results_*.json found in {args.dir}", file=sys.stderr) + return 1 + + # Map config name -> result file + configs: dict[str, Path] = {} + for f in result_files: + # strip "results_" prefix and ".json" suffix + name = f.stem[len("results_"):] + configs[name] = f + + if "baseline_off" not in configs: + print("ERROR: baseline_off results not found — cannot compare", file=sys.stderr) + return 1 + + ref = load_results(configs["baseline_off"]) + + rows = [] + for name, path in configs.items(): + cand = load_results(path) + m = compare(ref, cand) + m["config"] = name + rows.append(m) + + # Sort: baseline_off first, then alphabetical + def sort_key(r): + if r["config"] == "baseline_off": + return (0, r["config"]) + return (1, r["config"]) + rows.sort(key=sort_key) + + # Sanity check: baseline_off_2 vs baseline_off + sanity_row = next((r for r in rows if r["config"] == "baseline_off_2"), None) + sanity_warning = "" + if sanity_row and sanity_row["exact_match_rate"] < 0.99: + sanity_warning = ( + f"WARNING: baseline_off_2 exact_match_rate={sanity_row['exact_match_rate']:.3f} " + f"< 0.99 — SERVER IS NONDETERMINISTIC. All other comparisons are suspect.\n\n" + ) + + # Build markdown table + lines = [] + if sanity_warning: + lines.append(f"> {sanity_warning.strip()}\n") + + lines.append("| config | exact_match_rate | mean_lcp_ratio | divergence_count | total_pairs |") + lines.append("|--------|-----------------|----------------|-----------------|-------------|") + for r in rows: + lines.append( + f"| {r['config']} " + f"| {r['exact_match_rate']:.3f} " + f"| {r['mean_lcp_ratio']:.3f} " + f"| {r['divergence_count']} " + f"| {r['total_pairs']} |" + ) + + lines.append("") + lines.append("## First 5 divergences per config (vs baseline_off)") + for r in rows: + if r["config"] == "baseline_off" or not r["first_5_divergences"]: + continue + lines.append(f"\n### {r['config']}") + lines.append("| qid | turn | ref (first 50) | cand (first 50) |") + lines.append("|-----|------|----------------|-----------------|") + for qid, turn, ref50, cand50 in r["first_5_divergences"]: + ref50_s = ref50.replace("|", "\\|").replace("\n", " ") + cand50_s = cand50.replace("|", "\\|").replace("\n", " ") + lines.append(f"| {qid} | {turn} | {ref50_s!r} | {cand50_s!r} |") + + table = "\n".join(lines) + + # Print to stdout + if sanity_warning: + print(f"\n{'!'*70}") + print(sanity_warning.strip()) + print(f"{'!'*70}\n") + print(table) + + # Write summary file + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(table + "\n") + print(f"\nSummary written to {args.out}", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 8c1d705ffafdba32ed7a7679d746b77f55353393 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 09:24:10 +0200 Subject: [PATCH 05/16] feat(qwen35): derive scalars from weights, assert vs GGUF metadata Mirror the gemma4_backend.cpp:75-104 defensive pattern for the qwen35 target loader and the dflash decode draft loader. After loading weight tensors, derive head_dim / n_head / n_head_kv from wq->ne[1] / wk->ne[1] and compare against GGUF-declared values; set_last_error and return false on mismatch. Makes the 'stale scalar at graph-build time' bug class structurally impossible. Load-time only, no runtime cost. Existing well-formed GGUFs are unaffected (smoke verified). --- server/src/draft/draft_gguf_loader.cpp | 57 ++++++++++++++++++++++++ server/src/qwen35/gguf_target_loader.cpp | 45 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp index fbec7263b..73a9c17bd 100644 --- a/server/src/draft/draft_gguf_loader.cpp +++ b/server/src/draft/draft_gguf_loader.cpp @@ -349,6 +349,63 @@ bool load_draft_gguf(const std::string & path, gguf_free(gctx); + // Structural defense: derive scalar dims from weight tensor shapes and + // assert against GGUF-declared metadata (Bug #2 class prevention). + // All draft layers have wq/wk (no deltanet mix), so use layer 0. + // wq is plain Q-only (no gate), so ne[1] = n_head * head_dim. + // fc is [n_target_layers*n_embd, n_embd], so ne[0] = n_target_layers*n_embd. + { + const DraftLayer & L0 = out.layers[0]; + const int64_t derived_q_dim = L0.wq->ne[1]; + const int64_t derived_kv_dim = L0.wk->ne[1]; + const int64_t expected_q_dim = (int64_t)out.n_head * out.head_dim; + const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.head_dim; + if (derived_q_dim != expected_q_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[1]=%lld " + "!= n_head*head_dim=%d*%d=%lld", + (long long)derived_q_dim, + out.n_head, out.head_dim, (long long)expected_q_dim); + set_last_error(buf); + return false; + } + if (derived_kv_dim != expected_kv_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_k.weight->ne[1]=%lld " + "!= n_head_kv*head_dim=%d*%d=%lld", + (long long)derived_kv_dim, + out.n_head_kv, out.head_dim, (long long)expected_kv_dim); + set_last_error(buf); + return false; + } + const int64_t derived_n_embd = L0.wq->ne[0]; + if (derived_n_embd != (int64_t)out.n_embd) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[0]=%lld != n_embd=%d", + (long long)derived_n_embd, out.n_embd); + set_last_error(buf); + return false; + } + // fc: [n_target_layers*n_embd, n_embd] — check fc->ne[0] against derived expectation + if (out.n_target_layers > 0) { + const int64_t derived_fc_in = out.fc->ne[0]; + const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd; + if (derived_fc_in != expected_fc_in) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "draft GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld " + "!= n_target_layers*n_embd=%d*%d=%lld", + (long long)derived_fc_in, + out.n_target_layers, out.n_embd, (long long)expected_fc_in); + set_last_error(buf); + return false; + } + } + } + char summary[192]; std::snprintf(summary, sizeof(summary), "draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU", diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp index 116ddafc0..9085c1a1b 100644 --- a/server/src/qwen35/gguf_target_loader.cpp +++ b/server/src/qwen35/gguf_target_loader.cpp @@ -738,6 +738,51 @@ bool load_target_gguf_partial(const std::string & path, gguf_free(gctx); + // Structural defense: derive scalar dims from weight tensor shapes and + // assert against GGUF-declared metadata. Catches stale/zero dw_ or w_ + // scalars before they silently corrupt graph-build (Bug #2 class). + // Uses the first full-attention layer (il = fai-1) because deltanet + // layers don't carry wq/wk. wq packs Q+gate so ne[1] = n_head*kl*2. + { + const int fa_il = out.full_attention_interval - 1; // first full-attn layer + const TargetLayer & fa = out.layers[(size_t)fa_il]; + if (fa.wq && fa.wk) { + const int64_t derived_q_dim = fa.wq->ne[1]; // n_head * head_dim * 2 + const int64_t derived_kv_dim = fa.wk->ne[1]; // n_head_kv * head_dim + const int64_t expected_q_dim = (int64_t)out.n_head * out.n_embd_head_k * 2; + const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k; + if (derived_q_dim != expected_q_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_q.weight->ne[1]=%lld " + "!= n_head*head_dim*2=%d*%d*2=%lld", + fa_il, (long long)derived_q_dim, + out.n_head, out.n_embd_head_k, (long long)expected_q_dim); + set_last_error(buf); + return false; + } + if (derived_kv_dim != expected_kv_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_k.weight->ne[1]=%lld " + "!= n_head_kv*head_dim=%d*%d=%lld", + fa_il, (long long)derived_kv_dim, + out.n_head_kv, out.n_embd_head_k, (long long)expected_kv_dim); + set_last_error(buf); + return false; + } + const int64_t derived_n_embd = fa.wq->ne[0]; // input dim = n_embd + if (derived_n_embd != (int64_t)out.n_embd) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: blk.%d.attn_q.weight->ne[0]=%lld != n_embd=%d", + fa_il, (long long)derived_n_embd, out.n_embd); + set_last_error(buf); + return false; + } + } + } + if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) { set_last_error("token_embd.weight not found or invalid type"); return false; From 699bb5c925ad6ceed353d9cb0cf4a131d7b03411 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 09:24:19 +0200 Subject: [PATCH 06/16] feat(pflash): adaptive composition via per-request fa_window override When pflash compresses, set gen_req.fa_window_override = effective_prompt + 256 so spec-decode verify sees the entire compressed prompt. Pflash already paid compute to pick which tokens matter; verify never throws any of them away. When the override would exceed 2 * cfg_.fa_window (spec-decode's drafter cost stops earning its tok/J), the C2 gate in qwen35_backend's generate() falls back to AR (fa_window=0, full attention). AR sees every kept token at every context; we choose mechanism, not visibility. Zero new CLI flags. --draft remains the only knob for composition; all per-request adaptation is internal. --- server/src/common/model_backend.h | 4 ++ server/src/qwen35/qwen35_backend.cpp | 61 ++++++++++++++++++------ server/src/qwen35/qwen35_dflash_target.h | 5 ++ server/src/server/http_server.cpp | 16 +++++++ 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index 182b50030..3af273ace 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -100,6 +100,10 @@ struct GenerateRequest { const std::vector * hint_tokens = nullptr; // Optional thinking-budget hook — see BudgetHook docs above. BudgetHook budget_hook; + // Per-request override for target spec-decode verify fa_window. Set by + // http_server when pflash compresses, so verify sees the entire compressed + // prompt (not just the last cfg_.fa_window positions). Zero = no override. + int fa_window_override = 0; }; struct GenerateResult { diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index be83db452..471e8af07 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -553,6 +553,16 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req, sampler_rng_.seed(sampler_.seed); } + // Design 1: apply the per-request verify fa_window override (set by + // http_server when pflash compresses), then restore cfg_.fa_window after + // this generate completes so concurrent requests aren't affected. Calling + // dflash_target() lazily constructs it on first use. + const int eff_fa_window = + (req.fa_window_override > 0) ? req.fa_window_override : cfg_.fa_window; + if (auto * dt = dynamic_cast(dflash_target())) { + dt->set_fa_window(eff_fa_window); + } + // Zero delta-net recurrent state (SSM + conv) so a fresh prompt doesn't // inherit stale hidden state from the previous request. KV cache is // position-addressed and will be overwritten during prefill. @@ -568,22 +578,45 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req, auto t_prefill_end = std::chrono::steady_clock::now(); result.prefill_s = std::chrono::duration(t_prefill_end - t_prefill_start).count(); - // Decode (speculative) + // C2 adaptive-mechanism gate: pflash's override always reflects the + // FULL compressed-prompt size — we never cap visibility (would waste + // pflash's anchor-selection work). The gate here decides whether + // spec-decode's verify arithmetic still earns its drafter cost at + // that window size. Threshold 2× cfg_.fa_window: + // override <= 4096 (32K → ~1.5K, 64K → ~3K compressed) → spec-decode + // override > 4096 (128K → ~6.4K compressed) → AR fallback + // AR uses fa_window=0 (full attention) so every kept token is visible + // regardless of which path runs. We choose mechanism, not visibility. + const bool fa_within_budget = + (req.fa_window_override == 0) + || (eff_fa_window <= 2 * cfg_.fa_window); + + // Decode (speculative or AR) if (req.n_gen > 0) { auto t_decode_start = std::chrono::steady_clock::now(); - // Pass the budget hook into spec-decode. When token count nears - // the budget edge, do_spec_decode breaks out and tails off via - // AR with the hook still active — force-close fires correctly - // without sacrificing spec-decode throughput for the bulk of - // generation. Most requests never hit the tail because the - // model closes naturally well before the budget edge. - if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, - result.accept_rate, result.spec_decode_ran, - req.hint_tokens, &req.budget_hook, - &result.budget_forced_close, - &result.degenerate_decode_close)) { - result.error = "decode"; - return result; + if (!fa_within_budget) { + // AR fallback: fa_window override too wide for spec decode. + bool ok = do_ar_decode(committed, req.n_gen, result.tokens, out_io, + req.budget_hook, + &result.budget_forced_close, + &result.degenerate_decode_close); + out_io.emit(-1); + if (!ok) { result.error = "decode"; return result; } + } else { + // Pass the budget hook into spec-decode. When token count nears + // the budget edge, do_spec_decode breaks out and tails off via + // AR with the hook still active — force-close fires correctly + // without sacrificing spec-decode throughput for the bulk of + // generation. Most requests never hit the tail because the + // model closes naturally well before the budget edge. + if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io, + result.accept_rate, result.spec_decode_ran, + req.hint_tokens, &req.budget_hook, + &result.budget_forced_close, + &result.degenerate_decode_close)) { + result.error = "decode"; + return result; + } } result.decode_s = std::chrono::duration( std::chrono::steady_clock::now() - t_decode_start).count(); diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h index 6a72e48b5..69e134f1c 100644 --- a/server/src/qwen35/qwen35_dflash_target.h +++ b/server/src/qwen35/qwen35_dflash_target.h @@ -53,6 +53,11 @@ class Qwen35DFlashTarget : public DFlashTarget { int mask_token_id() const override; const std::vector & capture_layer_ids() const override; + // Per-call override for the verify-time flash-attention window. Used by + // do_spec_decode to widen the window when pflash compression has shrunk + // the prompt — see GenerateRequest.fa_window_override. + void set_fa_window(int fa) { fa_window_ = fa; } + private: TargetWeights & w_; TargetCache & cache_; diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index ab37805bf..465758b51 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1267,6 +1267,22 @@ void HttpServer::worker_loop() { gen_req.sampler = req.sampler; gen_req.do_sample = req.sampler.needs_logit_processing(); gen_req.stream = false; // we handle streaming via on_token callback + // Design 1: when pflash compresses, widen the target spec-decode verify + // fa_window to cover the entire compressed prompt. Otherwise verify sees + // only the last cfg_.fa_window positions of the compressed sequence, + // losing needle context and truncating the answer at long ctx. + // + // Principle: pflash already paid compute to pick which tokens matter. + // Don't throw any of them away in verify by capping fa_window — that + // would waste pflash's work. Always request enough verify window to + // see the entire compressed prompt. The C2 gate in qwen35_backend.cpp + // then decides per request whether spec-decode arithmetic still beats + // AR at this window size; if not, AR fallback kicks in (which uses + // fa_window=0 → full attention over the compressed prompt). Either + // path sees every kept token. We choose mechanism, not visibility. + if (pflash_compressed) { + gen_req.fa_window_override = (int)effective_prompt.size() + 256; + } // Level 2 force-close: when thinking is opted in, the server is // configured with a hard-limit reply budget, and we resolved the From a676161a8c8d01bf30eccdf449e6c7e5027e7899 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 13:47:10 +0200 Subject: [PATCH 07/16] feat(pflash): PFLASH_*/DFLASH_* env-var dual aliasing + transitive cascade default-on Adds backwards-compat fallback wrappers for 6 cascade env vars in both standard and bandit code paths, so harness scripts using either spelling work against this binary. Emits one-time WARN to stderr when the legacy DFLASH_* spelling is honored. Also flips the default for `use_transitive` from `false` to `true` because the gated rare-token bridge improves multi-hop F1 with zero downside in the cascade-already-firing case. --- server/src/qwen3/qwen3_drafter.cpp | 103 +++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 14 deletions(-) diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index 2fbf5850f..fb7cf7cf0 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -584,11 +584,35 @@ static std::vector qwen35_score_and_compress( else if (n_chunks < 2048) max_anchor_hits = 16; else max_anchor_hits = 32; } - const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); - const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); - - const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); - const float max_forced_ratio = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); + const int anchor_ngram = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; } + return 4; + }(); + const int rare_token_max_freq = [&]{ + const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); + const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; } + return 2; + }(); + + const float cascade_min_anchor_frac = [&]{ + const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; } + return 0.0f; // gate off by default: always run cascade + }(); + const float max_forced_ratio = [&]{ + const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; } + return 10.0f; // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep) + }(); const int q0 = std::max(0, S - query_tokens); std::vector query_pool(ids.begin() + q0, ids.end()); @@ -600,8 +624,20 @@ static std::vector qwen35_score_and_compress( anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); - const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; - const int max_iters = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + const bool use_transitive = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + if (nv >= 0) return nv != 0; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; } + return true; // on by default: gated rare-token bridge improves multi-hop F1 + }(); + const int max_iters = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; } + return 3; + }(); if (use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, anchor_cfg, max_iters, forced); @@ -802,11 +838,35 @@ std::vector drafter_score_and_compress( else if (n_chunks < 2048) max_anchor_hits = 16; else max_anchor_hits = 32; } - const int anchor_ngram = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", 4); - const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", 2); - - const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f); - const float max_forced_ratio = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", 1.3f); + const int anchor_ngram = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; } + return 4; + }(); + const int rare_token_max_freq = [&]{ + const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); + const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; } + return 2; + }(); + + const float cascade_min_anchor_frac = [&]{ + const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; } + return 0.0f; // gate off by default: always run cascade + }(); + const float max_forced_ratio = [&]{ + const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; } + return 10.0f; // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep) + }(); std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); @@ -821,9 +881,24 @@ std::vector drafter_score_and_compress( rare_token_max_freq}; anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); + std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d ratio=%.2f\n", + n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, max_forced_ratio); + std::fflush(stderr); - const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0; - const int max_iters = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", 3); + const bool use_transitive = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + if (nv >= 0) return nv != 0; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; } + return true; // on by default: gated rare-token bridge improves multi-hop F1 + }(); + const int max_iters = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; } + return 3; + }(); if (use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, anchor_cfg, max_iters, forced); From 6536b76df933682ff2038f5257215a54ddebda4a Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 16:00:20 +0200 Subject: [PATCH 08/16] refactor(pflash): extract compress_cfg_from_env, kill qwen35/qwen3 path drift Single helper reads all 10 PFLASH_*/DFLASH_* env vars once. Both qwen35_score_and_compress and drafter_score_and_compress call it. Removes two 70-LOC duplicate env-reading blocks and the duplicated anchor-radius comment. Also removes dead force_chunk_neighborhood (no callers) and collapses the 4-overload load_drafter pyramid to one canonical implementation + 3 thin forwarders. --- server/src/qwen3/qwen3_drafter.cpp | 386 ++++++++++++----------------- 1 file changed, 157 insertions(+), 229 deletions(-) diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index fb7cf7cf0..4261635f4 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -72,11 +72,110 @@ static float env_float(const char * name, float def) { return def; } -static void force_chunk_neighborhood(std::vector & forced, int n_chunks, - int chunk, int radius) { - int lo = std::max(0, chunk - radius); - int hi = std::min(n_chunks - 1, chunk + radius); - for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; +// All pflash/dflash compression knobs read from env, derived per-request. +// anchor_radius and max_anchor_hits use an adaptive ladder keyed on n_chunks +// to prevent the 64K NIAH cliff; see docs/pflash-compress-cfg.md. +// Override any ladder value via PFLASH_COMPRESS_* env vars. +struct CompressCfg { + int query_tokens; + int head_chunks; + int tail_chunks; + dflash::qwen3::AnchorScanCfg anchor; + bool use_transitive; + int max_iters; +}; + +static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) { + CompressCfg c{}; + + c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); + + // head/tail forced chunks scale so top-K scoring always gets slots + const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); + const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); + c.head_chunks = h_raw; + c.tail_chunks = t_raw; + if (c.head_chunks + c.tail_chunks >= n_keep) { + const int budget = std::max(1, n_keep - 1); + c.head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw)); + c.tail_chunks = std::max(0, budget - c.head_chunks); + } + + // anchor_radius: adaptive ladder prevents 64K NIAH cliff + // (<32K=2, 32-64K=4, >=64K=8); override via PFLASH_COMPRESS_ANCHOR_RADIUS + { + const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); + const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); + if (env_r >= 0) c.anchor.anchor_radius = env_r; + else if (legacy_r >= 0) c.anchor.anchor_radius = legacy_r; + else if (n_chunks < 1024) c.anchor.anchor_radius = 2; + else if (n_chunks < 2048) c.anchor.anchor_radius = 4; + else c.anchor.anchor_radius = 8; + } + + // max_anchor_hits: same ladder — sparser anchors at long context + { + const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + if (env_h >= 0) c.anchor.max_anchor_hits = env_h; + else if (legacy_h >= 0) c.anchor.max_anchor_hits = legacy_h; + else if (n_chunks < 1024) c.anchor.max_anchor_hits = 8; + else if (n_chunks < 2048) c.anchor.max_anchor_hits = 16; + else c.anchor.max_anchor_hits = 32; + } + + c.anchor.ngram = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM deprecated, use PFLASH_COMPRESS_ANCHOR_NGRAM\n"); return lv; } + return 4; + }(); + + c.anchor.rare_token_max_freq = [&]{ + const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); + const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ deprecated, use PFLASH_COMPRESS_RARE_MAX_FREQ\n"); return lv; } + return 2; + }(); + + const float cascade_min_anchor_frac = [&]{ + const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC deprecated, use PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC\n"); return lv; } + return 0.0f; + }(); + + const float max_forced_ratio = [&]{ + const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO deprecated, use PFLASH_COMPRESS_MAX_FORCED_RATIO\n"); return lv; } + return 10.0f; + }(); + + c.anchor.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); + c.anchor.max_forced_count = (int)(max_forced_ratio * n_keep); + + c.use_transitive = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + if (nv >= 0) return nv != 0; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE deprecated, use PFLASH_COMPRESS_ANCHOR_TRANSITIVE\n"); return lv != 0; } + return true; // on by default; see docs/anchor-transitive.md + }(); + + c.max_iters = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS deprecated, use PFLASH_COMPRESS_ANCHOR_MAX_ITERS\n"); return lv; } + return 3; + }(); + + return c; } #if defined(DFLASH27B_BACKEND_HIP) @@ -128,21 +227,6 @@ const char * drafter_arch_name(DrafterArch arch) { return "unknown"; } -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, /*gpu=*/0, out); -} - -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - int gpu, DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, DrafterArch::Qwen3_0p6b, gpu, out); -} - -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - DrafterArch arch, DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, arch, /*gpu=*/0, out); -} - bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, DrafterArch arch, int gpu, DrafterContext & out) { if (gpu < 0) { @@ -232,6 +316,22 @@ bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, return true; } +// Thin overloads for API compat; all forward to the canonical 4-arg form. +bool load_drafter(const std::string & gguf_path, int gpu_layers, + DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, /*gpu=*/0, out); +} + +bool load_drafter(const std::string & gguf_path, int gpu_layers, + int gpu, DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, gpu, out); +} + +bool load_drafter(const std::string & gguf_path, int gpu_layers, + DrafterArch arch, DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, arch, /*gpu=*/0, out); +} + void free_drafter(DrafterContext & ctx) { free_drafter_weights(ctx); if (ctx.backend) { @@ -513,24 +613,23 @@ static std::vector qwen35_score_and_compress( const int n_chunks = (S + chunk_size - 1) / chunk_size; const int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); - - std::vector smooth_score = score; - // Caller pool_kernel takes precedence; if zero/negative, fall back to env or 5. + const int pk = (pool_kernel > 0) ? pool_kernel : std::max(3, env_int("DFLASH_COMPRESS_POOL_KERNEL", 5)); - std::vector smoothed((size_t)S, 0.0f); - int half = pk / 2; - for (int j = 0; j < S; ++j) { - int lo = std::max(0, j - half); - int hi = std::min(S - 1, j + half); - float s = 0.0f; - int n = 0; - for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; } - smoothed[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f; + std::vector smooth_score((size_t)S, 0.0f); + { + int half = pk / 2; + for (int j = 0; j < S; ++j) { + int lo = std::max(0, j - half); + int hi = std::min(S - 1, j + half); + float s = 0.0f; + int n = 0; + for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; } + smooth_score[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f; + } } - smooth_score.swap(smoothed); - + std::vector> chunk_means; for (int c = 0; c < n_chunks; ++c) { int lo = c * chunk_size, hi = std::min(S, lo + chunk_size); @@ -539,108 +638,24 @@ static std::vector qwen35_score_and_compress( chunk_means.push_back({s / std::max(1, hi - lo), c}); } std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - - std::vector selected((size_t)n_chunks, 0); - int count = 0; - // Scale head/tail forced chunks so they don't crowd out top-K scoring. - { - const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); - const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); - int h_n = h_raw, t_n = t_raw; - if (h_n + t_n >= n_keep) { - const int budget = std::max(1, n_keep - 1); - h_n = std::max(0, h_raw * budget / (h_raw + t_raw)); - t_n = std::max(0, budget - h_n); - } - for (int c = 0; c < std::min(n_chunks, h_n); ++c) { selected[(size_t)c] = 1; ++count; } - for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } - } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff: - // at long context the needle text is more likely to straddle multiple - // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the - // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8. - // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins). - int anchor_radius; - { - const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); - const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); - if (env_r >= 0) anchor_radius = env_r; - else if (legacy_r >= 0) anchor_radius = legacy_r; - else if (n_chunks < 1024) anchor_radius = 2; - else if (n_chunks < 2048) anchor_radius = 4; - else anchor_radius = 8; - } - // max_anchor_hits scales the same way: at long context, distinctive - // anchors are sparser, so we can afford to keep more hits per qi. - int max_anchor_hits; - { - const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); - const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); - if (env_h >= 0) max_anchor_hits = env_h; - else if (legacy_h >= 0) max_anchor_hits = legacy_h; - else if (n_chunks < 1024) max_anchor_hits = 8; - else if (n_chunks < 2048) max_anchor_hits = 16; - else max_anchor_hits = 32; - } - const int anchor_ngram = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; } - return 4; - }(); - const int rare_token_max_freq = [&]{ - const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); - const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; } - return 2; - }(); + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep); - const float cascade_min_anchor_frac = [&]{ - const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); - const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); - if (nv >= 0.0f) return nv; - if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; } - return 0.0f; // gate off by default: always run cascade - }(); - const float max_forced_ratio = [&]{ - const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); - const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); - if (nv >= 0.0f) return nv; - if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; } - return 10.0f; // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep) - }(); + std::vector selected((size_t)n_chunks, 0); + int count = 0; + for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) { selected[(size_t)c] = 1; ++count; } + for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } - const int q0 = std::max(0, S - query_tokens); + const int q0 = std::max(0, S - cfg.query_tokens); std::vector query_pool(ids.begin() + q0, ids.end()); std::vector forced((size_t)n_chunks, 0); - dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius, - max_anchor_hits, anchor_ngram, - rare_token_max_freq}; - anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); - anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); + dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; + anchor_cfg.chunk_size = chunk_size; - const bool use_transitive = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); - if (nv >= 0) return nv != 0; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; } - return true; // on by default: gated rare-token bridge improves multi-hop F1 - }(); - const int max_iters = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; } - return 3; - }(); - if (use_transitive) { + if (cfg.use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, - anchor_cfg, max_iters, forced); + anchor_cfg, cfg.max_iters, forced); } else { dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } @@ -652,16 +667,14 @@ static std::vector qwen35_score_and_compress( } } - // Global aggregation tasks often depend on repeated rare tokens that do - // not appear in the final query. Preserve high-frequency-but-not-filler - // token chunks before filling with model-score top-K. + // Global aggregation tasks: preserve high-frequency-but-not-filler token chunks. const int repeat_min = env_int("DFLASH_COMPRESS_REPEAT_MIN", 4); const int repeat_max = env_int("DFLASH_COMPRESS_REPEAT_MAX", 32); const int repeat_limit = env_int("DFLASH_COMPRESS_REPEAT_CHUNKS", n_keep); if (repeat_min > 1 && count < repeat_limit) { std::unordered_map freq; freq.reserve((size_t)S); - const int repeat_scan_end = std::max(0, S - query_tokens); + const int repeat_scan_end = std::max(0, S - cfg.query_tokens); for (int j = 0; j < repeat_scan_end; ++j) { ++freq[ids[(size_t)j]]; } @@ -689,12 +702,12 @@ static std::vector qwen35_score_and_compress( } } } - + for (auto [_, c] : chunk_means) { if (count >= n_keep) break; if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } } - + std::vector out_ids; std::vector selected_chunks; for (int c = 0; c < n_chunks; ++c) { @@ -798,110 +811,25 @@ std::vector drafter_score_and_compress( std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - // Retrieval tasks often repeat a rare key in the final query and in the - // needle span. Exact scores alone can keep the query while dropping the - // neighboring answer chunk, so force a small token-only anchor neighborhood. - // Head/tail forced chunks scale with n_keep so top-K scoring always gets slots. - const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); - const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); - int head_chunks = h_raw, tail_chunks = t_raw; - if (head_chunks + tail_chunks >= n_keep) { - const int budget = std::max(1, n_keep - 1); - head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw)); - tail_chunks = std::max(0, budget - head_chunks); - } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff: - // at long context the needle text is more likely to straddle multiple - // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the - // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8. - // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins). - int anchor_radius; - { - const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); - const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); - if (env_r >= 0) anchor_radius = env_r; - else if (legacy_r >= 0) anchor_radius = legacy_r; - else if (n_chunks < 1024) anchor_radius = 2; - else if (n_chunks < 2048) anchor_radius = 4; - else anchor_radius = 8; - } - // max_anchor_hits scales the same way: at long context, distinctive - // anchors are sparser, so we can afford to keep more hits per qi. - int max_anchor_hits; - { - const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); - const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); - if (env_h >= 0) max_anchor_hits = env_h; - else if (legacy_h >= 0) max_anchor_hits = legacy_h; - else if (n_chunks < 1024) max_anchor_hits = 8; - else if (n_chunks < 2048) max_anchor_hits = 16; - else max_anchor_hits = 32; - } - const int anchor_ngram = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; } - return 4; - }(); - const int rare_token_max_freq = [&]{ - const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); - const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; } - return 2; - }(); - - const float cascade_min_anchor_frac = [&]{ - const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); - const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); - if (nv >= 0.0f) return nv; - if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; } - return 0.0f; // gate off by default: always run cascade - }(); - const float max_forced_ratio = [&]{ - const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); - const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); - if (nv >= 0.0f) return nv; - if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; } - return 10.0f; // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep) - }(); + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep); std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); - for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1; - for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; + for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1; + for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; - const int q0 = std::max(0, S - query_tokens); + const int q0 = std::max(0, S - cfg.query_tokens); { std::vector query_pool(ids.begin() + q0, ids.end()); - dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius, - max_anchor_hits, anchor_ngram, - rare_token_max_freq}; - anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); - anchor_cfg.max_forced_count = (int)(max_forced_ratio * n_keep); - std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d ratio=%.2f\n", - n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, max_forced_ratio); + dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; + anchor_cfg.chunk_size = chunk_size; + std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n", + n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count); std::fflush(stderr); - const bool use_transitive = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); - if (nv >= 0) return nv != 0; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; } - return true; // on by default: gated rare-token bridge improves multi-hop F1 - }(); - const int max_iters = [&]{ - const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); - const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); - if (nv >= 0) return nv; - if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; } - return 3; - }(); - if (use_transitive) { + if (cfg.use_transitive) { dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, - anchor_cfg, max_iters, forced); + anchor_cfg, cfg.max_iters, forced); } else { dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } From b7dd89b0dbb69254d3066f4db2091ea10d0e073d Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 16:04:39 +0200 Subject: [PATCH 09/16] chore(pflash): move narrative comments to docs/, trim mega-blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - qwen3_graph.cpp: collapse 18-line alg-note, trim VRAM prose (3 blocks), remove early_exit_n alias (inline early_exit_pre at call site) - qwen35_backend.cpp: C2 gate 9-line → 2-line + docs ref; do_ar_decode budget-hook 15-line → 4-line + docs ref - http_server.cpp: Design 1 rationale 13-line → 2-line + docs ref - model_backend.h: BudgetHook 23-line essay → 3-line + docs ref - gguf_target_loader.cpp: 4-line prose tail → 1-line - .gitignore: ignore *.git-head / *.pre-pflash-rename workdir artifacts - docs/: pflash-compress-cfg.md, pflash-adaptive-composition.md, anchor-transitive.md (consolidated rationale) --- .gitignore | 4 +++ docs/anchor-transitive.md | 15 ++++++++ docs/pflash-adaptive-composition.md | 18 ++++++++++ docs/pflash-compress-cfg.md | 46 ++++++++++++++++++++++++ server/src/common/model_backend.h | 31 ++-------------- server/src/qwen3/qwen3_graph.cpp | 42 +++++----------------- server/src/qwen35/gguf_target_loader.cpp | 5 +-- server/src/qwen35/qwen35_backend.cpp | 33 ++++------------- server/src/server/http_server.cpp | 15 ++------ 9 files changed, 103 insertions(+), 106 deletions(-) create mode 100644 docs/anchor-transitive.md create mode 100644 docs/pflash-adaptive-composition.md create mode 100644 docs/pflash-compress-cfg.md diff --git a/.gitignore b/.gitignore index b400bb6de..63ba50a0b 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,7 @@ fix-plan.md # Harness test artifacts .harness-work/ health + +# Workdir editor backup suffixes +*.git-head +*.pre-pflash-rename diff --git a/docs/anchor-transitive.md b/docs/anchor-transitive.md new file mode 100644 index 000000000..6f1b02f89 --- /dev/null +++ b/docs/anchor-transitive.md @@ -0,0 +1,15 @@ +# anchor transitive scan + +`scan_and_force_transitive` (anchor_scan.cpp) expands the query pool with +tokens from newly-forced chunks and re-runs `scan_and_force` until fixed +point or max_iters (default 3) is reached. + +Improves multi-hop retrieval: enables discovery of intermediate context +chunks whose tokens do not appear in the original query but connect +query-to-needle via shared rare tokens. + +Empirical result: F1=0.628 on LongBench HotpotQA at ee7 + keep=0.15 +(vs uncompressed F1=0.697). This is the ceiling for attention-score-based +prefill compression on this task; see bench/2026-05-25_longbench_hotpotqa/. + +On by default. Disable via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0. diff --git a/docs/pflash-adaptive-composition.md b/docs/pflash-adaptive-composition.md new file mode 100644 index 000000000..1851dee1e --- /dev/null +++ b/docs/pflash-adaptive-composition.md @@ -0,0 +1,18 @@ +# pflash adaptive composition (Design 1) + +When pflash compresses a prompt, the target spec-decode verify window must +cover the entire compressed sequence — otherwise verify sees only the last +fa_window positions and loses needle context. + +`http_server.cpp`: when pflash_compressed, sets +`req.fa_window_override = effective_prompt.size() + 256`. +This never caps visibility; pflash already paid compute to pick which tokens +matter, so every kept token must be visible in verify. + +`qwen35_backend.cpp` C2 gate: after prefill, checks whether spec-decode +arithmetic still earns its drafter cost at the override window size. + +- override <= 2 * cfg_.fa_window → spec-decode +- override > 2 * cfg_.fa_window → AR fallback (fa_window=0, full attention) + +Both paths see every kept token. The gate chooses mechanism, not visibility. diff --git a/docs/pflash-compress-cfg.md b/docs/pflash-compress-cfg.md new file mode 100644 index 000000000..5755e3142 --- /dev/null +++ b/docs/pflash-compress-cfg.md @@ -0,0 +1,46 @@ +# pflash compression knobs + +All PFLASH_COMPRESS_* and DFLASH_COMPRESS_* env vars are read once per +request in `compress_cfg_from_env(n_chunks, n_keep)` in qwen3_drafter.cpp. + +## anchor_radius adaptive ladder + +Prevents the 64K NIAH cliff: at long context the needle text is more likely +to straddle multiple chunks, and a fixed radius=2 window (5 chunks / ~160 +tokens) loses the back half of the needle. + +Default ladder (override via PFLASH_COMPRESS_ANCHOR_RADIUS): + +| n_chunks | anchor_radius | +|------------|---------------| +| < 1024 | 2 | +| 1024-2047 | 4 | +| >= 2048 | 8 | + +## max_anchor_hits adaptive ladder + +Same breakpoints as anchor_radius. At long context anchors are sparser, so +more hits per query token are affordable. + +| n_chunks | max_anchor_hits | +|------------|-----------------| +| < 1024 | 8 | +| 1024-2047 | 16 | +| >= 2048 | 32 | + +## anchor_transitive + +On by default. Gated rare-token bridge expands the query pool with tokens +from newly-forced chunks and re-runs anchor scan to fixed point. +Improves multi-hop F1 on LongBench HotpotQA (empirically; F1=0.628 ceiling +at ee7+anchor-transitive on RTX 3090 — see bench/2026-05-25_longbench_hotpotqa/). +Control via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0 to disable. + +## head/tail chunk forcing + +Head and tail chunks are force-included before top-K scoring fills the +remainder. The counts scale with n_keep so top-K always gets at least one +slot even when head_raw + tail_raw >= n_keep. + +Defaults: head=8, tail=24 (override via DFLASH_COMPRESS_HEAD_CHUNKS / +DFLASH_COMPRESS_TAIL_CHUNKS). diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index 3af273ace..f2a863418 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -46,35 +46,10 @@ struct DaemonIO { // ─── Generate request/result ──────────────────────────────────────────── -// Thinking-budget force-close hook. Mirrors antirez/ds4 ds4_eval.c's -// hard_limit_reply_budget semantics: when the budget remaining (n_gen -// minus tokens committed so far) falls to hard_limit_remaining, the -// next sampled tokens get overridden with close_token_ids in order, -// giving the model the remaining budget to write a visible answer -// after the injected close-tag sequence. -// -// Single vs multi-token close: -// Qwen3.6: is one added_token (id 248069). close_token_ids -// has size 1. One override + budget_close_injected=true. -// DeepSeek/laguna: tokenizes to 3 ordinary tokens -// ([1718, 37947, 32] for DS-V3). close_token_ids has -// size 3. Three consecutive overrides, then resume. -// -// This is "Level 2" of our thinking-budget migration: in-process -// mid-stream force-close, KV-continuous. Beats Level 1's phase-2 -// reprompt because the model never sees a fresh prefill — its KV -// state continues naturally after the injected close. -// -// Current implementation: AR-decode only. When budget_hook is set, -// backends MAY route generation through their AR path (skipping spec -// decode) — the perf trade-off is acceptable since this only kicks in -// for thinking-enabled requests. Spec-decode integration is a follow-up. +// Thinking-budget force-close hook; see docs/specs/thinking-budget.md. +// When (n_gen - committed) == hard_limit_remaining, overrides sampled +// tokens with close_token_ids (AR path only). Empty = disabled. struct BudgetHook { - // Multi-token close sequence injected when `(n_gen - committed)` - // drops to `hard_limit_remaining`. For Qwen3.x this is the - // canonical "Considering the limited time..." summarize-and-stop - // lead-in (tokenized at server startup); for non-qwen arches it's - // a single close-tag token. Empty = hook disabled. std::vector close_token_ids; int hard_limit_remaining = 0; }; diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp index 858bcd75e..c2715a356 100644 --- a/server/src/qwen3/qwen3_graph.cpp +++ b/server/src/qwen3/qwen3_graph.cpp @@ -5,23 +5,10 @@ // buffers. Sliding-window flash-attention via ggml-cuda's tensor-core // `flash_attn_ext` keeps attention cost linear in S. // -// **Algorithmic note vs blog**: -// The blog stack is Liu Q-hook tail scoring + FlashPrefill block-sparse FA. -// The Liu Q-hook is implemented with a NoPE fix: by default (DFLASH_FP_NOPE_TAIL=1) -// the tail score uses pre-RoPE K/Q, removing the RoPE distance decay that -// buries early-position needle chunks and was causing NIAH failures. -// Set DFLASH_FP_NOPE_TAIL=0 to revert to post-RoPE scoring. The block-sparse FA is replaced -// with a sliding-window approximation here because (a) ggml-cuda's -// `flash_attn_ext` already gives tensor-core speed inside the ubatch -// graph, and (b) our own block-sparse CUDA kernel needs a tensor-core -// rewrite (mma.sync.aligned) to actually beat ggml's FA — see -// `src/flashprefill_kernels.cu` for the (slow) scalar reference path. -// At S=140K with W=512 sliding window the NIAH magic key still propagates -// through 28 layers and is recovered in the kept tokens, so this -// approximation passes the actual e2e correctness check the user cares -// about. The block-sparse FA upgrade remains the next deliverable for -// "match the article algorithmically", but is functionally equivalent -// for the deployed perf budget today. +// Tail score uses pre-RoPE K/Q (DFLASH_FP_NOPE_TAIL=1 default) to remove +// distance decay that buries early-position needle chunks (NIAH fix). +// Block-sparse FA replaced by sliding-window via ggml-cuda flash_attn_ext; +// BSA upgrade tracked in flashprefill_kernels.cu. // // Memory at S=140K, B=1, H=16, Hk=8, D=128, hidden=1024, ff=3072: // weights ~1.5 GB @@ -250,10 +237,8 @@ bool forward_qwen3_drafter_model( } running_max.assign((size_t)n_lookahead * S, -INFINITY); - // Compute score_layer_start early so we can avoid allocating K_norope/Q_norope - // for layers that will never be used in scoring. At S=128K the full K_norope - // allocation is ~5.6 GB (21 unused layers × 268 MB) — skipping it keeps total - // VRAM under 24 GB and eliminates the warm-path regression (A_compute 5.4x). + // Pre-compute score range to skip K_norope alloc for non-scoring layers. + // At S=128K this trims ~5.6 GB (21 × 268 MB); see test_drafter_warm_path_regression. static const int score_layers_pre = []() -> int { const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS"); if (e) { int v = std::atoi(e); if (v > 0) return v; } @@ -264,23 +249,16 @@ bool forward_qwen3_drafter_model( if (e) { int v = std::atoi(e); if (v > 0) return v; } return -1; }(); - // fwd_layer_limit_pre mirrors the fwd_layer_limit computed later in the loop. const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer) ? early_exit_pre : w.n_layer; - // Use compute_score_range (same formula as the scoring loop) so the pre-alloc - // boundary is guaranteed to match the actual scoring boundary. const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre); const int score_layer_start_pre = pre_range.start; - // Number of layers that participate in scoring (and need K_norope/Q_norope). const int n_score_layers = pre_range.count(); PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf; std::vector K_curr_v((size_t)w.n_layer); std::vector V_curr_v((size_t)w.n_layer); std::vector Q_last_v((size_t)w.n_layer); - // NoPE: only allocate K_norope/Q_norope for layers that will be scored. - // When score_layer_start_pre > 0 this trims up to 21 × 268 MB = 5.6 GB, - // preventing the VRAM overflow that causes the warm-path regression at 128K. std::vector K_norope_v(nope_tail ? (size_t)n_score_layers : 0); std::vector Q_norope_v(nope_tail ? (size_t)n_score_layers : 0); auto cleanup_all = [&]() { @@ -380,10 +358,6 @@ bool forward_qwen3_drafter_model( ggml_free(gctx); } - // PFLASH_DRAFTER_EARLY_EXIT_N: already read into early_exit_pre above. - // Alias used in the forward-loop limit below. - const int & early_exit_n = early_exit_pre; - // Per-layer A→FA→B loop. ggml_gallocr_t galloc = ggml_gallocr_new( ggml_backend_get_default_buffer_type(w.backend)); @@ -404,8 +378,8 @@ bool forward_qwen3_drafter_model( double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0; double t_fp = 0.0; - const int fwd_layer_limit = (early_exit_n > 0 && early_exit_n < w.n_layer) - ? early_exit_n : w.n_layer; + const int fwd_layer_limit = (early_exit_pre > 0 && early_exit_pre < w.n_layer) + ? early_exit_pre : w.n_layer; for (int il = 0; il < fwd_layer_limit; ++il) { const auto & L = w.layers[il]; diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp index 9085c1a1b..8628eb3ab 100644 --- a/server/src/qwen35/gguf_target_loader.cpp +++ b/server/src/qwen35/gguf_target_loader.cpp @@ -38,10 +38,7 @@ // ssm_out.weight [inner, hidden] Q5_K // ffn_gate/up/down (same as full-attn) // -// This loader reads the file via ggml's built-in GGUF API, which returns a -// ggml_context pre-populated with tensors. We then wire that context onto -// the CUDA backend (via ggml_backend_alloc_ctx_tensors) and copy each -// tensor's bytes from the mmap'd file. +// Loads via ggml GGUF API; tensors copied from mmap to CUDA backend. #include "internal.h" #include "common/layer_split_utils.h" diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index 471e8af07..feb21e7b2 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -578,15 +578,8 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req, auto t_prefill_end = std::chrono::steady_clock::now(); result.prefill_s = std::chrono::duration(t_prefill_end - t_prefill_start).count(); - // C2 adaptive-mechanism gate: pflash's override always reflects the - // FULL compressed-prompt size — we never cap visibility (would waste - // pflash's anchor-selection work). The gate here decides whether - // spec-decode's verify arithmetic still earns its drafter cost at - // that window size. Threshold 2× cfg_.fa_window: - // override <= 4096 (32K → ~1.5K, 64K → ~3K compressed) → spec-decode - // override > 4096 (128K → ~6.4K compressed) → AR fallback - // AR uses fa_window=0 (full attention) so every kept token is visible - // regardless of which path runs. We choose mechanism, not visibility. + // C2 gate: spec-decode when override <= 2x fa_window; AR fallback otherwise. + // Both paths see all kept tokens. See docs/pflash-adaptive-composition.md. const bool fa_within_budget = (req.fa_window_override == 0) || (eff_fa_window <= 2 * cfg_.fa_window); @@ -847,26 +840,12 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen, const BudgetHook & budget_hook, bool * forced_close_out, bool * degenerate_close_out) { - // Budget hook state. - // - budget_close_started: true once we've begun injecting the close - // sequence. Prevents re-triggering on continued forward generation. - // - close_inject_pos: index into budget_hook.close_token_ids for the - // NEXT token to inject. While < close_token_ids.size(), each - // iteration overrides the sampled token with the corresponding - // close-sequence token (single-token close = 1 override and done; - // multi-token close like DeepSeek/laguna [1718,37947,32] = 3 - // consecutive overrides). Once equal to close_token_ids.size(), - // normal sampling resumes (model writes visible answer). + // budget_close_started: prevents re-triggering; close_inject_pos: next + // token index to inject from close_token_ids. See docs/specs/thinking-budget.md. bool budget_close_started = false; int close_inject_pos = 0; - // Capture entry KV position so the budget check is in the - // "generated since entry" frame, not the absolute KV frame. - // n_gen is the gen-only count (or the remaining-budget remap done by - // spec-decode tail-off); subtracting committed_now (absolute KV = - // prompt_len + tokens generated this call) directly would treat - // prompt-length tokens as if they were generated output, firing - // force-close prompt_len tokens early on prompted requests and - // potentially going negative after spec-decode tail-off. + // committed_at_entry: anchors budget check to "generated since entry" frame, + // not absolute KV (avoids firing prompt_len tokens early). const int committed_at_entry = committed; auto maybe_force_close = [&](int32_t & tok, int committed_now) { if (budget_hook.close_token_ids.empty()) return; diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 465758b51..346b9d8d4 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1267,19 +1267,8 @@ void HttpServer::worker_loop() { gen_req.sampler = req.sampler; gen_req.do_sample = req.sampler.needs_logit_processing(); gen_req.stream = false; // we handle streaming via on_token callback - // Design 1: when pflash compresses, widen the target spec-decode verify - // fa_window to cover the entire compressed prompt. Otherwise verify sees - // only the last cfg_.fa_window positions of the compressed sequence, - // losing needle context and truncating the answer at long ctx. - // - // Principle: pflash already paid compute to pick which tokens matter. - // Don't throw any of them away in verify by capping fa_window — that - // would waste pflash's work. Always request enough verify window to - // see the entire compressed prompt. The C2 gate in qwen35_backend.cpp - // then decides per request whether spec-decode arithmetic still beats - // AR at this window size; if not, AR fallback kicks in (which uses - // fa_window=0 → full attention over the compressed prompt). Either - // path sees every kept token. We choose mechanism, not visibility. + // Widen verify window to cover the full compressed prompt; C2 gate in + // qwen35_backend.cpp selects spec-decode vs AR. See docs/pflash-adaptive-composition.md. if (pflash_compressed) { gen_req.fa_window_override = (int)effective_prompt.size() + 256; } From ff0a6b9074a998302dc3bbfe88faeeb4709ce119 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 17:16:35 +0200 Subject: [PATCH 10/16] fix(server): append closed prefill in Jinja renderer when thinking is off The hard-coded renderer appends a closed think prefill when thinking is disabled. Some Qwen3.6 Jinja templates omit that final assistant suffix, leaving the model in the wrong decoding state for tool use. Mirror the hard-coded behavior here when the rendered prompt ends with a bare assistant generation prompt; tolerate trailing-whitespace variants (single \n, double \n\n, trailing space). Diagnosed by Round 5b D peer-chat showing dflash drafter accept_rate=0.0%: the drafter was distilled with the closed-think suffix in its training distribution; the Unsloth Qwen3-Coder template doesn't emit it, so target and drafter disagree on what comes after <|im_start|>assistant\n. --- server/src/server/chat_template.cpp | 30 ++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp index 1349109ad..4e972e495 100644 --- a/server/src/server/chat_template.cpp +++ b/server/src/server/chat_template.cpp @@ -411,7 +411,35 @@ std::string render_chat_template_jinja( jinja::runtime rt(ctx); jinja::value results = rt.execute(*prog); auto parts = jinja::runtime::gather_string_parts(results); - return parts->as_string().str(); + std::string rendered = parts->as_string().str(); + + // The hard-coded Qwen renderer appends a closed think prefill when + // thinking is disabled. Some Qwen3.6 Jinja templates omit that final + // assistant suffix, which leaves the model in the wrong decoding state + // for tool use. Mirror the hard-coded behavior here when the rendered + // prompt ends with a bare assistant generation prompt. + if (!enable_thinking) { + // Tolerate template variants that emit extra trailing whitespace + // after the assistant marker (single \n, double \n\n, trailing + // space). Strategy: trim trailing whitespace, check for the BARE + // assistant marker (no newline), then re-emit marker + prefill. + static constexpr char kAssistantBare[] = "<|im_start|>assistant"; + static constexpr char kAssistantPrefill[] = "<|im_start|>assistant\n\n\n\n\n"; + size_t trim_end = rendered.size(); + while (trim_end > 0) { + char c = rendered[trim_end - 1]; + if (c != ' ' && c != '\t' && c != '\n' && c != '\r') break; + --trim_end; + } + const size_t blen = sizeof(kAssistantBare) - 1; + if (trim_end >= blen && + rendered.compare(trim_end - blen, blen, kAssistantBare) == 0) { + rendered.resize(trim_end - blen); + rendered += kAssistantPrefill; + } + } + + return rendered; } catch (const std::exception & e) { throw std::runtime_error(std::string("jinja runtime: ") + e.what()); } From fc8c8e24c51fb1f7cd8197cc913544a4cf008448 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 17:20:37 +0200 Subject: [PATCH 11/16] fix(chat_template): gate closed-think prefill injection to Qwen3 arch only The previous commit applied the closed-think suffix to all Jinja-rendered prompts. Add arch_hint (ChatFormat) parameter to render_chat_template_jinja, defaulting to QWEN3, and guard the post-processing block with arch_hint == ChatFormat::QWEN3. Call site in http_server.cpp passes chat_format_ so other archs (Laguna, Gemma4) are unaffected. qwen35moe inherits ChatFormat::QWEN3 by design (matches drafter distillation). 5 unit tests cover: thinking-off appends, thinking-on no-append, non-Qwen3 arch no-append (Laguna + Gemma4), qwen35moe inherits QWEN3, no double-append when template already closes the think block. Diagnosis + verification protocol in docs/pflash-drafter-template-alignment.md. --- docs/pflash-drafter-template-alignment.md | 95 +++++++++++++++++++++++ server/src/server/chat_template.cpp | 17 ++-- server/src/server/chat_template.h | 5 +- server/src/server/http_server.cpp | 3 +- server/test/test_server_unit.cpp | 89 +++++++++++++++++++++ 5 files changed, 200 insertions(+), 9 deletions(-) create mode 100644 docs/pflash-drafter-template-alignment.md diff --git a/docs/pflash-drafter-template-alignment.md b/docs/pflash-drafter-template-alignment.md new file mode 100644 index 000000000..3669b5ed9 --- /dev/null +++ b/docs/pflash-drafter-template-alignment.md @@ -0,0 +1,95 @@ +# Drafter / target distribution alignment via closed-think prefill + +## Problem + +PR #274 (adaptive composition) shipped on `feat/pflash-drafter-ee7`, validating +13× prefill TPS and +47% decode TPS at long context. It surfaced a load-bearing +ceiling on the dflash decode side: spec-decode `accept_rate` was capped at +13–21% on the opencode harness and went to 0.0% on a peer-chat call. Composition +arm decode TPS (24.4 tok/s) therefore stayed below pflash-only (33.0 tok/s) — +the drafter overhead wasn't amortizing through acceptance. + +## Diagnosis (the wrong hypothesis first) + +The peer-chat conversation suggested "drafter conditioned on a different chat +template than the target." Three Phase-1 Explore agents traced the code and +showed that framing is architecturally wrong: + +- Both target and drafter receive the **same** `effective_prompt` token IDs at + prefill. The chat template is applied **once** on the target side at + `server/src/server/http_server.cpp:996-1014`, tokenized with the target's + tokenizer at `:1014`, then flows to both target and drafter via + `gen_req.prompt = effective_prompt` at `:1265`. +- The drafter `dflash-draft-3.6-q4_k_m.gguf` does **not** apply any chat + template at runtime. `server/src/draft/draft_gguf_loader.cpp` doesn't read + the `tokenizer.chat_template` GGUF metadata key. + +A `--draft-chat-template` flag would fix nothing — there is no drafter-side +template-application code path to redirect. + +## Diagnosis (the actual root cause) + +The drafter GGUF **does** ship the official Qwen3.6 chat template as +`tokenizer.chat_template` metadata. That template appends +`\n\n\n\n` after `<|im_start|>assistant\n` when +`enable_thinking=false`. The drafter was distilled with that closed-think +suffix in its training distribution — every assistant turn it predicts +expects that prefix. + +The target's Unsloth Qwen3-Coder template (`project_unsloth_jinja_template_solves_tool_call` +in memory) does **not** append that suffix. So at the moment spec-decode +predicts the next token after `<|im_start|>assistant\n`: + +- drafter's distribution expects `` literal tokens +- target's distribution expects the actual answer + +Drafter proposes `...`, target rejects, falls back to AR. Repeat at +every position. `accept_rate` ≈ 0%. + +## Fix + +Make the **target's render** match the drafter's training distribution. +`render_chat_template_jinja` now appends `\n\n\n\n` after a +bare `<|im_start|>assistant` marker when **all three** of these hold: + +1. `arch_hint == ChatFormat::QWEN3` (gated to Qwen3-family — qwen35, qwen35moe; + Laguna / Gemma4 don't use ChatML tokens and must not be touched) +2. `!enable_thinking` +3. The rendered prompt ends with the bare assistant marker (tolerant of + trailing whitespace variants: `\n`, `\n\n`, trailing space) + +Condition (3) prevents double-appending when a user-supplied template already +emits the closed-think suffix. + +## Multi-arch safety + +`chat_format_for_arch()` in `server/src/server/chat_template.cpp` returns: +- `ChatFormat::QWEN3` for `qwen3`, `qwen35`, `qwen35moe` +- `ChatFormat::LAGUNA` for `laguna` +- `ChatFormat::GEMMA4` for `gemma4` + +The suffix only fires for `QWEN3`. A new test +(`test_chat_format_for_arch_qwen35moe_returns_qwen3`) locks the qwen35moe → +QWEN3 inheritance so a future arch-enum addition doesn't silently flip +behavior. Tests also lock the Laguna/Gemma4 no-append case and the +no-double-append guard. + +## Expected impact + +- `accept_rate` lifts from 13–21% (and 0% on peer-chat) on Qwen3.6 dense with + Unsloth Qwen3-Coder template. Threshold for declaring the fix worked: + non-zero peer-chat accept_rate AND opencode harness accept_rate ≥30% on at + least 2 of 3 turns from Round 5b D. +- Composition arm decode TPS rises above pflash-only on long-generation + workloads (currently 24.4 vs 33.0; the gap exists because spec-decode + amortization is bounded by accept_rate). +- davide221's qwen35moe `chat CACHE` hang (issue #280) likely has the same + root cause via the same code path — qwen35moe inherits ChatFormat::QWEN3 + and the suffix will fire there too. + +## Out of scope + +The sibling commits on `fix/qwen36-claude-code-tool-calling` (target-side +tool-format normalization, scrub/truncate, Anthropic→Qwen tool shape, +param-name aliasing) ship as PR #276. They are not drafter alignment — they +are independent target-side tool-formatting improvements. diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp index 4e972e495..33f4bd864 100644 --- a/server/src/server/chat_template.cpp +++ b/server/src/server/chat_template.cpp @@ -360,7 +360,8 @@ std::string render_chat_template_jinja( const std::string & eos_token, bool add_generation_prompt, bool enable_thinking, - const std::string & tools_json) + const std::string & tools_json, + ChatFormat arch_hint) { if (template_src.empty()) { throw std::runtime_error("render_chat_template_jinja: template_src is empty"); @@ -413,12 +414,14 @@ std::string render_chat_template_jinja( auto parts = jinja::runtime::gather_string_parts(results); std::string rendered = parts->as_string().str(); - // The hard-coded Qwen renderer appends a closed think prefill when - // thinking is disabled. Some Qwen3.6 Jinja templates omit that final - // assistant suffix, which leaves the model in the wrong decoding state - // for tool use. Mirror the hard-coded behavior here when the rendered - // prompt ends with a bare assistant generation prompt. - if (!enable_thinking) { + // Qwen3/3.5/3.6 only: the hard-coded renderer appends a closed think + // prefill when thinking is disabled. Some Qwen3.6 Jinja templates omit + // that final assistant suffix, leaving the model in the wrong decoding + // state for tool use. Mirror the hard-coded behavior here when the + // rendered prompt ends with a bare assistant generation prompt. + // Other architectures (Laguna, Gemma4, ...) do not use ChatML tokens + // and must not be touched here. + if (arch_hint == ChatFormat::QWEN3 && !enable_thinking) { // Tolerate template variants that emit extra trailing whitespace // after the assistant marker (single \n, double \n\n, trailing // space). Strategy: trim trailing whitespace, check for the BARE diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h index ca7ef9db5..b544df245 100644 --- a/server/src/server/chat_template.h +++ b/server/src/server/chat_template.h @@ -63,6 +63,8 @@ ChatFormat chat_format_for_arch(const std::string & arch); // {{bos_token}} / {{eos_token}}). Use empty strings if unknown. // `tools_json` optional JSON array of tool definitions; when non-empty it // is parsed and injected as `tools` into the template context. +// `arch_hint` model architecture (controls arch-specific post-processing; +// the closed-think prefill injection is Qwen3/3.5/3.6 only). // // Internally caches the most recently parsed program per thread (avoids // re-parsing the template on every request). Throws std::runtime_error on @@ -74,6 +76,7 @@ std::string render_chat_template_jinja( const std::string & eos_token, bool add_generation_prompt = true, bool enable_thinking = false, - const std::string & tools_json = ""); + const std::string & tools_json = "", + ChatFormat arch_hint = ChatFormat::QWEN3); } // namespace dflash::common diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 346b9d8d4..22076c4b9 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1000,7 +1000,8 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { eos_str, /*add_generation_prompt=*/true, enable_thinking, - tools_json); + tools_json, + chat_format_); } catch (const std::exception & e) { send_error(fd, 500, std::string("chat template (jinja) render failed: ") + e.what()); diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 1415aab30..fc54666c9 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -1133,6 +1133,90 @@ static void test_jinja_render_bad_tools_json_throws() { TEST_ASSERT(threw); } +// --------------------------------------------------------------------------- +// Drafter / target distribution alignment (closed prefill on Qwen3). +// The hard-coded Qwen renderer appends a closed think prefill when thinking is +// disabled; some Qwen3.6 Jinja templates omit it. render_chat_template_jinja +// mirrors the hard-coded behavior when arch_hint == QWEN3 && !enable_thinking +// && the rendered prompt ends with a bare assistant generation marker. +// --------------------------------------------------------------------------- + +static const char QWEN3_BARE_ASSISTANT_TPL[] = + "{%- for m in messages -%}" + "<|im_start|>{{ m.role }}\n{{ m.content }}<|im_end|>\n" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|im_start|>assistant\n" + "{%- endif -%}"; + +static void test_jinja_render_qwen3_closes_think_when_thinking_off() { + std::vector msgs = {{"user", "hi", ""}}; + std::string out = render_chat_template_jinja( + QWEN3_BARE_ASSISTANT_TPL, msgs, "", "", + /*add_gen=*/true, /*think=*/false, /*tools=*/"", + /*arch_hint=*/ChatFormat::QWEN3); + TEST_ASSERT(out.find("<|im_start|>assistant\n\n\n\n\n") != std::string::npos); +} + +static void test_jinja_render_does_not_close_think_when_thinking_on() { + std::vector msgs = {{"user", "hi", ""}}; + std::string out = render_chat_template_jinja( + QWEN3_BARE_ASSISTANT_TPL, msgs, "", "", + /*add_gen=*/true, /*think=*/true, /*tools=*/"", + /*arch_hint=*/ChatFormat::QWEN3); + TEST_ASSERT(out.find("") == std::string::npos); +} + +static void test_jinja_render_does_not_close_think_for_non_qwen3_arch() { + // Laguna and Gemma4 do not use ChatML tokens; the closed-think suffix + // must NOT be appended for them even if the rendered prompt happens to + // end with the same string. + std::vector msgs = {{"user", "hi", ""}}; + std::string out_laguna = render_chat_template_jinja( + QWEN3_BARE_ASSISTANT_TPL, msgs, "", "", + /*add_gen=*/true, /*think=*/false, /*tools=*/"", + /*arch_hint=*/ChatFormat::LAGUNA); + TEST_ASSERT(out_laguna.find("") == std::string::npos); + std::string out_gemma4 = render_chat_template_jinja( + QWEN3_BARE_ASSISTANT_TPL, msgs, "", "", + /*add_gen=*/true, /*think=*/false, /*tools=*/"", + /*arch_hint=*/ChatFormat::GEMMA4); + TEST_ASSERT(out_gemma4.find("") == std::string::npos); +} + +static void test_chat_format_for_arch_qwen35moe_returns_qwen3() { + // qwen35moe MUST inherit ChatFormat::QWEN3 — the closed-think prefill + // depends on it, and a future enum-add must not silently flip behavior. + TEST_ASSERT(chat_format_for_arch("qwen35moe") == ChatFormat::QWEN3); + TEST_ASSERT(chat_format_for_arch("qwen35") == ChatFormat::QWEN3); + TEST_ASSERT(chat_format_for_arch("qwen3") == ChatFormat::QWEN3); + TEST_ASSERT(chat_format_for_arch("laguna") == ChatFormat::LAGUNA); + TEST_ASSERT(chat_format_for_arch("gemma4") == ChatFormat::GEMMA4); +} + +static void test_jinja_render_does_not_double_append_close_think() { + // A user-supplied template that already closes the think block must not + // get a second suffix from the bare-marker post-processing. + static const char TPL_ALREADY_CLOSED[] = + "{%- for m in messages -%}" + "<|im_start|>{{ m.role }}\n{{ m.content }}<|im_end|>\n" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|im_start|>assistant\n\n\n\n\n" + "{%- endif -%}"; + std::vector msgs = {{"user", "hi", ""}}; + std::string out = render_chat_template_jinja( + TPL_ALREADY_CLOSED, msgs, "", "", + /*add_gen=*/true, /*think=*/false, /*tools=*/"", + /*arch_hint=*/ChatFormat::QWEN3); + // Exactly one — the one the template emitted itself. + size_t first = out.find(""); + size_t second = (first == std::string::npos) ? std::string::npos + : out.find("", first + 1); + TEST_ASSERT(first != std::string::npos); + TEST_ASSERT(second == std::string::npos); +} + static void test_normalize_responses_tool_followup_messages() { ToolMemory tool_memory; const std::string call_id = "call_exec_001"; @@ -2539,6 +2623,11 @@ int main() { RUN_TEST(test_jinja_render_empty_tools_skipped); RUN_TEST(test_jinja_render_bos_eos_threaded); RUN_TEST(test_jinja_render_empty_template_throws); + RUN_TEST(test_jinja_render_qwen3_closes_think_when_thinking_off); + RUN_TEST(test_jinja_render_does_not_close_think_when_thinking_on); + RUN_TEST(test_jinja_render_does_not_close_think_for_non_qwen3_arch); + RUN_TEST(test_chat_format_for_arch_qwen35moe_returns_qwen3); + RUN_TEST(test_jinja_render_does_not_double_append_close_think); RUN_TEST(test_jinja_render_bad_tools_json_throws); RUN_TEST(test_normalize_responses_tool_followup_messages); From e64a2b80669f5da5e2e3c8dd3599fd4774ab387b Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 28 May 2026 19:04:09 +0200 Subject: [PATCH 12/16] refactor(c2-gate): wire c2_spec_decode_permitted into qwen35_backend Extract the C2 spec-decode gate from an inline expression in qwen35_backend.cpp into a pure predicate header c2_gate.h. Zero behavior change. Identical math: (fa_window_override == 0) || (fa_window_override <= 2 * fa_window_cfg) The new header documents the empirically-derived rationale: at compressed KV sizes (pflash compression of long prompts), T_draft/T_target ratio approaches 1, eliminating spec-decode's profit margin over AR. Empirical at D_composition 128K replay: AR=27.5 tok/s vs forced spec-decode=5.74 tok/s. The gate correctly blocks spec-decode when eff_fa_window > 2*fa_window_cfg. Adds 5 unit tests locking in the predicate's behavior with explicit Round 5 4-arm matrix bench citations. Files: - server/src/qwen35/c2_gate.h (new) - server/src/qwen35/qwen35_backend.cpp (+1 include, inline -> call) - server/test/test_server_unit.cpp (+60 LOC, 5 tests) --- server/src/qwen35/c2_gate.h | 31 ++++++++++++++ server/src/qwen35/qwen35_backend.cpp | 6 ++- server/test/test_server_unit.cpp | 60 ++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 server/src/qwen35/c2_gate.h diff --git a/server/src/qwen35/c2_gate.h b/server/src/qwen35/c2_gate.h new file mode 100644 index 000000000..51c644e2c --- /dev/null +++ b/server/src/qwen35/c2_gate.h @@ -0,0 +1,31 @@ +// C2 gate predicate — pure function, no GPU/model deps. +// Extracted from qwen35_backend.cpp for testability. +// +// Reasoning: when pflash compresses a 128K prompt to ~11K tokens, the +// target KV at decode time = 11K (small). T_target is fast (small KV), +// T_draft ≈ constant. r = T_draft/T_target ≈ 1, so spec-decode does NOT +// win over AR. Empirical: D_composition 128K: AR=27.5 tok/s, spec=5.74 tok/s. +// Gate correctly blocks spec-decode when eff_fa_window > 2*fa_window_cfg. +#pragma once + +namespace dflash::common { + +// Returns true if spec-decode should be attempted. +// fa_window_override: 0 = no pflash; else = compressed_prompt_size + 256 +// fa_window_cfg : cfg_.fa_window (default 2048) +// kv_committed : KV position after prefill (unused; kept for future use) +// +// Gate: permit spec-decode when eff_fa_window <= 2 * fa_window_cfg. +// For uncompressed (override==0): always permit. +// For pflash-compressed: permit only when compressed_size <= 3840 tokens. +// At compressed_size > 3840, target KV is large enough that AR is faster +// than spec-decode (empirically: D_composition 128K AR=27.5 vs spec=5.74 tok/s). +inline bool c2_spec_decode_permitted(int fa_window_override, + int fa_window_cfg, + int kv_committed) { + (void)kv_committed; + return (fa_window_override == 0) + || (fa_window_override <= 2 * fa_window_cfg); +} + +} // namespace dflash::common diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index feb21e7b2..82d0ea6ca 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -6,6 +6,7 @@ #include "common/dflash_draft_graph.h" #include "peer_access.h" #include "attn_masks.h" +#include "qwen35/c2_gate.h" #include "common/sampler.h" #include "common/io_utils.h" #include "common/restore_delta.h" @@ -581,8 +582,9 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req, // C2 gate: spec-decode when override <= 2x fa_window; AR fallback otherwise. // Both paths see all kept tokens. See docs/pflash-adaptive-composition.md. const bool fa_within_budget = - (req.fa_window_override == 0) - || (eff_fa_window <= 2 * cfg_.fa_window); + dflash::common::c2_spec_decode_permitted(req.fa_window_override, + cfg_.fa_window, + /*kv_committed*/ 0); // Decode (speculative or AR) if (req.n_gen > 0) { diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index fc54666c9..eb53299fc 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -23,6 +23,7 @@ #include "placement/placement_config.h" #include "common/layer_split_backend.h" #include "common/layer_split_utils.h" +#include "qwen35/c2_gate.h" #include #include @@ -2532,6 +2533,58 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() { TEST_ASSERT(r.accept_rate == 0.0f); } +// ═══════════════════════════════════════════════════════════════════════ +// C2 gate: c2_spec_decode_permitted() unit tests +// +// Gate logic: permit spec-decode when eff_fa_window <= 2*fa_window_cfg. +// eff_fa_window = fa_window_override when set, else fa_window_cfg. +// +// Empirical validation (Round 5 bench): +// - D_composition 128K: effective_in=10988, eff_fa_window=11244 > 4096 +// → gate BLOCKS spec-decode → AR at 27.5 tok/s (correct — spec at 5.74) +// - D_composition short: eff_fa_window <= 4096 → gate permits spec-decode +// ═══════════════════════════════════════════════════════════════════════ + +static void test_c2_gate_no_override_always_permits() { + // fa_window_override == 0 → no pflash, always spec-decode permitted. + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 1)); + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 4096)); + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 131072)); +} + +static void test_c2_gate_128k_compressed_blocks_spec() { + // Round 5 D 128K: effective_in=10988, fa_window_override=11244. + // 11244 > 2*2048=4096 → gate correctly BLOCKS spec-decode (AR wins empirically). + int fa_window_cfg = 2048; + int compressed_size = 10988; + int fa_window_override = compressed_size + 256; // = 11244 + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted( + fa_window_override, fa_window_cfg, compressed_size)); +} + +static void test_c2_gate_65k_compressed_blocks_spec() { + // D 65K cell: effective_in≈5383, fa_window_override≈5639 > 4096 → blocks. + int compressed_size = 5383; + int fa_window_override = compressed_size + 256; + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted( + fa_window_override, 2048, compressed_size)); +} + +static void test_c2_gate_small_compressed_permits_spec() { + // Small compressed KV (override <= 2*fa_window): spec-decode permitted. + // fa_window_override=3000 <= 4096 → permit + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(3000, 2048, 2744)); + // fa_window_override=4096 == 2*2048 → permit (at boundary) + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840)); +} + +static void test_c2_gate_boundary_at_2x_fa_window() { + // At exactly 2*fa_window_cfg: permit (<=). + TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840)); + // At 2*fa_window_cfg + 1: block. + TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(4097, 2048, 3841)); +} + int main() { std::fprintf(stderr, "══════════════════════════════════════════\n"); std::fprintf(stderr, " Server Unit Tests\n"); @@ -2698,6 +2751,13 @@ int main() { RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic); RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode); + std::fprintf(stderr, "\n── C2 gate (spec-decode gate) ──\n"); + RUN_TEST(test_c2_gate_no_override_always_permits); + RUN_TEST(test_c2_gate_128k_compressed_blocks_spec); + RUN_TEST(test_c2_gate_65k_compressed_blocks_spec); + RUN_TEST(test_c2_gate_small_compressed_permits_spec); + RUN_TEST(test_c2_gate_boundary_at_2x_fa_window); + std::fprintf(stderr, "\n══════════════════════════════════════════\n"); std::fprintf(stderr, " Results: %d assertions, %d failures\n", test_count, test_failures); From 8c1f37db0a538b7e792ac0650a340b7df63d7001 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Fri, 29 May 2026 17:23:29 +0200 Subject: [PATCH 13/16] feat(pflash): effective-size admission gate + keep-ratio guard (keep default 0.10) - Gate context-window admission on post-compression effective size, not raw, so >128K-raw prompts compress to fit max_ctx instead of 400 / oversized KV reservation. - Pre-compression keep-ratio sanity guard (raw*keep+max_out>max_ctx); the real effective-size gate runs post-compression in worker_loop. - Default prefill-keep-ratio 0.05 -> 0.10: real ~2x compression on agentic content (0.25 over-forces anchor-transitive to ~100% = no-op + rejects >128K). - Evidence (RTX3090, agentic replay, keep=0.10): 167K raw admitted -> 71K eff (42.6%), prefill 145s vs 845s forced; 32-128K real compression; tool-parse intact; 1629 unit asserts green; 14-cell P/PD sweep zero crashes. --- server/src/server/http_server.cpp | 63 ++++++++++++++++++++++- server/src/server/http_server.h | 19 ++++++- server/src/server/server_main.cpp | 2 +- server/test/test_server_unit.cpp | 83 ++++++++++++++++++++++++++++++- 4 files changed, 162 insertions(+), 5 deletions(-) diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 22076c4b9..219bccf6d 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -77,6 +77,32 @@ static size_t json_array_size(const json & value) { return value.is_array() ? value.size() : 0; } +// ─── Admission gate ────────────────────────────────────────────────────── +// Pre-compression sanity guard uses first principles: reject only when even +// best-case compression cannot fit — (double)raw*keep_ratio + max_output > max_ctx. +// This is keep-ratio-derived, so it correctly admits large prompts at low +// keep ratios rather than using a hardcoded 4× multiplier calibrated to 0.25. + +bool check_admission(int effective_size, int raw_size, + int max_output, int max_ctx, bool pflash_on, + float pflash_keep_ratio) { + if (max_ctx <= 0) return true; // no limit configured + if (pflash_on) { + // Pre-compression guard: reject only when even best-case compression + // cannot fit. Skip when keep_ratio <= 0 (degenerate config; let the + // post-compression gate decide). + if (pflash_keep_ratio > 0.0f) { + if ((double)raw_size * pflash_keep_ratio + max_output > (double)max_ctx) + return false; + } + // Pre-compression guard passed: admit. The real effective-size gate + // runs post-compression (caller passes pflash_on=false after pflash). + return true; + } + // Non-pflash (or post-compression): check effective size directly. + return effective_size + max_output <= max_ctx; +} + // Build the /props response body. // // Non-static so unit tests can call it directly (declared in http_server.h). @@ -1027,8 +1053,27 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { return true; // handled (with error) } - // Check context length. - if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) { + // Pre-compression admission: reject non-pflash requests that can't fit, + // and pflash requests whose raw prompt cannot possibly compress to fit + // (first-principles guard: raw*keep_ratio + max_output > max_ctx). + // The real post-compression gate runs in worker_loop after pflash runs. + const int raw_size = (int)req.prompt_tokens.size(); + const bool pflash_will_run = + config_.max_ctx > 0 && + config_.pflash_mode != ServerConfig::PflashMode::OFF && + drafter_tokenizer_ != nullptr && + (config_.pflash_mode == ServerConfig::PflashMode::ALWAYS || + raw_size >= config_.pflash_threshold); + if (!check_admission(raw_size, raw_size, req.max_output, config_.max_ctx, + /*pflash_on=*/false) && !pflash_will_run) { + // Non-pflash path: raw is the effective size, reject immediately. + send_error(fd, 400, "prompt + max_tokens exceeds context window"); + return true; + } + if (pflash_will_run && + !check_admission(raw_size, raw_size, req.max_output, config_.max_ctx, + /*pflash_on=*/true, config_.pflash_keep_ratio)) { + // Pre-compression guard: best-case compression still can't fit. send_error(fd, 400, "prompt + max_tokens exceeds context window"); return true; } @@ -1230,6 +1275,20 @@ void HttpServer::worker_loop() { } } + // Effective-size admission gate: check post-compression prompt fits max_ctx. + // For non-pflash requests this was already checked in handle_client; + // for pflash requests the raw guard passed but the effective size may + // still be too large (unlikely but possible if compression ratio is poor). + // Use pflash_on=false here so the function directly checks effective size + // (pflash_on=true only runs the pre-compression guard, not useful here). + if (!check_admission((int)effective_prompt.size(), (int)req.prompt_tokens.size(), + req.max_output, config_.max_ctx, + /*pflash_on=*/false, + config_.pflash_keep_ratio)) { + fail_request(400, "prompt + max_tokens exceeds context window"); + continue; + } + // Build generate request. // // Thinking-budget v2 (Level 2): when caller opts in via diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 2fb3e4661..803cf74e8 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -143,7 +143,7 @@ struct ServerConfig { enum class PflashMode { OFF, AUTO, ALWAYS }; PflashMode pflash_mode = PflashMode::OFF; int pflash_threshold = 32000; // token count threshold for AUTO mode - float pflash_keep_ratio = 0.05f; // fraction of tokens to keep + float pflash_keep_ratio = 0.10f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) int pflash_drafter_gpu = 0; // backend-local GPU for PFlash drafter bool pflash_remote_drafter = false; // use IPC drafter for mixed backends @@ -317,6 +317,23 @@ struct ServerJob { ServerJob * next = nullptr; }; +// ─── Admission gate (pure, testable) ──────────────────────────────────── +// Returns true when the request should be admitted (effective prompt fits). +// +// effective_size : post-compression prompt token count (== raw_size when +// pflash is off or the prompt is below threshold). +// raw_size : pre-compression token count; used for the pre-compression +// sanity guard: reject early when even best-case compression +// cannot fit — i.e. raw*keep_ratio + max_output > max_ctx. +// max_output : request's requested generation tokens. +// max_ctx : server's configured context window (--max-ctx). +// pflash_on : true when pflash compressed this request. +// pflash_keep_ratio: configured keep fraction; drives the pre-compression guard. +// Guard is skipped when <= 0. +bool check_admission(int effective_size, int raw_size, + int max_output, int max_ctx, bool pflash_on, + float pflash_keep_ratio = 0.10f); + // ─── Parse session_id from a chat-completion JSON body ────────────────── // Returns empty string when session_id is absent or not a string (int/null/array). // Checks extra_body.session_id first, then top-level session_id. diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 3dcb23a5a..156c84afe 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -205,7 +205,7 @@ static void print_usage(const char * prog) { "PFlash (speculative prefill compression):\n" " --prefill-compression off|auto|always (default: off)\n" " --prefill-threshold Token threshold for auto mode (default: 32000)\n" - " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" + " --prefill-keep-ratio Fraction of tokens to keep (default: 0.10)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" " --lazy-draft Park decode draft when idle to save VRAM\n" diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index eb53299fc..1e6a1bd39 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -890,7 +890,7 @@ static void test_pflash_config_defaults() { ServerConfig cfg; TEST_ASSERT(cfg.pflash_mode == ServerConfig::PflashMode::OFF); TEST_ASSERT(cfg.pflash_threshold == 32000); - TEST_ASSERT(cfg.pflash_keep_ratio > 0.04f && cfg.pflash_keep_ratio < 0.06f); + TEST_ASSERT(cfg.pflash_keep_ratio > 0.09f && cfg.pflash_keep_ratio < 0.11f); TEST_ASSERT(cfg.pflash_drafter_path.empty()); TEST_ASSERT(!cfg.pflash_skip_park); } @@ -954,6 +954,76 @@ static void test_pflash_threshold_always_mode() { TEST_ASSERT(should); } +// ═══════════════════════════════════════════════════════════════════════ +// Admission gate tests (check_admission pure helper) +// ═══════════════════════════════════════════════════════════════════════ + +static void test_admission_pflash_raw_large_effective_fits() { + // pflash on, raw=170000, effective=65000, max_output=512, max_ctx=131072 → ADMITTED + TEST_ASSERT(check_admission(/*effective=*/65000, /*raw=*/170000, + /*max_output=*/512, /*max_ctx=*/131072, + /*pflash_on=*/true)); +} + +static void test_admission_pflash_effective_too_large() { + // Post-compression: effective still too large → REJECTED. + // The post-compression call uses pflash_on=false (direct effective check). + TEST_ASSERT(!check_admission(/*effective=*/131000, /*raw=*/170000, + /*max_output=*/512, /*max_ctx=*/131072, + /*pflash_on=*/false)); +} + +static void test_admission_no_pflash_raw_too_large() { + // pflash off, raw > max_ctx → REJECTED (unchanged from original behavior) + TEST_ASSERT(!check_admission(/*effective=*/100000, /*raw=*/100000, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/false)); +} + +static void test_admission_small_request_admitted() { + // Normal small request → ADMITTED regardless of pflash flag + TEST_ASSERT(check_admission(/*effective=*/1000, /*raw=*/1000, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/false)); + TEST_ASSERT(check_admission(/*effective=*/1000, /*raw=*/1000, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/true)); +} + +static void test_admission_pflash_raw_sanity_guard() { + // pflash on, keep_ratio=0.25 (explicit guard-test input), raw=32769: + // 32769*0.25 + 512 = 8704.25 > 8192 → REJECTED. + TEST_ASSERT(!check_admission(/*effective=*/1000, /*raw=*/32769, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/true, /*keep_ratio=*/0.25f)); +} + +static void test_admission_no_max_ctx_always_admits() { + // max_ctx=0 means no limit: always admit + TEST_ASSERT(check_admission(/*effective=*/999999, /*raw=*/999999, + /*max_output=*/9999, /*max_ctx=*/0, + /*pflash_on=*/false)); +} + +static void test_admission_keep_ratio_derived_guard_admits_low_ratio() { + // keep_ratio=0.05, raw=65536 (8× max_ctx=8192): + // best-case effective = 65536*0.05 = 3276.8 tokens. + // 3276.8 + 512 = 3788.8 < 8192 → guard PASSES → ADMITTED. + // The old hardcoded 4× guard would have rejected (65536 > 4*8192=32768). + TEST_ASSERT(check_admission(/*effective=*/65536, /*raw=*/65536, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/true, /*keep_ratio=*/0.05f)); +} + +static void test_admission_keep_ratio_derived_guard_rejects_impossible() { + // keep_ratio=0.05, raw=2_000_000, max_ctx=8192: + // best-case effective = 2000000*0.05 = 100000 tokens. + // 100000 + 512 = 100512 > 8192 → REJECTED. + TEST_ASSERT(!check_admission(/*effective=*/2000000, /*raw=*/2000000, + /*max_output=*/512, /*max_ctx=*/8192, + /*pflash_on=*/true, /*keep_ratio=*/0.05f)); +} + static void test_pflash_placement_same_backend_local() { DevicePlacement target; target.backend = compiled_placement_backend(); @@ -2663,6 +2733,17 @@ int main() { RUN_TEST(test_pflash_compress_result_defaults); RUN_TEST(test_pflash_threshold_auto_mode); RUN_TEST(test_pflash_threshold_always_mode); + + std::fprintf(stderr, "\n── Admission gate ──\n"); + RUN_TEST(test_admission_pflash_raw_large_effective_fits); + RUN_TEST(test_admission_pflash_effective_too_large); + RUN_TEST(test_admission_no_pflash_raw_too_large); + RUN_TEST(test_admission_small_request_admitted); + RUN_TEST(test_admission_pflash_raw_sanity_guard); + RUN_TEST(test_admission_no_max_ctx_always_admits); + RUN_TEST(test_admission_keep_ratio_derived_guard_admits_low_ratio); + RUN_TEST(test_admission_keep_ratio_derived_guard_rejects_impossible); + RUN_TEST(test_pflash_placement_same_backend_local); RUN_TEST(test_pflash_placement_mixed_backend_remote); RUN_TEST(test_pflash_placement_auto_draft_follows_target); From fbc2d412ab1cf467a9b16212147b53b49c477dfd Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Sat, 30 May 2026 20:51:22 +0200 Subject: [PATCH 14/16] feat(pflash): adaptive compression-regime router (correct-by-construction, default-off) Pure per-request router that picks pFlash compression per prompt instead of a global keep. Ships DISABLED: the default policy is an exact no-op, so enabling is opt-in and cannot regress recall. - regime_router.h: pure decide_regime (v1) + decide_v2 (type-gate + sparse-prompt guard + recency floor) + recency_floor_for() size-adaptive helper. 172 unit tests, -Werror clean. Default policy returns FullCascade/full-keep for any input. - qwen3_drafter.cpp: PFLASH_RECENCY_FLOOR_TOKENS mechanism (0 = off default; -1 = auto = min(1024, 0.04*S)); env_int negative-sentinel fix. - CMakeLists: test_regime_router target. Evidence (bench/2026-05-30_*): indexer-prune dead (frac_prune=0 across scorers); the splitter is prompt TYPE, not cascade-expansion (rho=-0.27); sparse-plumbing turns fail under compression but pass at full keep (OFF=100%), so route them to full keep. Recency floor validated zero-sum at fixed keep (breaks controls), kept as off-by-default infra, not the mechanism. Router path: agentic-rich -> ~25% keep (~3x prefill/wall), agentic-sparse -> full keep, retrieval -> cascade. Not yet wired into the live request handler (follow-up). --- server/CMakeLists.txt | 8 + server/src/common/regime_router.h | 191 ++++++++ server/src/qwen3/qwen3_drafter.cpp | 51 +- server/test/test_regime_router.cpp | 715 +++++++++++++++++++++++++++++ 4 files changed, 963 insertions(+), 2 deletions(-) create mode 100644 server/src/common/regime_router.h create mode 100644 server/test/test_regime_router.cpp diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index cfdc22937..dd8812c0d 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -581,6 +581,14 @@ if(DFLASH27B_TESTS) add_test(NAME test_drafter_early_exit_score_range COMMAND test_drafter_early_exit_score_range) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_regime_router.cpp") + add_executable(test_regime_router + test/test_regime_router.cpp) + target_include_directories(test_regime_router PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/common) + add_test(NAME regime_router + COMMAND test_regime_router) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp") add_executable(test_anchor_transitive test/test_anchor_transitive.cpp diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h new file mode 100644 index 000000000..7c9917eb7 --- /dev/null +++ b/server/src/common/regime_router.h @@ -0,0 +1,191 @@ +// Pure, correct-by-construction adaptive compression-regime router. +// No IO, no globals, no GPU, no ggml/llama deps — header-only. +// +// Decides whether the transitive anchor cascade should run at full expansion +// (FullCascade, recall-preserving default) or be throttled +// (Throttle, fires ONLY when expansion_ratio >= policy threshold). +// +// Build (standalone): +// g++-11 -std=gnu++17 -O2 -I. -o test_regime_router test/test_regime_router.cpp +// CMake: cmake --build build --target test_regime_router -j +// ctest -R regime_router --output-on-failure +#pragma once + +#include +#include + +namespace dflash::common { + +// ─── Input ─────────────────────────────────────────────────────────────────── + +// All inputs are cheap lexical counts already available in the cascade path. +struct CascadeStats { + int n_chunks; + int forced_anchor_only; // chunks forced by BASE anchors, pre-cascade + int forced_after_cascade; // chunks forced AFTER transitive cascade + int prompt_tokens; // S + int keep_floor_chunks; // ceil(keep_ratio * n_chunks) budget (informational) +}; + +// ─── Policy ────────────────────────────────────────────────────────────────── + +struct RouterPolicy { + int threshold_tokens = 32000; // below this: passthrough + double expansion_throttle_ratio = INFINITY; // DEFAULT disabled + int min_anchor_chunks = 1; // don't throttle if too few anchors +}; + +// ─── Output ────────────────────────────────────────────────────────────────── + +enum class Regime { FullCascade, Throttle }; + +struct RegimeDecision { + Regime regime; + double expansion_ratio; + const char* reason; +}; + +// ─── Core function ─────────────────────────────────────────────────────────── + +// decide_regime — pure, no IO, no globals. +// +// Expansion ratio R = forced_after_cascade / forced_anchor_only +// (defined as 1.0 when forced_anchor_only == 0 to avoid division by zero). +// +// Transition to Throttle ONLY on the last branch; every other path returns +// FullCascade so the default deployment posture is recall-preserving. +inline RegimeDecision decide_regime(const CascadeStats& s, const RouterPolicy& p) { + // Compute R first (needed for degenerate guard + return value). + const double R = (s.forced_anchor_only > 0) + ? static_cast(s.forced_after_cascade) / s.forced_anchor_only + : 1.0; + + // Guard: degenerate inputs — return FullCascade, no further processing. + if (s.n_chunks <= 0 || s.forced_anchor_only < 0 || s.forced_after_cascade < 0) + return { Regime::FullCascade, R, "degenerate" }; + + // Passthrough: prompt too short to compress meaningfully. + if (s.prompt_tokens < p.threshold_tokens) + return { Regime::FullCascade, R, "below_threshold" }; + + // Guard: too few base anchors — throttle would be meaningless. + if (s.forced_anchor_only < p.min_anchor_chunks) + return { Regime::FullCascade, R, "too_few_anchors" }; + + // Only transition: cascade expanded beyond the policy limit. + if (R >= p.expansion_throttle_ratio) + return { Regime::Throttle, R, "cascade_over_expansion" }; + + return { Regime::FullCascade, R, "default_safe" }; +} + +// ─── V2 Router ─────────────────────────────────────────────────────────────── +// +// Adaptive compression router v2. +// Splits on prompt TYPE (agentic vs retrieval) rather than cascade expansion +// ratio R (which was refuted as a keep predictor, Spearman ρ=-0.27). +// +// Additional guards: +// sparse_prompt_guard — skip compression when new_content_tokens is tiny +// (plumbing turns: recent orchestration continuity must not be dropped) +// recency_floor_turns — always keep the last K turns whole in the agentic path +// +// Sentinel for "keep all turns" recency in SAFE decisions: +static constexpr int kRecencyKeepAll = (1 << 20); + +// Size-adaptive recency floor sentinel. +// When recency_floor_tokens == kRecencyFloorAuto the compress path computes +// R = min(1024, ceil(0.04 * prompt_tokens)) +// instead of using a fixed token count. 0 = off (no-op default). +static constexpr int kRecencyFloorAuto = -1; + +struct RequestFeatures { + bool is_agentic; // tool schemas / tool_use|tool_result blocks present + int prompt_tokens; // total S + int new_content_tokens; // newest turn content size (sparse-plumbing detector) +}; + +struct RouterPolicyV2 { + bool enabled = false; // DEFAULT DISABLED → exact no-op + int threshold_tokens = 32000; // below → passthrough + double agentic_keep_target = 0.25; // conservative floor, closes empty-failure tail + double full_keep_target = 1.0; // retrieval/QA & safe fallbacks + int recency_floor_turns = 2; // keep last K turns whole (continuity) + int sparse_new_content_tokens = 256; // below this → sparse_prompt_guard fires +}; + +// recency_floor_for — pure helper, no IO. +// +// Returns the concrete token floor for a given prompt size and policy: +// recency_floor_tokens == 0 → 0 (off, no-op) +// recency_floor_tokens == kRecencyFloorAuto (-1) +// → min(1024, ceil(0.04 * prompt_tokens)) +// recency_floor_tokens > 0 → recency_floor_tokens (explicit override) +// +// "one turn equivalent" lower-bound: the agentic throttle path in decide_v2 +// already reserves recency_floor_turns whole turns; this helper computes the +// token-count floor passed to the compress path for the token-budget guard. +inline int recency_floor_for(int prompt_tokens, int recency_floor_tokens) { + if (recency_floor_tokens == 0) + return 0; + if (recency_floor_tokens == kRecencyFloorAuto) { + // min(1024, ceil(0.04 * S)) — scales with context, caps at 1024 + const int adaptive = static_cast( + std::ceil(0.04 * static_cast(prompt_tokens < 0 ? 0 : prompt_tokens))); + return (adaptive < 1024) ? adaptive : 1024; + } + // Explicit positive override. + return (recency_floor_tokens > 0) ? recency_floor_tokens : 0; +} + +struct RouterDecisionV2 { + double keep_target; + int recency_floor_turns; + bool cascade; + const char* reason; +}; + +// decide_v2 — pure, no IO, no globals. +// +// SAFE path: keep_target=full_keep_target, recency=kRecencyKeepAll, cascade=true. +// Returns SAFE when: +// - p.enabled == false (deploy no-op, correct-by-construction) +// - f.prompt_tokens <= 0 || f.new_content_tokens < 0 (degenerate) +// - f.prompt_tokens < p.threshold_tokens (below threshold) +// - f.new_content_tokens < p.sparse_new_content_tokens (sparse_prompt_guard) +// Throttling path (only when all guards pass): +// - is_agentic → {agentic_keep_target, recency_floor_turns, cascade=false} +// - else → {full_keep_target, recency_floor_turns, cascade=true} +inline RouterDecisionV2 decide_v2(const RequestFeatures& f, + const RouterPolicyV2& p) { + // Helper: SAFE return (keep everything, cascade on, recency = keep-all). + const RouterDecisionV2 SAFE_disabled = { p.full_keep_target, kRecencyKeepAll, true, "disabled_noop" }; + const RouterDecisionV2 SAFE_degenerate = { p.full_keep_target, kRecencyKeepAll, true, "degenerate" }; + const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, kRecencyKeepAll, true, "below_threshold" }; + const RouterDecisionV2 SAFE_sparse = { p.full_keep_target, kRecencyKeepAll, true, "sparse_prompt_guard" }; + + // 1. Deploy no-op: disabled router is an exact no-op (correct-by-construction). + if (!p.enabled) + return SAFE_disabled; + + // 2. Degenerate inputs: prompt_tokens <= 0 or new_content_tokens < 0. + if (f.prompt_tokens <= 0 || f.new_content_tokens < 0) + return SAFE_degenerate; + + // 3. Below threshold: prompt too short to compress meaningfully. + if (f.prompt_tokens < p.threshold_tokens) + return SAFE_below_threshold; + + // 4. Sparse-prompt guard: tiny new-content turn (plumbing class). + // Compression would drop recent orchestration continuity with no anchor signal. + if (f.new_content_tokens < p.sparse_new_content_tokens) + return SAFE_sparse; + + // 5. Throttling paths — all guards passed. + if (f.is_agentic) + return { p.agentic_keep_target, p.recency_floor_turns, false, "agentic_throttle" }; + + return { p.full_keep_target, p.recency_floor_turns, true, "retrieval_full" }; +} + +} // namespace dflash::common diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index 4261635f4..833247b70 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -18,6 +18,7 @@ #include "common/backend_precision.h" #include "internal.h" #include "anchor_scan.h" +#include "regime_router.h" #include "ggml.h" #include "ggml-alloc.h" @@ -80,6 +81,7 @@ struct CompressCfg { int query_tokens; int head_chunks; int tail_chunks; + int recency_floor_tokens; // PFLASH_RECENCY_FLOOR_TOKENS: force-keep last N tokens (0 = off) dflash::qwen3::AnchorScanCfg anchor; bool use_transitive; int max_iters; @@ -175,6 +177,23 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) { return 3; }(); + // Recency floor: unconditionally force-keep the last R tokens of the prompt + // body before anchor scoring. DEFAULT 0 = no-op (unchanged behavior). + // 0 = off + // -1 = auto: min(1024, ceil(0.04 * S)) [resolved at compress time when S is known] + // >0 = explicit token count + // Note: env_int() rejects negatives, so read raw and parse to preserve -1. + // Rescues recent wiring-sequence turns dropped when anchors seed from a + // short/sparse tail (e.g. bare [tool_result] turns). + { + const char * rfv = std::getenv("PFLASH_RECENCY_FLOOR_TOKENS"); + if (rfv) { + c.recency_floor_tokens = std::atoi(rfv); // preserves -1 sentinel + } else { + c.recency_floor_tokens = 0; + } + } + return c; } @@ -650,6 +669,19 @@ static std::vector qwen35_score_and_compress( std::vector query_pool(ids.begin() + q0, ids.end()); std::vector forced((size_t)n_chunks, 0); + // Recency floor: force-keep the last R tokens worth of chunks before anchor + // scoring so that recent wiring-sequence turns are never dropped regardless + // of anchor seed quality. R=0 is a no-op (default). R=-1 = auto. + { + const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); + if (R > 0) { + const int floor_tok = std::min(S, R); + const int floor_start_tok = S - floor_tok; + const int floor_start_chunk = floor_start_tok / chunk_size; + for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1; + } + } + dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; anchor_cfg.chunk_size = chunk_size; @@ -818,13 +850,28 @@ std::vector drafter_score_and_compress( for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1; for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; + // Recency floor: force-keep the last R tokens worth of chunks before anchor + // scoring so that recent wiring-sequence turns are never dropped regardless + // of anchor seed quality. R=0 is a no-op (default). R=-1 = auto. + { + const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); + if (R > 0) { + const int floor_tok = std::min(S, R); + const int floor_start_tok = S - floor_tok; + const int floor_start_chunk = floor_start_tok / chunk_size; + for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1; + } + } + const int q0 = std::max(0, S - cfg.query_tokens); { + const int resolved_R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); std::vector query_pool(ids.begin() + q0, ids.end()); dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; anchor_cfg.chunk_size = chunk_size; - std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n", - n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count); + std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d recency_floor=%d (resolved=%d)\n", + n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, + cfg.recency_floor_tokens, resolved_R); std::fflush(stderr); if (cfg.use_transitive) { diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp new file mode 100644 index 000000000..f92a0b512 --- /dev/null +++ b/server/test/test_regime_router.cpp @@ -0,0 +1,715 @@ +// Unit tests for dflash::common::decide_regime() — pure function, no GPU. +// +// Build (standalone, from repo root): +// g++-11 -std=gnu++17 -O2 -I server/src/common +// -o /tmp/test_regime_router server/test/test_regime_router.cpp +// CMake: +// cmake --build build --target test_regime_router -j +// ctest -R regime_router --output-on-failure + +#include "regime_router.h" + +#include +#include +#include +#include + +using namespace dflash::common; + +// ─── Minimal test framework (mirrors test_adaptive_keep_ratio.cpp) ─────────── + +static int test_failures = 0; +static int test_count = 0; + +#define TEST_ASSERT(expr) do { \ + test_count++; \ + if (!(expr)) { \ + test_failures++; \ + std::fprintf(stderr, " FAIL: %s:%d: %s\n", __FILE__, __LINE__, #expr); \ + } \ +} while (0) + +#define TEST_ASSERT_MSG(expr, msg) do { \ + test_count++; \ + if (!(expr)) { \ + test_failures++; \ + std::fprintf(stderr, " FAIL: %s:%d: %s -- %s\n", \ + __FILE__, __LINE__, #expr, msg); \ + } \ +} while (0) + +#define RUN_TEST(fn) do { \ + std::fprintf(stderr, " %s ...", #fn); \ + int before = test_failures; \ + fn(); \ + if (test_failures == before) std::fprintf(stderr, " ok\n"); \ + else std::fprintf(stderr, "\n"); \ +} while (0) + +static inline bool approx_eq(double a, double b, double eps = 1e-9) { + return std::fabs(a - b) < eps; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// Build a policy with expansion_throttle_ratio disabled (default safe). +static RouterPolicy default_policy() { return {}; } + +// Build a policy that throttles at ratio >= r. +static RouterPolicy throttle_policy(double r, + int threshold = 32000, + int min_anchor = 1) { + RouterPolicy p; + p.threshold_tokens = threshold; + p.expansion_throttle_ratio = r; + p.min_anchor_chunks = min_anchor; + return p; +} + +static CascadeStats make_stats(int n_chunks, + int anchor_only, + int after_cascade, + int prompt_tokens, + int keep_floor = 0) { + return { n_chunks, anchor_only, after_cascade, prompt_tokens, keep_floor }; +} + +// ─── T1: DEPLOY-NO-OP ──────────────────────────────────────────────────────── +// With the DEFAULT RouterPolicy (ratio=INFINITY), decide_regime must return +// FullCascade for ANY stats, including pathologically large expansion. + +static void t1_deploy_noop() { + RouterPolicy p = default_policy(); + + // Normal case + { + auto d = decide_regime(make_stats(100, 10, 20, 50000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T1a: default policy must always give FullCascade"); + } + // Huge expansion: forced_anchor_only=10, forced_after_cascade=1000, prompt=100K + { + auto d = decide_regime(make_stats(500, 10, 1000, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T1b: huge expansion with default policy must be FullCascade"); + } + // Prompt below threshold + { + auto d = decide_regime(make_stats(50, 5, 500, 1000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T1c: short prompt with default policy must be FullCascade"); + } + // Zero anchors + { + auto d = decide_regime(make_stats(100, 0, 0, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T1d: zero anchors with default policy must be FullCascade"); + } + // Sweep: 50 random-ish stat combinations + for (int i = 1; i <= 50; ++i) { + CascadeStats s = make_stats(i * 10, + i, + i * 100, // R = 100, very high + i * 5000); + auto d = decide_regime(s, p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T1-sweep: default policy must be FullCascade for all stats"); + } +} + +// ─── T2: DEGENERATE ────────────────────────────────────────────────────────── +// Degenerate inputs must not crash or div-by-zero, and must return FullCascade. + +static void t2_degenerate() { + RouterPolicy p = throttle_policy(2.0); // would throttle if R >= 2 + + // n_chunks == 0 + { + auto d = decide_regime(make_stats(0, 5, 10, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T2a: n_chunks=0 must return FullCascade"); + TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio), + "T2a: expansion_ratio must be finite when n_chunks=0"); + } + // forced_anchor_only == 0 (no anchors before cascade) → R defaults to 1.0 + { + auto d = decide_regime(make_stats(100, 0, 50, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T2b: forced_anchor_only=0 must return FullCascade"); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), + "T2b: expansion_ratio must be 1.0 when forced_anchor_only=0"); + } + // Negative forced_anchor_only + { + auto d = decide_regime(make_stats(100, -1, 50, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T2c: negative forced_anchor_only must return FullCascade"); + TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio), + "T2c: expansion_ratio must be finite for negative anchor count"); + } + // Negative forced_after_cascade + { + auto d = decide_regime(make_stats(100, 5, -1, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T2d: negative forced_after_cascade must return FullCascade"); + } + // Both negative + { + auto d = decide_regime(make_stats(100, -3, -7, 100000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T2e: both counts negative must return FullCascade"); + } +} + +// ─── T3: BELOW-THRESHOLD ───────────────────────────────────────────────────── +// prompt_tokens < threshold → FullCascade regardless of R and finite ratio. + +static void t3_below_threshold() { + RouterPolicy p = throttle_policy(1.5, /*threshold=*/32000, /*min_anchor=*/1); + + // prompt = threshold - 1 (just below) + { + auto d = decide_regime(make_stats(100, 10, 1000, 31999), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T3a: prompt just below threshold must be FullCascade"); + TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", + "T3a: reason must be 'below_threshold'"); + } + // prompt = 0 + { + auto d = decide_regime(make_stats(100, 10, 9999, 0), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T3b: prompt=0 must be FullCascade"); + } + // Even with R = 1000 and finite ratio = 2.0, still FullCascade below threshold + { + auto d = decide_regime(make_stats(200, 5, 5000, 100), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T3c: tiny prompt, huge R, finite ratio -> FullCascade"); + } +} + +// ─── T4: TOO-FEW-ANCHORS ───────────────────────────────────────────────────── +// forced_anchor_only < min_anchor_chunks → FullCascade. + +static void t4_too_few_anchors() { + RouterPolicy p = throttle_policy(2.0, /*threshold=*/32000, /*min_anchor=*/3); + // forced_anchor_only = 2 < min_anchor = 3 + { + auto d = decide_regime(make_stats(100, 2, 1000, 50000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T4a: anchors below min must be FullCascade"); + } + // forced_anchor_only = 0 < min_anchor = 3 + { + auto d = decide_regime(make_stats(100, 0, 500, 50000), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T4b: zero anchors below min must be FullCascade"); + } + // forced_anchor_only = 3 == min_anchor = 3: NOT too few → may throttle + { + auto d = decide_regime(make_stats(100, 3, 300, 50000), p); + // R = 300/3 = 100 >= 2.0 → should be Throttle + TEST_ASSERT_MSG(d.regime == Regime::Throttle, + "T4c: anchors == min AND R >= ratio must throttle"); + } +} + +// ─── T5: MONOTONE ──────────────────────────────────────────────────────────── +// With a finite ratio policy, once Throttle triggers at R it must stay Throttle +// for all larger R. + +static void t5_monotone() { + // Policy: ratio=3.0, threshold=32000, min_anchor=1, prompt_tokens=50000 + RouterPolicy p = throttle_policy(3.0, 32000, 1); + const int prompt = 50000; + const int anchor = 10; // fixed; vary after_cascade to control R + + // R = 2.9 → FullCascade + { + // after = anchor * R = 10 * 2.9 = 29 + auto d = decide_regime(make_stats(100, anchor, 29, prompt), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T5a: R=2.9 < 3.0 must be FullCascade"); + } + // R = 3.0 → Throttle (boundary: >= triggers) + { + // after = 10 * 3 = 30 + auto d = decide_regime(make_stats(100, anchor, 30, prompt), p); + TEST_ASSERT_MSG(d.regime == Regime::Throttle, + "T5b: R=3.0 == ratio must be Throttle"); + } + // R = 10.0 → Throttle + { + auto d = decide_regime(make_stats(100, anchor, 100, prompt), p); + TEST_ASSERT_MSG(d.regime == Regime::Throttle, + "T5c: R=10.0 >> ratio must be Throttle"); + } + // Monotone sweep: for all integer R from 1 to 100, once Throttle appears + // it must not flip back to FullCascade. + bool seen_throttle = false; + bool monotone = true; + for (int r_int = 1; r_int <= 100; ++r_int) { + // after = anchor * r_int → exact integer R + auto d = decide_regime(make_stats(200, anchor, anchor * r_int, prompt), p); + if (d.regime == Regime::Throttle) { + seen_throttle = true; + } else if (seen_throttle) { + // Flipped back to FullCascade after Throttle was seen: not monotone + monotone = false; + std::fprintf(stderr, + " MONOTONE VIOLATION at R=%d: Throttle then FullCascade\n", + r_int); + break; + } + } + TEST_ASSERT_MSG(seen_throttle, "T5d: sweep must trigger Throttle at some R"); + TEST_ASSERT_MSG(monotone, "T5e: regime must be monotone (no FullCascade after Throttle)"); +} + +// ─── T6: BOUNDARY ──────────────────────────────────────────────────────────── +// R exactly == ratio → Throttle; R = ratio - epsilon → FullCascade. + +static void t6_boundary() { + const double ratio = 5.0; + RouterPolicy p = throttle_policy(ratio, 32000, 1); + const int anchor = 1000; // use large anchor to get precise integer ratios + const int prompt = 50000; + + // R exactly == ratio: after = anchor * ratio = 5000 + { + auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio, prompt), p); + TEST_ASSERT_MSG(d.regime == Regime::Throttle, + "T6a: R exactly == ratio must be Throttle"); + } + + // R = ratio - epsilon where epsilon = 0.5/anchor (one less chunk → R < ratio) + { + // after = anchor * ratio - 1 = 4999 → R = 4.999 < 5.0 + auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio - 1, prompt), p); + TEST_ASSERT_MSG(d.regime == Regime::FullCascade, + "T6b: R just below ratio must be FullCascade"); + } +} + +// ─── T7: RATIO-VALUE ───────────────────────────────────────────────────────── +// Check that expansion_ratio is computed correctly. + +static void t7_ratio_value() { + RouterPolicy p = default_policy(); // regime doesn't matter; check ratio value + + // forced_anchor_only=10, forced_after_cascade=85 → R = 8.5 + { + auto d = decide_regime(make_stats(100, 10, 85, 50000), p); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 8.5), + "T7a: R must be 85/10 = 8.5"); + } + // forced_anchor_only=0 → R must be 1.0 (no div-by-zero) + { + auto d = decide_regime(make_stats(100, 0, 50, 50000), p); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), + "T7b: forced_anchor_only=0 must give expansion_ratio=1.0"); + } + // forced_anchor_only=5, forced_after_cascade=5 → R = 1.0 + { + auto d = decide_regime(make_stats(100, 5, 5, 50000), p); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), + "T7c: equal anchors before/after must give R=1.0"); + } + // forced_anchor_only=7, forced_after_cascade=7 → R = 1.0 (no expansion) + { + auto d = decide_regime(make_stats(100, 7, 7, 50000), p); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), + "T7d: no cascade expansion must give R=1.0"); + } + // Verify ratio when throttle policy triggers: ratio value should still be correct + { + RouterPolicy tp = throttle_policy(3.0); + auto d = decide_regime(make_stats(100, 4, 20, 50000), tp); + // R = 20/4 = 5.0 → Throttle, ratio = 5.0 + TEST_ASSERT_MSG(d.regime == Regime::Throttle, + "T7e: R=5.0 >= 3.0 must throttle"); + TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 5.0), + "T7e: expansion_ratio must be 5.0"); + } +} + +// ─── V2 helpers ────────────────────────────────────────────────────────────── + +// Default v2 policy: disabled (deploy no-op). +static RouterPolicyV2 default_v2_policy() { return {}; } + +// Enabled v2 policy with default field values. +static RouterPolicyV2 enabled_v2_policy() { + RouterPolicyV2 p; + p.enabled = true; + return p; +} + +static RequestFeatures make_features(bool is_agentic, + int prompt_tokens, + int new_content_tokens) { + return { is_agentic, prompt_tokens, new_content_tokens }; +} + +// ─── T8: DEPLOY-NO-OP (v2) ─────────────────────────────────────────────────── +// enabled=false → SAFE for every input, including is_agentic=true and huge prompts. +// Correct-by-construction: disabled router must be an exact no-op. + +static void t8_v2_deploy_noop() { + RouterPolicyV2 p = default_v2_policy(); // enabled=false + + // Baseline: normal agentic prompt, well above threshold. + { + auto d = decide_v2(make_features(true, 100000, 10000), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T8a: disabled->keep_target must be full_keep_target"); + TEST_ASSERT_MSG(d.cascade, + "T8a: disabled->cascade must be true"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T8a: disabled->recency must be keep-all sentinel"); + } + // Sweep: all combinations of is_agentic, varying prompt and new_content sizes. + for (int i = 0; i < 4; ++i) { + bool agentic = (i & 1) != 0; + int prompt = (i & 2) ? 100000 : 500; + int new_toks = (i & 2) ? 10000 : 10; + auto d = decide_v2(make_features(agentic, prompt, new_toks), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T8-sweep: disabled->keep_target must be full_keep_target"); + TEST_ASSERT_MSG(d.cascade, + "T8-sweep: disabled->cascade must be true"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T8-sweep: disabled->recency must be keep-all sentinel"); + } + // Explicitly: is_agentic=true, large prompt, large new_content — must be SAFE. + { + auto d = decide_v2(make_features(true, 200000, 50000), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T8b: disabled, agentic, huge prompt -> SAFE"); + TEST_ASSERT_MSG(d.cascade, "T8b: disabled -> cascade=true"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T8b: disabled -> recency keep-all"); + } +} + +// ─── T9: SPARSE-PROMPT GUARD (failure-class fix) ───────────────────────────── +// is_agentic=true, prompt above threshold, BUT new_content < sparse threshold. +// This is the LONG_A-t11/LONG_B-t10 plumbing class: a tiny tool_result riding +// on long history. Compression must NOT throttle here (would drop continuity). + +static void t9_sparse_prompt_guard() { + RouterPolicyV2 p = enabled_v2_policy(); + + // Canonical failure case: 3-word tool_result on 43K history. + { + auto d = decide_v2(make_features(true, 43000, 8), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T9a: sparse agentic turn must be SAFE (full keep), not throttled"); + TEST_ASSERT_MSG(d.cascade, + "T9a: sparse_prompt_guard must cascade=true (SAFE)"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T9a: sparse_prompt_guard -> recency keep-all"); + TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard", + "T9a: reason must be 'sparse_prompt_guard'"); + } + // new_content = sparse_new_content_tokens - 1 (just below the guard). + { + auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens - 1), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T9b: new_content just below sparse threshold -> SAFE"); + TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard", + "T9b: reason must be 'sparse_prompt_guard'"); + } + // new_content = 0 (degenerate new turn, still sparse guard NOT degenerate path). + // Note: 0 < sparse_new_content_tokens (256) so sparse guard fires first. + { + auto d = decide_v2(make_features(true, 40000, 0), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T9c: new_content=0 -> SAFE (sparse guard or degenerate, both SAFE)"); + } + // Confirm: new_content = sparse_new_content_tokens (AT the boundary → NOT sparse). + // is_agentic=true above threshold with enough new content → throttle kicks in. + { + auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target), + "T9d: new_content==sparse threshold -> agentic throttle applies"); + TEST_ASSERT_MSG(!d.cascade, + "T9d: agentic throttle -> cascade=false"); + } +} + +// ─── T10: AGENTIC-THROTTLE ─────────────────────────────────────────────────── +// enabled, is_agentic=true, prompt > threshold, new_content > sparse threshold +// → keep_target=agentic_keep_target, cascade=false, recency >= 1. + +static void t10_agentic_throttle() { + RouterPolicyV2 p = enabled_v2_policy(); + + { + auto d = decide_v2(make_features(true, 40000, 3000), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target), + "T10a: agentic throttle -> keep_target=agentic_keep_target"); + TEST_ASSERT_MSG(!d.cascade, + "T10a: agentic throttle -> cascade=false"); + TEST_ASSERT_MSG(d.recency_floor_turns == p.recency_floor_turns, + "T10a: agentic throttle -> recency matches policy"); + TEST_ASSERT_MSG(d.recency_floor_turns >= 1, + "T10a: recency_floor_turns must be >= 1 (continuity guaranteed)"); + TEST_ASSERT_MSG(std::string(d.reason) == "agentic_throttle", + "T10a: reason must be 'agentic_throttle'"); + } + // Custom policy: verify fields propagate. + { + RouterPolicyV2 p2 = p; + p2.agentic_keep_target = 0.30; + p2.recency_floor_turns = 5; + auto d = decide_v2(make_features(true, 60000, 1000), p2); + TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.30), + "T10b: custom agentic_keep_target propagated"); + TEST_ASSERT_MSG(d.recency_floor_turns == 5, + "T10b: custom recency_floor_turns propagated"); + } +} + +// ─── T11: RETRIEVAL-FULL ───────────────────────────────────────────────────── +// enabled, is_agentic=false, prompt > threshold, new_content > sparse threshold +// → cascade=true, keep_target=full_keep_target. + +static void t11_retrieval_full() { + RouterPolicyV2 p = enabled_v2_policy(); + + { + auto d = decide_v2(make_features(false, 40000, 3000), p); + TEST_ASSERT_MSG(d.cascade, + "T11a: retrieval -> cascade=true"); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T11a: retrieval -> keep_target=full_keep_target"); + TEST_ASSERT_MSG(std::string(d.reason) == "retrieval_full", + "T11a: reason must be 'retrieval_full'"); + } + // Custom full_keep_target. + { + RouterPolicyV2 p2 = p; + p2.full_keep_target = 0.80; + auto d = decide_v2(make_features(false, 50000, 5000), p2); + TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.80), + "T11b: custom full_keep_target propagated"); + TEST_ASSERT_MSG(d.cascade, "T11b: retrieval -> cascade=true"); + } +} + +// ─── T12: BELOW-THRESHOLD (v2) ─────────────────────────────────────────────── +// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic and new_content. + +static void t12_v2_below_threshold() { + RouterPolicyV2 p = enabled_v2_policy(); + + // Agentic, just below threshold, plenty of new content. + { + auto d = decide_v2(make_features(true, p.threshold_tokens - 1, 5000), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T12a: agentic, below threshold -> SAFE"); + TEST_ASSERT_MSG(d.cascade, + "T12a: below threshold -> cascade=true"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T12a: below threshold -> recency keep-all"); + TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", + "T12a: reason must be 'below_threshold'"); + } + // Non-agentic, at threshold boundary - 1. + { + auto d = decide_v2(make_features(false, p.threshold_tokens - 1, 5000), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T12b: non-agentic, below threshold -> SAFE"); + } + // Custom threshold. + { + RouterPolicyV2 p2 = p; + p2.threshold_tokens = 10000; + auto d = decide_v2(make_features(true, 9999, 2000), p2); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p2.full_keep_target), + "T12c: custom threshold, below it -> SAFE"); + TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", + "T12c: reason must be 'below_threshold'"); + } +} + +// ─── T13: RECENCY-FLOOR INVARIANT ──────────────────────────────────────────── +// In every throttling decision (non-SAFE), recency_floor_turns >= 1. +// In every SAFE decision, recency_floor_turns >= kRecencyKeepAll. + +static void t13_recency_floor_invariant() { + RouterPolicyV2 p = enabled_v2_policy(); + + // Throttle path (agentic): recency >= 1. + { + auto d = decide_v2(make_features(true, 50000, 1000), p); + TEST_ASSERT_MSG(!approx_eq(d.keep_target, p.full_keep_target) || + d.recency_floor_turns >= 1, + "T13a: throttled decision must have recency >= 1"); + TEST_ASSERT_MSG(d.recency_floor_turns >= 1, + "T13a: agentic throttle recency_floor_turns >= 1 (continuity)"); + } + // SAFE paths: recency must be keep-all. + // disabled + { + RouterPolicyV2 pd; pd.enabled = false; + auto d = decide_v2(make_features(true, 50000, 1000), pd); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T13b: disabled SAFE recency must be keep-all"); + } + // sparse_prompt_guard + { + auto d = decide_v2(make_features(true, 50000, 5), p); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T13c: sparse_prompt_guard SAFE recency must be keep-all"); + } + // below_threshold + { + auto d = decide_v2(make_features(true, 1000, 500), p); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T13d: below_threshold SAFE recency must be keep-all"); + } + // retrieval_full path: recency = policy value (not keep-all, it's a throttle-adjacent path) + { + auto d = decide_v2(make_features(false, 50000, 1000), p); + TEST_ASSERT_MSG(d.recency_floor_turns >= 1, + "T13e: retrieval_full recency >= 1"); + } + // Custom recency_floor_turns: verify agentic propagates it. + for (int k = 1; k <= 10; ++k) { + RouterPolicyV2 pk = p; + pk.recency_floor_turns = k; + auto d = decide_v2(make_features(true, 50000, 1000), pk); + TEST_ASSERT_MSG(d.recency_floor_turns == k, + "T13f: agentic throttle recency must equal policy recency_floor_turns"); + } +} + +// ─── T14: DEGENERATE (v2) ──────────────────────────────────────────────────── +// prompt_tokens <= 0 or new_content_tokens < 0 → SAFE (no crash, no garbage). + +static void t14_v2_degenerate() { + RouterPolicyV2 p = enabled_v2_policy(); + + // prompt_tokens = 0 + { + auto d = decide_v2(make_features(true, 0, 500), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T14a: prompt_tokens=0 -> SAFE"); + TEST_ASSERT_MSG(d.cascade, "T14a: degenerate -> cascade=true"); + TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, + "T14a: degenerate -> recency keep-all"); + TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", + "T14a: reason must be 'degenerate'"); + } + // prompt_tokens < 0 + { + auto d = decide_v2(make_features(false, -1, 100), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T14b: negative prompt_tokens -> SAFE"); + TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", + "T14b: reason must be 'degenerate'"); + } + // new_content_tokens < 0 + { + auto d = decide_v2(make_features(true, 50000, -1), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T14c: negative new_content_tokens -> SAFE"); + TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", + "T14c: reason must be 'degenerate'"); + } + // Both degenerate + { + auto d = decide_v2(make_features(true, -5, -10), p); + TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), + "T14d: both degenerate -> SAFE"); + } +} + +// ─── T15: RECENCY_FLOOR_FOR — off ──────────────────────────────────────────── +// recency_floor_tokens == 0 → always 0 regardless of prompt size. + +static void t15_recency_floor_off() { + // 0 → off + TEST_ASSERT_MSG(recency_floor_for(0, 0) == 0, "T15a: S=0 R=0 -> 0"); + TEST_ASSERT_MSG(recency_floor_for(1000, 0) == 0, "T15b: S=1000 R=0 -> 0"); + TEST_ASSERT_MSG(recency_floor_for(100000, 0) == 0, "T15c: S=100K R=0 -> 0"); + // Negative R (shouldn't happen but must be safe) + TEST_ASSERT_MSG(recency_floor_for(10000, -2) == 0, "T15d: negative R (not sentinel) -> 0"); +} + +// ─── T16: RECENCY_FLOOR_FOR — auto ─────────────────────────────────────────── +// kRecencyFloorAuto (-1) → min(1024, ceil(0.04 * S)). + +static void t16_recency_floor_auto() { + const int A = kRecencyFloorAuto; + + // S=0: ceil(0.04*0)=0 + TEST_ASSERT_MSG(recency_floor_for(0, A) == 0, "T16a: S=0 auto -> 0"); + // S=1000: ceil(0.04*1000)=40 + TEST_ASSERT_MSG(recency_floor_for(1000, A) == 40, "T16b: S=1000 auto -> 40"); + // S=10000: ceil(0.04*10000)=400 + TEST_ASSERT_MSG(recency_floor_for(10000, A) == 400, "T16c: S=10K auto -> 400"); + // S=25000: ceil(0.04*25000)=1000 + TEST_ASSERT_MSG(recency_floor_for(25000, A) == 1000, "T16d: S=25K auto -> 1000"); + // S=25001: ceil(0.04*25001)=1001 but capped at 1024 + // actually 0.04*25001=1000.04 → ceil=1001 < 1024 → 1001 + TEST_ASSERT_MSG(recency_floor_for(25001, A) == 1001, "T16e: S=25001 auto -> 1001"); + // S=25600: 0.04*25600=1024.0 → ceil=1024 + TEST_ASSERT_MSG(recency_floor_for(25600, A) == 1024, "T16f: S=25600 auto -> 1024"); + // S=26000: 0.04*26000=1040 → ceil=1040 but capped at 1024 + TEST_ASSERT_MSG(recency_floor_for(26000, A) == 1024, "T16g: S=26000 auto -> cap 1024"); + // S=100000: 0.04*100000=4000 → capped at 1024 + TEST_ASSERT_MSG(recency_floor_for(100000, A) == 1024, "T16h: S=100K auto -> cap 1024"); + // S=-1: negative prompt treated as 0 → 0 + TEST_ASSERT_MSG(recency_floor_for(-1, A) == 0, "T16i: S=-1 auto -> 0"); +} + +// ─── T17: RECENCY_FLOOR_FOR — explicit ─────────────────────────────────────── +// Any explicit positive value is returned unchanged (no prompt-size influence). + +static void t17_recency_floor_explicit() { + // Explicit override ignores prompt size + TEST_ASSERT_MSG(recency_floor_for(1000, 512) == 512, "T17a: explicit 512"); + TEST_ASSERT_MSG(recency_floor_for(100000, 512) == 512, "T17b: explicit 512, large S"); + TEST_ASSERT_MSG(recency_floor_for(1000, 1024) == 1024, "T17c: explicit 1024"); + TEST_ASSERT_MSG(recency_floor_for(1000, 2048) == 2048, "T17d: explicit 2048 > cap"); + TEST_ASSERT_MSG(recency_floor_for(0, 256) == 256, "T17e: explicit 256, S=0"); + // Monotone: explicit > auto at short prompts + const int A = kRecencyFloorAuto; + TEST_ASSERT_MSG(recency_floor_for(1000, 512) > recency_floor_for(1000, A), + "T17f: explicit 512 > auto(1000)=40"); +} + +// ─── main ───────────────────────────────────────────────────────────────────── + +int main() { + std::fprintf(stderr, "=== test_regime_router ===\n"); + + RUN_TEST(t1_deploy_noop); + RUN_TEST(t2_degenerate); + RUN_TEST(t3_below_threshold); + RUN_TEST(t4_too_few_anchors); + RUN_TEST(t5_monotone); + RUN_TEST(t6_boundary); + RUN_TEST(t7_ratio_value); + + std::fprintf(stderr, "--- v2 ---\n"); + RUN_TEST(t8_v2_deploy_noop); + RUN_TEST(t9_sparse_prompt_guard); + RUN_TEST(t10_agentic_throttle); + RUN_TEST(t11_retrieval_full); + RUN_TEST(t12_v2_below_threshold); + RUN_TEST(t13_recency_floor_invariant); + RUN_TEST(t14_v2_degenerate); + + std::fprintf(stderr, "--- recency_floor_for ---\n"); + RUN_TEST(t15_recency_floor_off); + RUN_TEST(t16_recency_floor_auto); + RUN_TEST(t17_recency_floor_explicit); + + std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures); + return (test_failures == 0) ? 0 : 1; +} From b31544f0e160c2879a5ffd89dcd3989fcc6f1a87 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Sat, 30 May 2026 22:12:22 +0200 Subject: [PATCH 15/16] feat(pflash): wire type-gate router into live handler; prune disproven mechanisms Wires the compression-regime router into the request path (default-OFF via PFLASH_ROUTER_ENABLE, so this commit is a no-op until enabled) and prunes the router down to only what this session validated. Net -433 lines. Live path (when enabled): - detect agentic vs retrieval at admission (tool schemas / tool_use|tool_result blocks / tool_calls present). JSON-walking lives at the handler boundary (http_server.cpp); the router header stays pure (stdlib-only). - agentic -> cascade off, keep_target 0.25 (~3x prefill/wall, content permitting) - retrieval -> cascade on, full keep - per-request use_transitive threaded through the compress path. Pruned (mechanisms disproven this session, see bench/2026-05-30_*): - v1 cascade-expansion router (R): refuted as keep predictor, Spearman rho=-0.27. - recency-token floor: validated zero-sum at fixed keep (displaces top-K middle context, breaks controls). Removed from header + drafter. - decide_v2 sparse-prompt guard (new_content_tokens): does not separate fails-from-passes under compression. regime_router.h: 271 -> 96 LOC, zero non-stdlib includes, standalone -Werror, 43 unit tests (decide_v2 type-gate + detect_request_type truth-table). Known follow-ups (NOT in this commit): (1) hard empty/degenerate safety guard (result.tokens.size()/degenerate_decode_close already at the update site) to catch empty responses; (2) reconcile bandit keep range [0.025,0.20] vs router floor 0.25 (bandit currently wins per-session); (3) confident-off-task closure needs client/app feedback, not a label-free reward (Momus-confirmed). --- server/src/common/model_backend.h | 4 + server/src/common/regime_router.h | 195 +++----- server/src/qwen3/qwen3_backend.cpp | 4 +- server/src/qwen3/qwen3_drafter.cpp | 84 ++-- server/src/qwen3/qwen3_drafter.h | 24 +- server/src/qwen35/qwen35_backend.cpp | 4 +- server/src/server/http_server.cpp | 72 ++- server/src/server/http_server.h | 6 + server/src/server/server_main.cpp | 16 + server/test/test_regime_router.cpp | 644 +++++---------------------- 10 files changed, 310 insertions(+), 743 deletions(-) diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index f2a863418..0d8a85d5b 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -177,6 +177,10 @@ struct ModelBackend { std::string drafter_path; // GGUF path (for lazy-load) int drafter_gpu = 0; // backend-local GPU for PFlash drafter bool skip_park = false; // true on >=32GB GPUs + // Per-request transitive-cascade override (-1 = use env default). + // 0 = off (agentic path: suppress cascade to avoid anchor bloat). + // 1 = on (retrieval path: full expansion, same as today). + int use_transitive = -1; }; struct CompressResult { diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h index 7c9917eb7..426c0c985 100644 --- a/server/src/common/regime_router.h +++ b/server/src/common/regime_router.h @@ -1,12 +1,13 @@ -// Pure, correct-by-construction adaptive compression-regime router. -// No IO, no globals, no GPU, no ggml/llama deps — header-only. +// Adaptive compression-regime router v2. +// No IO, no globals, no GPU, no ggml/llama deps — header-only, stdlib-only. // -// Decides whether the transitive anchor cascade should run at full expansion -// (FullCascade, recall-preserving default) or be throttled -// (Throttle, fires ONLY when expansion_ratio >= policy threshold). +// Splits on prompt TYPE (agentic vs retrieval). +// V1 R-router (cascade expansion ratio) was refuted as a keep predictor (ρ=-0.27). +// Sparse-prompt guard and recency floor were validated zero-sum; removed. // // Build (standalone): -// g++-11 -std=gnu++17 -O2 -I. -o test_regime_router test/test_regime_router.cpp +// g++-11 -std=gnu++17 -O2 -I server/src/common +// -o /tmp/test_regime_router server/test/test_regime_router.cpp // CMake: cmake --build build --target test_regime_router -j // ctest -R regime_router --output-on-failure #pragma once @@ -16,176 +17,80 @@ namespace dflash::common { -// ─── Input ─────────────────────────────────────────────────────────────────── - -// All inputs are cheap lexical counts already available in the cascade path. -struct CascadeStats { - int n_chunks; - int forced_anchor_only; // chunks forced by BASE anchors, pre-cascade - int forced_after_cascade; // chunks forced AFTER transitive cascade - int prompt_tokens; // S - int keep_floor_chunks; // ceil(keep_ratio * n_chunks) budget (informational) -}; - -// ─── Policy ────────────────────────────────────────────────────────────────── - -struct RouterPolicy { - int threshold_tokens = 32000; // below this: passthrough - double expansion_throttle_ratio = INFINITY; // DEFAULT disabled - int min_anchor_chunks = 1; // don't throttle if too few anchors -}; - -// ─── Output ────────────────────────────────────────────────────────────────── - -enum class Regime { FullCascade, Throttle }; - -struct RegimeDecision { - Regime regime; - double expansion_ratio; - const char* reason; -}; - -// ─── Core function ─────────────────────────────────────────────────────────── - -// decide_regime — pure, no IO, no globals. -// -// Expansion ratio R = forced_after_cascade / forced_anchor_only -// (defined as 1.0 when forced_anchor_only == 0 to avoid division by zero). -// -// Transition to Throttle ONLY on the last branch; every other path returns -// FullCascade so the default deployment posture is recall-preserving. -inline RegimeDecision decide_regime(const CascadeStats& s, const RouterPolicy& p) { - // Compute R first (needed for degenerate guard + return value). - const double R = (s.forced_anchor_only > 0) - ? static_cast(s.forced_after_cascade) / s.forced_anchor_only - : 1.0; - - // Guard: degenerate inputs — return FullCascade, no further processing. - if (s.n_chunks <= 0 || s.forced_anchor_only < 0 || s.forced_after_cascade < 0) - return { Regime::FullCascade, R, "degenerate" }; - - // Passthrough: prompt too short to compress meaningfully. - if (s.prompt_tokens < p.threshold_tokens) - return { Regime::FullCascade, R, "below_threshold" }; - - // Guard: too few base anchors — throttle would be meaningless. - if (s.forced_anchor_only < p.min_anchor_chunks) - return { Regime::FullCascade, R, "too_few_anchors" }; - - // Only transition: cascade expanded beyond the policy limit. - if (R >= p.expansion_throttle_ratio) - return { Regime::Throttle, R, "cascade_over_expansion" }; - - return { Regime::FullCascade, R, "default_safe" }; -} - // ─── V2 Router ─────────────────────────────────────────────────────────────── -// -// Adaptive compression router v2. -// Splits on prompt TYPE (agentic vs retrieval) rather than cascade expansion -// ratio R (which was refuted as a keep predictor, Spearman ρ=-0.27). -// -// Additional guards: -// sparse_prompt_guard — skip compression when new_content_tokens is tiny -// (plumbing turns: recent orchestration continuity must not be dropped) -// recency_floor_turns — always keep the last K turns whole in the agentic path -// -// Sentinel for "keep all turns" recency in SAFE decisions: -static constexpr int kRecencyKeepAll = (1 << 20); - -// Size-adaptive recency floor sentinel. -// When recency_floor_tokens == kRecencyFloorAuto the compress path computes -// R = min(1024, ceil(0.04 * prompt_tokens)) -// instead of using a fixed token count. 0 = off (no-op default). -static constexpr int kRecencyFloorAuto = -1; struct RequestFeatures { - bool is_agentic; // tool schemas / tool_use|tool_result blocks present - int prompt_tokens; // total S - int new_content_tokens; // newest turn content size (sparse-plumbing detector) + bool is_agentic; // tool schemas / tool_use|tool_result blocks present + int prompt_tokens; // total S }; struct RouterPolicyV2 { - bool enabled = false; // DEFAULT DISABLED → exact no-op - int threshold_tokens = 32000; // below → passthrough - double agentic_keep_target = 0.25; // conservative floor, closes empty-failure tail - double full_keep_target = 1.0; // retrieval/QA & safe fallbacks - int recency_floor_turns = 2; // keep last K turns whole (continuity) - int sparse_new_content_tokens = 256; // below this → sparse_prompt_guard fires + bool enabled = false; // DEFAULT DISABLED → exact no-op + int threshold_tokens = 32000; // below → passthrough + double agentic_keep_target = 0.25; // conservative floor, agentic path + double full_keep_target = 1.0; // retrieval/QA & safe fallbacks }; -// recency_floor_for — pure helper, no IO. -// -// Returns the concrete token floor for a given prompt size and policy: -// recency_floor_tokens == 0 → 0 (off, no-op) -// recency_floor_tokens == kRecencyFloorAuto (-1) -// → min(1024, ceil(0.04 * prompt_tokens)) -// recency_floor_tokens > 0 → recency_floor_tokens (explicit override) -// -// "one turn equivalent" lower-bound: the agentic throttle path in decide_v2 -// already reserves recency_floor_turns whole turns; this helper computes the -// token-count floor passed to the compress path for the token-budget guard. -inline int recency_floor_for(int prompt_tokens, int recency_floor_tokens) { - if (recency_floor_tokens == 0) - return 0; - if (recency_floor_tokens == kRecencyFloorAuto) { - // min(1024, ceil(0.04 * S)) — scales with context, caps at 1024 - const int adaptive = static_cast( - std::ceil(0.04 * static_cast(prompt_tokens < 0 ? 0 : prompt_tokens))); - return (adaptive < 1024) ? adaptive : 1024; - } - // Explicit positive override. - return (recency_floor_tokens > 0) ? recency_floor_tokens : 0; -} - struct RouterDecisionV2 { double keep_target; - int recency_floor_turns; bool cascade; const char* reason; }; // decide_v2 — pure, no IO, no globals. // -// SAFE path: keep_target=full_keep_target, recency=kRecencyKeepAll, cascade=true. +// SAFE path: keep_target=full_keep_target, cascade=true. // Returns SAFE when: -// - p.enabled == false (deploy no-op, correct-by-construction) -// - f.prompt_tokens <= 0 || f.new_content_tokens < 0 (degenerate) -// - f.prompt_tokens < p.threshold_tokens (below threshold) -// - f.new_content_tokens < p.sparse_new_content_tokens (sparse_prompt_guard) +// - p.enabled == false (deploy no-op, correct-by-construction) +// - f.prompt_tokens <= 0 (degenerate) +// - f.prompt_tokens < p.threshold_tokens (below threshold) // Throttling path (only when all guards pass): -// - is_agentic → {agentic_keep_target, recency_floor_turns, cascade=false} -// - else → {full_keep_target, recency_floor_turns, cascade=true} +// - is_agentic → {agentic_keep_target, cascade=false, "agentic_throttle"} +// - else → {full_keep_target, cascade=true, "retrieval_full"} inline RouterDecisionV2 decide_v2(const RequestFeatures& f, const RouterPolicyV2& p) { - // Helper: SAFE return (keep everything, cascade on, recency = keep-all). - const RouterDecisionV2 SAFE_disabled = { p.full_keep_target, kRecencyKeepAll, true, "disabled_noop" }; - const RouterDecisionV2 SAFE_degenerate = { p.full_keep_target, kRecencyKeepAll, true, "degenerate" }; - const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, kRecencyKeepAll, true, "below_threshold" }; - const RouterDecisionV2 SAFE_sparse = { p.full_keep_target, kRecencyKeepAll, true, "sparse_prompt_guard" }; + const RouterDecisionV2 SAFE_disabled = { p.full_keep_target, true, "disabled_noop" }; + const RouterDecisionV2 SAFE_degenerate = { p.full_keep_target, true, "degenerate" }; + const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, true, "below_threshold" }; - // 1. Deploy no-op: disabled router is an exact no-op (correct-by-construction). if (!p.enabled) return SAFE_disabled; - // 2. Degenerate inputs: prompt_tokens <= 0 or new_content_tokens < 0. - if (f.prompt_tokens <= 0 || f.new_content_tokens < 0) + if (f.prompt_tokens <= 0) return SAFE_degenerate; - // 3. Below threshold: prompt too short to compress meaningfully. if (f.prompt_tokens < p.threshold_tokens) return SAFE_below_threshold; - // 4. Sparse-prompt guard: tiny new-content turn (plumbing class). - // Compression would drop recent orchestration continuity with no anchor signal. - if (f.new_content_tokens < p.sparse_new_content_tokens) - return SAFE_sparse; - - // 5. Throttling paths — all guards passed. if (f.is_agentic) - return { p.agentic_keep_target, p.recency_floor_turns, false, "agentic_throttle" }; + return { p.agentic_keep_target, false, "agentic_throttle" }; + + return { p.full_keep_target, true, "retrieval_full" }; +} - return { p.full_keep_target, p.recency_floor_turns, true, "retrieval_full" }; +// ─── TYPE GATE ─────────────────────────────────────────────────────────────── +// +// Coarse request-type classifier. Pure function — no IO, no globals, no JSON. +// +// Agentic signals (any one is sufficient): +// 1. has_tools — tools array was non-null and non-empty +// 2. has_tool_use_blocks — any message content contained a tool_use or +// tool_result block (Anthropic style) +// 3. has_tool_calls — any assistant message had a non-empty tool_calls +// array (OpenAI style) +// +// The caller is responsible for extracting these bools from the wire format. +// Default: Retrieval (safe — never compresses more than intended). + +enum class RequestType { Agentic, Retrieval }; + +// detect_request_type — pure, stdlib-only, no IO. +inline RequestType detect_request_type(bool has_tools, + bool has_tool_use_blocks, + bool has_tool_calls) { + if (has_tools || has_tool_use_blocks || has_tool_calls) + return RequestType::Agentic; + return RequestType::Retrieval; } } // namespace dflash::common diff --git a/server/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp index e2adc7f65..bc0e9178b 100644 --- a/server/src/qwen3/qwen3_backend.cpp +++ b/server/src/qwen3/qwen3_backend.cpp @@ -952,7 +952,9 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req) } result.compressed_ids = drafter_score_and_compress( - drafter_ctx_, req.input_ids, req.keep_ratio); + drafter_ctx_, req.input_ids, req.keep_ratio, + /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13, + req.use_transitive); result.ok = true; if (!req.skip_park && !was_parked) unpark("target"); diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index 833247b70..852fc96e1 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -18,7 +18,6 @@ #include "common/backend_precision.h" #include "internal.h" #include "anchor_scan.h" -#include "regime_router.h" #include "ggml.h" #include "ggml-alloc.h" @@ -81,13 +80,13 @@ struct CompressCfg { int query_tokens; int head_chunks; int tail_chunks; - int recency_floor_tokens; // PFLASH_RECENCY_FLOOR_TOKENS: force-keep last N tokens (0 = off) dflash::qwen3::AnchorScanCfg anchor; bool use_transitive; int max_iters; }; -static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) { +static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep, + int use_transitive_override = -1) { CompressCfg c{}; c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); @@ -162,6 +161,10 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) { c.anchor.max_forced_count = (int)(max_forced_ratio * n_keep); c.use_transitive = [&]{ + // Per-request override (0=off, 1=on) from router decision takes precedence. + if (use_transitive_override == 0) return false; + if (use_transitive_override == 1) return true; + // Fallback: read from env (same as before, no behaviour change when -1). const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); if (nv >= 0) return nv != 0; @@ -177,23 +180,6 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) { return 3; }(); - // Recency floor: unconditionally force-keep the last R tokens of the prompt - // body before anchor scoring. DEFAULT 0 = no-op (unchanged behavior). - // 0 = off - // -1 = auto: min(1024, ceil(0.04 * S)) [resolved at compress time when S is known] - // >0 = explicit token count - // Note: env_int() rejects negatives, so read raw and parse to preserve -1. - // Rescues recent wiring-sequence turns dropped when anchors seed from a - // short/sparse tail (e.g. bare [tool_result] turns). - { - const char * rfv = std::getenv("PFLASH_RECENCY_FLOOR_TOKENS"); - if (rfv) { - c.recency_floor_tokens = std::atoi(rfv); // preserves -1 sentinel - } else { - c.recency_floor_tokens = 0; - } - } - return c; } @@ -381,7 +367,8 @@ static std::vector qwen35_score_and_compress( float keep_ratio, int chunk_size, int n_lookahead, - int pool_kernel) { + int pool_kernel, + int use_transitive_override = -1) { const int S = (int)ids.size(); const int hidden = w.n_embd; @@ -658,7 +645,7 @@ static std::vector qwen35_score_and_compress( } std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep); + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override); std::vector selected((size_t)n_chunks, 0); int count = 0; @@ -669,19 +656,6 @@ static std::vector qwen35_score_and_compress( std::vector query_pool(ids.begin() + q0, ids.end()); std::vector forced((size_t)n_chunks, 0); - // Recency floor: force-keep the last R tokens worth of chunks before anchor - // scoring so that recent wiring-sequence turns are never dropped regardless - // of anchor seed quality. R=0 is a no-op (default). R=-1 = auto. - { - const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); - if (R > 0) { - const int floor_tok = std::min(S, R); - const int floor_start_tok = S - floor_tok; - const int floor_start_chunk = floor_start_tok / chunk_size; - for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1; - } - } - dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; anchor_cfg.chunk_size = chunk_size; @@ -775,7 +749,8 @@ std::vector drafter_score_and_compress( float keep_ratio, int chunk_size, int n_lookahead, - int pool_kernel) { + int pool_kernel, + int use_transitive_override) { if (!ctx.loaded) { set_last_error("drafter not loaded"); return {}; @@ -786,7 +761,7 @@ std::vector drafter_score_and_compress( return {}; } auto * st = static_cast(ctx.arch_state); - return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel); + return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel, use_transitive_override); } const int S = (int)ids.size(); if (S < n_lookahead + 1) { @@ -843,35 +818,20 @@ std::vector drafter_score_and_compress( std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep); + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override); std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1; for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; - // Recency floor: force-keep the last R tokens worth of chunks before anchor - // scoring so that recent wiring-sequence turns are never dropped regardless - // of anchor seed quality. R=0 is a no-op (default). R=-1 = auto. - { - const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); - if (R > 0) { - const int floor_tok = std::min(S, R); - const int floor_start_tok = S - floor_tok; - const int floor_start_chunk = floor_start_tok / chunk_size; - for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1; - } - } - const int q0 = std::max(0, S - cfg.query_tokens); { - const int resolved_R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens); std::vector query_pool(ids.begin() + q0, ids.end()); dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; anchor_cfg.chunk_size = chunk_size; - std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d recency_floor=%d (resolved=%d)\n", - n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, - cfg.recency_floor_tokens, resolved_R); + std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n", + n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count); std::fflush(stderr); if (cfg.use_transitive) { @@ -935,4 +895,18 @@ std::vector drafter_score_and_compress( return out; } +// ABI-stable 6-arg overload — old callers compiled before the use_transitive_override +// parameter was added link here without requiring recompilation. +std::vector drafter_score_and_compress( + DrafterContext & ctx, + const std::vector & ids, + float keep_ratio, + int chunk_size, + int n_lookahead, + int pool_kernel) { + return drafter_score_and_compress(ctx, ids, keep_ratio, + chunk_size, n_lookahead, pool_kernel, + /*use_transitive_override=*/-1); +} + } // namespace dflash::common diff --git a/server/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h index e5424f9dd..08aed3e9b 100644 --- a/server/src/qwen3/qwen3_drafter.h +++ b/server/src/qwen3/qwen3_drafter.h @@ -66,13 +66,27 @@ void free_drafter_weights(DrafterContext & ctx); // Score importance per token via Liu Q-hook tail attention, then chunk-top-K // span merge. Returns surviving token IDs (drafter vocab). // -// ids input token IDs of length S -// keep_ratio fraction of `chunk_size`-token chunks to keep -// chunk_size span granularity (default 32) -// n_lookahead trailing Q tokens used for tail attention (default 8) -// pool_kernel AvgPool kernel for score smoothing (default 13) +// ids input token IDs of length S +// keep_ratio fraction of `chunk_size`-token chunks to keep +// chunk_size span granularity (default 32) +// n_lookahead trailing Q tokens used for tail attention (default 8) +// pool_kernel AvgPool kernel for score smoothing (default 13) +// use_transitive_override -1 = read from env (default, no behaviour change) +// 0 = cascade off (agentic path) +// 1 = cascade on (retrieval path) // // On failure returns empty vector + sets last_error. +std::vector drafter_score_and_compress( + DrafterContext & ctx, + const std::vector & ids, + float keep_ratio, + int chunk_size, + int n_lookahead, + int pool_kernel, + int use_transitive_override); + +// Backward-compatible 6-arg overload — ABI-stable wrapper, defined in qwen3_drafter.cpp. +// Old callers compiled against the 6-arg signature continue to link without recompile. std::vector drafter_score_and_compress( DrafterContext & ctx, const std::vector & ids, diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index 82d0ea6ca..9859cffae 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -396,7 +396,9 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req } result.compressed_ids = drafter_score_and_compress( - drafter_ctx_, req.input_ids, req.keep_ratio); + drafter_ctx_, req.input_ids, req.keep_ratio, + /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13, + req.use_transitive); result.ok = !result.compressed_ids.empty(); if (result.ok) { std::fprintf(stderr, "[compress] %zu -> %zu tokens\n", diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 219bccf6d..f2518ba1e 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1222,10 +1222,74 @@ void HttpServer::worker_loop() { // 3. Compress via typed API ModelBackend::CompressRequest creq; creq.input_ids = std::move(drafter_ids); - // Bandit: use per-session keep_ratio if session_id provided. - creq.keep_ratio = req.session_id.empty() - ? config_.pflash_keep_ratio - : sessions_.get_keep_ratio(req.session_id); + + // TYPE-GATE router (default-off via pflash_router.enabled). + // When enabled, detect request type and override keep_ratio + + // cascade per the v2 policy. When disabled → exact no-op. + { + // Extract agentic-signal bools from the parsed JSON + // (json-walking belongs at the handler boundary, not + // in the pure router header). + const bool _has_tools = + req.tools.is_array() && !req.tools.empty(); + bool _has_tool_use_blocks = false; + bool _has_tool_calls = false; + if (req.messages.is_array()) { + for (const auto & _msg : req.messages) { + if (!_msg.is_object()) continue; + if (_msg.contains("tool_calls")) { + const auto & _tc = _msg["tool_calls"]; + if (_tc.is_array() && !_tc.empty()) + _has_tool_calls = true; + } + if (_msg.contains("content")) { + const auto & _c = _msg["content"]; + if (_c.is_array()) { + for (const auto & _b : _c) { + if (!_b.is_object()) continue; + const std::string _bt = _b.value("type", ""); + if (_bt == "tool_use" || _bt == "tool_result") + _has_tool_use_blocks = true; + } + } + } + } + } + const bool is_agentic = (detect_request_type( + _has_tools, _has_tool_use_blocks, _has_tool_calls) + == RequestType::Agentic); + const RequestFeatures rf { + is_agentic, + n_prompt + }; + const RouterDecisionV2 rd = decide_v2(rf, config_.pflash_router); + if (config_.pflash_router.enabled) { + // Router is on: apply per-request keep + cascade override. + // Bandit keeps winning if session_id is present — bandit + // is the M2 lever for agentic keep level tuning. + // For M1 the TYPE decision overrides keep_ratio when no + // session bandit is active. + if (req.session_id.empty()) { + creq.keep_ratio = (float)rd.keep_target; + } else { + creq.keep_ratio = sessions_.get_keep_ratio(req.session_id); + } + // cascade = use_transitive: 0 = off, 1 = on, -1 = env default + creq.use_transitive = rd.cascade ? 1 : 0; + std::fprintf(stderr, + "[pflash-router] type=%s keep=%.3f cascade=%s reason=%s\n", + is_agentic ? "agentic" : "retrieval", + creq.keep_ratio, + rd.cascade ? "on" : "off", + rd.reason); + } else { + // Router disabled: legacy keep_ratio path, no change. + creq.keep_ratio = req.session_id.empty() + ? config_.pflash_keep_ratio + : sessions_.get_keep_ratio(req.session_id); + // use_transitive stays at -1 (env default). + } + } creq.drafter_path = config_.pflash_drafter_path; creq.drafter_gpu = config_.pflash_drafter_gpu; creq.skip_park = config_.pflash_skip_park; diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 803cf74e8..f4fe57316 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -12,6 +12,7 @@ #pragma once #include "common/model_backend.h" +#include "common/regime_router.h" #include "tokenizer.h" #include "chat_template.h" #include "tool_memory.h" @@ -151,6 +152,11 @@ struct ServerConfig { bool pflash_skip_park = false; // skip park/unpark for >=32GB GPUs bool lazy_draft = false; // park decode draft when idle to save VRAM + // TYPE-gate compression router (v2). + // Default: disabled (exact no-op, correct-by-construction). + // Enable via PFLASH_ROUTER_ENABLE=1 env var at server startup. + RouterPolicyV2 pflash_router; // enabled=false by default + // Disk prefix cache std::string disk_cache_dir; // empty = disabled size_t disk_cache_budget_mb = 4096; // max disk usage in MB diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 156c84afe..21b8379f3 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -540,6 +540,21 @@ int main(int argc, char ** argv) { sconfig.pflash_threshold, sconfig.pflash_keep_ratio, sconfig.pflash_drafter_gpu, (int)sconfig.pflash_skip_park); + // TYPE-gate router: opt-in via env var, default-off. + { + const char * router_env = std::getenv("PFLASH_ROUTER_ENABLE"); + if (router_env && *router_env && std::strcmp(router_env, "0") != 0) { + sconfig.pflash_router.enabled = true; + // Inherit pflash threshold so the router fires at the same + // token count as the compression admission gate. + sconfig.pflash_router.threshold_tokens = sconfig.pflash_threshold; + std::fprintf(stderr, + "[server] pflash-router: ENABLED (type-gate v2) " + "threshold=%d agentic_keep=%.3f\n", + sconfig.pflash_router.threshold_tokens, + sconfig.pflash_router.agentic_keep_target); + } + } } // Create backend. @@ -771,6 +786,7 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ pflash_skip_park= %s\n", sconfig.pflash_skip_park ? "ON" : "off"); std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off"); std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)"); + std::fprintf(stderr, "[server] │ pflash_router = %s\n", sconfig.pflash_router.enabled ? "ON" : "off"); } if (bargs.draft_path) { std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off"); diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp index f92a0b512..b45370f99 100644 --- a/server/test/test_regime_router.cpp +++ b/server/test/test_regime_router.cpp @@ -1,7 +1,16 @@ -// Unit tests for dflash::common::decide_regime() — pure function, no GPU. +// Unit tests for the pflash regime router v2 — pure function, no GPU. +// +// Tests kept: t8 (deploy-noop), t10 (agentic-throttle), t11 (retrieval-full), +// t12 (below-threshold), t14 (degenerate), t18 (detect_request_type). +// +// Tests removed: +// t1-t7 — v1 R-router (decide_regime), refuted (ρ=-0.27), deleted. +// t9 — sparse_prompt_guard, validated zero-sum, deleted. +// t13 — recency_floor_invariant, deleted with recency floor feature. +// t15-t17 — recency_floor_for, deleted with recency floor feature. // // Build (standalone, from repo root): -// g++-11 -std=gnu++17 -O2 -I server/src/common +// g++-11 -std=gnu++17 -O2 -Wall -Wextra -Werror -I server/src/common // -o /tmp/test_regime_router server/test/test_regime_router.cpp // CMake: // cmake --build build --target test_regime_router -j @@ -11,12 +20,11 @@ #include #include -#include #include using namespace dflash::common; -// ─── Minimal test framework (mirrors test_adaptive_keep_ratio.cpp) ─────────── +// ─── Minimal test framework ─────────────────────────────────────────────────── static int test_failures = 0; static int test_count = 0; @@ -52,437 +60,86 @@ static inline bool approx_eq(double a, double b, double eps = 1e-9) { // ─── Helpers ───────────────────────────────────────────────────────────────── -// Build a policy with expansion_throttle_ratio disabled (default safe). -static RouterPolicy default_policy() { return {}; } - -// Build a policy that throttles at ratio >= r. -static RouterPolicy throttle_policy(double r, - int threshold = 32000, - int min_anchor = 1) { - RouterPolicy p; - p.threshold_tokens = threshold; - p.expansion_throttle_ratio = r; - p.min_anchor_chunks = min_anchor; - return p; -} - -static CascadeStats make_stats(int n_chunks, - int anchor_only, - int after_cascade, - int prompt_tokens, - int keep_floor = 0) { - return { n_chunks, anchor_only, after_cascade, prompt_tokens, keep_floor }; -} - -// ─── T1: DEPLOY-NO-OP ──────────────────────────────────────────────────────── -// With the DEFAULT RouterPolicy (ratio=INFINITY), decide_regime must return -// FullCascade for ANY stats, including pathologically large expansion. - -static void t1_deploy_noop() { - RouterPolicy p = default_policy(); - - // Normal case - { - auto d = decide_regime(make_stats(100, 10, 20, 50000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T1a: default policy must always give FullCascade"); - } - // Huge expansion: forced_anchor_only=10, forced_after_cascade=1000, prompt=100K - { - auto d = decide_regime(make_stats(500, 10, 1000, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T1b: huge expansion with default policy must be FullCascade"); - } - // Prompt below threshold - { - auto d = decide_regime(make_stats(50, 5, 500, 1000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T1c: short prompt with default policy must be FullCascade"); - } - // Zero anchors - { - auto d = decide_regime(make_stats(100, 0, 0, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T1d: zero anchors with default policy must be FullCascade"); - } - // Sweep: 50 random-ish stat combinations - for (int i = 1; i <= 50; ++i) { - CascadeStats s = make_stats(i * 10, - i, - i * 100, // R = 100, very high - i * 5000); - auto d = decide_regime(s, p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T1-sweep: default policy must be FullCascade for all stats"); - } -} - -// ─── T2: DEGENERATE ────────────────────────────────────────────────────────── -// Degenerate inputs must not crash or div-by-zero, and must return FullCascade. - -static void t2_degenerate() { - RouterPolicy p = throttle_policy(2.0); // would throttle if R >= 2 - - // n_chunks == 0 - { - auto d = decide_regime(make_stats(0, 5, 10, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T2a: n_chunks=0 must return FullCascade"); - TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio), - "T2a: expansion_ratio must be finite when n_chunks=0"); - } - // forced_anchor_only == 0 (no anchors before cascade) → R defaults to 1.0 - { - auto d = decide_regime(make_stats(100, 0, 50, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T2b: forced_anchor_only=0 must return FullCascade"); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), - "T2b: expansion_ratio must be 1.0 when forced_anchor_only=0"); - } - // Negative forced_anchor_only - { - auto d = decide_regime(make_stats(100, -1, 50, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T2c: negative forced_anchor_only must return FullCascade"); - TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio), - "T2c: expansion_ratio must be finite for negative anchor count"); - } - // Negative forced_after_cascade - { - auto d = decide_regime(make_stats(100, 5, -1, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T2d: negative forced_after_cascade must return FullCascade"); - } - // Both negative - { - auto d = decide_regime(make_stats(100, -3, -7, 100000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T2e: both counts negative must return FullCascade"); - } -} - -// ─── T3: BELOW-THRESHOLD ───────────────────────────────────────────────────── -// prompt_tokens < threshold → FullCascade regardless of R and finite ratio. - -static void t3_below_threshold() { - RouterPolicy p = throttle_policy(1.5, /*threshold=*/32000, /*min_anchor=*/1); - - // prompt = threshold - 1 (just below) - { - auto d = decide_regime(make_stats(100, 10, 1000, 31999), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T3a: prompt just below threshold must be FullCascade"); - TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", - "T3a: reason must be 'below_threshold'"); - } - // prompt = 0 - { - auto d = decide_regime(make_stats(100, 10, 9999, 0), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T3b: prompt=0 must be FullCascade"); - } - // Even with R = 1000 and finite ratio = 2.0, still FullCascade below threshold - { - auto d = decide_regime(make_stats(200, 5, 5000, 100), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T3c: tiny prompt, huge R, finite ratio -> FullCascade"); - } -} - -// ─── T4: TOO-FEW-ANCHORS ───────────────────────────────────────────────────── -// forced_anchor_only < min_anchor_chunks → FullCascade. - -static void t4_too_few_anchors() { - RouterPolicy p = throttle_policy(2.0, /*threshold=*/32000, /*min_anchor=*/3); - // forced_anchor_only = 2 < min_anchor = 3 - { - auto d = decide_regime(make_stats(100, 2, 1000, 50000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T4a: anchors below min must be FullCascade"); - } - // forced_anchor_only = 0 < min_anchor = 3 - { - auto d = decide_regime(make_stats(100, 0, 500, 50000), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T4b: zero anchors below min must be FullCascade"); - } - // forced_anchor_only = 3 == min_anchor = 3: NOT too few → may throttle - { - auto d = decide_regime(make_stats(100, 3, 300, 50000), p); - // R = 300/3 = 100 >= 2.0 → should be Throttle - TEST_ASSERT_MSG(d.regime == Regime::Throttle, - "T4c: anchors == min AND R >= ratio must throttle"); - } -} - -// ─── T5: MONOTONE ──────────────────────────────────────────────────────────── -// With a finite ratio policy, once Throttle triggers at R it must stay Throttle -// for all larger R. - -static void t5_monotone() { - // Policy: ratio=3.0, threshold=32000, min_anchor=1, prompt_tokens=50000 - RouterPolicy p = throttle_policy(3.0, 32000, 1); - const int prompt = 50000; - const int anchor = 10; // fixed; vary after_cascade to control R - - // R = 2.9 → FullCascade - { - // after = anchor * R = 10 * 2.9 = 29 - auto d = decide_regime(make_stats(100, anchor, 29, prompt), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T5a: R=2.9 < 3.0 must be FullCascade"); - } - // R = 3.0 → Throttle (boundary: >= triggers) - { - // after = 10 * 3 = 30 - auto d = decide_regime(make_stats(100, anchor, 30, prompt), p); - TEST_ASSERT_MSG(d.regime == Regime::Throttle, - "T5b: R=3.0 == ratio must be Throttle"); - } - // R = 10.0 → Throttle - { - auto d = decide_regime(make_stats(100, anchor, 100, prompt), p); - TEST_ASSERT_MSG(d.regime == Regime::Throttle, - "T5c: R=10.0 >> ratio must be Throttle"); - } - // Monotone sweep: for all integer R from 1 to 100, once Throttle appears - // it must not flip back to FullCascade. - bool seen_throttle = false; - bool monotone = true; - for (int r_int = 1; r_int <= 100; ++r_int) { - // after = anchor * r_int → exact integer R - auto d = decide_regime(make_stats(200, anchor, anchor * r_int, prompt), p); - if (d.regime == Regime::Throttle) { - seen_throttle = true; - } else if (seen_throttle) { - // Flipped back to FullCascade after Throttle was seen: not monotone - monotone = false; - std::fprintf(stderr, - " MONOTONE VIOLATION at R=%d: Throttle then FullCascade\n", - r_int); - break; - } - } - TEST_ASSERT_MSG(seen_throttle, "T5d: sweep must trigger Throttle at some R"); - TEST_ASSERT_MSG(monotone, "T5e: regime must be monotone (no FullCascade after Throttle)"); -} - -// ─── T6: BOUNDARY ──────────────────────────────────────────────────────────── -// R exactly == ratio → Throttle; R = ratio - epsilon → FullCascade. - -static void t6_boundary() { - const double ratio = 5.0; - RouterPolicy p = throttle_policy(ratio, 32000, 1); - const int anchor = 1000; // use large anchor to get precise integer ratios - const int prompt = 50000; - - // R exactly == ratio: after = anchor * ratio = 5000 - { - auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio, prompt), p); - TEST_ASSERT_MSG(d.regime == Regime::Throttle, - "T6a: R exactly == ratio must be Throttle"); - } - - // R = ratio - epsilon where epsilon = 0.5/anchor (one less chunk → R < ratio) - { - // after = anchor * ratio - 1 = 4999 → R = 4.999 < 5.0 - auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio - 1, prompt), p); - TEST_ASSERT_MSG(d.regime == Regime::FullCascade, - "T6b: R just below ratio must be FullCascade"); - } -} - -// ─── T7: RATIO-VALUE ───────────────────────────────────────────────────────── -// Check that expansion_ratio is computed correctly. - -static void t7_ratio_value() { - RouterPolicy p = default_policy(); // regime doesn't matter; check ratio value - - // forced_anchor_only=10, forced_after_cascade=85 → R = 8.5 - { - auto d = decide_regime(make_stats(100, 10, 85, 50000), p); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 8.5), - "T7a: R must be 85/10 = 8.5"); - } - // forced_anchor_only=0 → R must be 1.0 (no div-by-zero) - { - auto d = decide_regime(make_stats(100, 0, 50, 50000), p); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), - "T7b: forced_anchor_only=0 must give expansion_ratio=1.0"); - } - // forced_anchor_only=5, forced_after_cascade=5 → R = 1.0 - { - auto d = decide_regime(make_stats(100, 5, 5, 50000), p); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), - "T7c: equal anchors before/after must give R=1.0"); - } - // forced_anchor_only=7, forced_after_cascade=7 → R = 1.0 (no expansion) - { - auto d = decide_regime(make_stats(100, 7, 7, 50000), p); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0), - "T7d: no cascade expansion must give R=1.0"); - } - // Verify ratio when throttle policy triggers: ratio value should still be correct - { - RouterPolicy tp = throttle_policy(3.0); - auto d = decide_regime(make_stats(100, 4, 20, 50000), tp); - // R = 20/4 = 5.0 → Throttle, ratio = 5.0 - TEST_ASSERT_MSG(d.regime == Regime::Throttle, - "T7e: R=5.0 >= 3.0 must throttle"); - TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 5.0), - "T7e: expansion_ratio must be 5.0"); - } -} - -// ─── V2 helpers ────────────────────────────────────────────────────────────── - -// Default v2 policy: disabled (deploy no-op). static RouterPolicyV2 default_v2_policy() { return {}; } -// Enabled v2 policy with default field values. static RouterPolicyV2 enabled_v2_policy() { RouterPolicyV2 p; p.enabled = true; return p; } -static RequestFeatures make_features(bool is_agentic, - int prompt_tokens, - int new_content_tokens) { - return { is_agentic, prompt_tokens, new_content_tokens }; +static RequestFeatures make_features(bool is_agentic, int prompt_tokens) { + return { is_agentic, prompt_tokens }; } -// ─── T8: DEPLOY-NO-OP (v2) ─────────────────────────────────────────────────── +// ─── T8: DEPLOY-NO-OP ──────────────────────────────────────────────────────── // enabled=false → SAFE for every input, including is_agentic=true and huge prompts. -// Correct-by-construction: disabled router must be an exact no-op. static void t8_v2_deploy_noop() { RouterPolicyV2 p = default_v2_policy(); // enabled=false - // Baseline: normal agentic prompt, well above threshold. { - auto d = decide_v2(make_features(true, 100000, 10000), p); + auto d = decide_v2(make_features(true, 100000), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T8a: disabled->keep_target must be full_keep_target"); - TEST_ASSERT_MSG(d.cascade, - "T8a: disabled->cascade must be true"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T8a: disabled->recency must be keep-all sentinel"); + TEST_ASSERT_MSG(d.cascade, "T8a: disabled->cascade must be true"); + TEST_ASSERT_MSG(std::string(d.reason) == "disabled_noop", + "T8a: disabled->reason must be 'disabled_noop'"); } - // Sweep: all combinations of is_agentic, varying prompt and new_content sizes. + // Sweep all combinations of is_agentic and prompt sizes. for (int i = 0; i < 4; ++i) { - bool agentic = (i & 1) != 0; - int prompt = (i & 2) ? 100000 : 500; - int new_toks = (i & 2) ? 10000 : 10; - auto d = decide_v2(make_features(agentic, prompt, new_toks), p); + bool agentic = (i & 1) != 0; + int prompt = (i & 2) ? 100000 : 500; + auto d = decide_v2(make_features(agentic, prompt), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T8-sweep: disabled->keep_target must be full_keep_target"); - TEST_ASSERT_MSG(d.cascade, - "T8-sweep: disabled->cascade must be true"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T8-sweep: disabled->recency must be keep-all sentinel"); + TEST_ASSERT_MSG(d.cascade, "T8-sweep: disabled->cascade must be true"); } - // Explicitly: is_agentic=true, large prompt, large new_content — must be SAFE. + // Explicitly: is_agentic=true, large prompt — must be SAFE. { - auto d = decide_v2(make_features(true, 200000, 50000), p); + auto d = decide_v2(make_features(true, 200000), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T8b: disabled, agentic, huge prompt -> SAFE"); TEST_ASSERT_MSG(d.cascade, "T8b: disabled -> cascade=true"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T8b: disabled -> recency keep-all"); - } -} - -// ─── T9: SPARSE-PROMPT GUARD (failure-class fix) ───────────────────────────── -// is_agentic=true, prompt above threshold, BUT new_content < sparse threshold. -// This is the LONG_A-t11/LONG_B-t10 plumbing class: a tiny tool_result riding -// on long history. Compression must NOT throttle here (would drop continuity). - -static void t9_sparse_prompt_guard() { - RouterPolicyV2 p = enabled_v2_policy(); - - // Canonical failure case: 3-word tool_result on 43K history. - { - auto d = decide_v2(make_features(true, 43000, 8), p); - TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), - "T9a: sparse agentic turn must be SAFE (full keep), not throttled"); - TEST_ASSERT_MSG(d.cascade, - "T9a: sparse_prompt_guard must cascade=true (SAFE)"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T9a: sparse_prompt_guard -> recency keep-all"); - TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard", - "T9a: reason must be 'sparse_prompt_guard'"); - } - // new_content = sparse_new_content_tokens - 1 (just below the guard). - { - auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens - 1), p); - TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), - "T9b: new_content just below sparse threshold -> SAFE"); - TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard", - "T9b: reason must be 'sparse_prompt_guard'"); - } - // new_content = 0 (degenerate new turn, still sparse guard NOT degenerate path). - // Note: 0 < sparse_new_content_tokens (256) so sparse guard fires first. - { - auto d = decide_v2(make_features(true, 40000, 0), p); - TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), - "T9c: new_content=0 -> SAFE (sparse guard or degenerate, both SAFE)"); - } - // Confirm: new_content = sparse_new_content_tokens (AT the boundary → NOT sparse). - // is_agentic=true above threshold with enough new content → throttle kicks in. - { - auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens), p); - TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target), - "T9d: new_content==sparse threshold -> agentic throttle applies"); - TEST_ASSERT_MSG(!d.cascade, - "T9d: agentic throttle -> cascade=false"); } } // ─── T10: AGENTIC-THROTTLE ─────────────────────────────────────────────────── -// enabled, is_agentic=true, prompt > threshold, new_content > sparse threshold -// → keep_target=agentic_keep_target, cascade=false, recency >= 1. +// enabled, is_agentic=true, prompt > threshold +// → keep_target=agentic_keep_target, cascade=false. static void t10_agentic_throttle() { RouterPolicyV2 p = enabled_v2_policy(); { - auto d = decide_v2(make_features(true, 40000, 3000), p); + auto d = decide_v2(make_features(true, 40000), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target), "T10a: agentic throttle -> keep_target=agentic_keep_target"); - TEST_ASSERT_MSG(!d.cascade, - "T10a: agentic throttle -> cascade=false"); - TEST_ASSERT_MSG(d.recency_floor_turns == p.recency_floor_turns, - "T10a: agentic throttle -> recency matches policy"); - TEST_ASSERT_MSG(d.recency_floor_turns >= 1, - "T10a: recency_floor_turns must be >= 1 (continuity guaranteed)"); + TEST_ASSERT_MSG(!d.cascade, "T10a: agentic throttle -> cascade=false"); TEST_ASSERT_MSG(std::string(d.reason) == "agentic_throttle", "T10a: reason must be 'agentic_throttle'"); } - // Custom policy: verify fields propagate. + // Custom agentic_keep_target. { - RouterPolicyV2 p2 = p; - p2.agentic_keep_target = 0.30; - p2.recency_floor_turns = 5; - auto d = decide_v2(make_features(true, 60000, 1000), p2); + RouterPolicyV2 p2 = p; + p2.agentic_keep_target = 0.30; + auto d = decide_v2(make_features(true, 60000), p2); TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.30), "T10b: custom agentic_keep_target propagated"); - TEST_ASSERT_MSG(d.recency_floor_turns == 5, - "T10b: custom recency_floor_turns propagated"); + TEST_ASSERT_MSG(!d.cascade, "T10b: agentic -> cascade=false"); } } // ─── T11: RETRIEVAL-FULL ───────────────────────────────────────────────────── -// enabled, is_agentic=false, prompt > threshold, new_content > sparse threshold +// enabled, is_agentic=false, prompt > threshold // → cascade=true, keep_target=full_keep_target. static void t11_retrieval_full() { RouterPolicyV2 p = enabled_v2_policy(); { - auto d = decide_v2(make_features(false, 40000, 3000), p); - TEST_ASSERT_MSG(d.cascade, - "T11a: retrieval -> cascade=true"); + auto d = decide_v2(make_features(false, 40000), p); + TEST_ASSERT_MSG(d.cascade, "T11a: retrieval -> cascade=true"); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T11a: retrieval -> keep_target=full_keep_target"); TEST_ASSERT_MSG(std::string(d.reason) == "retrieval_full", @@ -490,44 +147,41 @@ static void t11_retrieval_full() { } // Custom full_keep_target. { - RouterPolicyV2 p2 = p; - p2.full_keep_target = 0.80; - auto d = decide_v2(make_features(false, 50000, 5000), p2); + RouterPolicyV2 p2 = p; + p2.full_keep_target = 0.80; + auto d = decide_v2(make_features(false, 50000), p2); TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.80), "T11b: custom full_keep_target propagated"); TEST_ASSERT_MSG(d.cascade, "T11b: retrieval -> cascade=true"); } } -// ─── T12: BELOW-THRESHOLD (v2) ─────────────────────────────────────────────── -// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic and new_content. +// ─── T12: BELOW-THRESHOLD ──────────────────────────────────────────────────── +// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic. static void t12_v2_below_threshold() { RouterPolicyV2 p = enabled_v2_policy(); - // Agentic, just below threshold, plenty of new content. + // Agentic, just below threshold. { - auto d = decide_v2(make_features(true, p.threshold_tokens - 1, 5000), p); + auto d = decide_v2(make_features(true, p.threshold_tokens - 1), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T12a: agentic, below threshold -> SAFE"); - TEST_ASSERT_MSG(d.cascade, - "T12a: below threshold -> cascade=true"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T12a: below threshold -> recency keep-all"); + TEST_ASSERT_MSG(d.cascade, "T12a: below threshold -> cascade=true"); TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", "T12a: reason must be 'below_threshold'"); } - // Non-agentic, at threshold boundary - 1. + // Non-agentic, just below threshold. { - auto d = decide_v2(make_features(false, p.threshold_tokens - 1, 5000), p); + auto d = decide_v2(make_features(false, p.threshold_tokens - 1), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T12b: non-agentic, below threshold -> SAFE"); } // Custom threshold. { - RouterPolicyV2 p2 = p; - p2.threshold_tokens = 10000; - auto d = decide_v2(make_features(true, 9999, 2000), p2); + RouterPolicyV2 p2 = p; + p2.threshold_tokens = 10000; + auto d = decide_v2(make_features(true, 9999), p2); TEST_ASSERT_MSG(approx_eq(d.keep_target, p2.full_keep_target), "T12c: custom threshold, below it -> SAFE"); TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold", @@ -535,152 +189,91 @@ static void t12_v2_below_threshold() { } } -// ─── T13: RECENCY-FLOOR INVARIANT ──────────────────────────────────────────── -// In every throttling decision (non-SAFE), recency_floor_turns >= 1. -// In every SAFE decision, recency_floor_turns >= kRecencyKeepAll. - -static void t13_recency_floor_invariant() { - RouterPolicyV2 p = enabled_v2_policy(); - - // Throttle path (agentic): recency >= 1. - { - auto d = decide_v2(make_features(true, 50000, 1000), p); - TEST_ASSERT_MSG(!approx_eq(d.keep_target, p.full_keep_target) || - d.recency_floor_turns >= 1, - "T13a: throttled decision must have recency >= 1"); - TEST_ASSERT_MSG(d.recency_floor_turns >= 1, - "T13a: agentic throttle recency_floor_turns >= 1 (continuity)"); - } - // SAFE paths: recency must be keep-all. - // disabled - { - RouterPolicyV2 pd; pd.enabled = false; - auto d = decide_v2(make_features(true, 50000, 1000), pd); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T13b: disabled SAFE recency must be keep-all"); - } - // sparse_prompt_guard - { - auto d = decide_v2(make_features(true, 50000, 5), p); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T13c: sparse_prompt_guard SAFE recency must be keep-all"); - } - // below_threshold - { - auto d = decide_v2(make_features(true, 1000, 500), p); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T13d: below_threshold SAFE recency must be keep-all"); - } - // retrieval_full path: recency = policy value (not keep-all, it's a throttle-adjacent path) - { - auto d = decide_v2(make_features(false, 50000, 1000), p); - TEST_ASSERT_MSG(d.recency_floor_turns >= 1, - "T13e: retrieval_full recency >= 1"); - } - // Custom recency_floor_turns: verify agentic propagates it. - for (int k = 1; k <= 10; ++k) { - RouterPolicyV2 pk = p; - pk.recency_floor_turns = k; - auto d = decide_v2(make_features(true, 50000, 1000), pk); - TEST_ASSERT_MSG(d.recency_floor_turns == k, - "T13f: agentic throttle recency must equal policy recency_floor_turns"); - } -} - -// ─── T14: DEGENERATE (v2) ──────────────────────────────────────────────────── -// prompt_tokens <= 0 or new_content_tokens < 0 → SAFE (no crash, no garbage). +// ─── T14: DEGENERATE ───────────────────────────────────────────────────────── +// prompt_tokens <= 0 → SAFE (no crash, no garbage). static void t14_v2_degenerate() { RouterPolicyV2 p = enabled_v2_policy(); // prompt_tokens = 0 { - auto d = decide_v2(make_features(true, 0, 500), p); + auto d = decide_v2(make_features(true, 0), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T14a: prompt_tokens=0 -> SAFE"); TEST_ASSERT_MSG(d.cascade, "T14a: degenerate -> cascade=true"); - TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll, - "T14a: degenerate -> recency keep-all"); TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", "T14a: reason must be 'degenerate'"); } // prompt_tokens < 0 { - auto d = decide_v2(make_features(false, -1, 100), p); + auto d = decide_v2(make_features(false, -1), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), "T14b: negative prompt_tokens -> SAFE"); TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", "T14b: reason must be 'degenerate'"); } - // new_content_tokens < 0 - { - auto d = decide_v2(make_features(true, 50000, -1), p); - TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), - "T14c: negative new_content_tokens -> SAFE"); - TEST_ASSERT_MSG(std::string(d.reason) == "degenerate", - "T14c: reason must be 'degenerate'"); - } // Both degenerate { - auto d = decide_v2(make_features(true, -5, -10), p); + auto d = decide_v2(make_features(true, -5), p); TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target), - "T14d: both degenerate -> SAFE"); + "T14c: negative agentic -> SAFE"); } } -// ─── T15: RECENCY_FLOOR_FOR — off ──────────────────────────────────────────── -// recency_floor_tokens == 0 → always 0 regardless of prompt size. - -static void t15_recency_floor_off() { - // 0 → off - TEST_ASSERT_MSG(recency_floor_for(0, 0) == 0, "T15a: S=0 R=0 -> 0"); - TEST_ASSERT_MSG(recency_floor_for(1000, 0) == 0, "T15b: S=1000 R=0 -> 0"); - TEST_ASSERT_MSG(recency_floor_for(100000, 0) == 0, "T15c: S=100K R=0 -> 0"); - // Negative R (shouldn't happen but must be safe) - TEST_ASSERT_MSG(recency_floor_for(10000, -2) == 0, "T15d: negative R (not sentinel) -> 0"); -} - -// ─── T16: RECENCY_FLOOR_FOR — auto ─────────────────────────────────────────── -// kRecencyFloorAuto (-1) → min(1024, ceil(0.04 * S)). - -static void t16_recency_floor_auto() { - const int A = kRecencyFloorAuto; - - // S=0: ceil(0.04*0)=0 - TEST_ASSERT_MSG(recency_floor_for(0, A) == 0, "T16a: S=0 auto -> 0"); - // S=1000: ceil(0.04*1000)=40 - TEST_ASSERT_MSG(recency_floor_for(1000, A) == 40, "T16b: S=1000 auto -> 40"); - // S=10000: ceil(0.04*10000)=400 - TEST_ASSERT_MSG(recency_floor_for(10000, A) == 400, "T16c: S=10K auto -> 400"); - // S=25000: ceil(0.04*25000)=1000 - TEST_ASSERT_MSG(recency_floor_for(25000, A) == 1000, "T16d: S=25K auto -> 1000"); - // S=25001: ceil(0.04*25001)=1001 but capped at 1024 - // actually 0.04*25001=1000.04 → ceil=1001 < 1024 → 1001 - TEST_ASSERT_MSG(recency_floor_for(25001, A) == 1001, "T16e: S=25001 auto -> 1001"); - // S=25600: 0.04*25600=1024.0 → ceil=1024 - TEST_ASSERT_MSG(recency_floor_for(25600, A) == 1024, "T16f: S=25600 auto -> 1024"); - // S=26000: 0.04*26000=1040 → ceil=1040 but capped at 1024 - TEST_ASSERT_MSG(recency_floor_for(26000, A) == 1024, "T16g: S=26000 auto -> cap 1024"); - // S=100000: 0.04*100000=4000 → capped at 1024 - TEST_ASSERT_MSG(recency_floor_for(100000, A) == 1024, "T16h: S=100K auto -> cap 1024"); - // S=-1: negative prompt treated as 0 → 0 - TEST_ASSERT_MSG(recency_floor_for(-1, A) == 0, "T16i: S=-1 auto -> 0"); -} +// ─── T18: detect_request_type — bool truth-table ───────────────────────────── +// +// Exhaustive 3-bit truth table: any true → Agentic, all false → Retrieval. +// No JSON dependency; the caller extracts bools at the handler boundary. -// ─── T17: RECENCY_FLOOR_FOR — explicit ─────────────────────────────────────── -// Any explicit positive value is returned unchanged (no prompt-size influence). - -static void t17_recency_floor_explicit() { - // Explicit override ignores prompt size - TEST_ASSERT_MSG(recency_floor_for(1000, 512) == 512, "T17a: explicit 512"); - TEST_ASSERT_MSG(recency_floor_for(100000, 512) == 512, "T17b: explicit 512, large S"); - TEST_ASSERT_MSG(recency_floor_for(1000, 1024) == 1024, "T17c: explicit 1024"); - TEST_ASSERT_MSG(recency_floor_for(1000, 2048) == 2048, "T17d: explicit 2048 > cap"); - TEST_ASSERT_MSG(recency_floor_for(0, 256) == 256, "T17e: explicit 256, S=0"); - // Monotone: explicit > auto at short prompts - const int A = kRecencyFloorAuto; - TEST_ASSERT_MSG(recency_floor_for(1000, 512) > recency_floor_for(1000, A), - "T17f: explicit 512 > auto(1000)=40"); +static void t18_detect_request_type() { + // All-false → Retrieval (safe default). + { + auto type = detect_request_type(false, false, false); + TEST_ASSERT_MSG(type == RequestType::Retrieval, + "T18a: all false -> Retrieval"); + } + // has_tools only → Agentic. + { + auto type = detect_request_type(true, false, false); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18b: has_tools=true -> Agentic"); + } + // has_tool_use_blocks only → Agentic. + { + auto type = detect_request_type(false, true, false); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18c: has_tool_use_blocks=true -> Agentic"); + } + // has_tool_calls only → Agentic. + { + auto type = detect_request_type(false, false, true); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18d: has_tool_calls=true -> Agentic"); + } + // has_tools + has_tool_use_blocks → Agentic. + { + auto type = detect_request_type(true, true, false); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18e: has_tools + has_tool_use_blocks -> Agentic"); + } + // has_tools + has_tool_calls → Agentic. + { + auto type = detect_request_type(true, false, true); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18f: has_tools + has_tool_calls -> Agentic"); + } + // has_tool_use_blocks + has_tool_calls → Agentic. + { + auto type = detect_request_type(false, true, true); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18g: has_tool_use_blocks + has_tool_calls -> Agentic"); + } + // All true → Agentic. + { + auto type = detect_request_type(true, true, true); + TEST_ASSERT_MSG(type == RequestType::Agentic, + "T18h: all true -> Agentic"); + } } // ─── main ───────────────────────────────────────────────────────────────────── @@ -688,27 +281,14 @@ static void t17_recency_floor_explicit() { int main() { std::fprintf(stderr, "=== test_regime_router ===\n"); - RUN_TEST(t1_deploy_noop); - RUN_TEST(t2_degenerate); - RUN_TEST(t3_below_threshold); - RUN_TEST(t4_too_few_anchors); - RUN_TEST(t5_monotone); - RUN_TEST(t6_boundary); - RUN_TEST(t7_ratio_value); - - std::fprintf(stderr, "--- v2 ---\n"); RUN_TEST(t8_v2_deploy_noop); - RUN_TEST(t9_sparse_prompt_guard); RUN_TEST(t10_agentic_throttle); RUN_TEST(t11_retrieval_full); RUN_TEST(t12_v2_below_threshold); - RUN_TEST(t13_recency_floor_invariant); RUN_TEST(t14_v2_degenerate); - std::fprintf(stderr, "--- recency_floor_for ---\n"); - RUN_TEST(t15_recency_floor_off); - RUN_TEST(t16_recency_floor_auto); - RUN_TEST(t17_recency_floor_explicit); + std::fprintf(stderr, "--- detect_request_type ---\n"); + RUN_TEST(t18_detect_request_type); std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures); return (test_failures == 0) ? 0 : 1; From 8fc961b54f44a98d697899e47df27c04753c1ac6 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Sat, 30 May 2026 22:43:16 +0200 Subject: [PATCH 16/16] feat(pflash): empty-response guard + bandit floor reconciliation (task #10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the two validated pieces of the adaptive-keep path (the label-free quality-reward idea was dropped — Momus-confirmed it can't catch confident off-task). Default-OFF; router gates these to agentic-routed requests. - regime_router.h: two pure helpers (stdlib-only, TDD'd) — clamp_keep_to_floor(bandit_keep, router_floor, agentic): agentic effective keep = max(bandit_keep, floor) so the bandit's 0.20 ceiling can no longer silently undercut the router's 0.25 floor. compression_failed(tokens, degenerate_close, agentic_compressed, min=8): true on empty/degenerate output of an agentic compressed turn. - adaptive_keep_ratio.h: per-session recover_full_next flag (+ set/consume). - http_server.cpp: floor clamp at keep-apply; at the post-generate update site, on compression_failed → skip the bandit update (failure noise) and set the session to full keep for the next turn (deterministic recovery from the empty-response failure class, e.g. LONG_B t10). PFLASH_GUARD_MIN_TOKENS env (default 8) tunes the guard threshold. - 59 standalone unit tests, -Werror. LIVE-VALIDATED on RTX 3090 (server up on :18097, 34K-token prompts): - type-gate: agentic→keep 0.250/cascade-off, retrieval→cascade-on. - guard recovery loop: turn1 compression_failed→full-keep-next (resp_tokens=13, bandit update skipped); turn2 same session recover_full_next consumed→keep 1.0. - floor clamp fired: agentic bandit 0.100 < floor 0.250 → 0.250. Launch config (24GB): GGML_CUDA_NO_VMM=1 + --max-ctx 49152 (139264 KV OOMs the 3090 — that was the pre-existing bad_alloc, not this change). Still default-OFF via PFLASH_ROUTER_ENABLE. --- server/src/common/regime_router.h | 32 +++++++ server/src/server/adaptive_keep_ratio.h | 38 ++++++++- server/src/server/http_server.cpp | 69 ++++++++++++--- server/test/test_regime_router.cpp | 106 ++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 15 deletions(-) diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h index 426c0c985..4c03eff8f 100644 --- a/server/src/common/regime_router.h +++ b/server/src/common/regime_router.h @@ -68,6 +68,38 @@ inline RouterDecisionV2 decide_v2(const RequestFeatures& f, return { p.full_keep_target, true, "retrieval_full" }; } +// ─── PIECE 1: floor clamp ──────────────────────────────────────────────────── +// +// When the router routed a request as agentic, the bandit must not compress +// harder than the router's agentic_keep_target floor. Non-agentic sessions +// are passed through unchanged (bandit drives retrieval sessions freely). +// +// Pure, stdlib-only, no IO. +inline double clamp_keep_to_floor(double bandit_keep, + double router_floor, + bool agentic) { + if (!agentic) return bandit_keep; + return bandit_keep >= router_floor ? bandit_keep : router_floor; +} + +// ─── PIECE 2: compression failure guard ────────────────────────────────────── +// +// Returns true when a compressed agentic turn produced an empty or degenerate +// response. Used to skip the bandit update (failure noise) and schedule a +// full-keep recovery for the next turn. +// +// Fires ONLY on the agentic+compressed path — non-compressed failures are not +// our fault and do not need recovery. +// +// Pure, stdlib-only, no IO. +inline bool compression_failed(int response_tokens, + bool degenerate_close, + bool agentic_compressed, + int min_tokens = 8) { + if (!agentic_compressed) return false; + return response_tokens < min_tokens || degenerate_close; +} + // ─── TYPE GATE ─────────────────────────────────────────────────────────────── // // Coarse request-type classifier. Pure function — no IO, no globals, no JSON. diff --git a/server/src/server/adaptive_keep_ratio.h b/server/src/server/adaptive_keep_ratio.h index 959b87bce..36a815917 100644 --- a/server/src/server/adaptive_keep_ratio.h +++ b/server/src/server/adaptive_keep_ratio.h @@ -9,9 +9,10 @@ namespace dflash::common { struct AdaptiveKeepRatioState { - float ema = 0.0f; - float last_keep = 0.10f; - int turn_count = 0; + float ema = 0.0f; + float last_keep = 0.10f; + int turn_count = 0; + bool recover_full_next = false; // set by compression_failed guard; cleared after one turn }; constexpr float kBanditEmaAlpha = 0.7f; @@ -90,6 +91,37 @@ class HttpServerSessions { return it->second.state.turn_count; } + // Schedule full-keep recovery for the next turn of this session. + // Called by the compression_failed guard when an agentic compressed turn + // produced an empty or degenerate response. Creates the session entry if + // it does not exist yet (guard may fire before any bandit update). + void set_recover_full_next(const std::string& session_id) { + std::lock_guard lock(mu_); + auto it = map_.find(session_id); + if (it == map_.end()) { + evict_if_full_locked(); + lru_.push_front(session_id); + AdaptiveKeepRatioState s{}; + s.recover_full_next = true; + map_.emplace(session_id, Entry{s, lru_.begin()}); + } else { + it->second.state.recover_full_next = true; + lru_.splice(lru_.begin(), lru_, it->second.lru_it); + } + } + + // Returns true and clears the flag if recovery was scheduled; false otherwise. + // One-shot: the flag is consumed on read so the next turn runs normally. + bool consume_recover_full_next(const std::string& session_id) { + std::lock_guard lock(mu_); + auto it = map_.find(session_id); + if (it == map_.end()) return false; + lru_.splice(lru_.begin(), lru_, it->second.lru_it); + if (!it->second.state.recover_full_next) return false; + it->second.state.recover_full_next = false; + return true; + } + size_t size() const { std::lock_guard lock(mu_); return map_.size(); diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index f2518ba1e..5818dfd1f 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1187,6 +1187,7 @@ void HttpServer::worker_loop() { // If pflash is enabled and prompt exceeds threshold, compress. std::vector effective_prompt = req.prompt_tokens; bool pflash_compressed = false; + bool pflash_is_agentic = false; // hoisted for post-generate guard if (config_.pflash_mode != ServerConfig::PflashMode::OFF && drafter_tokenizer_ != nullptr) @@ -1258,6 +1259,7 @@ void HttpServer::worker_loop() { const bool is_agentic = (detect_request_type( _has_tools, _has_tool_use_blocks, _has_tool_calls) == RequestType::Agentic); + pflash_is_agentic = is_agentic; // hoist for post-generate guard const RequestFeatures rf { is_agentic, n_prompt @@ -1272,7 +1274,32 @@ void HttpServer::worker_loop() { if (req.session_id.empty()) { creq.keep_ratio = (float)rd.keep_target; } else { - creq.keep_ratio = sessions_.get_keep_ratio(req.session_id); + // PIECE 2: recover_full_next — one-shot full-keep recovery + // after a compression_failed turn. Consumed here (one turn). + if (!req.session_id.empty() && + sessions_.consume_recover_full_next(req.session_id)) { + creq.keep_ratio = (float)config_.pflash_router.full_keep_target; + std::fprintf(stderr, + "[pflash-guard] recover_full_next consumed — " + "session=%s full_keep=%.3f\n", + req.session_id.c_str(), creq.keep_ratio); + } else { + // PIECE 1: floor clamp — bandit must not undercut + // the router's agentic floor. + float raw_keep = sessions_.get_keep_ratio(req.session_id); + creq.keep_ratio = (float)clamp_keep_to_floor( + raw_keep, + config_.pflash_router.agentic_keep_target, + is_agentic); + if (is_agentic && creq.keep_ratio > raw_keep) { + std::fprintf(stderr, + "[pflash-router] floor-clamp: " + "agentic bandit %.3f < floor %.3f → %.3f\n", + raw_keep, + config_.pflash_router.agentic_keep_target, + creq.keep_ratio); + } + } } // cascade = use_transitive: 0 = off, 1 = on, -1 = env default creq.use_transitive = rd.cascade ? 1 : 0; @@ -1620,18 +1647,36 @@ void HttpServer::worker_loop() { // doesn't grow monotonically across requests with different sizes. backend_.release_scratch(); - // Bandit: update when spec decode actually ran — including 0-accept case, - // which signals the current keep_ratio is too low. - if (!req.session_id.empty() && result.spec_decode_ran) { - float old_keep = sessions_.get_keep_ratio(req.session_id); - int old_turn = sessions_.turn_count(req.session_id); - sessions_.update(req.session_id, result.accept_rate); - float new_keep = sessions_.get_keep_ratio(req.session_id); - float ema = sessions_.get_ema(req.session_id); + // PIECE 2: compression failure guard — deterministic recovery. + // When an agentic compressed turn produces an empty or degenerate response: + // (a) skip the bandit update (failure noise — don't reward/penalise) + // (b) schedule full-keep recovery for the next turn of this session + const bool agentic_compressed = pflash_is_agentic && pflash_compressed; + const int n_response_tokens = (int)result.tokens.size(); + if (!req.session_id.empty() && + compression_failed(n_response_tokens, result.degenerate_decode_close, + agentic_compressed)) { std::fprintf(stderr, - "[pflash-bandit] session=%s turn=%d keep=%.4f->%.4f ema=%.3f accept=%.3f\n", - req.session_id.c_str(), old_turn + 1, - old_keep, new_keep, ema, result.accept_rate); + "[pflash-guard] compression_failed → full-keep next: " + "session=%s resp_tokens=%d degenerate=%s\n", + req.session_id.c_str(), n_response_tokens, + result.degenerate_decode_close ? "true" : "false"); + sessions_.set_recover_full_next(req.session_id); + // Fall through — skip bandit update below (spec_decode_ran may still be true). + } else { + // Bandit: update when spec decode actually ran — including 0-accept case, + // which signals the current keep_ratio is too low. + if (!req.session_id.empty() && result.spec_decode_ran) { + float old_keep = sessions_.get_keep_ratio(req.session_id); + int old_turn = sessions_.turn_count(req.session_id); + sessions_.update(req.session_id, result.accept_rate); + float new_keep = sessions_.get_keep_ratio(req.session_id); + float ema = sessions_.get_ema(req.session_id); + std::fprintf(stderr, + "[pflash-bandit] session=%s turn=%d keep=%.4f->%.4f ema=%.3f accept=%.3f\n", + req.session_id.c_str(), old_turn + 1, + old_keep, new_keep, ema, result.accept_rate); + } } diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp index b45370f99..215145f90 100644 --- a/server/test/test_regime_router.cpp +++ b/server/test/test_regime_router.cpp @@ -276,6 +276,108 @@ static void t18_detect_request_type() { } } +// ─── T19: clamp_keep_to_floor ──────────────────────────────────────────────── +// agentic=true → effective keep = max(bandit_keep, router_floor) +// agentic=false → pass through bandit_keep unchanged +// bandit_keep > floor → no clamping even for agentic + +static void t19_clamp_keep_to_floor() { + // Agentic + bandit below floor → clamped up to floor. + { + double result = clamp_keep_to_floor(0.10, 0.25, /*agentic=*/true); + TEST_ASSERT_MSG(approx_eq(result, 0.25), + "T19a: agentic, bandit 0.10 < floor 0.25 -> clamped to 0.25"); + } + // Agentic + bandit == floor → returns floor. + { + double result = clamp_keep_to_floor(0.25, 0.25, /*agentic=*/true); + TEST_ASSERT_MSG(approx_eq(result, 0.25), + "T19b: agentic, bandit == floor -> 0.25"); + } + // Agentic + bandit above floor → no clamping (bandit wins). + { + double result = clamp_keep_to_floor(0.30, 0.25, /*agentic=*/true); + TEST_ASSERT_MSG(approx_eq(result, 0.30), + "T19c: agentic, bandit 0.30 > floor 0.25 -> 0.30 (bandit wins)"); + } + // Non-agentic → pass through, even if below floor. + { + double result = clamp_keep_to_floor(0.05, 0.25, /*agentic=*/false); + TEST_ASSERT_MSG(approx_eq(result, 0.05), + "T19d: non-agentic -> 0.05 passed through unchanged"); + } + // Non-agentic, bandit above floor → pass through. + { + double result = clamp_keep_to_floor(0.50, 0.25, /*agentic=*/false); + TEST_ASSERT_MSG(approx_eq(result, 0.50), + "T19e: non-agentic, bandit above floor -> 0.50 passed through"); + } + // Agentic, bandit=0.0 (minimum possible) → clamped to floor. + { + double result = clamp_keep_to_floor(0.0, 0.25, /*agentic=*/true); + TEST_ASSERT_MSG(approx_eq(result, 0.25), + "T19f: agentic, bandit=0.0 -> clamped to floor 0.25"); + } +} + +// ─── T20: compression_failed truth table ───────────────────────────────────── +// Returns true iff agentic_compressed && (response_tokens < min_tokens || degenerate_close). +// When not agentic_compressed, always false. + +static void t20_compression_failed() { + // agentic_compressed=true, response_tokens < min_tokens → failed. + { + bool result = compression_failed(/*response_tokens=*/3, /*degenerate_close=*/false, + /*agentic_compressed=*/true, /*min_tokens=*/8); + TEST_ASSERT_MSG(result, "T20a: agentic, 3 tokens < 8 min -> failed=true"); + } + // agentic_compressed=true, response_tokens == min_tokens-1 → failed. + { + bool result = compression_failed(7, false, true, 8); + TEST_ASSERT_MSG(result, "T20b: agentic, 7 < 8 -> failed=true"); + } + // agentic_compressed=true, response_tokens == min_tokens → NOT failed. + { + bool result = compression_failed(8, false, true, 8); + TEST_ASSERT_MSG(!result, "T20c: agentic, 8 == 8 -> failed=false"); + } + // agentic_compressed=true, response_tokens > min_tokens → NOT failed (normal). + { + bool result = compression_failed(100, false, true, 8); + TEST_ASSERT_MSG(!result, "T20d: agentic, 100 tokens, normal -> failed=false"); + } + // agentic_compressed=true, degenerate_close=true (even with enough tokens) → failed. + { + bool result = compression_failed(50, /*degenerate_close=*/true, true, 8); + TEST_ASSERT_MSG(result, "T20e: agentic, degenerate_close -> failed=true"); + } + // agentic_compressed=true, both degenerate + empty → failed. + { + bool result = compression_failed(0, true, true, 8); + TEST_ASSERT_MSG(result, "T20f: agentic, 0 tokens + degenerate -> failed=true"); + } + // agentic_compressed=false, even with empty response → NOT failed (not our fault). + { + bool result = compression_failed(0, false, /*agentic_compressed=*/false, 8); + TEST_ASSERT_MSG(!result, "T20g: not agentic_compressed, empty -> failed=false"); + } + // agentic_compressed=false, degenerate_close=true → NOT failed (guard only fires on compression path). + { + bool result = compression_failed(0, true, false, 8); + TEST_ASSERT_MSG(!result, "T20h: not agentic_compressed, degenerate -> failed=false"); + } + // Default min_tokens=8: verify default is honoured. + { + bool result = compression_failed(5, false, true); + TEST_ASSERT_MSG(result, "T20i: agentic, 5<8 with default min_tokens -> failed=true"); + } + // Default min_tokens=8: 8 tokens → not failed. + { + bool result = compression_failed(8, false, true); + TEST_ASSERT_MSG(!result, "T20j: agentic, 8 tokens with default min_tokens -> failed=false"); + } +} + // ─── main ───────────────────────────────────────────────────────────────────── int main() { @@ -290,6 +392,10 @@ int main() { std::fprintf(stderr, "--- detect_request_type ---\n"); RUN_TEST(t18_detect_request_type); + std::fprintf(stderr, "--- floor clamp + compression_failed ---\n"); + RUN_TEST(t19_clamp_keep_to_floor); + RUN_TEST(t20_compression_failed); + std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures); return (test_failures == 0) ? 0 : 1; }