From d4546a5105a09971178478a1663a2a60af14f15c Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Wed, 27 May 2026 09:08:00 +0200
Subject: [PATCH 01/16] feat(pflash): ee7 early-exit drafter +
 anchor-transitive cascade + bug-42 tail-capture guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ee7 truncates drafter forward at layer 7 of 28, scoring only those layers.
9.3× drafter wall at 128K (RTX 3090, Qwen3.6-27B-Q4_K_M target + Qwen2.5-0.5B-BF16 drafter).
Anchor-transitive cascade rescues multi-hop on bimodal-density prompts (gated, default OFF).
Bug #42 fix: tail-capture view-bounds guard at S%4096 in {1..7}.

5 unit tests included. Bench scripts split to follow-up PR.
---
 server/CMakeLists.txt                         |  39 ++
 server/src/common/score_range.h               |  48 +++
 server/src/qwen3/anchor_scan.cpp              | 169 +++++++++
 server/src/qwen3/anchor_scan.h                |  42 +++
 server/src/qwen3/qwen3_drafter.cpp            | 103 ++---
 server/src/qwen3/qwen3_graph.cpp              | 107 ++++--
 server/src/qwen3/qwen3_loader.cpp             |  12 +
 server/test/test_anchor_transitive.cpp        | 355 ++++++++++++++++++
 .../test_drafter_early_exit_score_range.cpp   | 108 ++++++
 .../test/test_drafter_tail_capture_guard.cpp  | 128 +++++++
 .../test_drafter_warm_path_regression.cpp     | 164 ++++++++
 11 files changed, 1201 insertions(+), 74 deletions(-)
 create mode 100644 server/src/common/score_range.h
 create mode 100644 server/src/qwen3/anchor_scan.cpp
 create mode 100644 server/src/qwen3/anchor_scan.h
 create mode 100644 server/test/test_anchor_transitive.cpp
 create mode 100644 server/test/test_drafter_early_exit_score_range.cpp
 create mode 100644 server/test/test_drafter_tail_capture_guard.cpp
 create mode 100644 server/test/test_drafter_warm_path_regression.cpp

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 345ee8aee..cfdc22937 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -217,6 +217,7 @@ add_library(dflash_common STATIC
     src/draft/draft_gguf_loader.cpp
     src/draft/draft_safetensors_loader.cpp
     src/draft/draft_graph.cpp
+    src/qwen3/anchor_scan.cpp
     src/qwen3/qwen3_drafter.cpp
     src/qwen3/qwen3_loader.cpp
     src/qwen3/qwen3_graph.cpp
@@ -572,6 +573,44 @@ if(DFLASH27B_TESTS)
         target_link_libraries(test_bandit_integration PRIVATE dflash_common)
         add_test(NAME bandit_integration COMMAND test_bandit_integration)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_early_exit_score_range.cpp")
+        add_executable(test_drafter_early_exit_score_range
+            test/test_drafter_early_exit_score_range.cpp)
+        target_include_directories(test_drafter_early_exit_score_range PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_early_exit_score_range
+            COMMAND test_drafter_early_exit_score_range)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp")
+        add_executable(test_anchor_transitive
+            test/test_anchor_transitive.cpp
+            src/qwen3/anchor_scan.cpp)
+        target_include_directories(test_anchor_transitive PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/qwen3)
+        add_test(NAME test_anchor_transitive
+            COMMAND test_anchor_transitive)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_warm_path_regression.cpp")
+        add_executable(test_drafter_warm_path_regression
+            test/test_drafter_warm_path_regression.cpp)
+        target_include_directories(test_drafter_warm_path_regression PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME test_drafter_warm_path_regression
+            COMMAND test_drafter_warm_path_regression)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_drafter_tail_capture_guard.cpp")
+        # GREEN phase: built with TAIL_GUARD_USE_NEW_FORMULA — must pass after Bug #42 fix.
+        add_executable(test_drafter_tail_capture_guard
+            test/test_drafter_tail_capture_guard.cpp)
+        target_compile_definitions(test_drafter_tail_capture_guard PRIVATE
+            TAIL_GUARD_USE_NEW_FORMULA)
+        add_test(NAME test_drafter_tail_capture_guard
+            COMMAND test_drafter_tail_capture_guard)
+        # RED phase binary: same source WITHOUT the fix flag — documents the bug.
+        add_executable(test_drafter_tail_capture_guard_red
+            test/test_drafter_tail_capture_guard.cpp)
+        # No TAIL_GUARD_USE_NEW_FORMULA — uses old (buggy) guard, expected to FAIL.
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
         add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)
         target_link_libraries(test_draft_vs_reference PRIVATE dflash_common)
diff --git a/server/src/common/score_range.h b/server/src/common/score_range.h
new file mode 100644
index 000000000..1ad137207
--- /dev/null
+++ b/server/src/common/score_range.h
@@ -0,0 +1,48 @@
+// Pure helper: compute the [score_layer_start, score_layer_end) range for
+// tail-attention scoring given the forward-pass layer limit and the optional
+// SCORE_LAYERS count.
+//
+// Parameters:
+//   n_layer        - total number of layers in the model (e.g. 28)
+//   score_layers   - value of PFLASH_DRAFTER_SCORE_LAYERS (-1 = all)
+//   fwd_layer_limit - number of layers actually computed (== early_exit_n when
+//                    early-exit is active, else n_layer)
+//
+// Semantics: SCORE_LAYERS is interpreted as "how many of the computed layers
+// to score", counted from the END of the forward range [0, fwd_layer_limit).
+// This way SCORE_LAYERS=7 with early_exit_n=7 scores layers [0,7) instead of
+// producing the empty interval [7,7) that the old code yielded.
+#pragma once
+
+#include <algorithm>
+
+namespace dflash::common {
+
+struct ScoreRange {
+    int start; // inclusive
+    int end;   // exclusive
+    int count() const { return end - start; }
+    bool empty() const { return start >= end; }
+};
+
+// Compute the scoring layer range.
+// When early-exit is active, SCORE_LAYERS counts from 0 upward within the
+// computed range [0, fwd_layer_limit), not from the end of the full model.
+inline ScoreRange compute_score_range(int n_layer, int score_layers, int fwd_layer_limit) {
+    // score_layers <= 0 means "use all computed layers"
+    const int effective_n = fwd_layer_limit;
+    int start;
+    if (score_layers > 0 && score_layers < n_layer) {
+        // Clamp: can't request more layers than were computed.
+        int want = std::min(score_layers, effective_n);
+        start = effective_n - want;
+    } else {
+        start = 0;
+    }
+    int end = fwd_layer_limit;
+    // Clamp start to never exceed end.
+    if (start > end) start = end;
+    return { start, end };
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen3/anchor_scan.cpp b/server/src/qwen3/anchor_scan.cpp
new file mode 100644
index 000000000..e0088167a
--- /dev/null
+++ b/server/src/qwen3/anchor_scan.cpp
@@ -0,0 +1,169 @@
+#include "anchor_scan.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+// Force chunk and its radius-neighborhood into `forced`.
+static void force_neighborhood(std::vector<uint8_t>& forced, int n_chunks,
+                                int chunk, int radius) {
+    int lo = std::max(0, chunk - radius);
+    int hi = std::min(n_chunks - 1, chunk + radius);
+    for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
+}
+
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced)
+{
+    const int n_chunks = (int)forced.size();
+    const int ngram    = cfg.ngram;
+    const int search_end = std::max(0, body_end - ngram);
+
+    for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) {
+        int hits = 0;
+        int hit_pos[8];
+        for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) {
+            bool same = true;
+            for (int k = 0; k < ngram; ++k) {
+                if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) {
+                    same = false;
+                    break;
+                }
+            }
+            if (same) {
+                if (hits < 8) hit_pos[hits] = p;
+                ++hits;
+            }
+        }
+        if (hits > 0 && hits <= cfg.max_anchor_hits) {
+            for (int i = 0; i < hits && i < 8; ++i) {
+                force_neighborhood(forced, n_chunks,
+                                   hit_pos[i] / cfg.chunk_size,
+                                   cfg.anchor_radius);
+            }
+        }
+    }
+}
+
+// Helper: count set entries in forced.
+static int count_set(const std::vector<uint8_t>& forced) {
+    int n = 0;
+    for (uint8_t v : forced) n += (v != 0);
+    return n;
+}
+
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced)
+{
+    auto pool = initial_query_pool;
+    const int n_chunks = (int)forced.size();
+
+    // Precompute token frequencies in body once.
+    std::unordered_map<int32_t, int> body_freq;
+    body_freq.reserve((size_t)body_end);
+    for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]];
+
+    // Build inverted index: token -> list of body positions (for rare tokens only).
+    std::unordered_map<int32_t, std::vector<int>> rare_positions;
+    if (cfg.rare_token_max_freq > 0) {
+        for (auto& kv : body_freq) {
+            if (kv.second <= cfg.rare_token_max_freq) {
+                rare_positions[kv.first] = {};
+            }
+        }
+        for (int p = 0; p < body_end; ++p) {
+            auto it = rare_positions.find(ids[(size_t)p]);
+            if (it != rare_positions.end()) it->second.push_back(p);
+        }
+    }
+
+    // Pass-1: run the initial scan.
+    const int count_before_pass1 = count_set(forced);
+    scan_and_force(ids, body_end, pool, cfg, forced);
+    const int gained_pass1 = count_set(forced) - count_before_pass1;
+
+    // Gating: if pass-1 already found many anchors, skip the cascade entirely.
+    if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) {
+        return;
+    }
+
+    // Cascade loop: expand pool with newly-forced tokens and re-scan.
+    std::vector<uint8_t> prev_forced;
+    for (int it = 0; it < max_iters; ++it) {
+        prev_forced = forced;
+
+        // Rare-token single-match: worklist-driven so cascades within a pass are
+        // caught (e.g. hop3 forces hop2 which forces hop1 in one outer iteration).
+        if (cfg.rare_token_max_freq > 0) {
+            std::vector<int> worklist;
+            for (int c = 0; c < n_chunks; ++c) {
+                if (forced[c] && !prev_forced[c]) worklist.push_back(c);
+            }
+            // On first iteration, seed from everything forced so far (pass-1 results).
+            if (it == 0) {
+                worklist.clear();
+                for (int c = 0; c < n_chunks; ++c) {
+                    if (forced[c]) worklist.push_back(c);
+                }
+            }
+            for (int wi = 0; wi < (int)worklist.size(); ++wi) {
+                int c = worklist[wi];
+                int s = c * cfg.chunk_size;
+                int e = std::min(body_end, (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) {
+                    auto it2 = rare_positions.find(ids[(size_t)j]);
+                    if (it2 == rare_positions.end()) continue;
+                    for (int p : it2->second) {
+                        int target_c = p / cfg.chunk_size;
+                        if (!forced[(size_t)target_c]) {
+                            force_neighborhood(forced, n_chunks,
+                                               target_c, cfg.anchor_radius);
+                            worklist.push_back(target_c);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Hard cap: if we exceeded max_forced_count, revert this iteration and stop.
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+
+        if (forced == prev_forced) break;
+
+        // Expand pool with tokens from newly-forced chunks (feeds next 4-gram pass).
+        for (int c = 0; c < n_chunks; ++c) {
+            if (forced[c] && !prev_forced[c]) {
+                int s = c * cfg.chunk_size;
+                int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) pool.push_back(ids[j]);
+            }
+        }
+
+        // 4-gram scan with expanded pool for next iteration.
+        prev_forced = forced;
+        scan_and_force(ids, body_end, pool, cfg, forced);
+
+        // Hard cap check after 4-gram expansion too.
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+    }
+}
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/anchor_scan.h b/server/src/qwen3/anchor_scan.h
new file mode 100644
index 000000000..8f75a0855
--- /dev/null
+++ b/server/src/qwen3/anchor_scan.h
@@ -0,0 +1,42 @@
+// N-gram anchor scan: mark chunks forced by token-match between a query pool
+// and the body of an ids sequence.  Pure CPU, no GPU, no model required.
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+struct AnchorScanCfg {
+    int chunk_size;
+    int anchor_radius;
+    int max_anchor_hits;
+    int ngram = 4;
+    int rare_token_max_freq = 8;        // tokens appearing <= this many times in body count as rare
+    int cascade_min_anchor_count = 0;   // skip cascade if pass-1 forced >= this many chunks (0 = always cascade)
+    int max_forced_count = INT_MAX;     // hard cap on total forced chunks
+};
+
+// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end).
+// `forced` is in-out; new hits are OR-merged.  Idempotent.
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced
+);
+
+// Transitive variant: expands the query pool with tokens from newly-forced
+// chunks and re-runs scan_and_force until a fixed point or max_iters reached.
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced
+);
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index f65cb079f..589351f9c 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -17,6 +17,7 @@
 #include "qwen3_drafter_model.h"
 #include "common/backend_precision.h"
 #include "internal.h"
+#include "anchor_scan.h"
 
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -64,6 +65,13 @@ static int env_int(const char * name, int fallback) {
     return fallback;
 }
 
+static float env_float(const char * name, float def) {
+    if (const char * v = std::getenv(name)) {
+        try { return std::stof(v); } catch (...) {}
+    }
+    return def;
+}
+
 static void force_chunk_neighborhood(std::vector<uint8_t> & forced, int n_chunks,
                                      int chunk, int radius) {
     int lo = std::max(0, chunk - radius);
@@ -548,33 +556,34 @@ static std::vector<int32_t> qwen35_score_and_compress(
         for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
     }
 
-    const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
-    const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2);
-    const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
-    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+    const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
+    const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
+    const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    const int anchor_ngram        = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM",    4);
+    const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ",   2);
+
+    const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
+    const float max_forced_ratio        = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
 
     const int q0 = std::max(0, S - query_tokens);
-    constexpr int NGRAM = 4;
-    for (int q = q0; q + NGRAM <= S; ++q) {
-        int hits = 0;
-        int hit_pos[8];
-        const int search_end = std::max(0, q0 - NGRAM);
-        for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) {
-            bool same = true;
-            for (int k = 0; k < NGRAM; ++k) {
-                if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; }
-            }
-            if (same) {
-                if (hits < 8) hit_pos[hits] = p;
-                ++hits;
-            }
-        }
-        if (hits > 0 && hits <= max_anchor_hits) {
-            for (int i = 0; i < hits && i < 8; ++i) {
-                force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius);
-            }
-        }
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius,
+                                             max_anchor_hits, anchor_ngram,
+                                             rare_token_max_freq};
+    anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
+    anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
+
+    const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
+    const int  max_iters      = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+    if (use_transitive) {
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  anchor_cfg, max_iters, forced);
+    } else {
+        dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
     }
+
     for (int c = 0; c < n_chunks; ++c) {
         if (forced[(size_t)c] && !selected[(size_t)c]) {
             selected[(size_t)c] = 1;
@@ -740,34 +749,36 @@ std::vector<int32_t> drafter_score_and_compress(
         head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw));
         tail_chunks = std::max(0, budget - head_chunks);
     }
-    const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
-    const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2);
-    const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
+    const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
+    const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    const int anchor_ngram        = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM",    4);
+    const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ",   2);
+
+    const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
+    const float max_forced_ratio        = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
+
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
     for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1;
     for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
 
     const int q0 = std::max(0, S - query_tokens);
-    constexpr int NGRAM = 4;
-    for (int q = q0; q + NGRAM <= S; ++q) {
-        int hits = 0;
-        int hit_pos[8];
-        const int search_end = std::max(0, q0 - NGRAM);
-        for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) {
-            bool same = true;
-            for (int k = 0; k < NGRAM; ++k) {
-                if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; }
-            }
-            if (same) {
-                if (hits < 8) hit_pos[hits] = p;
-                ++hits;
-            }
-        }
-        if (hits > 0 && hits <= max_anchor_hits) {
-            for (int i = 0; i < hits && i < 8; ++i) {
-                force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius);
-            }
+    {
+        std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+        dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius,
+                                                 max_anchor_hits, anchor_ngram,
+                                                 rare_token_max_freq};
+        anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
+        anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
+
+        const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
+        const int  max_iters      = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+        if (use_transitive) {
+            dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                      anchor_cfg, max_iters, forced);
+        } else {
+            dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
         }
     }
 
diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp
index a23bcefb3..858bcd75e 100644
--- a/server/src/qwen3/qwen3_graph.cpp
+++ b/server/src/qwen3/qwen3_graph.cpp
@@ -35,6 +35,7 @@
 #include "qwen3_drafter_model.h"
 #include "internal.h"
 #include "flashprefill.h"
+#include "../common/score_range.h"
 
 #include "device_runtime.h"
 
@@ -249,13 +250,39 @@ bool forward_qwen3_drafter_model(
     }
     running_max.assign((size_t)n_lookahead * S, -INFINITY);
 
+    // Compute score_layer_start early so we can avoid allocating K_norope/Q_norope
+    // for layers that will never be used in scoring.  At S=128K the full K_norope
+    // allocation is ~5.6 GB (21 unused layers × 268 MB) — skipping it keeps total
+    // VRAM under 24 GB and eliminates the warm-path regression (A_compute 5.4x).
+    static const int score_layers_pre = []() -> int {
+        const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS");
+        if (e) { int v = std::atoi(e); if (v > 0) return v; }
+        return -1;
+    }();
+    static const int early_exit_pre = []() -> int {
+        const char * e = std::getenv("PFLASH_DRAFTER_EARLY_EXIT_N");
+        if (e) { int v = std::atoi(e); if (v > 0) return v; }
+        return -1;
+    }();
+    // fwd_layer_limit_pre mirrors the fwd_layer_limit computed later in the loop.
+    const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer)
+        ? early_exit_pre : w.n_layer;
+    // Use compute_score_range (same formula as the scoring loop) so the pre-alloc
+    // boundary is guaranteed to match the actual scoring boundary.
+    const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre);
+    const int score_layer_start_pre = pre_range.start;
+    // Number of layers that participate in scoring (and need K_norope/Q_norope).
+    const int n_score_layers = pre_range.count();
+
     PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf;
     std::vector<PersBuf> K_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> V_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> Q_last_v((size_t)w.n_layer);
-    // NoPE: pre-RoPE K (full sequence) and Q tail; allocated only when nope_tail.
-    std::vector<PersBuf> K_norope_v(nope_tail ? (size_t)w.n_layer : 0);
-    std::vector<PersBuf> Q_norope_v(nope_tail ? (size_t)w.n_layer : 0);
+    // NoPE: only allocate K_norope/Q_norope for layers that will be scored.
+    // When score_layer_start_pre > 0 this trims up to 21 × 268 MB = 5.6 GB,
+    // preventing the VRAM overflow that causes the warm-path regression at 128K.
+    std::vector<PersBuf> K_norope_v(nope_tail ? (size_t)n_score_layers : 0);
+    std::vector<PersBuf> Q_norope_v(nope_tail ? (size_t)n_score_layers : 0);
     auto cleanup_all = [&]() {
         free_pers(hidden_buf);
         free_pers(pos_buf);
@@ -294,9 +321,10 @@ bool forward_qwen3_drafter_model(
                 cleanup_all();
                 return false;
             }
-            if (nope_tail) {
-                if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[il]) ||
-                    !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[il])) {
+            if (nope_tail && il >= score_layer_start_pre && il < fwd_layer_limit_pre) {
+                const int si = il - score_layer_start_pre;
+                if (!make_pers(w.backend, half_type, 3, d_kv, K_norope_v[si]) ||
+                    !make_pers(w.backend, GGML_TYPE_F32, 3, d_ql, Q_norope_v[si])) {
                     set_last_error("forward_qwen3: K_norope/Q_norope alloc failed at layer " + std::to_string(il));
                     cleanup_all();
                     return false;
@@ -352,6 +380,10 @@ bool forward_qwen3_drafter_model(
         ggml_free(gctx);
     }
 
+    // PFLASH_DRAFTER_EARLY_EXIT_N: already read into early_exit_pre above.
+    // Alias used in the forward-loop limit below.
+    const int & early_exit_n = early_exit_pre;
+
     // Per-layer A→FA→B loop.
     ggml_gallocr_t galloc = ggml_gallocr_new(
         ggml_backend_get_default_buffer_type(w.backend));
@@ -372,7 +404,10 @@ bool forward_qwen3_drafter_model(
     double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0;
     double t_fp = 0.0;
 
-    for (int il = 0; il < w.n_layer; ++il) {
+    const int fwd_layer_limit = (early_exit_n > 0 && early_exit_n < w.n_layer)
+        ? early_exit_n : w.n_layer;
+
+    for (int il = 0; il < fwd_layer_limit; ++il) {
         const auto & L = w.layers[il];
         const bool debug_first_layer = (il == 0 && std::getenv("DFLASH_FP_DEBUG_LAYER0") != nullptr);
 
@@ -411,19 +446,22 @@ bool forward_qwen3_drafter_model(
 
             ggml_tensor * Q = ggml_mul_mat(gA, L.wq, h_norm);
             Q = ggml_reshape_3d(gA, Q, D, H, cl);
-            Q = ggml_rms_norm(gA, Q, eps);
-            Q = ggml_mul(gA, Q, L.q_norm);
-            // NoPE: capture pre-RoPE Q tail so the tail scorer is not biased by distance.
-            if (nope_tail) {
+            if (L.q_norm) {
+                Q = ggml_rms_norm(gA, Q, eps);
+                Q = ggml_mul(gA, Q, L.q_norm);
+            }
+            // NoPE: capture pre-RoPE Q tail (only for layers that will be scored).
+            if (nope_tail && il >= score_layer_start_pre) {
+                const int si = il - score_layer_start_pre;
                 const int tail_lo_nr = S - n_lookahead;
-                if (tail_lo_nr >= cs && tail_lo_nr < cs + cl) {
+                if (tail_lo_nr >= cs && tail_lo_nr + n_lookahead <= cs + cl) {
                     const int local_lo_nr = tail_lo_nr - cs;
                     ggml_tensor * Q_prenrope_tail = ggml_view_3d(
                         gA, Q, D, H, n_lookahead,
                         Q->nb[1], Q->nb[2],
                         (size_t)local_lo_nr * Q->nb[2]);
                     ggml_build_forward_expand(gfA,
-                        ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[il].t));
+                        ggml_cpy(gA, Q_prenrope_tail, Q_norope_v[si].t));
                 }
             }
             Q = ggml_rope_ext(gA, Q, pos_chunk, nullptr, D,
@@ -432,12 +470,15 @@ bool forward_qwen3_drafter_model(
 
             ggml_tensor * K = ggml_mul_mat(gA, L.wk, h_norm);
             K = ggml_reshape_3d(gA, K, D, Hk, cl);
-            K = ggml_rms_norm(gA, K, eps);
-            K = ggml_mul(gA, K, L.k_norm);
-            // NoPE: save pre-RoPE K chunk alongside K_curr_v.
-            if (nope_tail) {
-                const size_t kn_esz = ggml_element_size(K_norope_v[il].t);
-                ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[il].t, D, Hk, cl,
+            if (L.k_norm) {
+                K = ggml_rms_norm(gA, K, eps);
+                K = ggml_mul(gA, K, L.k_norm);
+            }
+            // NoPE: save pre-RoPE K chunk (only for layers that will be scored).
+            if (nope_tail && il >= score_layer_start_pre) {
+                const int si = il - score_layer_start_pre;
+                const size_t kn_esz = ggml_element_size(K_norope_v[si].t);
+                ggml_tensor * Kn_dst = ggml_view_3d(gA, K_norope_v[si].t, D, Hk, cl,
                                                     kn_esz * D, kn_esz * D * Hk,
                                                     (size_t)cs * kn_esz * D * Hk);
                 ggml_build_forward_expand(gfA, ggml_cpy(gA, K, Kn_dst));
@@ -466,7 +507,7 @@ bool forward_qwen3_drafter_model(
 
             // Copy Q tail to Q_last_v[il] in the chunk that contains the tail.
             const int tail_lo = S - n_lookahead;
-            if (tail_lo >= cs && tail_lo < cs + cl) {
+            if (tail_lo >= cs && tail_lo + n_lookahead <= cs + cl) {
                 int local_lo = tail_lo - cs;
                 ggml_tensor * Q_tail_local = ggml_view_3d(
                     gA, Q, D, H, n_lookahead,
@@ -707,12 +748,12 @@ bool forward_qwen3_drafter_model(
         }
 #endif
 
-        if (il == 0 || il == w.n_layer - 1) {
+        if (il == 0 || il == fwd_layer_limit - 1) {
             std::fprintf(stderr,
                          "[qwen3-0.6b-fp] layer %d/%d done "
                          "(A_setup=%.3fs A_alloc=%.3fs A_compute=%.3fs FP=%.3fs "
                          "B_warm=%.3fs B_setup=%.3fs B_alloc=%.3fs B_copy_in=%.3fs B_norm=%.3fs B_compute=%.3fs B_copy_out=%.3fs)\n",
-                         il + 1, w.n_layer,
+                         il + 1, fwd_layer_limit,
                          t_a_setup, t_a_alloc, t_compute_a, t_fp,
                          t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out);
             std::fflush(stderr);
@@ -724,19 +765,28 @@ bool forward_qwen3_drafter_model(
     auto t_fwd_end = std::chrono::steady_clock::now();
     double t_fwd = std::chrono::duration<double>(t_fwd_end - t_total_start).count();
 
-    // Tail attention scoring (unchanged from previous impl).
+    // Tail attention scoring.
+    // score_layers_pre / compute_score_range already determined the range before
+    // allocation (to size K_norope_v correctly).  Re-use that result here.
+    // score_layer_start_pre == score_layer_start by construction (same formula,
+    // same env vars, same fwd_layer_limit_pre == fwd_layer_limit).
+    const int score_layer_start  = score_layer_start_pre;
+    const int score_layer_end    = fwd_layer_limit;
+
     std::vector<float> probs_h((size_t)S * n_lookahead * H);
     auto t_score_start = std::chrono::steady_clock::now();
 
-    for (int il = 0; il < w.n_layer; ++il) {
+    for (int il = score_layer_start; il < score_layer_end; ++il) {
         ggml_init_params ip{};
         ip.mem_size = ggml_tensor_overhead() * 32 + ggml_graph_overhead() + 16 * 1024;
         ip.no_alloc = true;
         ggml_context * gctx = ggml_init(ip);
 
+        // K_norope_v / Q_norope_v are indexed from score_layer_start_pre.
+        const int si = il - score_layer_start_pre;
         ggml_tensor * K_f32 = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, D, Hk, S);
         ggml_tensor * K_cast = ggml_cpy(gctx,
-            nope_tail ? K_norope_v[il].t : K_curr_v[il].t, K_f32);
+            nope_tail ? K_norope_v[si].t : K_curr_v[il].t, K_f32);
         ggml_tensor * K_perm = ggml_cont(gctx,
             ggml_permute(gctx, K_cast, 0, 2, 1, 3));
         ggml_tensor * K_score = K_perm;
@@ -749,7 +799,7 @@ bool forward_qwen3_drafter_model(
         }
         ggml_tensor * Q_tail_perm = ggml_cont(gctx,
             ggml_permute(gctx,
-                nope_tail ? Q_norope_v[il].t : Q_last_v[il].t,
+                nope_tail ? Q_norope_v[si].t : Q_last_v[il].t,
                 0, 2, 1, 3));
         ggml_tensor * attn_score = ggml_mul_mat(gctx, K_score, Q_tail_perm);
         ggml_tensor * probs = ggml_soft_max_ext(gctx, attn_score, mask_tail_buf.t,
@@ -796,8 +846,9 @@ bool forward_qwen3_drafter_model(
     double t_score = std::chrono::duration<double>(t_total_end - t_score_start).count();
     std::fprintf(stderr,
         "[qwen3-0.6b-fp] forward %.2fs (S=%d, A_setup=%.2fs A_alloc=%.2fs A_compute=%.2fs FP=%.2fs B_warm=%.2fs B_setup=%.2fs B_alloc=%.2fs B_copy_in=%.2fs B_norm=%.2fs B_compute=%.2fs B_copy_out=%.2fs)  "
-        "tail-score %.2fs  total %.2fs\n",
-        t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out, t_score, t_fwd + t_score);
+        "tail-score %.2fs (layers %d-%d)  total %.2fs\n",
+        t_fwd, S, t_a_setup, t_a_alloc, t_compute_a, t_fp, t_b_warm, t_b_setup, t_b_alloc, t_b_copy_in, t_b_norm, t_compute_b, t_b_copy_out,
+        t_score, score_layer_start, score_layer_end - 1, t_fwd + t_score);
     std::fflush(stderr);
 
     cleanup_all();
diff --git a/server/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp
index ed38ee106..b7b35a85e 100644
--- a/server/src/qwen3/qwen3_loader.cpp
+++ b/server/src/qwen3/qwen3_loader.cpp
@@ -133,6 +133,18 @@ bool load_qwen3_drafter_model(const std::string & path,
     out.head_dim   = (int)get_u32(gctx, "qwen3.attention.key_length", 128);
     out.rope_theta = get_f32(gctx, "qwen3.rope.freq_base", 1000000.0f);
 
+    // Detect weight quant type from blk.0.attn_q.weight; support BF16 and Q8_0.
+    ggml_type wtype = GGML_TYPE_BF16;
+    {
+        int64_t tidx = gguf_find_tensor(gctx, "blk.0.attn_q.weight");
+        if (tidx >= 0) {
+            wtype = gguf_get_tensor_type(gctx, tidx);
+        }
+    }
+    std::fprintf(stderr, "[qwen3-0.6b] detected weight type: %s\n",
+                 wtype == GGML_TYPE_Q8_0 ? "Q8_0" : "BF16");
+    std::fflush(stderr);
+
     // Compute total tensor metadata size for context allocation.
     const int n_layer = out.n_layer;
     const int n_tensors_per_layer = 11;
diff --git a/server/test/test_anchor_transitive.cpp b/server/test/test_anchor_transitive.cpp
new file mode 100644
index 000000000..ae8a0bbce
--- /dev/null
+++ b/server/test/test_anchor_transitive.cpp
@@ -0,0 +1,355 @@
+// TDD: anchor transitive multi-pass.
+//
+// T1 — single-pass query-match preserved (regression pin, PASS today)
+// T2 — single-pass misses chain hops (characterises limitation, PASS today)
+// T3 — transitive rescues all hops (RED until Phase 2)
+//
+// Pure CPU — no GPU, no model load.
+
+#include "../src/qwen3/anchor_scan.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#define REQUIRE(cond) \
+    do { if (!(cond)) { \
+        std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \
+        std::exit(1); \
+    } } while (0)
+
+static constexpr int32_t FILLER = 1;
+static constexpr int32_t M1 = 1001, M2 = 1002, M3 = 1003;
+static constexpr int CHUNK = 64;
+
+// Place a marker 4-gram [FILLER, FILLER, MARKER, FILLER] at position pos.
+static void place_marker_4gram(std::vector<int32_t>& ids, int pos, int32_t marker) {
+    ids[(size_t)pos]     = FILLER;
+    ids[(size_t)pos + 1] = FILLER;
+    ids[(size_t)pos + 2] = marker;
+    ids[(size_t)pos + 3] = FILLER;
+}
+
+// T1 — single-pass finds a query-matching marker in the body.
+static void t1_single_pass_match() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // Body marker at pos 100 (chunk 1).
+    place_marker_4gram(ids, 100, M3);
+    // Same 4-gram in the query suffix at pos 2044 (inside query window).
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;  // N - 100
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced);
+
+    // Chunk containing pos 100 must be forced.
+    const int target_chunk = 100 / CHUNK;  // chunk 1
+    REQUIRE(forced[(size_t)target_chunk] == 1);
+
+    std::printf("T1 PASS: chunk %d forced by single-pass M3 match\n", target_chunk);
+}
+
+// T2 — single-pass only forces the direct match; chain hops stay unforced.
+static void t2_single_pass_misses_hops() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // hop1 at pos 200 (chunk 3): contains M1.
+    place_marker_4gram(ids, 200, M1);
+
+    // hop2 at pos 600 (chunk 9): contains M2 + M1 (bridge to hop1).
+    place_marker_4gram(ids, 600, M2);
+    place_marker_4gram(ids, 604, M1);
+
+    // hop3 at pos 1200 (chunk 18): contains M3 + M2 (bridge to hop2).
+    place_marker_4gram(ids, 1200, M3);
+    place_marker_4gram(ids, 1204, M2);
+
+    // Query suffix at pos 2044: contains M3.
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;  // 18
+    const int chunk_hop2 = 600  / CHUNK;  // 9
+    const int chunk_hop1 = 200  / CHUNK;  // 3
+
+    // Single-pass: only the direct M3 match at pos 1200 is forced.
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 0);
+    REQUIRE(forced[(size_t)chunk_hop1] == 0);
+
+    std::printf("T2 PASS: chunk(%d) forced, chunk(%d) and chunk(%d) NOT forced (single-pass)\n",
+                chunk_hop3, chunk_hop2, chunk_hop1);
+}
+
+// T3 — transitive rescues all hops (FAILS until Phase 2 implements the function).
+static void t3_transitive_rescues_all() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    place_marker_4gram(ids, 200, M1);
+
+    place_marker_4gram(ids, 600, M2);
+    place_marker_4gram(ids, 604, M1);
+
+    place_marker_4gram(ids, 1200, M3);
+    place_marker_4gram(ids, 1204, M2);
+
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;
+    std::vector<int32_t> initial_query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool,
+                                              cfg, /*max_iters=*/3, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;
+    const int chunk_hop2 = 600  / CHUNK;
+    const int chunk_hop1 = 200  / CHUNK;
+
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 1);
+    REQUIRE(forced[(size_t)chunk_hop1] == 1);
+
+    std::printf("T3 PASS: all hops forced transitively\n");
+}
+
+// T4 — variable-name reuse across templates (FAILS until v2 adds rare-token match).
+//
+// Token layout:
+//   FILLER=1, V1=2001(X42), V2=2002(Y42), V3=2003(Z42)
+//   Template-context tokens: A=3001,B=3002,C=3003,D=3004,E=3005,F=3006
+//   Query-match tokens: X1=4001,X2=4002,X3=4003
+//
+// hop3 (chunk 18, pos 1200): [X1,X2,V3,X3,E,V2,F,FILL] — 4-gram [X1,X2,V3,X3] matches query
+// hop2 (chunk  9, pos  600): [C,V2,FILL,V1,D,FILL,FILL] — V2 in DIFFERENT context than hop3
+// hop1 (chunk  3, pos  200): [A,V1,FILL,B]              — V1 in DIFFERENT context than hop2
+// query (pos 2044):          [X1,X2,V3,X3]              — matches hop3 4-gram exactly
+//
+// Pass 1 (4-gram): forces hop3.
+// Pass 1 rare-token: V2 (freq=2) found in hop3 → also at pos 601 (hop2 chunk 9) → forces hop2.
+// Pass 2 rare-token: V1 (freq=2) found in hop2 → also at pos 201 (hop1 chunk 3) → forces hop1.
+// Today's impl (4-gram only) fails because V2 4-grams in hop3 ≠ V2 4-grams in hop2.
+static void t4_rare_token_bridges_different_context() {
+    static constexpr int32_t V1 = 2001, V2 = 2002, V3 = 2003;
+    static constexpr int32_t A = 3001, B = 3002, C = 3003, D = 3004, E = 3005, F = 3006;
+    static constexpr int32_t X1 = 4001, X2 = 4002, X3 = 4003;
+
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // hop1 (chunk 3, pos 200): [A, V1, FILL, B]
+    ids[200] = A; ids[201] = V1; ids[202] = FILLER; ids[203] = B;
+
+    // hop2 (chunk 9, pos 600): [C, V2, FILL, V1, D, FILL, FILL]
+    ids[600] = C; ids[601] = V2; ids[602] = FILLER; ids[603] = V1;
+    ids[604] = D; ids[605] = FILLER; ids[606] = FILLER;
+
+    // hop3 (chunk 18, pos 1200): [X1, X2, V3, X3, E, V2, F, FILL]
+    // V2 here is in 4-gram context [E,V2,F,FILL] — differs from hop2's [C,V2,FILL,V1]
+    ids[1200] = X1; ids[1201] = X2; ids[1202] = V3; ids[1203] = X3;
+    ids[1204] = E;  ids[1205] = V2; ids[1206] = F;  ids[1207] = FILLER;
+
+    // query suffix (pos 2044): [X1, X2, V3, X3] — exact 4-gram match to hop3
+    ids[2044] = X1; ids[2045] = X2; ids[2046] = V3; ids[2047] = X3;
+
+    const int q0 = 1948;
+    std::vector<int32_t> initial_query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4,
+                                     /*rare_token_max_freq=*/8};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool,
+                                              cfg, /*max_iters=*/3, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;  // 18
+    const int chunk_hop2 =  600 / CHUNK;  //  9
+    const int chunk_hop1 =  200 / CHUNK;  //  3
+
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 1);
+    REQUIRE(forced[(size_t)chunk_hop1] == 1);
+
+    std::printf("T4 PASS: all hops forced via rare-token bridge (V2 freq=2, V1 freq=2)\n");
+}
+
+// T5: gate closes when pass-1 already finds >= cascade_min_anchor_count chunks.
+//
+// Layout (N=4096, chunk=64 → 64 chunks):
+//   A common 4-gram [CMN,CMN,CMN,CMN] appears 50 times at scattered body positions.
+//   One forced chunk (chunk 5, pos 320) also contains a unique rare token RT (freq=1).
+//   RT appears once more at a separate body position in chunk 60 (pos 3840).
+//   Query suffix contains the common 4-gram → pass-1 forces all 50 matching chunks.
+//
+// With cascade_min_anchor_count=5: gained=50 >= 5 → gate closes → cascade skipped.
+// chunk 60 (pos 3840, which has RT but is only reachable via cascade) stays UNFORCED.
+//
+// With cascade_min_anchor_count=0: gate open → cascade runs → chunk 60 gets forced.
+// This contrast proves the gate is operative.
+static void t5_gate_closes_when_pass1_finds_many() {
+    static constexpr int32_t CMN = 5001;  // common token (4-gram made of it)
+    static constexpr int32_t RT  = 5002;  // rare token (freq=2)
+
+    const int N = 4096;
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;  // 64
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // Place common 4-gram at 50 scattered body positions (chunks 0..49).
+    // Spaced 64 tokens apart to land in different chunks.
+    for (int i = 0; i < 50; ++i) {
+        int pos = i * 64 + 4;  // pos 4, 68, 132, ... (well within body)
+        ids[(size_t)pos]     = CMN;
+        ids[(size_t)pos + 1] = CMN;
+        ids[(size_t)pos + 2] = CMN;
+        ids[(size_t)pos + 3] = CMN;
+    }
+
+    // RT appears in chunk 5 (pos 320) and chunk 60 (pos 3840).
+    ids[320] = RT;
+    ids[3840] = RT;
+
+    // Query suffix: just the common 4-gram so pass-1 fires on all 50 body positions.
+    const int q0 = N - 32;
+    ids[(size_t)q0]     = CMN;
+    ids[(size_t)q0 + 1] = CMN;
+    ids[(size_t)q0 + 2] = CMN;
+    ids[(size_t)q0 + 3] = CMN;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    // --- Test A: gate CLOSED (cascade_min_anchor_count=5) ---
+    {
+        std::vector<uint8_t> forced_a((size_t)n_chunks, 0);
+        dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                         /*max_anchor_hits=*/64, /*ngram=*/4,
+                                         /*rare_token_max_freq=*/2,
+                                         /*cascade_min_anchor_count=*/5,
+                                         /*max_forced_count=*/INT_MAX};
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  cfg, /*max_iters=*/3, forced_a);
+
+        // Pass-1 forces chunks 0..49 (50 chunks); gate closes → cascade skipped.
+        // chunk 60 (pos 3840 has RT but only reachable via cascade) must be UNFORCED.
+        const int chunk_rt_extra = 3840 / CHUNK;  // 60
+        REQUIRE(forced_a[(size_t)chunk_rt_extra] == 0);
+        // chunk 5 (contains RT at pos 320) is forced by pass-1 (common 4-gram at pos 324).
+        REQUIRE(forced_a[5] == 1);
+
+        std::printf("T5a PASS: gate closed (gained=50 >= min=5), chunk %d unforced\n",
+                    chunk_rt_extra);
+    }
+
+    // --- Test B: gate OPEN (cascade_min_anchor_count=0) → cascade forces chunk 60 ---
+    {
+        std::vector<uint8_t> forced_b((size_t)n_chunks, 0);
+        dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                         /*max_anchor_hits=*/64, /*ngram=*/4,
+                                         /*rare_token_max_freq=*/2,
+                                         /*cascade_min_anchor_count=*/0,
+                                         /*max_forced_count=*/INT_MAX};
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  cfg, /*max_iters=*/3, forced_b);
+
+        // Cascade runs; chunk 5 is forced by pass-1 and contains RT;
+        // RT at pos 3840 → chunk 60 forced via rare-token cascade.
+        const int chunk_rt_extra = 3840 / CHUNK;
+        REQUIRE(forced_b[(size_t)chunk_rt_extra] == 1);
+
+        std::printf("T5b PASS: gate open (min=0), cascade forced chunk %d via RT\n",
+                    chunk_rt_extra);
+    }
+}
+
+// T6: hard cap (max_forced_count) prevents runaway cascade.
+//
+// Layout (N=2048, chunk=64 → 32 chunks):
+//   Query contains 4-gram [TGR,TGR,TGR,TGR] which matches body chunk 0.
+//   Chunk 0 contains chain token C0 (freq=2): also appears in chunk 1.
+//   Chunk 1 contains chain token C1 (freq=2): also appears in chunk 2.
+//   ... 20 such chain links.
+//   Pass-1 forces chunk 0 (1 chunk gained < cascade_min_anchor_count=0 → gate open).
+//   Cascade rare-token worklist propagates: chunk 0→1→2→...→20 (20 more).
+//   max_forced_count=5 → cascade stops when total > 5. Result: forced <= 5.
+static void t6_hard_cap_prevents_runaway() {
+    static constexpr int32_t TGR = 7000;  // trigger token for 4-gram pass-1 match
+
+    const int N = 2048;
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;  // 32
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // body chunk 0 (pos 0): place 4-gram [TGR,TGR,TGR,TGR] so pass-1 forces it.
+    ids[0] = TGR; ids[1] = TGR; ids[2] = TGR; ids[3] = TGR;
+
+    // Rare-token chain: C_i appears in chunk i (at offset 8) and chunk i+1 (at offset 9).
+    // Offsets 8 and 9 within each chunk don't collide between consecutive tokens.
+    // Cascade worklist: chunk i forced → C_i found at offset 8 → chunk i+1 forced.
+    for (int i = 0; i < 20; ++i) {
+        int32_t tok = 7100 + i;
+        ids[(size_t)(i * 64 + 8)]           = tok;  // in chunk i, offset 8
+        ids[(size_t)((i + 1) * 64 + 9)]     = tok;  // in chunk i+1, offset 9
+    }
+
+    // Query suffix: contains [TGR,TGR,TGR,TGR] → pass-1 matches body chunk 0.
+    const int q0 = N - 64;
+    ids[(size_t)q0]     = TGR;
+    ids[(size_t)q0 + 1] = TGR;
+    ids[(size_t)q0 + 2] = TGR;
+    ids[(size_t)q0 + 3] = TGR;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    // Without cap: cascade forces chunks 0..20 (21 chunks total).
+    // With cap=5: stops at 5.
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4,
+                                     /*rare_token_max_freq=*/2,
+                                     /*cascade_min_anchor_count=*/0,
+                                     /*max_forced_count=*/5};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                              cfg, /*max_iters=*/25, forced);
+
+    int total_forced = 0;
+    for (int c = 0; c < n_chunks; ++c) total_forced += (int)forced[(size_t)c];
+
+    REQUIRE(total_forced <= 5);
+    REQUIRE(forced[0] == 1);  // chunk 0 always forced by pass-1
+
+    std::printf("T6 PASS: hard cap engaged, forced=%d (cap=5, chain length=20)\n",
+                total_forced);
+}
+
+int main() {
+    t1_single_pass_match();
+    t2_single_pass_misses_hops();
+    t3_transitive_rescues_all();
+    t4_rare_token_bridges_different_context();
+    t5_gate_closes_when_pass1_finds_many();
+    t6_hard_cap_prevents_runaway();
+    std::printf("\nAll anchor_transitive tests passed.\n");
+    return 0;
+}
diff --git a/server/test/test_drafter_early_exit_score_range.cpp b/server/test/test_drafter_early_exit_score_range.cpp
new file mode 100644
index 000000000..96e888e77
--- /dev/null
+++ b/server/test/test_drafter_early_exit_score_range.cpp
@@ -0,0 +1,108 @@
+// Unit tests for dflash::common::compute_score_range().
+// Plain int main(), no frameworks.
+//
+// Verifies that SCORE_LAYERS is interpreted relative to fwd_layer_limit
+// (the early-exit boundary) rather than the full model depth, so that
+// early_exit_n=7 + score_layers=7 produces the non-empty range [0,7)
+// instead of the phantom-empty [7,7) the old inline code produced.
+
+#include "score_range.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+// REQUIRE survives -DNDEBUG (bare assert does not).
+#define REQUIRE(cond) \
+    do { if (!(cond)) { \
+        std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \
+        std::exit(1); \
+    } } while (0)
+
+using dflash::common::ScoreRange;
+using dflash::common::compute_score_range;
+
+// T1 — The exact bug scenario: early_exit_n=7, score_layers=7, n_layer=28.
+// OLD code: start = min(28-7, 7) = 7, end = 7 → empty loop.
+// NEW code: effective_n=7, want=min(7,7)=7, start=7-7=0, end=7 → [0,7).
+static void t1_bug_scenario() {
+    ScoreRange r = compute_score_range(/*n_layer=*/28,
+                                       /*score_layers=*/7,
+                                       /*fwd_layer_limit=*/7);
+    REQUIRE(r.start == 0 && "score_layer_start must be 0");
+    REQUIRE(r.end   == 7 && "score_layer_end must equal fwd_layer_limit");
+    REQUIRE(!r.empty()   && "range must be non-empty");
+    REQUIRE(r.count() == 7);
+    printf("T1 pass: early_exit_n=7 score_layers=7 n_layer=28 -> [%d,%d)\n",
+           r.start, r.end);
+}
+
+// T2 — No early exit (fwd_layer_limit == n_layer).
+// score_layers=7 should pick the last 7 layers [21,28).
+static void t2_no_early_exit() {
+    ScoreRange r = compute_score_range(28, 7, 28);
+    REQUIRE(r.start == 21);
+    REQUIRE(r.end   == 28);
+    REQUIRE(!r.empty());
+    REQUIRE(r.count() == 7);
+    printf("T2 pass: no early exit score_layers=7 -> [%d,%d)\n", r.start, r.end);
+}
+
+// T3 — score_layers == -1 (all layers) with no early exit.
+static void t3_all_layers_no_exit() {
+    ScoreRange r = compute_score_range(28, -1, 28);
+    REQUIRE(r.start == 0);
+    REQUIRE(r.end   == 28);
+    REQUIRE(!r.empty());
+    printf("T3 pass: score_layers=-1 no exit -> [%d,%d)\n", r.start, r.end);
+}
+
+// T4 — All layers, with early exit at 14.
+static void t4_all_layers_with_exit() {
+    ScoreRange r = compute_score_range(28, -1, 14);
+    REQUIRE(r.start == 0);
+    REQUIRE(r.end   == 14);
+    REQUIRE(!r.empty());
+    printf("T4 pass: score_layers=-1 early_exit=14 -> [%d,%d)\n", r.start, r.end);
+}
+
+// T5 — SCORE_LAYERS larger than fwd_layer_limit: clamp to [0, fwd_layer_limit).
+static void t5_score_layers_exceeds_exit() {
+    // score_layers=14 but only 7 computed: want = min(14,7) = 7, start=0
+    ScoreRange r = compute_score_range(28, 14, 7);
+    REQUIRE(r.start == 0);
+    REQUIRE(r.end   == 7);
+    REQUIRE(!r.empty());
+    printf("T5 pass: score_layers=14 early_exit=7 -> [%d,%d)\n", r.start, r.end);
+}
+
+// T6 — SCORE_LAYERS == n_layer (all layers) with no early exit.
+static void t6_score_layers_equals_n_layer() {
+    ScoreRange r = compute_score_range(28, 28, 28);
+    // score_layers == n_layer → condition (score_layers < n_layer) is false → start=0
+    REQUIRE(r.start == 0);
+    REQUIRE(r.end   == 28);
+    REQUIRE(!r.empty());
+    printf("T6 pass: score_layers=n_layer=28 -> [%d,%d)\n", r.start, r.end);
+}
+
+// T7 — early_exit_n == 14, score_layers == 7: should produce [7,14).
+static void t7_partial_exit_partial_score() {
+    ScoreRange r = compute_score_range(28, 7, 14);
+    REQUIRE(r.start == 7);
+    REQUIRE(r.end   == 14);
+    REQUIRE(!r.empty());
+    REQUIRE(r.count() == 7);
+    printf("T7 pass: early_exit=14 score_layers=7 -> [%d,%d)\n", r.start, r.end);
+}
+
+int main() {
+    t1_bug_scenario();
+    t2_no_early_exit();
+    t3_all_layers_no_exit();
+    t4_all_layers_with_exit();
+    t5_score_layers_exceeds_exit();
+    t6_score_layers_equals_n_layer();
+    t7_partial_exit_partial_score();
+    printf("\nAll score_range tests passed.\n");
+    return 0;
+}
diff --git a/server/test/test_drafter_tail_capture_guard.cpp b/server/test/test_drafter_tail_capture_guard.cpp
new file mode 100644
index 000000000..a00763e3e
--- /dev/null
+++ b/server/test/test_drafter_tail_capture_guard.cpp
@@ -0,0 +1,128 @@
+// Unit tests for the tail-capture chunk-boundary guard in qwen3_graph.cpp.
+// Reproduces Bug #42: ggml_view_3d overrun when S % chunk_size ∈ {1..7}
+// and n_lookahead == 8.
+//
+// Pure integer arithmetic — no ggml, no GPU, no server deps.
+//
+// Root cause (codex's diagnosis, confirmed by momus's data audit):
+//   tail_lo = S - n_lookahead
+//   When chunk 0 contains S = chunk_size + r tokens (r ∈ {1..7}), a second
+//   chunk was dispatched but we still evaluate the first chunk's guard with
+//   cs=0, cl=chunk_size. tail_lo = chunk_size + r - n_lookahead = 4088 + r.
+//
+//   OLD guard:  tail_lo >= cs && tail_lo < cs + cl
+//     r=1..7: (4088+r) >= 0 && (4088+r) < 4096  → TRUE  ← BUG: tail overruns
+//
+//   NEW guard:  tail_lo >= cs && tail_lo + n_lookahead <= cs + cl
+//     r=1..7: (4088+r) + 8 <= 4096 → 4096+r <= 4096 → FALSE ← correct: skip
+//
+// TDD RED/GREEN:
+//   RED  (before patch): TAIL_GUARD_USE_NEW_FORMULA undefined → old guard inline → test FAILS.
+//   GREEN (after patch): TAIL_GUARD_USE_NEW_FORMULA defined via compiler flag → test PASSES.
+//   The patch to qwen3_graph.cpp changes the same 2 lines as this toggle.
+
+#include <cstdio>
+#include <cstdlib>
+
+#define REQUIRE(cond) \
+    do { if (!(cond)) { \
+        std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \
+        std::exit(1); \
+    } } while (0)
+
+// The guard being tested — toggled by compile-time flag to reproduce RED/GREEN.
+#ifdef TAIL_GUARD_USE_NEW_FORMULA
+static bool tail_fits(int tail_lo, int cs, int cl, int n_lookahead) {
+    return tail_lo >= cs && tail_lo + n_lookahead <= cs + cl;  // NEW (fix)
+}
+#else
+static bool tail_fits(int tail_lo, int cs, int cl, int n_lookahead) {
+    (void)n_lookahead;
+    return tail_lo >= cs && tail_lo < cs + cl;  // OLD (Bug #42)
+}
+#endif
+
+// T1: First chunk (cs=0, cl=4096), S = chunk_size + r for r ∈ {1..7}.
+// Tail straddles the chunk boundary: tail_lo ∈ [4089..4095], needs 8 tokens
+// → runs 1..7 tokens past the end → view must be SKIPPED.
+// CORRECT answer: false. Old guard returns true → BUG → RED test FAILS.
+static void t1_straddling_tail_must_be_skipped() {
+    const int chunk_size = 4096, n_lookahead = 8;
+    const int cs = 0, cl = chunk_size;  // first chunk
+
+    for (int r = 1; r <= 7; r++) {
+        const int S       = chunk_size + r;
+        const int tail_lo = S - n_lookahead;  // = 4088 + r ∈ [4089..4095]
+
+        const bool result = tail_fits(tail_lo, cs, cl, n_lookahead);
+        std::printf("T1 r=%d S=%d tail_lo=%d tail_hi=%d chunk=[%d,%d): fits=%d (expect 0)\n",
+                    r, S, tail_lo, tail_lo + n_lookahead, cs, cs + cl, (int)result);
+        REQUIRE(!result && "tail overruns chunk boundary — guard must return false");
+    }
+}
+
+// T2: r=0 (S == chunk_size exactly). tail_lo=4088, tail_hi=4096=chunk end. Fits exactly.
+// Both old and new guards agree: true.
+static void t2_tail_fits_exactly_at_chunk_end() {
+    const int chunk_size = 4096, n_lookahead = 8;
+    const int cs = 0, cl = chunk_size;
+    const int S       = chunk_size;
+    const int tail_lo = S - n_lookahead;  // 4088
+
+    const bool result = tail_fits(tail_lo, cs, cl, n_lookahead);
+    std::printf("T2 r=0 S=%d tail_lo=%d: fits=%d (expect 1)\n", S, tail_lo, (int)result);
+    REQUIRE(result && "tail fits exactly at chunk end — must return true");
+}
+
+// T3: r=8 (S = chunk_size + 8). tail_lo=4096 — at cs+cl boundary, outside chunk.
+// Both guards agree: false.
+static void t3_tail_starts_outside_chunk() {
+    const int chunk_size = 4096, n_lookahead = 8;
+    const int cs = 0, cl = chunk_size;
+    const int S       = chunk_size + 8;
+    const int tail_lo = S - n_lookahead;  // 4096
+
+    const bool result = tail_fits(tail_lo, cs, cl, n_lookahead);
+    std::printf("T3 r=8 S=%d tail_lo=%d: fits=%d (expect 0)\n", S, tail_lo, (int)result);
+    REQUIRE(!result && "tail starts at next chunk — must return false");
+}
+
+// T4: Second chunk (cs=4096, cl=4096), S=8192, tail fully inside.
+// tail_lo=8184, tail_hi=8192 == cs+cl. Both guards agree: true.
+static void t4_second_chunk_tail_fits_exactly() {
+    const int chunk_size = 4096, n_lookahead = 8;
+    const int cs = chunk_size, cl = chunk_size;  // second chunk
+    const int S       = 2 * chunk_size;
+    const int tail_lo = S - n_lookahead;  // 8184
+
+    const bool result = tail_fits(tail_lo, cs, cl, n_lookahead);
+    std::printf("T4 second chunk S=%d tail_lo=%d cs=%d: fits=%d (expect 1)\n",
+                S, tail_lo, cs, (int)result);
+    REQUIRE(result && "tail fits exactly in second chunk — must return true");
+}
+
+// T5: Second chunk, r=3. tail straddles end of second chunk.
+// S = 2*4096 + 3 = 8195. tail_lo = 8187, tail_hi = 8195. cs+cl = 8192.
+// New guard: 8195 <= 8192 → false. Old guard: 8187 < 8192 → true (BUG).
+static void t5_second_chunk_straddling_tail_skipped() {
+    const int chunk_size = 4096, n_lookahead = 8;
+    const int cs = chunk_size, cl = chunk_size;  // second chunk [4096,8192)
+    const int r = 3;
+    const int S       = 2 * chunk_size + r;
+    const int tail_lo = S - n_lookahead;  // 8187
+
+    const bool result = tail_fits(tail_lo, cs, cl, n_lookahead);
+    std::printf("T5 second chunk r=%d S=%d tail_lo=%d: fits=%d (expect 0)\n",
+                r, S, tail_lo, (int)result);
+    REQUIRE(!result && "tail straddles end of second chunk — must return false");
+}
+
+int main() {
+    t1_straddling_tail_must_be_skipped();
+    t2_tail_fits_exactly_at_chunk_end();
+    t3_tail_starts_outside_chunk();
+    t4_second_chunk_tail_fits_exactly();
+    t5_second_chunk_straddling_tail_skipped();
+    std::printf("All tail_capture guard tests passed.\n");
+    return 0;
+}
diff --git a/server/test/test_drafter_warm_path_regression.cpp b/server/test/test_drafter_warm_path_regression.cpp
new file mode 100644
index 000000000..4a2015319
--- /dev/null
+++ b/server/test/test_drafter_warm_path_regression.cpp
@@ -0,0 +1,164 @@
+// Regression test: layer-subset warm-path buffer sizing fix.
+//
+// Root cause (commit that introduced fix): when PFLASH_DRAFTER_SCORE_LAYERS=7
+// with a 28-layer model, the old code allocated K_norope_v for ALL 28 layers
+// (~7.5 GB on RTX 3090 at S=128K) even though only 7 layers are read in scoring.
+// The extra 21 × 268 MB = 5.6 GB pushed total VRAM above 24 GB, causing GPU
+// page migration and a 5.4× A_compute regression on warm runs.
+//
+// The fix: size K_norope_v / Q_norope_v to n_score_layers (= score_range.count()),
+// which equals 7 rather than 28.  This test verifies the sizing formula via
+// compute_score_range without needing a GPU.
+
+#include "score_range.h"
+
+#include <cassert>
+#include <cstdio>
+
+using dflash::common::ScoreRange;
+using dflash::common::compute_score_range;
+
+// Helper: compute n_score_layers as the fixed allocator does.
+static int score_layer_count(int n_layer, int score_layers_env, int early_exit_env) {
+    const int fwd_limit = (early_exit_env > 0 && early_exit_env < n_layer)
+        ? early_exit_env : n_layer;
+    ScoreRange r = compute_score_range(n_layer, score_layers_env, fwd_limit);
+    return r.count();
+}
+
+// T1: baseline case — SCORE_LAYERS unset (-1), no early exit.
+// K_norope_v should have n_layer entries.
+static void t1_baseline_full_alloc() {
+    int n = score_layer_count(28, -1, -1);
+    assert(n == 28 && "baseline: all 28 layers must be allocated");
+    printf("T1 pass: baseline n_score_layers=%d\n", n);
+}
+
+// T2: L7 case — SCORE_LAYERS=7, no early exit.
+// OLD: allocated 28 entries (5.6 GB wasted). NEW: 7 entries.
+static void t2_l7_trimmed_alloc() {
+    int n = score_layer_count(28, 7, -1);
+    assert(n == 7 && "L7: only 7 K_norope entries must be allocated");
+    printf("T2 pass: L7 n_score_layers=%d (was 28 before fix)\n", n);
+}
+
+// T3: early-exit=14, SCORE_LAYERS=7. Scoring range [7,14), 7 layers.
+static void t3_early_exit_with_score_layers() {
+    int n = score_layer_count(28, 7, 14);
+    assert(n == 7);
+    printf("T3 pass: early_exit=14 score_layers=7 -> n_score_layers=%d\n", n);
+}
+
+// T4: early-exit=7, SCORE_LAYERS=7 (the classic double-7 composition).
+// Range [0,7), 7 layers.
+static void t4_ee7_score7_composition() {
+    int n = score_layer_count(28, 7, 7);
+    assert(n == 7);
+    printf("T4 pass: ee7+score7 n_score_layers=%d\n", n);
+}
+
+// T5: SCORE_LAYERS not set (all layers), early-exit=14.
+// Scoring range [0,14), 14 layers needed.
+static void t5_all_score_with_early_exit() {
+    int n = score_layer_count(28, -1, 14);
+    assert(n == 14);
+    printf("T5 pass: score_all early_exit=14 n_score_layers=%d\n", n);
+}
+
+// T6: validate that score_layer_start_pre matches score_layer_start used
+// in the scoring loop (must be identical for correct buffer indexing).
+static void t6_start_pre_matches_loop_start() {
+    // Replicate the pre-alloc computation.
+    const int n_layer = 28, score_layers_env = 7, early_exit_env = -1;
+    const int fwd_limit = (early_exit_env > 0 && early_exit_env < n_layer)
+        ? early_exit_env : n_layer;
+    ScoreRange pre   = compute_score_range(n_layer, score_layers_env, fwd_limit);
+    // Scoring loop uses the same fwd_layer_limit (== fwd_limit) and same env.
+    ScoreRange loop  = compute_score_range(n_layer, score_layers_env, fwd_limit);
+    assert(pre.start == loop.start && "score_layer_start_pre must equal score_layer_start");
+    assert(pre.end   == loop.end);
+    printf("T6 pass: pre_start=%d loop_start=%d (match)\n", pre.start, loop.start);
+}
+
+// T7: alloc loop boundary check — the alloc loop iterates 0..n_layer but must only
+// fill K_norope_v for layers in [score_layer_start_pre, fwd_layer_limit_pre).
+// This replicates the guard added to the alloc loop: il >= start AND il < fwd_limit.
+// Before the fix: il was only bounded below (il >= start), causing K_norope_v[si]
+// out-of-bounds when n_score_layers < n_layer (e.g. ee14: si 0..27 but vec size 14).
+static void t7_alloc_loop_upper_bound() {
+    struct FakeVec {
+        int capacity;
+        int max_si_written = -1;
+        void write(int si) {
+            assert(si >= 0 && si < capacity && "si out of bounds");
+            if (si > max_si_written) max_si_written = si;
+        }
+    };
+
+    // Simulate ee14 (no SCORE_LAYERS, early_exit=14, n_layer=28).
+    {
+        const int n_layer = 28, score_layers = -1, early_exit = 14;
+        const int fwd_limit = early_exit;
+        ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit);
+        const int n_score = r.count();  // 14
+        FakeVec v{n_score};
+        int writes = 0;
+        for (int il = 0; il < n_layer; ++il) {
+            // Correct guard: il >= start AND il < fwd_limit (the fix)
+            if (il >= r.start && il < fwd_limit) {
+                v.write(il - r.start);
+                writes++;
+            }
+        }
+        assert(writes == n_score && "ee14: must write exactly n_score_layers entries");
+        printf("T7a pass: ee14 alloc writes=%d capacity=%d (no overflow)\n", writes, n_score);
+    }
+
+    // Simulate ee7 (SCORE_LAYERS=7, early_exit=7, n_layer=28).
+    {
+        const int n_layer = 28, score_layers = 7, early_exit = 7;
+        const int fwd_limit = early_exit;
+        ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit);
+        const int n_score = r.count();  // 7
+        FakeVec v{n_score};
+        int writes = 0;
+        for (int il = 0; il < n_layer; ++il) {
+            if (il >= r.start && il < fwd_limit) {
+                v.write(il - r.start);
+                writes++;
+            }
+        }
+        assert(writes == n_score && "ee7: must write exactly 7 entries");
+        printf("T7b pass: ee7 alloc writes=%d capacity=%d (no overflow)\n", writes, n_score);
+    }
+
+    // Simulate baseline (no ee, no score_layers).
+    {
+        const int n_layer = 28, score_layers = -1, early_exit = -1;
+        const int fwd_limit = n_layer;
+        ScoreRange r = compute_score_range(n_layer, score_layers, fwd_limit);
+        const int n_score = r.count();  // 28
+        FakeVec v{n_score};
+        int writes = 0;
+        for (int il = 0; il < n_layer; ++il) {
+            if (il >= r.start && il < fwd_limit) {
+                v.write(il - r.start);
+                writes++;
+            }
+        }
+        assert(writes == n_score && "baseline: must write 28 entries");
+        printf("T7c pass: baseline alloc writes=%d capacity=%d (no overflow)\n", writes, n_score);
+    }
+}
+
+int main() {
+    t1_baseline_full_alloc();
+    t2_l7_trimmed_alloc();
+    t3_early_exit_with_score_layers();
+    t4_ee7_score7_composition();
+    t5_all_score_with_early_exit();
+    t6_start_pre_matches_loop_start();
+    t7_alloc_loop_upper_bound();
+    printf("\nAll warm-path regression tests passed.\n");
+    return 0;
+}

From 94907a441726eb6249a6af1cc1b06d7d2194121e Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Wed, 27 May 2026 12:32:08 +0200
Subject: [PATCH 02/16] =?UTF-8?q?refactor(pflash):=20rename=20DFLASH=5FCOM?=
 =?UTF-8?q?PRESS=5F*=20=E2=86=92=20PFLASH=5FCOMPRESS=5F*=20(cascade=20env?=
 =?UTF-8?q?=20vars)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 server/src/qwen3/qwen3_drafter.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index 589351f9c..67c9adc6a 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -559,11 +559,11 @@ static std::vector<int32_t> qwen35_score_and_compress(
     const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
     const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
     const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
-    const int anchor_ngram        = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM",    4);
-    const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ",   2);
+    const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
+    const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
 
-    const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
-    const float max_forced_ratio        = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
+    const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
+    const float max_forced_ratio        = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
 
     const int q0 = std::max(0, S - query_tokens);
     std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
@@ -575,8 +575,8 @@ static std::vector<int32_t> qwen35_score_and_compress(
     anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
     anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
 
-    const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
-    const int  max_iters      = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+    const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
+    const int  max_iters      = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
     if (use_transitive) {
         dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
                                                   anchor_cfg, max_iters, forced);
@@ -752,11 +752,11 @@ std::vector<int32_t> drafter_score_and_compress(
     const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
     const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
     const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
-    const int anchor_ngram        = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM",    4);
-    const int rare_token_max_freq = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ",   2);
+    const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
+    const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
 
-    const float cascade_min_anchor_frac = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
-    const float max_forced_ratio        = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
+    const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
+    const float max_forced_ratio        = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
 
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
@@ -772,8 +772,8 @@ std::vector<int32_t> drafter_score_and_compress(
         anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
         anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
 
-        const bool use_transitive = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
-        const int  max_iters      = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+        const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
+        const int  max_iters      = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
         if (use_transitive) {
             dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
                                                       anchor_cfg, max_iters, forced);

From 99f6b38d68c6b2d65bb6499d45c4d5763224482b Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Wed, 27 May 2026 15:12:49 +0200
Subject: [PATCH 03/16] fix(pflash): adaptive anchor_radius eliminates 64K NIAH
 cliff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At >=32K context the needle text is more likely to straddle multiple
chunks (chunk_size=32), and the fixed anchor_radius=2 window (5 chunks
~160 tokens) loses the back half of the needle digits — the model
retrieves '...is 4' but truncates/hallucinates the continuation.

Adaptive scaling based on n_chunks:
  <32K  context (<1024 chunks): radius=2,  max_anchor_hits=8   (unchanged)
  32-64K (1024-2047 chunks):    radius=4,  max_anchor_hits=16
  >=64K (>=2048 chunks):        radius=8,  max_anchor_hits=32

Override via PFLASH_COMPRESS_ANCHOR_RADIUS / PFLASH_COMPRESS_MAX_ANCHOR_HITS
env vars (legacy DFLASH_COMPRESS_* names still accepted).

Validated at 49K context: NIAH needle 'kowefada 1596346' correctly
retrieved (was: '1594' or hallucinated 'is 048394839483' before fix).
Resolves the long-standing 'project_64k_quality_cliff' memory entry.
---
 server/src/qwen3/qwen3_drafter.cpp | 58 +++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index 67c9adc6a..2fbf5850f 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -557,8 +557,33 @@ static std::vector<int32_t> qwen35_score_and_compress(
     }
 
     const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
-    const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
-    const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff:
+    // at long context the needle text is more likely to straddle multiple
+    // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the
+    // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8.
+    // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins).
+    int anchor_radius;
+    {
+        const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        if (env_r >= 0)         anchor_radius = env_r;
+        else if (legacy_r >= 0) anchor_radius = legacy_r;
+        else if (n_chunks <  1024) anchor_radius = 2;
+        else if (n_chunks <  2048) anchor_radius = 4;
+        else                       anchor_radius = 8;
+    }
+    // max_anchor_hits scales the same way: at long context, distinctive
+    // anchors are sparser, so we can afford to keep more hits per qi.
+    int max_anchor_hits;
+    {
+        const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        if (env_h >= 0)         max_anchor_hits = env_h;
+        else if (legacy_h >= 0) max_anchor_hits = legacy_h;
+        else if (n_chunks <  1024) max_anchor_hits = 8;
+        else if (n_chunks <  2048) max_anchor_hits = 16;
+        else                       max_anchor_hits = 32;
+    }
     const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
     const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
 
@@ -750,8 +775,33 @@ std::vector<int32_t> drafter_score_and_compress(
         tail_chunks = std::max(0, budget - head_chunks);
     }
     const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
-    const int anchor_radius       = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS",   2);
-    const int max_anchor_hits     = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff:
+    // at long context the needle text is more likely to straddle multiple
+    // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the
+    // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8.
+    // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins).
+    int anchor_radius;
+    {
+        const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        if (env_r >= 0)         anchor_radius = env_r;
+        else if (legacy_r >= 0) anchor_radius = legacy_r;
+        else if (n_chunks <  1024) anchor_radius = 2;
+        else if (n_chunks <  2048) anchor_radius = 4;
+        else                       anchor_radius = 8;
+    }
+    // max_anchor_hits scales the same way: at long context, distinctive
+    // anchors are sparser, so we can afford to keep more hits per qi.
+    int max_anchor_hits;
+    {
+        const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        if (env_h >= 0)         max_anchor_hits = env_h;
+        else if (legacy_h >= 0) max_anchor_hits = legacy_h;
+        else if (n_chunks <  1024) max_anchor_hits = 8;
+        else if (n_chunks <  2048) max_anchor_hits = 16;
+        else                       max_anchor_hits = 32;
+    }
     const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
     const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
 

From 766e46dd85fe8f5056fffaa6dd70ee2b892b2561 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 00:41:28 +0200
Subject: [PATCH 04/16] bench: add eval_quality_compare.py for LongBench F1
 regression detection

---
 dflash/scripts/eval_quality_compare.py | 166 +++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 dflash/scripts/eval_quality_compare.py

diff --git a/dflash/scripts/eval_quality_compare.py b/dflash/scripts/eval_quality_compare.py
new file mode 100644
index 000000000..cd4578e9e
--- /dev/null
+++ b/dflash/scripts/eval_quality_compare.py
@@ -0,0 +1,166 @@
+"""MT-Bench quality comparator.
+
+Reads all results_*.json in the given directory (or current dir),
+treats baseline_off as reference, and prints a markdown comparison table.
+
+Usage:
+    python eval_quality_compare.py [--dir PATH] [--out PATH]
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def load_results(path: Path) -> dict[tuple[int, int], str]:
+    """Returns {(question_id, turn_num): reply} for turn_num in {1, 2}."""
+    mapping = {}
+    with open(path) as f:
+        records = json.load(f)
+    for r in records:
+        qid = r["question_id"]
+        mapping[(qid, 1)] = r["turn_1"]
+        mapping[(qid, 2)] = r["turn_2"]
+    return mapping
+
+
+def lcp_ratio(a: str, b: str) -> float:
+    """Longest common prefix length / min(len(a), len(b))."""
+    denom = min(len(a), len(b))
+    if denom == 0:
+        return 1.0 if a == b else 0.0
+    i = 0
+    while i < denom and a[i] == b[i]:
+        i += 1
+    return i / denom
+
+
+def compare(ref: dict, cand: dict) -> dict:
+    """Compute comparison metrics between ref and cand reply maps."""
+    keys = sorted(set(ref) & set(cand))
+    if not keys:
+        return {"exact_match_rate": 0.0, "mean_lcp_ratio": 0.0,
+                "divergence_count": 0, "total_pairs": 0,
+                "first_5_divergences": []}
+
+    exact = 0
+    lcp_sum = 0.0
+    divergences = []
+
+    for k in keys:
+        r, c = ref[k], cand[k]
+        if r == c:
+            exact += 1
+        else:
+            if len(divergences) < 5:
+                qid, turn = k
+                divergences.append((qid, turn, r[:50], c[:50]))
+        lcp_sum += lcp_ratio(r, c)
+
+    n = len(keys)
+    return {
+        "exact_match_rate":   exact / n,
+        "mean_lcp_ratio":     lcp_sum / n,
+        "divergence_count":   n - exact,
+        "total_pairs":        n,
+        "first_5_divergences": divergences,
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="MT-Bench quality comparator")
+    ap.add_argument("--dir", type=Path, default=Path("."),
+                    help="Directory containing results_*.json files")
+    ap.add_argument("--out", type=Path,
+                    default=Path(__file__).parent.parent / "eval/summary.md",
+                    help="Output markdown summary path")
+    args = ap.parse_args()
+
+    result_files = sorted(args.dir.glob("results_*.json"))
+    if not result_files:
+        print(f"ERROR: no results_*.json found in {args.dir}", file=sys.stderr)
+        return 1
+
+    # Map config name -> result file
+    configs: dict[str, Path] = {}
+    for f in result_files:
+        # strip "results_" prefix and ".json" suffix
+        name = f.stem[len("results_"):]
+        configs[name] = f
+
+    if "baseline_off" not in configs:
+        print("ERROR: baseline_off results not found — cannot compare", file=sys.stderr)
+        return 1
+
+    ref = load_results(configs["baseline_off"])
+
+    rows = []
+    for name, path in configs.items():
+        cand = load_results(path)
+        m = compare(ref, cand)
+        m["config"] = name
+        rows.append(m)
+
+    # Sort: baseline_off first, then alphabetical
+    def sort_key(r):
+        if r["config"] == "baseline_off":
+            return (0, r["config"])
+        return (1, r["config"])
+    rows.sort(key=sort_key)
+
+    # Sanity check: baseline_off_2 vs baseline_off
+    sanity_row = next((r for r in rows if r["config"] == "baseline_off_2"), None)
+    sanity_warning = ""
+    if sanity_row and sanity_row["exact_match_rate"] < 0.99:
+        sanity_warning = (
+            f"WARNING: baseline_off_2 exact_match_rate={sanity_row['exact_match_rate']:.3f} "
+            f"< 0.99 — SERVER IS NONDETERMINISTIC. All other comparisons are suspect.\n\n"
+        )
+
+    # Build markdown table
+    lines = []
+    if sanity_warning:
+        lines.append(f"> {sanity_warning.strip()}\n")
+
+    lines.append("| config | exact_match_rate | mean_lcp_ratio | divergence_count | total_pairs |")
+    lines.append("|--------|-----------------|----------------|-----------------|-------------|")
+    for r in rows:
+        lines.append(
+            f"| {r['config']} "
+            f"| {r['exact_match_rate']:.3f} "
+            f"| {r['mean_lcp_ratio']:.3f} "
+            f"| {r['divergence_count']} "
+            f"| {r['total_pairs']} |"
+        )
+
+    lines.append("")
+    lines.append("## First 5 divergences per config (vs baseline_off)")
+    for r in rows:
+        if r["config"] == "baseline_off" or not r["first_5_divergences"]:
+            continue
+        lines.append(f"\n### {r['config']}")
+        lines.append("| qid | turn | ref (first 50) | cand (first 50) |")
+        lines.append("|-----|------|----------------|-----------------|")
+        for qid, turn, ref50, cand50 in r["first_5_divergences"]:
+            ref50_s  = ref50.replace("|", "\\|").replace("\n", " ")
+            cand50_s = cand50.replace("|", "\\|").replace("\n", " ")
+            lines.append(f"| {qid} | {turn} | {ref50_s!r} | {cand50_s!r} |")
+
+    table = "\n".join(lines)
+
+    # Print to stdout
+    if sanity_warning:
+        print(f"\n{'!'*70}")
+        print(sanity_warning.strip())
+        print(f"{'!'*70}\n")
+    print(table)
+
+    # Write summary file
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(table + "\n")
+    print(f"\nSummary written to {args.out}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 8c1d705ffafdba32ed7a7679d746b77f55353393 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 09:24:10 +0200
Subject: [PATCH 05/16] feat(qwen35): derive scalars from weights, assert vs
 GGUF metadata

Mirror the gemma4_backend.cpp:75-104 defensive pattern for the qwen35
target loader and the dflash decode draft loader. After loading weight
tensors, derive head_dim / n_head / n_head_kv from wq->ne[1] /
wk->ne[1] and compare against GGUF-declared values; set_last_error
and return false on mismatch.

Makes the 'stale scalar at graph-build time' bug class structurally
impossible. Load-time only, no runtime cost. Existing well-formed
GGUFs are unaffected (smoke verified).
---
 server/src/draft/draft_gguf_loader.cpp   | 57 ++++++++++++++++++++++++
 server/src/qwen35/gguf_target_loader.cpp | 45 +++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp
index fbec7263b..73a9c17bd 100644
--- a/server/src/draft/draft_gguf_loader.cpp
+++ b/server/src/draft/draft_gguf_loader.cpp
@@ -349,6 +349,63 @@ bool load_draft_gguf(const std::string & path,
 
     gguf_free(gctx);
 
+    // Structural defense: derive scalar dims from weight tensor shapes and
+    // assert against GGUF-declared metadata (Bug #2 class prevention).
+    // All draft layers have wq/wk (no deltanet mix), so use layer 0.
+    // wq is plain Q-only (no gate), so ne[1] = n_head * head_dim.
+    // fc is [n_target_layers*n_embd, n_embd], so ne[0] = n_target_layers*n_embd.
+    {
+        const DraftLayer & L0 = out.layers[0];
+        const int64_t derived_q_dim  = L0.wq->ne[1];
+        const int64_t derived_kv_dim = L0.wk->ne[1];
+        const int64_t expected_q_dim  = (int64_t)out.n_head * out.head_dim;
+        const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.head_dim;
+        if (derived_q_dim != expected_q_dim) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[1]=%lld "
+                "!= n_head*head_dim=%d*%d=%lld",
+                (long long)derived_q_dim,
+                out.n_head, out.head_dim, (long long)expected_q_dim);
+            set_last_error(buf);
+            return false;
+        }
+        if (derived_kv_dim != expected_kv_dim) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_k.weight->ne[1]=%lld "
+                "!= n_head_kv*head_dim=%d*%d=%lld",
+                (long long)derived_kv_dim,
+                out.n_head_kv, out.head_dim, (long long)expected_kv_dim);
+            set_last_error(buf);
+            return false;
+        }
+        const int64_t derived_n_embd = L0.wq->ne[0];
+        if (derived_n_embd != (int64_t)out.n_embd) {
+            char buf[256];
+            std::snprintf(buf, sizeof(buf),
+                "draft GGUF shape mismatch: blk.0.attn_q.weight->ne[0]=%lld != n_embd=%d",
+                (long long)derived_n_embd, out.n_embd);
+            set_last_error(buf);
+            return false;
+        }
+        // fc: [n_target_layers*n_embd, n_embd] — check fc->ne[0] against derived expectation
+        if (out.n_target_layers > 0) {
+            const int64_t derived_fc_in  = out.fc->ne[0];
+            const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd;
+            if (derived_fc_in != expected_fc_in) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "draft GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
+                    "!= n_target_layers*n_embd=%d*%d=%lld",
+                    (long long)derived_fc_in,
+                    out.n_target_layers, out.n_embd, (long long)expected_fc_in);
+                set_last_error(buf);
+                return false;
+            }
+        }
+    }
+
     char summary[192];
     std::snprintf(summary, sizeof(summary),
         "draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU",
diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp
index 116ddafc0..9085c1a1b 100644
--- a/server/src/qwen35/gguf_target_loader.cpp
+++ b/server/src/qwen35/gguf_target_loader.cpp
@@ -738,6 +738,51 @@ bool load_target_gguf_partial(const std::string & path,
 
     gguf_free(gctx);
 
+    // Structural defense: derive scalar dims from weight tensor shapes and
+    // assert against GGUF-declared metadata. Catches stale/zero dw_ or w_
+    // scalars before they silently corrupt graph-build (Bug #2 class).
+    // Uses the first full-attention layer (il = fai-1) because deltanet
+    // layers don't carry wq/wk. wq packs Q+gate so ne[1] = n_head*kl*2.
+    {
+        const int fa_il = out.full_attention_interval - 1;  // first full-attn layer
+        const TargetLayer & fa = out.layers[(size_t)fa_il];
+        if (fa.wq && fa.wk) {
+            const int64_t derived_q_dim  = fa.wq->ne[1];  // n_head * head_dim * 2
+            const int64_t derived_kv_dim = fa.wk->ne[1];  // n_head_kv * head_dim
+            const int64_t expected_q_dim  = (int64_t)out.n_head * out.n_embd_head_k * 2;
+            const int64_t expected_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k;
+            if (derived_q_dim != expected_q_dim) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_q.weight->ne[1]=%lld "
+                    "!= n_head*head_dim*2=%d*%d*2=%lld",
+                    fa_il, (long long)derived_q_dim,
+                    out.n_head, out.n_embd_head_k, (long long)expected_q_dim);
+                set_last_error(buf);
+                return false;
+            }
+            if (derived_kv_dim != expected_kv_dim) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_k.weight->ne[1]=%lld "
+                    "!= n_head_kv*head_dim=%d*%d=%lld",
+                    fa_il, (long long)derived_kv_dim,
+                    out.n_head_kv, out.n_embd_head_k, (long long)expected_kv_dim);
+                set_last_error(buf);
+                return false;
+            }
+            const int64_t derived_n_embd = fa.wq->ne[0];  // input dim = n_embd
+            if (derived_n_embd != (int64_t)out.n_embd) {
+                char buf[256];
+                std::snprintf(buf, sizeof(buf),
+                    "GGUF shape mismatch: blk.%d.attn_q.weight->ne[0]=%lld != n_embd=%d",
+                    fa_il, (long long)derived_n_embd, out.n_embd);
+                set_last_error(buf);
+                return false;
+            }
+        }
+    }
+
     if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {
         set_last_error("token_embd.weight not found or invalid type");
         return false;

From 699bb5c925ad6ceed353d9cb0cf4a131d7b03411 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 09:24:19 +0200
Subject: [PATCH 06/16] feat(pflash): adaptive composition via per-request
 fa_window override

When pflash compresses, set gen_req.fa_window_override =
effective_prompt + 256 so spec-decode verify sees the entire
compressed prompt. Pflash already paid compute to pick which tokens
matter; verify never throws any of them away.

When the override would exceed 2 * cfg_.fa_window (spec-decode's
drafter cost stops earning its tok/J), the C2 gate in
qwen35_backend's generate() falls back to AR (fa_window=0, full
attention). AR sees every kept token at every context; we choose
mechanism, not visibility.

Zero new CLI flags. --draft remains the only knob for composition;
all per-request adaptation is internal.
---
 server/src/common/model_backend.h        |  4 ++
 server/src/qwen35/qwen35_backend.cpp     | 61 ++++++++++++++++++------
 server/src/qwen35/qwen35_dflash_target.h |  5 ++
 server/src/server/http_server.cpp        | 16 +++++++
 4 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index 182b50030..3af273ace 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -100,6 +100,10 @@ struct GenerateRequest {
     const std::vector<int32_t> * hint_tokens = nullptr;
     // Optional thinking-budget hook — see BudgetHook docs above.
     BudgetHook                 budget_hook;
+    // Per-request override for target spec-decode verify fa_window. Set by
+    // http_server when pflash compresses, so verify sees the entire compressed
+    // prompt (not just the last cfg_.fa_window positions). Zero = no override.
+    int                        fa_window_override = 0;
 };
 
 struct GenerateResult {
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index be83db452..471e8af07 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -553,6 +553,16 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
         sampler_rng_.seed(sampler_.seed);
     }
 
+    // Design 1: apply the per-request verify fa_window override (set by
+    // http_server when pflash compresses), then restore cfg_.fa_window after
+    // this generate completes so concurrent requests aren't affected. Calling
+    // dflash_target() lazily constructs it on first use.
+    const int eff_fa_window =
+        (req.fa_window_override > 0) ? req.fa_window_override : cfg_.fa_window;
+    if (auto * dt = dynamic_cast<Qwen35DFlashTarget *>(dflash_target())) {
+        dt->set_fa_window(eff_fa_window);
+    }
+
     // Zero delta-net recurrent state (SSM + conv) so a fresh prompt doesn't
     // inherit stale hidden state from the previous request. KV cache is
     // position-addressed and will be overwritten during prefill.
@@ -568,22 +578,45 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
     auto t_prefill_end = std::chrono::steady_clock::now();
     result.prefill_s = std::chrono::duration<double>(t_prefill_end - t_prefill_start).count();
 
-    // Decode (speculative)
+    // C2 adaptive-mechanism gate: pflash's override always reflects the
+    // FULL compressed-prompt size — we never cap visibility (would waste
+    // pflash's anchor-selection work). The gate here decides whether
+    // spec-decode's verify arithmetic still earns its drafter cost at
+    // that window size. Threshold 2× cfg_.fa_window:
+    //   override <= 4096 (32K → ~1.5K, 64K → ~3K compressed) → spec-decode
+    //   override >  4096 (128K → ~6.4K compressed)            → AR fallback
+    // AR uses fa_window=0 (full attention) so every kept token is visible
+    // regardless of which path runs. We choose mechanism, not visibility.
+    const bool fa_within_budget =
+        (req.fa_window_override == 0)
+     || (eff_fa_window <= 2 * cfg_.fa_window);
+
+    // Decode (speculative or AR)
     if (req.n_gen > 0) {
         auto t_decode_start = std::chrono::steady_clock::now();
-        // Pass the budget hook into spec-decode. When token count nears
-        // the budget edge, do_spec_decode breaks out and tails off via
-        // AR with the hook still active — force-close fires correctly
-        // without sacrificing spec-decode throughput for the bulk of
-        // generation. Most requests never hit the tail because the
-        // model closes </think> naturally well before the budget edge.
-        if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
-                             result.accept_rate, result.spec_decode_ran,
-                             req.hint_tokens, &req.budget_hook,
-                             &result.budget_forced_close,
-                             &result.degenerate_decode_close)) {
-            result.error = "decode";
-            return result;
+        if (!fa_within_budget) {
+            // AR fallback: fa_window override too wide for spec decode.
+            bool ok = do_ar_decode(committed, req.n_gen, result.tokens, out_io,
+                                    req.budget_hook,
+                                    &result.budget_forced_close,
+                                    &result.degenerate_decode_close);
+            out_io.emit(-1);
+            if (!ok) { result.error = "decode"; return result; }
+        } else {
+            // Pass the budget hook into spec-decode. When token count nears
+            // the budget edge, do_spec_decode breaks out and tails off via
+            // AR with the hook still active — force-close fires correctly
+            // without sacrificing spec-decode throughput for the bulk of
+            // generation. Most requests never hit the tail because the
+            // model closes </think> naturally well before the budget edge.
+            if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
+                                 result.accept_rate, result.spec_decode_ran,
+                                 req.hint_tokens, &req.budget_hook,
+                                 &result.budget_forced_close,
+                                 &result.degenerate_decode_close)) {
+                result.error = "decode";
+                return result;
+            }
         }
         result.decode_s = std::chrono::duration<double>(
             std::chrono::steady_clock::now() - t_decode_start).count();
diff --git a/server/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h
index 6a72e48b5..69e134f1c 100644
--- a/server/src/qwen35/qwen35_dflash_target.h
+++ b/server/src/qwen35/qwen35_dflash_target.h
@@ -53,6 +53,11 @@ class Qwen35DFlashTarget : public DFlashTarget {
     int mask_token_id() const override;
     const std::vector<int> & capture_layer_ids() const override;
 
+    // Per-call override for the verify-time flash-attention window. Used by
+    // do_spec_decode to widen the window when pflash compression has shrunk
+    // the prompt — see GenerateRequest.fa_window_override.
+    void set_fa_window(int fa) { fa_window_ = fa; }
+
 private:
     TargetWeights & w_;
     TargetCache & cache_;
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index ab37805bf..465758b51 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1267,6 +1267,22 @@ void HttpServer::worker_loop() {
         gen_req.sampler = req.sampler;
         gen_req.do_sample = req.sampler.needs_logit_processing();
         gen_req.stream = false;  // we handle streaming via on_token callback
+        // Design 1: when pflash compresses, widen the target spec-decode verify
+        // fa_window to cover the entire compressed prompt. Otherwise verify sees
+        // only the last cfg_.fa_window positions of the compressed sequence,
+        // losing needle context and truncating the answer at long ctx.
+        //
+        // Principle: pflash already paid compute to pick which tokens matter.
+        // Don't throw any of them away in verify by capping fa_window — that
+        // would waste pflash's work. Always request enough verify window to
+        // see the entire compressed prompt. The C2 gate in qwen35_backend.cpp
+        // then decides per request whether spec-decode arithmetic still beats
+        // AR at this window size; if not, AR fallback kicks in (which uses
+        // fa_window=0 → full attention over the compressed prompt). Either
+        // path sees every kept token. We choose mechanism, not visibility.
+        if (pflash_compressed) {
+            gen_req.fa_window_override = (int)effective_prompt.size() + 256;
+        }
 
         // Level 2 force-close: when thinking is opted in, the server is
         // configured with a hard-limit reply budget, and we resolved the

From a676161a8c8d01bf30eccdf449e6c7e5027e7899 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 13:47:10 +0200
Subject: [PATCH 07/16] feat(pflash): PFLASH_*/DFLASH_* env-var dual aliasing +
 transitive cascade default-on

Adds backwards-compat fallback wrappers for 6 cascade env vars in both
standard and bandit code paths, so harness scripts using either spelling
work against this binary. Emits one-time WARN to stderr when the legacy
DFLASH_* spelling is honored.

Also flips the default for `use_transitive` from `false` to `true` because
the gated rare-token bridge improves multi-hop F1 with zero downside in
the cascade-already-firing case.
---
 server/src/qwen3/qwen3_drafter.cpp | 103 +++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 14 deletions(-)

diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index 2fbf5850f..fb7cf7cf0 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -584,11 +584,35 @@ static std::vector<int32_t> qwen35_score_and_compress(
         else if (n_chunks <  2048) max_anchor_hits = 16;
         else                       max_anchor_hits = 32;
     }
-    const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
-    const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
-
-    const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
-    const float max_forced_ratio        = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
+    const int anchor_ngram = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; }
+        return 4;
+    }();
+    const int rare_token_max_freq = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; }
+        return 2;
+    }();
+
+    const float cascade_min_anchor_frac = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; }
+        return 0.0f;  // gate off by default: always run cascade
+    }();
+    const float max_forced_ratio = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; }
+        return 10.0f;  // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep)
+    }();
 
     const int q0 = std::max(0, S - query_tokens);
     std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
@@ -600,8 +624,20 @@ static std::vector<int32_t> qwen35_score_and_compress(
     anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
     anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
 
-    const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
-    const int  max_iters      = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+    const bool use_transitive = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        if (nv >= 0) return nv != 0;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; }
+        return true;  // on by default: gated rare-token bridge improves multi-hop F1
+    }();
+    const int  max_iters = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; }
+        return 3;
+    }();
     if (use_transitive) {
         dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
                                                   anchor_cfg, max_iters, forced);
@@ -802,11 +838,35 @@ std::vector<int32_t> drafter_score_and_compress(
         else if (n_chunks <  2048) max_anchor_hits = 16;
         else                       max_anchor_hits = 32;
     }
-    const int anchor_ngram        = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM",    4);
-    const int rare_token_max_freq = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ",   2);
-
-    const float cascade_min_anchor_frac = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", 0.25f);
-    const float max_forced_ratio        = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO",        1.3f);
+    const int anchor_ngram = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; }
+        return 4;
+    }();
+    const int rare_token_max_freq = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; }
+        return 2;
+    }();
+
+    const float cascade_min_anchor_frac = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; }
+        return 0.0f;  // gate off by default: always run cascade
+    }();
+    const float max_forced_ratio = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; }
+        return 10.0f;  // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep)
+    }();
 
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
@@ -821,9 +881,24 @@ std::vector<int32_t> drafter_score_and_compress(
                                                  rare_token_max_freq};
         anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
         anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
+        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d ratio=%.2f\n",
+            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, max_forced_ratio);
+        std::fflush(stderr);
 
-        const bool use_transitive = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", 0) != 0;
-        const int  max_iters      = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS",  3);
+        const bool use_transitive = [&]{
+            const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+            const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+            if (nv >= 0) return nv != 0;
+            if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; }
+            return true;  // on by default: gated rare-token bridge improves multi-hop F1
+        }();
+        const int  max_iters = [&]{
+            const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+            const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+            if (nv >= 0) return nv;
+            if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; }
+            return 3;
+        }();
         if (use_transitive) {
             dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
                                                       anchor_cfg, max_iters, forced);

From 6536b76df933682ff2038f5257215a54ddebda4a Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 16:00:20 +0200
Subject: [PATCH 08/16] refactor(pflash): extract compress_cfg_from_env, kill
 qwen35/qwen3 path drift

Single helper reads all 10 PFLASH_*/DFLASH_* env vars once. Both
qwen35_score_and_compress and drafter_score_and_compress call it.
Removes two 70-LOC duplicate env-reading blocks and the duplicated
anchor-radius comment. Also removes dead force_chunk_neighborhood
(no callers) and collapses the 4-overload load_drafter pyramid to
one canonical implementation + 3 thin forwarders.
---
 server/src/qwen3/qwen3_drafter.cpp | 386 ++++++++++++-----------------
 1 file changed, 157 insertions(+), 229 deletions(-)

diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index fb7cf7cf0..4261635f4 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -72,11 +72,110 @@ static float env_float(const char * name, float def) {
     return def;
 }
 
-static void force_chunk_neighborhood(std::vector<uint8_t> & forced, int n_chunks,
-                                     int chunk, int radius) {
-    int lo = std::max(0, chunk - radius);
-    int hi = std::min(n_chunks - 1, chunk + radius);
-    for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
+// All pflash/dflash compression knobs read from env, derived per-request.
+// anchor_radius and max_anchor_hits use an adaptive ladder keyed on n_chunks
+// to prevent the 64K NIAH cliff; see docs/pflash-compress-cfg.md.
+// Override any ladder value via PFLASH_COMPRESS_* env vars.
+struct CompressCfg {
+    int   query_tokens;
+    int   head_chunks;
+    int   tail_chunks;
+    dflash::qwen3::AnchorScanCfg anchor;
+    bool  use_transitive;
+    int   max_iters;
+};
+
+static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) {
+    CompressCfg c{};
+
+    c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
+
+    // head/tail forced chunks scale so top-K scoring always gets slots
+    const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
+    const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
+    c.head_chunks = h_raw;
+    c.tail_chunks = t_raw;
+    if (c.head_chunks + c.tail_chunks >= n_keep) {
+        const int budget = std::max(1, n_keep - 1);
+        c.head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw));
+        c.tail_chunks = std::max(0, budget - c.head_chunks);
+    }
+
+    // anchor_radius: adaptive ladder prevents 64K NIAH cliff
+    // (<32K=2, 32-64K=4, >=64K=8); override via PFLASH_COMPRESS_ANCHOR_RADIUS
+    {
+        const int env_r    = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        if      (env_r    >= 0)    c.anchor.anchor_radius = env_r;
+        else if (legacy_r >= 0)    c.anchor.anchor_radius = legacy_r;
+        else if (n_chunks <  1024) c.anchor.anchor_radius = 2;
+        else if (n_chunks <  2048) c.anchor.anchor_radius = 4;
+        else                       c.anchor.anchor_radius = 8;
+    }
+
+    // max_anchor_hits: same ladder — sparser anchors at long context
+    {
+        const int env_h    = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        if      (env_h    >= 0)    c.anchor.max_anchor_hits = env_h;
+        else if (legacy_h >= 0)    c.anchor.max_anchor_hits = legacy_h;
+        else if (n_chunks <  1024) c.anchor.max_anchor_hits = 8;
+        else if (n_chunks <  2048) c.anchor.max_anchor_hits = 16;
+        else                       c.anchor.max_anchor_hits = 32;
+    }
+
+    c.anchor.ngram = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM deprecated, use PFLASH_COMPRESS_ANCHOR_NGRAM\n"); return lv; }
+        return 4;
+    }();
+
+    c.anchor.rare_token_max_freq = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ deprecated, use PFLASH_COMPRESS_RARE_MAX_FREQ\n"); return lv; }
+        return 2;
+    }();
+
+    const float cascade_min_anchor_frac = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC deprecated, use PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC\n"); return lv; }
+        return 0.0f;
+    }();
+
+    const float max_forced_ratio = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO deprecated, use PFLASH_COMPRESS_MAX_FORCED_RATIO\n"); return lv; }
+        return 10.0f;
+    }();
+
+    c.anchor.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
+    c.anchor.max_forced_count         = (int)(max_forced_ratio * n_keep);
+
+    c.use_transitive = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        if (nv >= 0) return nv != 0;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE deprecated, use PFLASH_COMPRESS_ANCHOR_TRANSITIVE\n"); return lv != 0; }
+        return true;  // on by default; see docs/anchor-transitive.md
+    }();
+
+    c.max_iters = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS deprecated, use PFLASH_COMPRESS_ANCHOR_MAX_ITERS\n"); return lv; }
+        return 3;
+    }();
+
+    return c;
 }
 
 #if defined(DFLASH27B_BACKEND_HIP)
@@ -128,21 +227,6 @@ const char * drafter_arch_name(DrafterArch arch) {
     return "unknown";
 }
 
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, /*gpu=*/0, out);
-}
-
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  int gpu, DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, DrafterArch::Qwen3_0p6b, gpu, out);
-}
-
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  DrafterArch arch, DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, arch, /*gpu=*/0, out);
-}
-
 bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
                   DrafterArch arch, int gpu, DrafterContext & out) {
     if (gpu < 0) {
@@ -232,6 +316,22 @@ bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
     return true;
 }
 
+// Thin overloads for API compat; all forward to the canonical 4-arg form.
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, /*gpu=*/0, out);
+}
+
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  int gpu, DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, gpu, out);
+}
+
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  DrafterArch arch, DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, arch, /*gpu=*/0, out);
+}
+
 void free_drafter(DrafterContext & ctx) {
     free_drafter_weights(ctx);
     if (ctx.backend) {
@@ -513,24 +613,23 @@ static std::vector<int32_t> qwen35_score_and_compress(
 
     const int n_chunks = (S + chunk_size - 1) / chunk_size;
     const int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio));
-    
-    std::vector<float> smooth_score = score;
-    // Caller pool_kernel takes precedence; if zero/negative, fall back to env or 5.
+
     const int pk = (pool_kernel > 0)
         ? pool_kernel
         : std::max(3, env_int("DFLASH_COMPRESS_POOL_KERNEL", 5));
-    std::vector<float> smoothed((size_t)S, 0.0f);
-    int half = pk / 2;
-    for (int j = 0; j < S; ++j) {
-        int lo = std::max(0, j - half);
-        int hi = std::min(S - 1, j + half);
-        float s = 0.0f;
-        int n = 0;
-        for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; }
-        smoothed[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f;
+    std::vector<float> smooth_score((size_t)S, 0.0f);
+    {
+        int half = pk / 2;
+        for (int j = 0; j < S; ++j) {
+            int lo = std::max(0, j - half);
+            int hi = std::min(S - 1, j + half);
+            float s = 0.0f;
+            int n = 0;
+            for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; }
+            smooth_score[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f;
+        }
     }
-    smooth_score.swap(smoothed);
-    
+
     std::vector<std::pair<float, int>> chunk_means;
     for (int c = 0; c < n_chunks; ++c) {
         int lo = c * chunk_size, hi = std::min(S, lo + chunk_size);
@@ -539,108 +638,24 @@ static std::vector<int32_t> qwen35_score_and_compress(
         chunk_means.push_back({s / std::max(1, hi - lo), c});
     }
     std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; });
-    
-    std::vector<uint8_t> selected((size_t)n_chunks, 0);
-    int count = 0;
-    // Scale head/tail forced chunks so they don't crowd out top-K scoring.
-    {
-        const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
-        const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
-        int h_n = h_raw, t_n = t_raw;
-        if (h_n + t_n >= n_keep) {
-            const int budget = std::max(1, n_keep - 1);
-            h_n = std::max(0, h_raw * budget / (h_raw + t_raw));
-            t_n = std::max(0, budget - h_n);
-        }
-        for (int c = 0; c < std::min(n_chunks, h_n); ++c) { selected[(size_t)c] = 1; ++count; }
-        for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
-    }
 
-    const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
-    // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff:
-    // at long context the needle text is more likely to straddle multiple
-    // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the
-    // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8.
-    // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins).
-    int anchor_radius;
-    {
-        const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
-        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
-        if (env_r >= 0)         anchor_radius = env_r;
-        else if (legacy_r >= 0) anchor_radius = legacy_r;
-        else if (n_chunks <  1024) anchor_radius = 2;
-        else if (n_chunks <  2048) anchor_radius = 4;
-        else                       anchor_radius = 8;
-    }
-    // max_anchor_hits scales the same way: at long context, distinctive
-    // anchors are sparser, so we can afford to keep more hits per qi.
-    int max_anchor_hits;
-    {
-        const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
-        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
-        if (env_h >= 0)         max_anchor_hits = env_h;
-        else if (legacy_h >= 0) max_anchor_hits = legacy_h;
-        else if (n_chunks <  1024) max_anchor_hits = 8;
-        else if (n_chunks <  2048) max_anchor_hits = 16;
-        else                       max_anchor_hits = 32;
-    }
-    const int anchor_ngram = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
-        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
-        if (nv >= 0) return nv;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; }
-        return 4;
-    }();
-    const int rare_token_max_freq = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
-        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
-        if (nv >= 0) return nv;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; }
-        return 2;
-    }();
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep);
 
-    const float cascade_min_anchor_frac = [&]{
-        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
-        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
-        if (nv >= 0.0f) return nv;
-        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; }
-        return 0.0f;  // gate off by default: always run cascade
-    }();
-    const float max_forced_ratio = [&]{
-        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
-        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
-        if (nv >= 0.0f) return nv;
-        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; }
-        return 10.0f;  // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep)
-    }();
+    std::vector<uint8_t> selected((size_t)n_chunks, 0);
+    int count = 0;
+    for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) { selected[(size_t)c] = 1; ++count; }
+    for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
 
-    const int q0 = std::max(0, S - query_tokens);
+    const int q0 = std::max(0, S - cfg.query_tokens);
     std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
 
-    dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius,
-                                             max_anchor_hits, anchor_ngram,
-                                             rare_token_max_freq};
-    anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
-    anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
+    dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
+    anchor_cfg.chunk_size = chunk_size;
 
-    const bool use_transitive = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
-        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
-        if (nv >= 0) return nv != 0;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; }
-        return true;  // on by default: gated rare-token bridge improves multi-hop F1
-    }();
-    const int  max_iters = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
-        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
-        if (nv >= 0) return nv;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; }
-        return 3;
-    }();
-    if (use_transitive) {
+    if (cfg.use_transitive) {
         dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
-                                                  anchor_cfg, max_iters, forced);
+                                                  anchor_cfg, cfg.max_iters, forced);
     } else {
         dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
     }
@@ -652,16 +667,14 @@ static std::vector<int32_t> qwen35_score_and_compress(
         }
     }
 
-    // Global aggregation tasks often depend on repeated rare tokens that do
-    // not appear in the final query. Preserve high-frequency-but-not-filler
-    // token chunks before filling with model-score top-K.
+    // Global aggregation tasks: preserve high-frequency-but-not-filler token chunks.
     const int repeat_min = env_int("DFLASH_COMPRESS_REPEAT_MIN", 4);
     const int repeat_max = env_int("DFLASH_COMPRESS_REPEAT_MAX", 32);
     const int repeat_limit = env_int("DFLASH_COMPRESS_REPEAT_CHUNKS", n_keep);
     if (repeat_min > 1 && count < repeat_limit) {
         std::unordered_map<int32_t, int> freq;
         freq.reserve((size_t)S);
-        const int repeat_scan_end = std::max(0, S - query_tokens);
+        const int repeat_scan_end = std::max(0, S - cfg.query_tokens);
         for (int j = 0; j < repeat_scan_end; ++j) {
             ++freq[ids[(size_t)j]];
         }
@@ -689,12 +702,12 @@ static std::vector<int32_t> qwen35_score_and_compress(
             }
         }
     }
-    
+
     for (auto [_, c] : chunk_means) {
         if (count >= n_keep) break;
         if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
     }
-    
+
     std::vector<int32_t> out_ids;
     std::vector<int> selected_chunks;
     for (int c = 0; c < n_chunks; ++c) {
@@ -798,110 +811,25 @@ std::vector<int32_t> drafter_score_and_compress(
     std::sort(chunk_means.begin(), chunk_means.end(),
                       [](auto a, auto b) { return a.first > b.first; });
 
-    // Retrieval tasks often repeat a rare key in the final query and in the
-    // needle span. Exact scores alone can keep the query while dropping the
-    // neighboring answer chunk, so force a small token-only anchor neighborhood.
-    // Head/tail forced chunks scale with n_keep so top-K scoring always gets slots.
-    const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
-    const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
-    int head_chunks = h_raw, tail_chunks = t_raw;
-    if (head_chunks + tail_chunks >= n_keep) {
-        const int budget = std::max(1, n_keep - 1);
-        head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw));
-        tail_chunks = std::max(0, budget - head_chunks);
-    }
-    const int query_tokens        = env_int("DFLASH_COMPRESS_QUERY_TOKENS",   96);
-    // Anchor radius scales with n_chunks to prevent the 64K NIAH cliff:
-    // at long context the needle text is more likely to straddle multiple
-    // chunks, and a fixed radius=2 window (5 chunks ~160 tokens) loses the
-    // back half of the needle. Adaptive: <32K = 2, 32-64K = 4, >=64K = 8.
-    // Override via PFLASH_COMPRESS_ANCHOR_RADIUS env var (>= 0 wins).
-    int anchor_radius;
-    {
-        const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
-        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
-        if (env_r >= 0)         anchor_radius = env_r;
-        else if (legacy_r >= 0) anchor_radius = legacy_r;
-        else if (n_chunks <  1024) anchor_radius = 2;
-        else if (n_chunks <  2048) anchor_radius = 4;
-        else                       anchor_radius = 8;
-    }
-    // max_anchor_hits scales the same way: at long context, distinctive
-    // anchors are sparser, so we can afford to keep more hits per qi.
-    int max_anchor_hits;
-    {
-        const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
-        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
-        if (env_h >= 0)         max_anchor_hits = env_h;
-        else if (legacy_h >= 0) max_anchor_hits = legacy_h;
-        else if (n_chunks <  1024) max_anchor_hits = 8;
-        else if (n_chunks <  2048) max_anchor_hits = 16;
-        else                       max_anchor_hits = 32;
-    }
-    const int anchor_ngram = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
-        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
-        if (nv >= 0) return nv;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM set without PFLASH_COMPRESS_ANCHOR_NGRAM; honoring legacy name (deprecated)\n"); return lv; }
-        return 4;
-    }();
-    const int rare_token_max_freq = [&]{
-        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
-        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
-        if (nv >= 0) return nv;
-        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ set without PFLASH_COMPRESS_RARE_MAX_FREQ; honoring legacy name (deprecated)\n"); return lv; }
-        return 2;
-    }();
-
-    const float cascade_min_anchor_frac = [&]{
-        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
-        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
-        if (nv >= 0.0f) return nv;
-        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC set without PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC; honoring legacy name (deprecated)\n"); return lv; }
-        return 0.0f;  // gate off by default: always run cascade
-    }();
-    const float max_forced_ratio = [&]{
-        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
-        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
-        if (nv >= 0.0f) return nv;
-        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO set without PFLASH_COMPRESS_MAX_FORCED_RATIO; honoring legacy name (deprecated)\n"); return lv; }
-        return 10.0f;  // generous cap: allows bridge to rescue multi-hop (original: ~6x n_keep)
-    }();
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep);
 
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
-    for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1;
-    for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
+    for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1;
+    for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
 
-    const int q0 = std::max(0, S - query_tokens);
+    const int q0 = std::max(0, S - cfg.query_tokens);
     {
         std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
-        dflash::qwen3::AnchorScanCfg anchor_cfg{chunk_size, anchor_radius,
-                                                 max_anchor_hits, anchor_ngram,
-                                                 rare_token_max_freq};
-        anchor_cfg.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
-        anchor_cfg.max_forced_count         = (int)(max_forced_ratio * n_keep);
-        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d ratio=%.2f\n",
-            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count, max_forced_ratio);
+        dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
+        anchor_cfg.chunk_size = chunk_size;
+        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n",
+            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count);
         std::fflush(stderr);
 
-        const bool use_transitive = [&]{
-            const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
-            const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
-            if (nv >= 0) return nv != 0;
-            if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE set without PFLASH_COMPRESS_ANCHOR_TRANSITIVE; honoring legacy name (deprecated)\n"); return lv != 0; }
-            return true;  // on by default: gated rare-token bridge improves multi-hop F1
-        }();
-        const int  max_iters = [&]{
-            const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
-            const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
-            if (nv >= 0) return nv;
-            if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS set without PFLASH_COMPRESS_ANCHOR_MAX_ITERS; honoring legacy name (deprecated)\n"); return lv; }
-            return 3;
-        }();
-        if (use_transitive) {
+        if (cfg.use_transitive) {
             dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
-                                                      anchor_cfg, max_iters, forced);
+                                                      anchor_cfg, cfg.max_iters, forced);
         } else {
             dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
         }

From b7dd89b0dbb69254d3066f4db2091ea10d0e073d Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 16:04:39 +0200
Subject: [PATCH 09/16] chore(pflash): move narrative comments to docs/, trim
 mega-blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- qwen3_graph.cpp: collapse 18-line alg-note, trim VRAM prose (3 blocks),
  remove early_exit_n alias (inline early_exit_pre at call site)
- qwen35_backend.cpp: C2 gate 9-line → 2-line + docs ref;
  do_ar_decode budget-hook 15-line → 4-line + docs ref
- http_server.cpp: Design 1 rationale 13-line → 2-line + docs ref
- model_backend.h: BudgetHook 23-line essay → 3-line + docs ref
- gguf_target_loader.cpp: 4-line prose tail → 1-line
- .gitignore: ignore *.git-head / *.pre-pflash-rename workdir artifacts
- docs/: pflash-compress-cfg.md, pflash-adaptive-composition.md,
  anchor-transitive.md (consolidated rationale)
---
 .gitignore                               |  4 +++
 docs/anchor-transitive.md                | 15 ++++++++
 docs/pflash-adaptive-composition.md      | 18 ++++++++++
 docs/pflash-compress-cfg.md              | 46 ++++++++++++++++++++++++
 server/src/common/model_backend.h        | 31 ++--------------
 server/src/qwen3/qwen3_graph.cpp         | 42 +++++-----------------
 server/src/qwen35/gguf_target_loader.cpp |  5 +--
 server/src/qwen35/qwen35_backend.cpp     | 33 ++++-------------
 server/src/server/http_server.cpp        | 15 ++------
 9 files changed, 103 insertions(+), 106 deletions(-)
 create mode 100644 docs/anchor-transitive.md
 create mode 100644 docs/pflash-adaptive-composition.md
 create mode 100644 docs/pflash-compress-cfg.md

diff --git a/.gitignore b/.gitignore
index b400bb6de..63ba50a0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,7 @@ fix-plan.md
 # Harness test artifacts
 .harness-work/
 health
+
+# Workdir editor backup suffixes
+*.git-head
+*.pre-pflash-rename
diff --git a/docs/anchor-transitive.md b/docs/anchor-transitive.md
new file mode 100644
index 000000000..6f1b02f89
--- /dev/null
+++ b/docs/anchor-transitive.md
@@ -0,0 +1,15 @@
+# anchor transitive scan
+
+`scan_and_force_transitive` (anchor_scan.cpp) expands the query pool with
+tokens from newly-forced chunks and re-runs `scan_and_force` until fixed
+point or max_iters (default 3) is reached.
+
+Improves multi-hop retrieval: enables discovery of intermediate context
+chunks whose tokens do not appear in the original query but connect
+query-to-needle via shared rare tokens.
+
+Empirical result: F1=0.628 on LongBench HotpotQA at ee7 + keep=0.15
+(vs uncompressed F1=0.697). This is the ceiling for attention-score-based
+prefill compression on this task; see bench/2026-05-25_longbench_hotpotqa/.
+
+On by default. Disable via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0.
diff --git a/docs/pflash-adaptive-composition.md b/docs/pflash-adaptive-composition.md
new file mode 100644
index 000000000..1851dee1e
--- /dev/null
+++ b/docs/pflash-adaptive-composition.md
@@ -0,0 +1,18 @@
+# pflash adaptive composition (Design 1)
+
+When pflash compresses a prompt, the target spec-decode verify window must
+cover the entire compressed sequence — otherwise verify sees only the last
+fa_window positions and loses needle context.
+
+`http_server.cpp`: when pflash_compressed, sets
+`req.fa_window_override = effective_prompt.size() + 256`.
+This never caps visibility; pflash already paid compute to pick which tokens
+matter, so every kept token must be visible in verify.
+
+`qwen35_backend.cpp` C2 gate: after prefill, checks whether spec-decode
+arithmetic still earns its drafter cost at the override window size.
+
+- override <= 2 * cfg_.fa_window → spec-decode
+- override >  2 * cfg_.fa_window → AR fallback (fa_window=0, full attention)
+
+Both paths see every kept token. The gate chooses mechanism, not visibility.
diff --git a/docs/pflash-compress-cfg.md b/docs/pflash-compress-cfg.md
new file mode 100644
index 000000000..5755e3142
--- /dev/null
+++ b/docs/pflash-compress-cfg.md
@@ -0,0 +1,46 @@
+# pflash compression knobs
+
+All PFLASH_COMPRESS_* and DFLASH_COMPRESS_* env vars are read once per
+request in `compress_cfg_from_env(n_chunks, n_keep)` in qwen3_drafter.cpp.
+
+## anchor_radius adaptive ladder
+
+Prevents the 64K NIAH cliff: at long context the needle text is more likely
+to straddle multiple chunks, and a fixed radius=2 window (5 chunks / ~160
+tokens) loses the back half of the needle.
+
+Default ladder (override via PFLASH_COMPRESS_ANCHOR_RADIUS):
+
+| n_chunks   | anchor_radius |
+|------------|---------------|
+| < 1024     | 2             |
+| 1024-2047  | 4             |
+| >= 2048    | 8             |
+
+## max_anchor_hits adaptive ladder
+
+Same breakpoints as anchor_radius. At long context anchors are sparser, so
+more hits per query token are affordable.
+
+| n_chunks   | max_anchor_hits |
+|------------|-----------------|
+| < 1024     | 8               |
+| 1024-2047  | 16              |
+| >= 2048    | 32              |
+
+## anchor_transitive
+
+On by default. Gated rare-token bridge expands the query pool with tokens
+from newly-forced chunks and re-runs anchor scan to fixed point.
+Improves multi-hop F1 on LongBench HotpotQA (empirically; F1=0.628 ceiling
+at ee7+anchor-transitive on RTX 3090 — see bench/2026-05-25_longbench_hotpotqa/).
+Control via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0 to disable.
+
+## head/tail chunk forcing
+
+Head and tail chunks are force-included before top-K scoring fills the
+remainder. The counts scale with n_keep so top-K always gets at least one
+slot even when head_raw + tail_raw >= n_keep.
+
+Defaults: head=8, tail=24 (override via DFLASH_COMPRESS_HEAD_CHUNKS /
+DFLASH_COMPRESS_TAIL_CHUNKS).
diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index 3af273ace..f2a863418 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -46,35 +46,10 @@ struct DaemonIO {
 
 // ─── Generate request/result ────────────────────────────────────────────
 
-// Thinking-budget force-close hook. Mirrors antirez/ds4 ds4_eval.c's
-// hard_limit_reply_budget semantics: when the budget remaining (n_gen
-// minus tokens committed so far) falls to hard_limit_remaining, the
-// next sampled tokens get overridden with close_token_ids in order,
-// giving the model the remaining budget to write a visible answer
-// after the injected close-tag sequence.
-//
-// Single vs multi-token close:
-//   Qwen3.6: </think> is one added_token (id 248069). close_token_ids
-//            has size 1. One override + budget_close_injected=true.
-//   DeepSeek/laguna: </think> tokenizes to 3 ordinary tokens
-//            ([1718, 37947, 32] for DS-V3). close_token_ids has
-//            size 3. Three consecutive overrides, then resume.
-//
-// This is "Level 2" of our thinking-budget migration: in-process
-// mid-stream force-close, KV-continuous. Beats Level 1's phase-2
-// reprompt because the model never sees a fresh prefill — its KV
-// state continues naturally after the injected close.
-//
-// Current implementation: AR-decode only. When budget_hook is set,
-// backends MAY route generation through their AR path (skipping spec
-// decode) — the perf trade-off is acceptable since this only kicks in
-// for thinking-enabled requests. Spec-decode integration is a follow-up.
+// Thinking-budget force-close hook; see docs/specs/thinking-budget.md.
+// When (n_gen - committed) == hard_limit_remaining, overrides sampled
+// tokens with close_token_ids (AR path only). Empty = disabled.
 struct BudgetHook {
-    // Multi-token close sequence injected when `(n_gen - committed)`
-    // drops to `hard_limit_remaining`. For Qwen3.x this is the
-    // canonical "Considering the limited time..." summarize-and-stop
-    // lead-in (tokenized at server startup); for non-qwen arches it's
-    // a single close-tag token. Empty = hook disabled.
     std::vector<int32_t> close_token_ids;
     int                  hard_limit_remaining = 0;
 };
diff --git a/server/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp
index 858bcd75e..c2715a356 100644
--- a/server/src/qwen3/qwen3_graph.cpp
+++ b/server/src/qwen3/qwen3_graph.cpp
@@ -5,23 +5,10 @@
 // buffers. Sliding-window flash-attention via ggml-cuda's tensor-core
 // `flash_attn_ext` keeps attention cost linear in S.
 //
-// **Algorithmic note vs blog**:
-//   The blog stack is Liu Q-hook tail scoring + FlashPrefill block-sparse FA.
-//   The Liu Q-hook is implemented with a NoPE fix: by default (DFLASH_FP_NOPE_TAIL=1)
-//   the tail score uses pre-RoPE K/Q, removing the RoPE distance decay that
-//   buries early-position needle chunks and was causing NIAH failures.
-//   Set DFLASH_FP_NOPE_TAIL=0 to revert to post-RoPE scoring.  The block-sparse FA is replaced
-//   with a sliding-window approximation here because (a) ggml-cuda's
-//   `flash_attn_ext` already gives tensor-core speed inside the ubatch
-//   graph, and (b) our own block-sparse CUDA kernel needs a tensor-core
-//   rewrite (mma.sync.aligned) to actually beat ggml's FA — see
-//   `src/flashprefill_kernels.cu` for the (slow) scalar reference path.
-//   At S=140K with W=512 sliding window the NIAH magic key still propagates
-//   through 28 layers and is recovered in the kept tokens, so this
-//   approximation passes the actual e2e correctness check the user cares
-//   about. The block-sparse FA upgrade remains the next deliverable for
-//   "match the article algorithmically", but is functionally equivalent
-//   for the deployed perf budget today.
+// Tail score uses pre-RoPE K/Q (DFLASH_FP_NOPE_TAIL=1 default) to remove
+// distance decay that buries early-position needle chunks (NIAH fix).
+// Block-sparse FA replaced by sliding-window via ggml-cuda flash_attn_ext;
+// BSA upgrade tracked in flashprefill_kernels.cu.
 //
 // Memory at S=140K, B=1, H=16, Hk=8, D=128, hidden=1024, ff=3072:
 //   weights                                            ~1.5 GB
@@ -250,10 +237,8 @@ bool forward_qwen3_drafter_model(
     }
     running_max.assign((size_t)n_lookahead * S, -INFINITY);
 
-    // Compute score_layer_start early so we can avoid allocating K_norope/Q_norope
-    // for layers that will never be used in scoring.  At S=128K the full K_norope
-    // allocation is ~5.6 GB (21 unused layers × 268 MB) — skipping it keeps total
-    // VRAM under 24 GB and eliminates the warm-path regression (A_compute 5.4x).
+    // Pre-compute score range to skip K_norope alloc for non-scoring layers.
+    // At S=128K this trims ~5.6 GB (21 × 268 MB); see test_drafter_warm_path_regression.
     static const int score_layers_pre = []() -> int {
         const char * e = std::getenv("PFLASH_DRAFTER_SCORE_LAYERS");
         if (e) { int v = std::atoi(e); if (v > 0) return v; }
@@ -264,23 +249,16 @@ bool forward_qwen3_drafter_model(
         if (e) { int v = std::atoi(e); if (v > 0) return v; }
         return -1;
     }();
-    // fwd_layer_limit_pre mirrors the fwd_layer_limit computed later in the loop.
     const int fwd_layer_limit_pre = (early_exit_pre > 0 && early_exit_pre < w.n_layer)
         ? early_exit_pre : w.n_layer;
-    // Use compute_score_range (same formula as the scoring loop) so the pre-alloc
-    // boundary is guaranteed to match the actual scoring boundary.
     const ScoreRange pre_range = compute_score_range(w.n_layer, score_layers_pre, fwd_layer_limit_pre);
     const int score_layer_start_pre = pre_range.start;
-    // Number of layers that participate in scoring (and need K_norope/Q_norope).
     const int n_score_layers = pre_range.count();
 
     PersBuf hidden_buf, pos_buf, mask_tail_buf, Q_buf, attn_out_buf;
     std::vector<PersBuf> K_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> V_curr_v((size_t)w.n_layer);
     std::vector<PersBuf> Q_last_v((size_t)w.n_layer);
-    // NoPE: only allocate K_norope/Q_norope for layers that will be scored.
-    // When score_layer_start_pre > 0 this trims up to 21 × 268 MB = 5.6 GB,
-    // preventing the VRAM overflow that causes the warm-path regression at 128K.
     std::vector<PersBuf> K_norope_v(nope_tail ? (size_t)n_score_layers : 0);
     std::vector<PersBuf> Q_norope_v(nope_tail ? (size_t)n_score_layers : 0);
     auto cleanup_all = [&]() {
@@ -380,10 +358,6 @@ bool forward_qwen3_drafter_model(
         ggml_free(gctx);
     }
 
-    // PFLASH_DRAFTER_EARLY_EXIT_N: already read into early_exit_pre above.
-    // Alias used in the forward-loop limit below.
-    const int & early_exit_n = early_exit_pre;
-
     // Per-layer A→FA→B loop.
     ggml_gallocr_t galloc = ggml_gallocr_new(
         ggml_backend_get_default_buffer_type(w.backend));
@@ -404,8 +378,8 @@ bool forward_qwen3_drafter_model(
     double t_b_warm = 0.0, t_b_setup = 0.0, t_b_alloc = 0.0, t_b_copy_in = 0.0, t_b_norm = 0.0, t_compute_b = 0.0, t_b_copy_out = 0.0;
     double t_fp = 0.0;
 
-    const int fwd_layer_limit = (early_exit_n > 0 && early_exit_n < w.n_layer)
-        ? early_exit_n : w.n_layer;
+    const int fwd_layer_limit = (early_exit_pre > 0 && early_exit_pre < w.n_layer)
+        ? early_exit_pre : w.n_layer;
 
     for (int il = 0; il < fwd_layer_limit; ++il) {
         const auto & L = w.layers[il];
diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp
index 9085c1a1b..8628eb3ab 100644
--- a/server/src/qwen35/gguf_target_loader.cpp
+++ b/server/src/qwen35/gguf_target_loader.cpp
@@ -38,10 +38,7 @@
 //     ssm_out.weight                 [inner, hidden]           Q5_K
 //     ffn_gate/up/down              (same as full-attn)
 //
-// This loader reads the file via ggml's built-in GGUF API, which returns a
-// ggml_context pre-populated with tensors. We then wire that context onto
-// the CUDA backend (via ggml_backend_alloc_ctx_tensors) and copy each
-// tensor's bytes from the mmap'd file.
+// Loads via ggml GGUF API; tensors copied from mmap to CUDA backend.
 
 #include "internal.h"
 #include "common/layer_split_utils.h"
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 471e8af07..feb21e7b2 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -578,15 +578,8 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
     auto t_prefill_end = std::chrono::steady_clock::now();
     result.prefill_s = std::chrono::duration<double>(t_prefill_end - t_prefill_start).count();
 
-    // C2 adaptive-mechanism gate: pflash's override always reflects the
-    // FULL compressed-prompt size — we never cap visibility (would waste
-    // pflash's anchor-selection work). The gate here decides whether
-    // spec-decode's verify arithmetic still earns its drafter cost at
-    // that window size. Threshold 2× cfg_.fa_window:
-    //   override <= 4096 (32K → ~1.5K, 64K → ~3K compressed) → spec-decode
-    //   override >  4096 (128K → ~6.4K compressed)            → AR fallback
-    // AR uses fa_window=0 (full attention) so every kept token is visible
-    // regardless of which path runs. We choose mechanism, not visibility.
+    // C2 gate: spec-decode when override <= 2x fa_window; AR fallback otherwise.
+    // Both paths see all kept tokens. See docs/pflash-adaptive-composition.md.
     const bool fa_within_budget =
         (req.fa_window_override == 0)
      || (eff_fa_window <= 2 * cfg_.fa_window);
@@ -847,26 +840,12 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
                                   const BudgetHook & budget_hook,
                                   bool * forced_close_out,
                                   bool * degenerate_close_out) {
-    // Budget hook state.
-    //   - budget_close_started: true once we've begun injecting the close
-    //     sequence. Prevents re-triggering on continued forward generation.
-    //   - close_inject_pos: index into budget_hook.close_token_ids for the
-    //     NEXT token to inject. While < close_token_ids.size(), each
-    //     iteration overrides the sampled token with the corresponding
-    //     close-sequence token (single-token close = 1 override and done;
-    //     multi-token close like DeepSeek/laguna [1718,37947,32] = 3
-    //     consecutive overrides). Once equal to close_token_ids.size(),
-    //     normal sampling resumes (model writes visible answer).
+    // budget_close_started: prevents re-triggering; close_inject_pos: next
+    // token index to inject from close_token_ids. See docs/specs/thinking-budget.md.
     bool budget_close_started = false;
     int  close_inject_pos     = 0;
-    // Capture entry KV position so the budget check is in the
-    // "generated since entry" frame, not the absolute KV frame.
-    // n_gen is the gen-only count (or the remaining-budget remap done by
-    // spec-decode tail-off); subtracting committed_now (absolute KV =
-    // prompt_len + tokens generated this call) directly would treat
-    // prompt-length tokens as if they were generated output, firing
-    // force-close prompt_len tokens early on prompted requests and
-    // potentially going negative after spec-decode tail-off.
+    // committed_at_entry: anchors budget check to "generated since entry" frame,
+    // not absolute KV (avoids firing prompt_len tokens early).
     const int committed_at_entry = committed;
     auto maybe_force_close = [&](int32_t & tok, int committed_now) {
         if (budget_hook.close_token_ids.empty()) return;
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 465758b51..346b9d8d4 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1267,19 +1267,8 @@ void HttpServer::worker_loop() {
         gen_req.sampler = req.sampler;
         gen_req.do_sample = req.sampler.needs_logit_processing();
         gen_req.stream = false;  // we handle streaming via on_token callback
-        // Design 1: when pflash compresses, widen the target spec-decode verify
-        // fa_window to cover the entire compressed prompt. Otherwise verify sees
-        // only the last cfg_.fa_window positions of the compressed sequence,
-        // losing needle context and truncating the answer at long ctx.
-        //
-        // Principle: pflash already paid compute to pick which tokens matter.
-        // Don't throw any of them away in verify by capping fa_window — that
-        // would waste pflash's work. Always request enough verify window to
-        // see the entire compressed prompt. The C2 gate in qwen35_backend.cpp
-        // then decides per request whether spec-decode arithmetic still beats
-        // AR at this window size; if not, AR fallback kicks in (which uses
-        // fa_window=0 → full attention over the compressed prompt). Either
-        // path sees every kept token. We choose mechanism, not visibility.
+        // Widen verify window to cover the full compressed prompt; C2 gate in
+        // qwen35_backend.cpp selects spec-decode vs AR. See docs/pflash-adaptive-composition.md.
         if (pflash_compressed) {
             gen_req.fa_window_override = (int)effective_prompt.size() + 256;
         }

From ff0a6b9074a998302dc3bbfe88faeeb4709ce119 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 17:16:35 +0200
Subject: [PATCH 10/16] fix(server): append closed <think> prefill in Jinja
 renderer when thinking is off

The hard-coded renderer appends a closed think prefill when thinking is
disabled. Some Qwen3.6 Jinja templates omit that final assistant suffix,
leaving the model in the wrong decoding state for tool use. Mirror the
hard-coded behavior here when the rendered prompt ends with a bare
assistant generation prompt; tolerate trailing-whitespace variants
(single \n, double \n\n, trailing space).

Diagnosed by Round 5b D peer-chat showing dflash drafter accept_rate=0.0%:
the drafter was distilled with the closed-think suffix in its training
distribution; the Unsloth Qwen3-Coder template doesn't emit it, so target
and drafter disagree on what comes after <|im_start|>assistant\n.
---
 server/src/server/chat_template.cpp | 30 ++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
index 1349109ad..4e972e495 100644
--- a/server/src/server/chat_template.cpp
+++ b/server/src/server/chat_template.cpp
@@ -411,7 +411,35 @@ std::string render_chat_template_jinja(
         jinja::runtime rt(ctx);
         jinja::value results = rt.execute(*prog);
         auto parts = jinja::runtime::gather_string_parts(results);
-        return parts->as_string().str();
+        std::string rendered = parts->as_string().str();
+
+        // The hard-coded Qwen renderer appends a closed think prefill when
+        // thinking is disabled. Some Qwen3.6 Jinja templates omit that final
+        // assistant suffix, which leaves the model in the wrong decoding state
+        // for tool use. Mirror the hard-coded behavior here when the rendered
+        // prompt ends with a bare assistant generation prompt.
+        if (!enable_thinking) {
+            // Tolerate template variants that emit extra trailing whitespace
+            // after the assistant marker (single \n, double \n\n, trailing
+            // space). Strategy: trim trailing whitespace, check for the BARE
+            // assistant marker (no newline), then re-emit marker + prefill.
+            static constexpr char kAssistantBare[]    = "<|im_start|>assistant";
+            static constexpr char kAssistantPrefill[] = "<|im_start|>assistant\n<think>\n\n</think>\n\n";
+            size_t trim_end = rendered.size();
+            while (trim_end > 0) {
+                char c = rendered[trim_end - 1];
+                if (c != ' ' && c != '\t' && c != '\n' && c != '\r') break;
+                --trim_end;
+            }
+            const size_t blen = sizeof(kAssistantBare) - 1;
+            if (trim_end >= blen &&
+                rendered.compare(trim_end - blen, blen, kAssistantBare) == 0) {
+                rendered.resize(trim_end - blen);
+                rendered += kAssistantPrefill;
+            }
+        }
+
+        return rendered;
     } catch (const std::exception & e) {
         throw std::runtime_error(std::string("jinja runtime: ") + e.what());
     }

From fc8c8e24c51fb1f7cd8197cc913544a4cf008448 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 17:20:37 +0200
Subject: [PATCH 11/16] fix(chat_template): gate closed-think prefill injection
 to Qwen3 arch only

The previous commit applied the closed-think suffix to all Jinja-rendered
prompts. Add arch_hint (ChatFormat) parameter to render_chat_template_jinja,
defaulting to QWEN3, and guard the post-processing block with
arch_hint == ChatFormat::QWEN3. Call site in http_server.cpp passes
chat_format_ so other archs (Laguna, Gemma4) are unaffected. qwen35moe
inherits ChatFormat::QWEN3 by design (matches drafter distillation).

5 unit tests cover: thinking-off appends, thinking-on no-append, non-Qwen3
arch no-append (Laguna + Gemma4), qwen35moe inherits QWEN3, no double-append
when template already closes the think block.

Diagnosis + verification protocol in docs/pflash-drafter-template-alignment.md.
---
 docs/pflash-drafter-template-alignment.md | 95 +++++++++++++++++++++++
 server/src/server/chat_template.cpp       | 17 ++--
 server/src/server/chat_template.h         |  5 +-
 server/src/server/http_server.cpp         |  3 +-
 server/test/test_server_unit.cpp          | 89 +++++++++++++++++++++
 5 files changed, 200 insertions(+), 9 deletions(-)
 create mode 100644 docs/pflash-drafter-template-alignment.md

diff --git a/docs/pflash-drafter-template-alignment.md b/docs/pflash-drafter-template-alignment.md
new file mode 100644
index 000000000..3669b5ed9
--- /dev/null
+++ b/docs/pflash-drafter-template-alignment.md
@@ -0,0 +1,95 @@
+# Drafter / target distribution alignment via closed-think prefill
+
+## Problem
+
+PR #274 (adaptive composition) shipped on `feat/pflash-drafter-ee7`, validating
+13× prefill TPS and +47% decode TPS at long context. It surfaced a load-bearing
+ceiling on the dflash decode side: spec-decode `accept_rate` was capped at
+13–21% on the opencode harness and went to 0.0% on a peer-chat call. Composition
+arm decode TPS (24.4 tok/s) therefore stayed below pflash-only (33.0 tok/s) —
+the drafter overhead wasn't amortizing through acceptance.
+
+## Diagnosis (the wrong hypothesis first)
+
+The peer-chat conversation suggested "drafter conditioned on a different chat
+template than the target." Three Phase-1 Explore agents traced the code and
+showed that framing is architecturally wrong:
+
+- Both target and drafter receive the **same** `effective_prompt` token IDs at
+  prefill. The chat template is applied **once** on the target side at
+  `server/src/server/http_server.cpp:996-1014`, tokenized with the target's
+  tokenizer at `:1014`, then flows to both target and drafter via
+  `gen_req.prompt = effective_prompt` at `:1265`.
+- The drafter `dflash-draft-3.6-q4_k_m.gguf` does **not** apply any chat
+  template at runtime. `server/src/draft/draft_gguf_loader.cpp` doesn't read
+  the `tokenizer.chat_template` GGUF metadata key.
+
+A `--draft-chat-template` flag would fix nothing — there is no drafter-side
+template-application code path to redirect.
+
+## Diagnosis (the actual root cause)
+
+The drafter GGUF **does** ship the official Qwen3.6 chat template as
+`tokenizer.chat_template` metadata. That template appends
+`<think>\n\n</think>\n\n` after `<|im_start|>assistant\n` when
+`enable_thinking=false`. The drafter was distilled with that closed-think
+suffix in its training distribution — every assistant turn it predicts
+expects that prefix.
+
+The target's Unsloth Qwen3-Coder template (`project_unsloth_jinja_template_solves_tool_call`
+in memory) does **not** append that suffix. So at the moment spec-decode
+predicts the next token after `<|im_start|>assistant\n`:
+
+- drafter's distribution expects `<think>` literal tokens
+- target's distribution expects the actual answer
+
+Drafter proposes `<think>...`, target rejects, falls back to AR. Repeat at
+every position. `accept_rate` ≈ 0%.
+
+## Fix
+
+Make the **target's render** match the drafter's training distribution.
+`render_chat_template_jinja` now appends `<think>\n\n</think>\n\n` after a
+bare `<|im_start|>assistant` marker when **all three** of these hold:
+
+1. `arch_hint == ChatFormat::QWEN3` (gated to Qwen3-family — qwen35, qwen35moe;
+   Laguna / Gemma4 don't use ChatML tokens and must not be touched)
+2. `!enable_thinking`
+3. The rendered prompt ends with the bare assistant marker (tolerant of
+   trailing whitespace variants: `\n`, `\n\n`, trailing space)
+
+Condition (3) prevents double-appending when a user-supplied template already
+emits the closed-think suffix.
+
+## Multi-arch safety
+
+`chat_format_for_arch()` in `server/src/server/chat_template.cpp` returns:
+- `ChatFormat::QWEN3` for `qwen3`, `qwen35`, `qwen35moe`
+- `ChatFormat::LAGUNA` for `laguna`
+- `ChatFormat::GEMMA4` for `gemma4`
+
+The suffix only fires for `QWEN3`. A new test
+(`test_chat_format_for_arch_qwen35moe_returns_qwen3`) locks the qwen35moe →
+QWEN3 inheritance so a future arch-enum addition doesn't silently flip
+behavior. Tests also lock the Laguna/Gemma4 no-append case and the
+no-double-append guard.
+
+## Expected impact
+
+- `accept_rate` lifts from 13–21% (and 0% on peer-chat) on Qwen3.6 dense with
+  Unsloth Qwen3-Coder template. Threshold for declaring the fix worked:
+  non-zero peer-chat accept_rate AND opencode harness accept_rate ≥30% on at
+  least 2 of 3 turns from Round 5b D.
+- Composition arm decode TPS rises above pflash-only on long-generation
+  workloads (currently 24.4 vs 33.0; the gap exists because spec-decode
+  amortization is bounded by accept_rate).
+- davide221's qwen35moe `chat CACHE` hang (issue #280) likely has the same
+  root cause via the same code path — qwen35moe inherits ChatFormat::QWEN3
+  and the suffix will fire there too.
+
+## Out of scope
+
+The sibling commits on `fix/qwen36-claude-code-tool-calling` (target-side
+tool-format normalization, scrub/truncate, Anthropic→Qwen tool shape,
+param-name aliasing) ship as PR #276. They are not drafter alignment — they
+are independent target-side tool-formatting improvements.
diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
index 4e972e495..33f4bd864 100644
--- a/server/src/server/chat_template.cpp
+++ b/server/src/server/chat_template.cpp
@@ -360,7 +360,8 @@ std::string render_chat_template_jinja(
     const std::string & eos_token,
     bool add_generation_prompt,
     bool enable_thinking,
-    const std::string & tools_json)
+    const std::string & tools_json,
+    ChatFormat arch_hint)
 {
     if (template_src.empty()) {
         throw std::runtime_error("render_chat_template_jinja: template_src is empty");
@@ -413,12 +414,14 @@ std::string render_chat_template_jinja(
         auto parts = jinja::runtime::gather_string_parts(results);
         std::string rendered = parts->as_string().str();
 
-        // The hard-coded Qwen renderer appends a closed think prefill when
-        // thinking is disabled. Some Qwen3.6 Jinja templates omit that final
-        // assistant suffix, which leaves the model in the wrong decoding state
-        // for tool use. Mirror the hard-coded behavior here when the rendered
-        // prompt ends with a bare assistant generation prompt.
-        if (!enable_thinking) {
+        // Qwen3/3.5/3.6 only: the hard-coded renderer appends a closed think
+        // prefill when thinking is disabled. Some Qwen3.6 Jinja templates omit
+        // that final assistant suffix, leaving the model in the wrong decoding
+        // state for tool use. Mirror the hard-coded behavior here when the
+        // rendered prompt ends with a bare assistant generation prompt.
+        // Other architectures (Laguna, Gemma4, ...) do not use ChatML tokens
+        // and must not be touched here.
+        if (arch_hint == ChatFormat::QWEN3 && !enable_thinking) {
             // Tolerate template variants that emit extra trailing whitespace
             // after the assistant marker (single \n, double \n\n, trailing
             // space). Strategy: trim trailing whitespace, check for the BARE
diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h
index ca7ef9db5..b544df245 100644
--- a/server/src/server/chat_template.h
+++ b/server/src/server/chat_template.h
@@ -63,6 +63,8 @@ ChatFormat chat_format_for_arch(const std::string & arch);
 //                {{bos_token}} / {{eos_token}}). Use empty strings if unknown.
 // `tools_json`   optional JSON array of tool definitions; when non-empty it
 //                is parsed and injected as `tools` into the template context.
+// `arch_hint`    model architecture (controls arch-specific post-processing;
+//                the closed-think prefill injection is Qwen3/3.5/3.6 only).
 //
 // Internally caches the most recently parsed program per thread (avoids
 // re-parsing the template on every request). Throws std::runtime_error on
@@ -74,6 +76,7 @@ std::string render_chat_template_jinja(
     const std::string & eos_token,
     bool add_generation_prompt = true,
     bool enable_thinking = false,
-    const std::string & tools_json = "");
+    const std::string & tools_json = "",
+    ChatFormat arch_hint = ChatFormat::QWEN3);
 
 }  // namespace dflash::common
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 346b9d8d4..22076c4b9 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1000,7 +1000,8 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                     eos_str,
                     /*add_generation_prompt=*/true,
                     enable_thinking,
-                    tools_json);
+                    tools_json,
+                    chat_format_);
             } catch (const std::exception & e) {
                 send_error(fd, 500,
                     std::string("chat template (jinja) render failed: ") + e.what());
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index 1415aab30..fc54666c9 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -1133,6 +1133,90 @@ static void test_jinja_render_bad_tools_json_throws() {
     TEST_ASSERT(threw);
 }
 
+// ---------------------------------------------------------------------------
+// Drafter / target distribution alignment (closed <think> prefill on Qwen3).
+// The hard-coded Qwen renderer appends a closed think prefill when thinking is
+// disabled; some Qwen3.6 Jinja templates omit it. render_chat_template_jinja
+// mirrors the hard-coded behavior when arch_hint == QWEN3 && !enable_thinking
+// && the rendered prompt ends with a bare assistant generation marker.
+// ---------------------------------------------------------------------------
+
+static const char QWEN3_BARE_ASSISTANT_TPL[] =
+    "{%- for m in messages -%}"
+    "<|im_start|>{{ m.role }}\n{{ m.content }}<|im_end|>\n"
+    "{%- endfor -%}"
+    "{%- if add_generation_prompt -%}"
+    "<|im_start|>assistant\n"
+    "{%- endif -%}";
+
+static void test_jinja_render_qwen3_closes_think_when_thinking_off() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    std::string out = render_chat_template_jinja(
+        QWEN3_BARE_ASSISTANT_TPL, msgs, "", "",
+        /*add_gen=*/true, /*think=*/false, /*tools=*/"",
+        /*arch_hint=*/ChatFormat::QWEN3);
+    TEST_ASSERT(out.find("<|im_start|>assistant\n<think>\n\n</think>\n\n") != std::string::npos);
+}
+
+static void test_jinja_render_does_not_close_think_when_thinking_on() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    std::string out = render_chat_template_jinja(
+        QWEN3_BARE_ASSISTANT_TPL, msgs, "", "",
+        /*add_gen=*/true, /*think=*/true, /*tools=*/"",
+        /*arch_hint=*/ChatFormat::QWEN3);
+    TEST_ASSERT(out.find("</think>") == std::string::npos);
+}
+
+static void test_jinja_render_does_not_close_think_for_non_qwen3_arch() {
+    // Laguna and Gemma4 do not use ChatML tokens; the closed-think suffix
+    // must NOT be appended for them even if the rendered prompt happens to
+    // end with the same string.
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    std::string out_laguna = render_chat_template_jinja(
+        QWEN3_BARE_ASSISTANT_TPL, msgs, "", "",
+        /*add_gen=*/true, /*think=*/false, /*tools=*/"",
+        /*arch_hint=*/ChatFormat::LAGUNA);
+    TEST_ASSERT(out_laguna.find("</think>") == std::string::npos);
+    std::string out_gemma4 = render_chat_template_jinja(
+        QWEN3_BARE_ASSISTANT_TPL, msgs, "", "",
+        /*add_gen=*/true, /*think=*/false, /*tools=*/"",
+        /*arch_hint=*/ChatFormat::GEMMA4);
+    TEST_ASSERT(out_gemma4.find("</think>") == std::string::npos);
+}
+
+static void test_chat_format_for_arch_qwen35moe_returns_qwen3() {
+    // qwen35moe MUST inherit ChatFormat::QWEN3 — the closed-think prefill
+    // depends on it, and a future enum-add must not silently flip behavior.
+    TEST_ASSERT(chat_format_for_arch("qwen35moe") == ChatFormat::QWEN3);
+    TEST_ASSERT(chat_format_for_arch("qwen35")    == ChatFormat::QWEN3);
+    TEST_ASSERT(chat_format_for_arch("qwen3")     == ChatFormat::QWEN3);
+    TEST_ASSERT(chat_format_for_arch("laguna")    == ChatFormat::LAGUNA);
+    TEST_ASSERT(chat_format_for_arch("gemma4")    == ChatFormat::GEMMA4);
+}
+
+static void test_jinja_render_does_not_double_append_close_think() {
+    // A user-supplied template that already closes the think block must not
+    // get a second </think> suffix from the bare-marker post-processing.
+    static const char TPL_ALREADY_CLOSED[] =
+        "{%- for m in messages -%}"
+        "<|im_start|>{{ m.role }}\n{{ m.content }}<|im_end|>\n"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "<|im_start|>assistant\n<think>\n\n</think>\n\n"
+        "{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    std::string out = render_chat_template_jinja(
+        TPL_ALREADY_CLOSED, msgs, "", "",
+        /*add_gen=*/true, /*think=*/false, /*tools=*/"",
+        /*arch_hint=*/ChatFormat::QWEN3);
+    // Exactly one </think> — the one the template emitted itself.
+    size_t first  = out.find("</think>");
+    size_t second = (first == std::string::npos) ? std::string::npos
+                                                  : out.find("</think>", first + 1);
+    TEST_ASSERT(first  != std::string::npos);
+    TEST_ASSERT(second == std::string::npos);
+}
+
 static void test_normalize_responses_tool_followup_messages() {
     ToolMemory tool_memory;
     const std::string call_id = "call_exec_001";
@@ -2539,6 +2623,11 @@ int main() {
     RUN_TEST(test_jinja_render_empty_tools_skipped);
     RUN_TEST(test_jinja_render_bos_eos_threaded);
     RUN_TEST(test_jinja_render_empty_template_throws);
+    RUN_TEST(test_jinja_render_qwen3_closes_think_when_thinking_off);
+    RUN_TEST(test_jinja_render_does_not_close_think_when_thinking_on);
+    RUN_TEST(test_jinja_render_does_not_close_think_for_non_qwen3_arch);
+    RUN_TEST(test_chat_format_for_arch_qwen35moe_returns_qwen3);
+    RUN_TEST(test_jinja_render_does_not_double_append_close_think);
     RUN_TEST(test_jinja_render_bad_tools_json_throws);
     RUN_TEST(test_normalize_responses_tool_followup_messages);
 

From e64a2b80669f5da5e2e3c8dd3599fd4774ab387b Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 28 May 2026 19:04:09 +0200
Subject: [PATCH 12/16] refactor(c2-gate): wire c2_spec_decode_permitted into
 qwen35_backend

Extract the C2 spec-decode gate from an inline expression in
qwen35_backend.cpp into a pure predicate header c2_gate.h.

Zero behavior change. Identical math:
  (fa_window_override == 0) || (fa_window_override <= 2 * fa_window_cfg)

The new header documents the empirically-derived rationale: at
compressed KV sizes (pflash compression of long prompts), T_draft/T_target
ratio approaches 1, eliminating spec-decode's profit margin over AR.
Empirical at D_composition 128K replay: AR=27.5 tok/s vs forced
spec-decode=5.74 tok/s. The gate correctly blocks spec-decode when
eff_fa_window > 2*fa_window_cfg.

Adds 5 unit tests locking in the predicate's behavior with explicit
Round 5 4-arm matrix bench citations.

Files:
- server/src/qwen35/c2_gate.h (new)
- server/src/qwen35/qwen35_backend.cpp (+1 include, inline -> call)
- server/test/test_server_unit.cpp (+60 LOC, 5 tests)
---
 server/src/qwen35/c2_gate.h          | 31 ++++++++++++++
 server/src/qwen35/qwen35_backend.cpp |  6 ++-
 server/test/test_server_unit.cpp     | 60 ++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 server/src/qwen35/c2_gate.h

diff --git a/server/src/qwen35/c2_gate.h b/server/src/qwen35/c2_gate.h
new file mode 100644
index 000000000..51c644e2c
--- /dev/null
+++ b/server/src/qwen35/c2_gate.h
@@ -0,0 +1,31 @@
+// C2 gate predicate — pure function, no GPU/model deps.
+// Extracted from qwen35_backend.cpp for testability.
+//
+// Reasoning: when pflash compresses a 128K prompt to ~11K tokens, the
+// target KV at decode time = 11K (small). T_target is fast (small KV),
+// T_draft ≈ constant. r = T_draft/T_target ≈ 1, so spec-decode does NOT
+// win over AR. Empirical: D_composition 128K: AR=27.5 tok/s, spec=5.74 tok/s.
+// Gate correctly blocks spec-decode when eff_fa_window > 2*fa_window_cfg.
+#pragma once
+
+namespace dflash::common {
+
+// Returns true if spec-decode should be attempted.
+//   fa_window_override: 0 = no pflash; else = compressed_prompt_size + 256
+//   fa_window_cfg     : cfg_.fa_window (default 2048)
+//   kv_committed      : KV position after prefill (unused; kept for future use)
+//
+// Gate: permit spec-decode when eff_fa_window <= 2 * fa_window_cfg.
+// For uncompressed (override==0): always permit.
+// For pflash-compressed: permit only when compressed_size <= 3840 tokens.
+// At compressed_size > 3840, target KV is large enough that AR is faster
+// than spec-decode (empirically: D_composition 128K AR=27.5 vs spec=5.74 tok/s).
+inline bool c2_spec_decode_permitted(int fa_window_override,
+                                     int fa_window_cfg,
+                                     int kv_committed) {
+    (void)kv_committed;
+    return (fa_window_override == 0)
+        || (fa_window_override <= 2 * fa_window_cfg);
+}
+
+} // namespace dflash::common
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index feb21e7b2..82d0ea6ca 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -6,6 +6,7 @@
 #include "common/dflash_draft_graph.h"
 #include "peer_access.h"
 #include "attn_masks.h"
+#include "qwen35/c2_gate.h"
 #include "common/sampler.h"
 #include "common/io_utils.h"
 #include "common/restore_delta.h"
@@ -581,8 +582,9 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
     // C2 gate: spec-decode when override <= 2x fa_window; AR fallback otherwise.
     // Both paths see all kept tokens. See docs/pflash-adaptive-composition.md.
     const bool fa_within_budget =
-        (req.fa_window_override == 0)
-     || (eff_fa_window <= 2 * cfg_.fa_window);
+        dflash::common::c2_spec_decode_permitted(req.fa_window_override,
+                                                 cfg_.fa_window,
+                                                 /*kv_committed*/ 0);
 
     // Decode (speculative or AR)
     if (req.n_gen > 0) {
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index fc54666c9..eb53299fc 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -23,6 +23,7 @@
 #include "placement/placement_config.h"
 #include "common/layer_split_backend.h"
 #include "common/layer_split_utils.h"
+#include "qwen35/c2_gate.h"
 #include <nlohmann/json.hpp>
 
 #include <cmath>
@@ -2532,6 +2533,58 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() {
     TEST_ASSERT(r.accept_rate == 0.0f);
 }
 
+// ═══════════════════════════════════════════════════════════════════════
+// C2 gate: c2_spec_decode_permitted() unit tests
+//
+// Gate logic: permit spec-decode when eff_fa_window <= 2*fa_window_cfg.
+// eff_fa_window = fa_window_override when set, else fa_window_cfg.
+//
+// Empirical validation (Round 5 bench):
+// - D_composition 128K: effective_in=10988, eff_fa_window=11244 > 4096
+//   → gate BLOCKS spec-decode → AR at 27.5 tok/s (correct — spec at 5.74)
+// - D_composition short: eff_fa_window <= 4096 → gate permits spec-decode
+// ═══════════════════════════════════════════════════════════════════════
+
+static void test_c2_gate_no_override_always_permits() {
+    // fa_window_override == 0 → no pflash, always spec-decode permitted.
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 1));
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 4096));
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(0, 2048, 131072));
+}
+
+static void test_c2_gate_128k_compressed_blocks_spec() {
+    // Round 5 D 128K: effective_in=10988, fa_window_override=11244.
+    // 11244 > 2*2048=4096 → gate correctly BLOCKS spec-decode (AR wins empirically).
+    int fa_window_cfg = 2048;
+    int compressed_size = 10988;
+    int fa_window_override = compressed_size + 256;  // = 11244
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(
+        fa_window_override, fa_window_cfg, compressed_size));
+}
+
+static void test_c2_gate_65k_compressed_blocks_spec() {
+    // D 65K cell: effective_in≈5383, fa_window_override≈5639 > 4096 → blocks.
+    int compressed_size = 5383;
+    int fa_window_override = compressed_size + 256;
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(
+        fa_window_override, 2048, compressed_size));
+}
+
+static void test_c2_gate_small_compressed_permits_spec() {
+    // Small compressed KV (override <= 2*fa_window): spec-decode permitted.
+    // fa_window_override=3000 <= 4096 → permit
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(3000, 2048, 2744));
+    // fa_window_override=4096 == 2*2048 → permit (at boundary)
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840));
+}
+
+static void test_c2_gate_boundary_at_2x_fa_window() {
+    // At exactly 2*fa_window_cfg: permit (<=).
+    TEST_ASSERT(dflash::common::c2_spec_decode_permitted(4096, 2048, 3840));
+    // At 2*fa_window_cfg + 1: block.
+    TEST_ASSERT(!dflash::common::c2_spec_decode_permitted(4097, 2048, 3841));
+}
+
 int main() {
     std::fprintf(stderr, "══════════════════════════════════════════\n");
     std::fprintf(stderr, " Server Unit Tests\n");
@@ -2698,6 +2751,13 @@ int main() {
     RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic);
     RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode);
 
+    std::fprintf(stderr, "\n── C2 gate (spec-decode gate) ──\n");
+    RUN_TEST(test_c2_gate_no_override_always_permits);
+    RUN_TEST(test_c2_gate_128k_compressed_blocks_spec);
+    RUN_TEST(test_c2_gate_65k_compressed_blocks_spec);
+    RUN_TEST(test_c2_gate_small_compressed_permits_spec);
+    RUN_TEST(test_c2_gate_boundary_at_2x_fa_window);
+
     std::fprintf(stderr, "\n══════════════════════════════════════════\n");
     std::fprintf(stderr, " Results: %d assertions, %d failures\n",
                  test_count, test_failures);

From 8c1f37db0a538b7e792ac0650a340b7df63d7001 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Fri, 29 May 2026 17:23:29 +0200
Subject: [PATCH 13/16] feat(pflash): effective-size admission gate +
 keep-ratio guard (keep default 0.10)

- Gate context-window admission on post-compression effective size, not raw, so
  >128K-raw prompts compress to fit max_ctx instead of 400 / oversized KV reservation.
- Pre-compression keep-ratio sanity guard (raw*keep+max_out>max_ctx); the real
  effective-size gate runs post-compression in worker_loop.
- Default prefill-keep-ratio 0.05 -> 0.10: real ~2x compression on agentic content
  (0.25 over-forces anchor-transitive to ~100% = no-op + rejects >128K).
- Evidence (RTX3090, agentic replay, keep=0.10): 167K raw admitted -> 71K eff (42.6%),
  prefill 145s vs 845s forced; 32-128K real compression; tool-parse intact; 1629 unit asserts green; 14-cell P/PD sweep zero crashes.
---
 server/src/server/http_server.cpp | 63 ++++++++++++++++++++++-
 server/src/server/http_server.h   | 19 ++++++-
 server/src/server/server_main.cpp |  2 +-
 server/test/test_server_unit.cpp  | 83 ++++++++++++++++++++++++++++++-
 4 files changed, 162 insertions(+), 5 deletions(-)

diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 22076c4b9..219bccf6d 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -77,6 +77,32 @@ static size_t json_array_size(const json & value) {
     return value.is_array() ? value.size() : 0;
 }
 
+// ─── Admission gate ──────────────────────────────────────────────────────
+// Pre-compression sanity guard uses first principles: reject only when even
+// best-case compression cannot fit — (double)raw*keep_ratio + max_output > max_ctx.
+// This is keep-ratio-derived, so it correctly admits large prompts at low
+// keep ratios rather than using a hardcoded 4× multiplier calibrated to 0.25.
+
+bool check_admission(int effective_size, int raw_size,
+                     int max_output, int max_ctx, bool pflash_on,
+                     float pflash_keep_ratio) {
+    if (max_ctx <= 0) return true;  // no limit configured
+    if (pflash_on) {
+        // Pre-compression guard: reject only when even best-case compression
+        // cannot fit. Skip when keep_ratio <= 0 (degenerate config; let the
+        // post-compression gate decide).
+        if (pflash_keep_ratio > 0.0f) {
+            if ((double)raw_size * pflash_keep_ratio + max_output > (double)max_ctx)
+                return false;
+        }
+        // Pre-compression guard passed: admit. The real effective-size gate
+        // runs post-compression (caller passes pflash_on=false after pflash).
+        return true;
+    }
+    // Non-pflash (or post-compression): check effective size directly.
+    return effective_size + max_output <= max_ctx;
+}
+
 // Build the /props response body.
 //
 // Non-static so unit tests can call it directly (declared in http_server.h).
@@ -1027,8 +1053,27 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
         return true;  // handled (with error)
     }
 
-    // Check context length.
-    if ((int)req.prompt_tokens.size() + req.max_output > config_.max_ctx) {
+    // Pre-compression admission: reject non-pflash requests that can't fit,
+    // and pflash requests whose raw prompt cannot possibly compress to fit
+    // (first-principles guard: raw*keep_ratio + max_output > max_ctx).
+    // The real post-compression gate runs in worker_loop after pflash runs.
+    const int raw_size = (int)req.prompt_tokens.size();
+    const bool pflash_will_run =
+        config_.max_ctx > 0 &&
+        config_.pflash_mode != ServerConfig::PflashMode::OFF &&
+        drafter_tokenizer_ != nullptr &&
+        (config_.pflash_mode == ServerConfig::PflashMode::ALWAYS ||
+         raw_size >= config_.pflash_threshold);
+    if (!check_admission(raw_size, raw_size, req.max_output, config_.max_ctx,
+                         /*pflash_on=*/false) && !pflash_will_run) {
+        // Non-pflash path: raw is the effective size, reject immediately.
+        send_error(fd, 400, "prompt + max_tokens exceeds context window");
+        return true;
+    }
+    if (pflash_will_run &&
+        !check_admission(raw_size, raw_size, req.max_output, config_.max_ctx,
+                         /*pflash_on=*/true, config_.pflash_keep_ratio)) {
+        // Pre-compression guard: best-case compression still can't fit.
         send_error(fd, 400, "prompt + max_tokens exceeds context window");
         return true;
     }
@@ -1230,6 +1275,20 @@ void HttpServer::worker_loop() {
             }
         }
 
+        // Effective-size admission gate: check post-compression prompt fits max_ctx.
+        // For non-pflash requests this was already checked in handle_client;
+        // for pflash requests the raw guard passed but the effective size may
+        // still be too large (unlikely but possible if compression ratio is poor).
+        // Use pflash_on=false here so the function directly checks effective size
+        // (pflash_on=true only runs the pre-compression guard, not useful here).
+        if (!check_admission((int)effective_prompt.size(), (int)req.prompt_tokens.size(),
+                             req.max_output, config_.max_ctx,
+                             /*pflash_on=*/false,
+                             config_.pflash_keep_ratio)) {
+            fail_request(400, "prompt + max_tokens exceeds context window");
+            continue;
+        }
+
         // Build generate request.
         //
         // Thinking-budget v2 (Level 2): when caller opts in via
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 2fb3e4661..803cf74e8 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -143,7 +143,7 @@ struct ServerConfig {
     enum class PflashMode { OFF, AUTO, ALWAYS };
     PflashMode  pflash_mode      = PflashMode::OFF;
     int         pflash_threshold = 32000;   // token count threshold for AUTO mode
-    float       pflash_keep_ratio = 0.05f;  // fraction of tokens to keep
+    float       pflash_keep_ratio = 0.10f;  // fraction of tokens to keep
     std::string pflash_drafter_path;        // path to drafter GGUF (Qwen3-0.6B)
     int         pflash_drafter_gpu = 0;     // backend-local GPU for PFlash drafter
     bool        pflash_remote_drafter = false; // use IPC drafter for mixed backends
@@ -317,6 +317,23 @@ struct ServerJob {
     ServerJob *   next = nullptr;
 };
 
+// ─── Admission gate (pure, testable) ────────────────────────────────────
+// Returns true when the request should be admitted (effective prompt fits).
+//
+// effective_size   : post-compression prompt token count (== raw_size when
+//                    pflash is off or the prompt is below threshold).
+// raw_size         : pre-compression token count; used for the pre-compression
+//                    sanity guard: reject early when even best-case compression
+//                    cannot fit — i.e. raw*keep_ratio + max_output > max_ctx.
+// max_output       : request's requested generation tokens.
+// max_ctx          : server's configured context window (--max-ctx).
+// pflash_on        : true when pflash compressed this request.
+// pflash_keep_ratio: configured keep fraction; drives the pre-compression guard.
+//                    Guard is skipped when <= 0.
+bool check_admission(int effective_size, int raw_size,
+                     int max_output, int max_ctx, bool pflash_on,
+                     float pflash_keep_ratio = 0.10f);
+
 // ─── Parse session_id from a chat-completion JSON body ──────────────────
 // Returns empty string when session_id is absent or not a string (int/null/array).
 // Checks extra_body.session_id first, then top-level session_id.
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index 3dcb23a5a..156c84afe 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -205,7 +205,7 @@ static void print_usage(const char * prog) {
         "PFlash (speculative prefill compression):\n"
         "  --prefill-compression off|auto|always  (default: off)\n"
         "  --prefill-threshold <N>     Token threshold for auto mode (default: 32000)\n"
-        "  --prefill-keep-ratio <F>    Fraction of tokens to keep (default: 0.05)\n"
+        "  --prefill-keep-ratio <F>    Fraction of tokens to keep (default: 0.10)\n"
         "  --prefill-drafter <path>    Drafter GGUF for compression (Qwen3-0.6B)\n"
         "  --prefill-skip-park         Skip park/unpark (for >=32GB GPUs)\n"
         "  --lazy-draft                Park decode draft when idle to save VRAM\n"
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index eb53299fc..1e6a1bd39 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -890,7 +890,7 @@ static void test_pflash_config_defaults() {
     ServerConfig cfg;
     TEST_ASSERT(cfg.pflash_mode == ServerConfig::PflashMode::OFF);
     TEST_ASSERT(cfg.pflash_threshold == 32000);
-    TEST_ASSERT(cfg.pflash_keep_ratio > 0.04f && cfg.pflash_keep_ratio < 0.06f);
+    TEST_ASSERT(cfg.pflash_keep_ratio > 0.09f && cfg.pflash_keep_ratio < 0.11f);
     TEST_ASSERT(cfg.pflash_drafter_path.empty());
     TEST_ASSERT(!cfg.pflash_skip_park);
 }
@@ -954,6 +954,76 @@ static void test_pflash_threshold_always_mode() {
     TEST_ASSERT(should);
 }
 
+// ═══════════════════════════════════════════════════════════════════════
+// Admission gate tests (check_admission pure helper)
+// ═══════════════════════════════════════════════════════════════════════
+
+static void test_admission_pflash_raw_large_effective_fits() {
+    // pflash on, raw=170000, effective=65000, max_output=512, max_ctx=131072 → ADMITTED
+    TEST_ASSERT(check_admission(/*effective=*/65000, /*raw=*/170000,
+                                /*max_output=*/512, /*max_ctx=*/131072,
+                                /*pflash_on=*/true));
+}
+
+static void test_admission_pflash_effective_too_large() {
+    // Post-compression: effective still too large → REJECTED.
+    // The post-compression call uses pflash_on=false (direct effective check).
+    TEST_ASSERT(!check_admission(/*effective=*/131000, /*raw=*/170000,
+                                 /*max_output=*/512, /*max_ctx=*/131072,
+                                 /*pflash_on=*/false));
+}
+
+static void test_admission_no_pflash_raw_too_large() {
+    // pflash off, raw > max_ctx → REJECTED (unchanged from original behavior)
+    TEST_ASSERT(!check_admission(/*effective=*/100000, /*raw=*/100000,
+                                 /*max_output=*/512, /*max_ctx=*/8192,
+                                 /*pflash_on=*/false));
+}
+
+static void test_admission_small_request_admitted() {
+    // Normal small request → ADMITTED regardless of pflash flag
+    TEST_ASSERT(check_admission(/*effective=*/1000, /*raw=*/1000,
+                                /*max_output=*/512, /*max_ctx=*/8192,
+                                /*pflash_on=*/false));
+    TEST_ASSERT(check_admission(/*effective=*/1000, /*raw=*/1000,
+                                /*max_output=*/512, /*max_ctx=*/8192,
+                                /*pflash_on=*/true));
+}
+
+static void test_admission_pflash_raw_sanity_guard() {
+    // pflash on, keep_ratio=0.25 (explicit guard-test input), raw=32769:
+    // 32769*0.25 + 512 = 8704.25 > 8192 → REJECTED.
+    TEST_ASSERT(!check_admission(/*effective=*/1000, /*raw=*/32769,
+                                 /*max_output=*/512, /*max_ctx=*/8192,
+                                 /*pflash_on=*/true, /*keep_ratio=*/0.25f));
+}
+
+static void test_admission_no_max_ctx_always_admits() {
+    // max_ctx=0 means no limit: always admit
+    TEST_ASSERT(check_admission(/*effective=*/999999, /*raw=*/999999,
+                                /*max_output=*/9999, /*max_ctx=*/0,
+                                /*pflash_on=*/false));
+}
+
+static void test_admission_keep_ratio_derived_guard_admits_low_ratio() {
+    // keep_ratio=0.05, raw=65536 (8× max_ctx=8192):
+    // best-case effective = 65536*0.05 = 3276.8 tokens.
+    // 3276.8 + 512 = 3788.8 < 8192 → guard PASSES → ADMITTED.
+    // The old hardcoded 4× guard would have rejected (65536 > 4*8192=32768).
+    TEST_ASSERT(check_admission(/*effective=*/65536, /*raw=*/65536,
+                                /*max_output=*/512, /*max_ctx=*/8192,
+                                /*pflash_on=*/true, /*keep_ratio=*/0.05f));
+}
+
+static void test_admission_keep_ratio_derived_guard_rejects_impossible() {
+    // keep_ratio=0.05, raw=2_000_000, max_ctx=8192:
+    // best-case effective = 2000000*0.05 = 100000 tokens.
+    // 100000 + 512 = 100512 > 8192 → REJECTED.
+    TEST_ASSERT(!check_admission(/*effective=*/2000000, /*raw=*/2000000,
+                                 /*max_output=*/512, /*max_ctx=*/8192,
+                                 /*pflash_on=*/true, /*keep_ratio=*/0.05f));
+}
+
 static void test_pflash_placement_same_backend_local() {
     DevicePlacement target;
     target.backend = compiled_placement_backend();
@@ -2663,6 +2733,17 @@ int main() {
     RUN_TEST(test_pflash_compress_result_defaults);
     RUN_TEST(test_pflash_threshold_auto_mode);
     RUN_TEST(test_pflash_threshold_always_mode);
+
+    std::fprintf(stderr, "\n── Admission gate ──\n");
+    RUN_TEST(test_admission_pflash_raw_large_effective_fits);
+    RUN_TEST(test_admission_pflash_effective_too_large);
+    RUN_TEST(test_admission_no_pflash_raw_too_large);
+    RUN_TEST(test_admission_small_request_admitted);
+    RUN_TEST(test_admission_pflash_raw_sanity_guard);
+    RUN_TEST(test_admission_no_max_ctx_always_admits);
+    RUN_TEST(test_admission_keep_ratio_derived_guard_admits_low_ratio);
+    RUN_TEST(test_admission_keep_ratio_derived_guard_rejects_impossible);
+
     RUN_TEST(test_pflash_placement_same_backend_local);
     RUN_TEST(test_pflash_placement_mixed_backend_remote);
     RUN_TEST(test_pflash_placement_auto_draft_follows_target);

From fbc2d412ab1cf467a9b16212147b53b49c477dfd Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Sat, 30 May 2026 20:51:22 +0200
Subject: [PATCH 14/16] feat(pflash): adaptive compression-regime router
 (correct-by-construction, default-off)

Pure per-request router that picks pFlash compression per prompt instead of a
global keep. Ships DISABLED: the default policy is an exact no-op, so enabling
is opt-in and cannot regress recall.

- regime_router.h: pure decide_regime (v1) + decide_v2 (type-gate + sparse-prompt
  guard + recency floor) + recency_floor_for() size-adaptive helper. 172 unit
  tests, -Werror clean. Default policy returns FullCascade/full-keep for any input.
- qwen3_drafter.cpp: PFLASH_RECENCY_FLOOR_TOKENS mechanism (0 = off default;
  -1 = auto = min(1024, 0.04*S)); env_int negative-sentinel fix.
- CMakeLists: test_regime_router target.

Evidence (bench/2026-05-30_*): indexer-prune dead (frac_prune=0 across scorers);
the splitter is prompt TYPE, not cascade-expansion (rho=-0.27); sparse-plumbing
turns fail under compression but pass at full keep (OFF=100%), so route them to
full keep. Recency floor validated zero-sum at fixed keep (breaks controls),
kept as off-by-default infra, not the mechanism. Router path: agentic-rich ->
~25% keep (~3x prefill/wall), agentic-sparse -> full keep, retrieval -> cascade.
Not yet wired into the live request handler (follow-up).
---
 server/CMakeLists.txt              |   8 +
 server/src/common/regime_router.h  | 191 ++++++++
 server/src/qwen3/qwen3_drafter.cpp |  51 +-
 server/test/test_regime_router.cpp | 715 +++++++++++++++++++++++++++++
 4 files changed, 963 insertions(+), 2 deletions(-)
 create mode 100644 server/src/common/regime_router.h
 create mode 100644 server/test/test_regime_router.cpp

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index cfdc22937..dd8812c0d 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -581,6 +581,14 @@ if(DFLASH27B_TESTS)
         add_test(NAME test_drafter_early_exit_score_range
             COMMAND test_drafter_early_exit_score_range)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_regime_router.cpp")
+        add_executable(test_regime_router
+            test/test_regime_router.cpp)
+        target_include_directories(test_regime_router PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
+        add_test(NAME regime_router
+            COMMAND test_regime_router)
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_anchor_transitive.cpp")
         add_executable(test_anchor_transitive
             test/test_anchor_transitive.cpp
diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h
new file mode 100644
index 000000000..7c9917eb7
--- /dev/null
+++ b/server/src/common/regime_router.h
@@ -0,0 +1,191 @@
+// Pure, correct-by-construction adaptive compression-regime router.
+// No IO, no globals, no GPU, no ggml/llama deps — header-only.
+//
+// Decides whether the transitive anchor cascade should run at full expansion
+// (FullCascade, recall-preserving default) or be throttled
+// (Throttle, fires ONLY when expansion_ratio >= policy threshold).
+//
+// Build (standalone):
+//   g++-11 -std=gnu++17 -O2 -I. -o test_regime_router test/test_regime_router.cpp
+// CMake:  cmake --build build --target test_regime_router -j
+//         ctest -R regime_router --output-on-failure
+#pragma once
+
+#include <cmath>
+#include <limits>
+
+namespace dflash::common {
+
+// ─── Input ───────────────────────────────────────────────────────────────────
+
+// All inputs are cheap lexical counts already available in the cascade path.
+struct CascadeStats {
+    int n_chunks;
+    int forced_anchor_only;    // chunks forced by BASE anchors, pre-cascade
+    int forced_after_cascade;  // chunks forced AFTER transitive cascade
+    int prompt_tokens;         // S
+    int keep_floor_chunks;     // ceil(keep_ratio * n_chunks) budget (informational)
+};
+
+// ─── Policy ──────────────────────────────────────────────────────────────────
+
+struct RouterPolicy {
+    int    threshold_tokens         = 32000;           // below this: passthrough
+    double expansion_throttle_ratio = INFINITY;        // DEFAULT disabled
+    int    min_anchor_chunks        = 1;               // don't throttle if too few anchors
+};
+
+// ─── Output ──────────────────────────────────────────────────────────────────
+
+enum class Regime { FullCascade, Throttle };
+
+struct RegimeDecision {
+    Regime      regime;
+    double      expansion_ratio;
+    const char* reason;
+};
+
+// ─── Core function ───────────────────────────────────────────────────────────
+
+// decide_regime — pure, no IO, no globals.
+//
+// Expansion ratio R = forced_after_cascade / forced_anchor_only
+//   (defined as 1.0 when forced_anchor_only == 0 to avoid division by zero).
+//
+// Transition to Throttle ONLY on the last branch; every other path returns
+// FullCascade so the default deployment posture is recall-preserving.
+inline RegimeDecision decide_regime(const CascadeStats& s, const RouterPolicy& p) {
+    // Compute R first (needed for degenerate guard + return value).
+    const double R = (s.forced_anchor_only > 0)
+        ? static_cast<double>(s.forced_after_cascade) / s.forced_anchor_only
+        : 1.0;
+
+    // Guard: degenerate inputs — return FullCascade, no further processing.
+    if (s.n_chunks <= 0 || s.forced_anchor_only < 0 || s.forced_after_cascade < 0)
+        return { Regime::FullCascade, R, "degenerate" };
+
+    // Passthrough: prompt too short to compress meaningfully.
+    if (s.prompt_tokens < p.threshold_tokens)
+        return { Regime::FullCascade, R, "below_threshold" };
+
+    // Guard: too few base anchors — throttle would be meaningless.
+    if (s.forced_anchor_only < p.min_anchor_chunks)
+        return { Regime::FullCascade, R, "too_few_anchors" };
+
+    // Only transition: cascade expanded beyond the policy limit.
+    if (R >= p.expansion_throttle_ratio)
+        return { Regime::Throttle, R, "cascade_over_expansion" };
+
+    return { Regime::FullCascade, R, "default_safe" };
+}
+
+// ─── V2 Router ───────────────────────────────────────────────────────────────
+//
+// Adaptive compression router v2.
+// Splits on prompt TYPE (agentic vs retrieval) rather than cascade expansion
+// ratio R (which was refuted as a keep predictor, Spearman ρ=-0.27).
+//
+// Additional guards:
+//   sparse_prompt_guard — skip compression when new_content_tokens is tiny
+//     (plumbing turns: recent orchestration continuity must not be dropped)
+//   recency_floor_turns — always keep the last K turns whole in the agentic path
+//
+// Sentinel for "keep all turns" recency in SAFE decisions:
+static constexpr int kRecencyKeepAll = (1 << 20);
+
+// Size-adaptive recency floor sentinel.
+// When recency_floor_tokens == kRecencyFloorAuto the compress path computes
+//   R = min(1024, ceil(0.04 * prompt_tokens))
+// instead of using a fixed token count.  0 = off (no-op default).
+static constexpr int kRecencyFloorAuto = -1;
+
+struct RequestFeatures {
+    bool is_agentic;           // tool schemas / tool_use|tool_result blocks present
+    int  prompt_tokens;        // total S
+    int  new_content_tokens;   // newest turn content size (sparse-plumbing detector)
+};
+
+struct RouterPolicyV2 {
+    bool   enabled                    = false;   // DEFAULT DISABLED → exact no-op
+    int    threshold_tokens           = 32000;   // below → passthrough
+    double agentic_keep_target        = 0.25;    // conservative floor, closes empty-failure tail
+    double full_keep_target           = 1.0;     // retrieval/QA & safe fallbacks
+    int    recency_floor_turns        = 2;       // keep last K turns whole (continuity)
+    int    sparse_new_content_tokens  = 256;     // below this → sparse_prompt_guard fires
+};
+
+// recency_floor_for — pure helper, no IO.
+//
+// Returns the concrete token floor for a given prompt size and policy:
+//   recency_floor_tokens == 0           → 0  (off, no-op)
+//   recency_floor_tokens == kRecencyFloorAuto (-1)
+//                                       → min(1024, ceil(0.04 * prompt_tokens))
+//   recency_floor_tokens  > 0           → recency_floor_tokens  (explicit override)
+//
+// "one turn equivalent" lower-bound: the agentic throttle path in decide_v2
+// already reserves recency_floor_turns whole turns; this helper computes the
+// token-count floor passed to the compress path for the token-budget guard.
+inline int recency_floor_for(int prompt_tokens, int recency_floor_tokens) {
+    if (recency_floor_tokens == 0)
+        return 0;
+    if (recency_floor_tokens == kRecencyFloorAuto) {
+        // min(1024, ceil(0.04 * S)) — scales with context, caps at 1024
+        const int adaptive = static_cast<int>(
+            std::ceil(0.04 * static_cast<double>(prompt_tokens < 0 ? 0 : prompt_tokens)));
+        return (adaptive < 1024) ? adaptive : 1024;
+    }
+    // Explicit positive override.
+    return (recency_floor_tokens > 0) ? recency_floor_tokens : 0;
+}
+
+struct RouterDecisionV2 {
+    double      keep_target;
+    int         recency_floor_turns;
+    bool        cascade;
+    const char* reason;
+};
+
+// decide_v2 — pure, no IO, no globals.
+//
+// SAFE path: keep_target=full_keep_target, recency=kRecencyKeepAll, cascade=true.
+// Returns SAFE when:
+//   - p.enabled == false                        (deploy no-op, correct-by-construction)
+//   - f.prompt_tokens <= 0 || f.new_content_tokens < 0  (degenerate)
+//   - f.prompt_tokens < p.threshold_tokens      (below threshold)
+//   - f.new_content_tokens < p.sparse_new_content_tokens (sparse_prompt_guard)
+// Throttling path (only when all guards pass):
+//   - is_agentic → {agentic_keep_target, recency_floor_turns, cascade=false}
+//   - else       → {full_keep_target,    recency_floor_turns, cascade=true}
+inline RouterDecisionV2 decide_v2(const RequestFeatures& f,
+                                   const RouterPolicyV2&   p) {
+    // Helper: SAFE return (keep everything, cascade on, recency = keep-all).
+    const RouterDecisionV2 SAFE_disabled        = { p.full_keep_target, kRecencyKeepAll, true, "disabled_noop"       };
+    const RouterDecisionV2 SAFE_degenerate      = { p.full_keep_target, kRecencyKeepAll, true, "degenerate"          };
+    const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, kRecencyKeepAll, true, "below_threshold"     };
+    const RouterDecisionV2 SAFE_sparse          = { p.full_keep_target, kRecencyKeepAll, true, "sparse_prompt_guard" };
+
+    // 1. Deploy no-op: disabled router is an exact no-op (correct-by-construction).
+    if (!p.enabled)
+        return SAFE_disabled;
+
+    // 2. Degenerate inputs: prompt_tokens <= 0 or new_content_tokens < 0.
+    if (f.prompt_tokens <= 0 || f.new_content_tokens < 0)
+        return SAFE_degenerate;
+
+    // 3. Below threshold: prompt too short to compress meaningfully.
+    if (f.prompt_tokens < p.threshold_tokens)
+        return SAFE_below_threshold;
+
+    // 4. Sparse-prompt guard: tiny new-content turn (plumbing class).
+    //    Compression would drop recent orchestration continuity with no anchor signal.
+    if (f.new_content_tokens < p.sparse_new_content_tokens)
+        return SAFE_sparse;
+
+    // 5. Throttling paths — all guards passed.
+    if (f.is_agentic)
+        return { p.agentic_keep_target, p.recency_floor_turns, false, "agentic_throttle" };
+
+    return { p.full_keep_target, p.recency_floor_turns, true, "retrieval_full" };
+}
+
+}  // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index 4261635f4..833247b70 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -18,6 +18,7 @@
 #include "common/backend_precision.h"
 #include "internal.h"
 #include "anchor_scan.h"
+#include "regime_router.h"
 
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -80,6 +81,7 @@ struct CompressCfg {
     int   query_tokens;
     int   head_chunks;
     int   tail_chunks;
+    int   recency_floor_tokens; // PFLASH_RECENCY_FLOOR_TOKENS: force-keep last N tokens (0 = off)
     dflash::qwen3::AnchorScanCfg anchor;
     bool  use_transitive;
     int   max_iters;
@@ -175,6 +177,23 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) {
         return 3;
     }();
 
+    // Recency floor: unconditionally force-keep the last R tokens of the prompt
+    // body before anchor scoring.  DEFAULT 0 = no-op (unchanged behavior).
+    //   0  = off
+    //  -1  = auto: min(1024, ceil(0.04 * S))  [resolved at compress time when S is known]
+    //  >0  = explicit token count
+    // Note: env_int() rejects negatives, so read raw and parse to preserve -1.
+    // Rescues recent wiring-sequence turns dropped when anchors seed from a
+    // short/sparse tail (e.g. bare [tool_result] turns).
+    {
+        const char * rfv = std::getenv("PFLASH_RECENCY_FLOOR_TOKENS");
+        if (rfv) {
+            c.recency_floor_tokens = std::atoi(rfv);  // preserves -1 sentinel
+        } else {
+            c.recency_floor_tokens = 0;
+        }
+    }
+
     return c;
 }
 
@@ -650,6 +669,19 @@ static std::vector<int32_t> qwen35_score_and_compress(
     std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
 
+    // Recency floor: force-keep the last R tokens worth of chunks before anchor
+    // scoring so that recent wiring-sequence turns are never dropped regardless
+    // of anchor seed quality.  R=0 is a no-op (default).  R=-1 = auto.
+    {
+        const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
+        if (R > 0) {
+            const int floor_tok = std::min(S, R);
+            const int floor_start_tok = S - floor_tok;
+            const int floor_start_chunk = floor_start_tok / chunk_size;
+            for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1;
+        }
+    }
+
     dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
     anchor_cfg.chunk_size = chunk_size;
 
@@ -818,13 +850,28 @@ std::vector<int32_t> drafter_score_and_compress(
     for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1;
     for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
 
+    // Recency floor: force-keep the last R tokens worth of chunks before anchor
+    // scoring so that recent wiring-sequence turns are never dropped regardless
+    // of anchor seed quality.  R=0 is a no-op (default).  R=-1 = auto.
+    {
+        const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
+        if (R > 0) {
+            const int floor_tok = std::min(S, R);
+            const int floor_start_tok = S - floor_tok;
+            const int floor_start_chunk = floor_start_tok / chunk_size;
+            for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1;
+        }
+    }
+
     const int q0 = std::max(0, S - cfg.query_tokens);
     {
+        const int resolved_R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
         std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
         dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
         anchor_cfg.chunk_size = chunk_size;
-        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n",
-            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count);
+        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d recency_floor=%d (resolved=%d)\n",
+            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count,
+            cfg.recency_floor_tokens, resolved_R);
         std::fflush(stderr);
 
         if (cfg.use_transitive) {
diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp
new file mode 100644
index 000000000..f92a0b512
--- /dev/null
+++ b/server/test/test_regime_router.cpp
@@ -0,0 +1,715 @@
+// Unit tests for dflash::common::decide_regime() — pure function, no GPU.
+//
+// Build (standalone, from repo root):
+//   g++-11 -std=gnu++17 -O2 -I server/src/common
+//     -o /tmp/test_regime_router server/test/test_regime_router.cpp
+// CMake:
+//   cmake --build build --target test_regime_router -j
+//   ctest -R regime_router --output-on-failure
+
+#include "regime_router.h"
+
+#include <cmath>
+#include <cstdio>
+#include <limits>
+#include <string>
+
+using namespace dflash::common;
+
+// ─── Minimal test framework (mirrors test_adaptive_keep_ratio.cpp) ───────────
+
+static int test_failures = 0;
+static int test_count    = 0;
+
+#define TEST_ASSERT(expr) do { \
+    test_count++; \
+    if (!(expr)) { \
+        test_failures++; \
+        std::fprintf(stderr, "  FAIL: %s:%d: %s\n", __FILE__, __LINE__, #expr); \
+    } \
+} while (0)
+
+#define TEST_ASSERT_MSG(expr, msg) do { \
+    test_count++; \
+    if (!(expr)) { \
+        test_failures++; \
+        std::fprintf(stderr, "  FAIL: %s:%d: %s -- %s\n", \
+                     __FILE__, __LINE__, #expr, msg); \
+    } \
+} while (0)
+
+#define RUN_TEST(fn) do { \
+    std::fprintf(stderr, "  %s ...", #fn); \
+    int before = test_failures; \
+    fn(); \
+    if (test_failures == before) std::fprintf(stderr, " ok\n"); \
+    else std::fprintf(stderr, "\n"); \
+} while (0)
+
+static inline bool approx_eq(double a, double b, double eps = 1e-9) {
+    return std::fabs(a - b) < eps;
+}
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+// Build a policy with expansion_throttle_ratio disabled (default safe).
+static RouterPolicy default_policy() { return {}; }
+
+// Build a policy that throttles at ratio >= r.
+static RouterPolicy throttle_policy(double r,
+                                    int threshold = 32000,
+                                    int min_anchor = 1) {
+    RouterPolicy p;
+    p.threshold_tokens         = threshold;
+    p.expansion_throttle_ratio = r;
+    p.min_anchor_chunks        = min_anchor;
+    return p;
+}
+
+static CascadeStats make_stats(int n_chunks,
+                                int anchor_only,
+                                int after_cascade,
+                                int prompt_tokens,
+                                int keep_floor = 0) {
+    return { n_chunks, anchor_only, after_cascade, prompt_tokens, keep_floor };
+}
+
+// ─── T1: DEPLOY-NO-OP ────────────────────────────────────────────────────────
+// With the DEFAULT RouterPolicy (ratio=INFINITY), decide_regime must return
+// FullCascade for ANY stats, including pathologically large expansion.
+
+static void t1_deploy_noop() {
+    RouterPolicy p = default_policy();
+
+    // Normal case
+    {
+        auto d = decide_regime(make_stats(100, 10, 20, 50000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T1a: default policy must always give FullCascade");
+    }
+    // Huge expansion: forced_anchor_only=10, forced_after_cascade=1000, prompt=100K
+    {
+        auto d = decide_regime(make_stats(500, 10, 1000, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T1b: huge expansion with default policy must be FullCascade");
+    }
+    // Prompt below threshold
+    {
+        auto d = decide_regime(make_stats(50, 5, 500, 1000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T1c: short prompt with default policy must be FullCascade");
+    }
+    // Zero anchors
+    {
+        auto d = decide_regime(make_stats(100, 0, 0, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T1d: zero anchors with default policy must be FullCascade");
+    }
+    // Sweep: 50 random-ish stat combinations
+    for (int i = 1; i <= 50; ++i) {
+        CascadeStats s = make_stats(i * 10,
+                                    i,
+                                    i * 100,          // R = 100, very high
+                                    i * 5000);
+        auto d = decide_regime(s, p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T1-sweep: default policy must be FullCascade for all stats");
+    }
+}
+
+// ─── T2: DEGENERATE ──────────────────────────────────────────────────────────
+// Degenerate inputs must not crash or div-by-zero, and must return FullCascade.
+
+static void t2_degenerate() {
+    RouterPolicy p = throttle_policy(2.0);   // would throttle if R >= 2
+
+    // n_chunks == 0
+    {
+        auto d = decide_regime(make_stats(0, 5, 10, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T2a: n_chunks=0 must return FullCascade");
+        TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio),
+                        "T2a: expansion_ratio must be finite when n_chunks=0");
+    }
+    // forced_anchor_only == 0 (no anchors before cascade) → R defaults to 1.0
+    {
+        auto d = decide_regime(make_stats(100, 0, 50, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T2b: forced_anchor_only=0 must return FullCascade");
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
+                        "T2b: expansion_ratio must be 1.0 when forced_anchor_only=0");
+    }
+    // Negative forced_anchor_only
+    {
+        auto d = decide_regime(make_stats(100, -1, 50, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T2c: negative forced_anchor_only must return FullCascade");
+        TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio),
+                        "T2c: expansion_ratio must be finite for negative anchor count");
+    }
+    // Negative forced_after_cascade
+    {
+        auto d = decide_regime(make_stats(100, 5, -1, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T2d: negative forced_after_cascade must return FullCascade");
+    }
+    // Both negative
+    {
+        auto d = decide_regime(make_stats(100, -3, -7, 100000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T2e: both counts negative must return FullCascade");
+    }
+}
+
+// ─── T3: BELOW-THRESHOLD ─────────────────────────────────────────────────────
+// prompt_tokens < threshold → FullCascade regardless of R and finite ratio.
+
+static void t3_below_threshold() {
+    RouterPolicy p = throttle_policy(1.5, /*threshold=*/32000, /*min_anchor=*/1);
+
+    // prompt = threshold - 1 (just below)
+    {
+        auto d = decide_regime(make_stats(100, 10, 1000, 31999), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T3a: prompt just below threshold must be FullCascade");
+        TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
+                        "T3a: reason must be 'below_threshold'");
+    }
+    // prompt = 0
+    {
+        auto d = decide_regime(make_stats(100, 10, 9999, 0), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T3b: prompt=0 must be FullCascade");
+    }
+    // Even with R = 1000 and finite ratio = 2.0, still FullCascade below threshold
+    {
+        auto d = decide_regime(make_stats(200, 5, 5000, 100), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T3c: tiny prompt, huge R, finite ratio -> FullCascade");
+    }
+}
+
+// ─── T4: TOO-FEW-ANCHORS ─────────────────────────────────────────────────────
+// forced_anchor_only < min_anchor_chunks → FullCascade.
+
+static void t4_too_few_anchors() {
+    RouterPolicy p = throttle_policy(2.0, /*threshold=*/32000, /*min_anchor=*/3);
+    // forced_anchor_only = 2 < min_anchor = 3
+    {
+        auto d = decide_regime(make_stats(100, 2, 1000, 50000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T4a: anchors below min must be FullCascade");
+    }
+    // forced_anchor_only = 0 < min_anchor = 3
+    {
+        auto d = decide_regime(make_stats(100, 0, 500, 50000), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T4b: zero anchors below min must be FullCascade");
+    }
+    // forced_anchor_only = 3 == min_anchor = 3: NOT too few → may throttle
+    {
+        auto d = decide_regime(make_stats(100, 3, 300, 50000), p);
+        // R = 300/3 = 100 >= 2.0 → should be Throttle
+        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
+                        "T4c: anchors == min AND R >= ratio must throttle");
+    }
+}
+
+// ─── T5: MONOTONE ────────────────────────────────────────────────────────────
+// With a finite ratio policy, once Throttle triggers at R it must stay Throttle
+// for all larger R.
+
+static void t5_monotone() {
+    // Policy: ratio=3.0, threshold=32000, min_anchor=1, prompt_tokens=50000
+    RouterPolicy p = throttle_policy(3.0, 32000, 1);
+    const int prompt = 50000;
+    const int anchor = 10;  // fixed; vary after_cascade to control R
+
+    // R = 2.9 → FullCascade
+    {
+        // after = anchor * R = 10 * 2.9 = 29
+        auto d = decide_regime(make_stats(100, anchor, 29, prompt), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T5a: R=2.9 < 3.0 must be FullCascade");
+    }
+    // R = 3.0 → Throttle (boundary: >= triggers)
+    {
+        // after = 10 * 3 = 30
+        auto d = decide_regime(make_stats(100, anchor, 30, prompt), p);
+        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
+                        "T5b: R=3.0 == ratio must be Throttle");
+    }
+    // R = 10.0 → Throttle
+    {
+        auto d = decide_regime(make_stats(100, anchor, 100, prompt), p);
+        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
+                        "T5c: R=10.0 >> ratio must be Throttle");
+    }
+    // Monotone sweep: for all integer R from 1 to 100, once Throttle appears
+    // it must not flip back to FullCascade.
+    bool seen_throttle = false;
+    bool monotone      = true;
+    for (int r_int = 1; r_int <= 100; ++r_int) {
+        // after = anchor * r_int → exact integer R
+        auto d = decide_regime(make_stats(200, anchor, anchor * r_int, prompt), p);
+        if (d.regime == Regime::Throttle) {
+            seen_throttle = true;
+        } else if (seen_throttle) {
+            // Flipped back to FullCascade after Throttle was seen: not monotone
+            monotone = false;
+            std::fprintf(stderr,
+                         "  MONOTONE VIOLATION at R=%d: Throttle then FullCascade\n",
+                         r_int);
+            break;
+        }
+    }
+    TEST_ASSERT_MSG(seen_throttle, "T5d: sweep must trigger Throttle at some R");
+    TEST_ASSERT_MSG(monotone,      "T5e: regime must be monotone (no FullCascade after Throttle)");
+}
+
+// ─── T6: BOUNDARY ────────────────────────────────────────────────────────────
+// R exactly == ratio → Throttle; R = ratio - epsilon → FullCascade.
+
+static void t6_boundary() {
+    const double ratio   = 5.0;
+    RouterPolicy p       = throttle_policy(ratio, 32000, 1);
+    const int    anchor  = 1000;   // use large anchor to get precise integer ratios
+    const int    prompt  = 50000;
+
+    // R exactly == ratio: after = anchor * ratio = 5000
+    {
+        auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio, prompt), p);
+        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
+                        "T6a: R exactly == ratio must be Throttle");
+    }
+
+    // R = ratio - epsilon where epsilon = 0.5/anchor (one less chunk → R < ratio)
+    {
+        // after = anchor * ratio - 1 = 4999 → R = 4.999 < 5.0
+        auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio - 1, prompt), p);
+        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
+                        "T6b: R just below ratio must be FullCascade");
+    }
+}
+
+// ─── T7: RATIO-VALUE ─────────────────────────────────────────────────────────
+// Check that expansion_ratio is computed correctly.
+
+static void t7_ratio_value() {
+    RouterPolicy p = default_policy();  // regime doesn't matter; check ratio value
+
+    // forced_anchor_only=10, forced_after_cascade=85 → R = 8.5
+    {
+        auto d = decide_regime(make_stats(100, 10, 85, 50000), p);
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 8.5),
+                        "T7a: R must be 85/10 = 8.5");
+    }
+    // forced_anchor_only=0 → R must be 1.0 (no div-by-zero)
+    {
+        auto d = decide_regime(make_stats(100, 0, 50, 50000), p);
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
+                        "T7b: forced_anchor_only=0 must give expansion_ratio=1.0");
+    }
+    // forced_anchor_only=5, forced_after_cascade=5 → R = 1.0
+    {
+        auto d = decide_regime(make_stats(100, 5, 5, 50000), p);
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
+                        "T7c: equal anchors before/after must give R=1.0");
+    }
+    // forced_anchor_only=7, forced_after_cascade=7 → R = 1.0 (no expansion)
+    {
+        auto d = decide_regime(make_stats(100, 7, 7, 50000), p);
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
+                        "T7d: no cascade expansion must give R=1.0");
+    }
+    // Verify ratio when throttle policy triggers: ratio value should still be correct
+    {
+        RouterPolicy tp = throttle_policy(3.0);
+        auto d = decide_regime(make_stats(100, 4, 20, 50000), tp);
+        // R = 20/4 = 5.0 → Throttle, ratio = 5.0
+        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
+                        "T7e: R=5.0 >= 3.0 must throttle");
+        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 5.0),
+                        "T7e: expansion_ratio must be 5.0");
+    }
+}
+
+// ─── V2 helpers ──────────────────────────────────────────────────────────────
+
+// Default v2 policy: disabled (deploy no-op).
+static RouterPolicyV2 default_v2_policy() { return {}; }
+
+// Enabled v2 policy with default field values.
+static RouterPolicyV2 enabled_v2_policy() {
+    RouterPolicyV2 p;
+    p.enabled = true;
+    return p;
+}
+
+static RequestFeatures make_features(bool is_agentic,
+                                      int  prompt_tokens,
+                                      int  new_content_tokens) {
+    return { is_agentic, prompt_tokens, new_content_tokens };
+}
+
+// ─── T8: DEPLOY-NO-OP (v2) ───────────────────────────────────────────────────
+// enabled=false → SAFE for every input, including is_agentic=true and huge prompts.
+// Correct-by-construction: disabled router must be an exact no-op.
+
+static void t8_v2_deploy_noop() {
+    RouterPolicyV2 p = default_v2_policy();   // enabled=false
+
+    // Baseline: normal agentic prompt, well above threshold.
+    {
+        auto d = decide_v2(make_features(true, 100000, 10000), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T8a: disabled->keep_target must be full_keep_target");
+        TEST_ASSERT_MSG(d.cascade,
+                        "T8a: disabled->cascade must be true");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T8a: disabled->recency must be keep-all sentinel");
+    }
+    // Sweep: all combinations of is_agentic, varying prompt and new_content sizes.
+    for (int i = 0; i < 4; ++i) {
+        bool agentic   = (i & 1) != 0;
+        int  prompt    = (i & 2) ? 100000 : 500;
+        int  new_toks  = (i & 2) ? 10000 : 10;
+        auto d = decide_v2(make_features(agentic, prompt, new_toks), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T8-sweep: disabled->keep_target must be full_keep_target");
+        TEST_ASSERT_MSG(d.cascade,
+                        "T8-sweep: disabled->cascade must be true");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T8-sweep: disabled->recency must be keep-all sentinel");
+    }
+    // Explicitly: is_agentic=true, large prompt, large new_content — must be SAFE.
+    {
+        auto d = decide_v2(make_features(true, 200000, 50000), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T8b: disabled, agentic, huge prompt -> SAFE");
+        TEST_ASSERT_MSG(d.cascade, "T8b: disabled -> cascade=true");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T8b: disabled -> recency keep-all");
+    }
+}
+
+// ─── T9: SPARSE-PROMPT GUARD (failure-class fix) ─────────────────────────────
+// is_agentic=true, prompt above threshold, BUT new_content < sparse threshold.
+// This is the LONG_A-t11/LONG_B-t10 plumbing class: a tiny tool_result riding
+// on long history. Compression must NOT throttle here (would drop continuity).
+
+static void t9_sparse_prompt_guard() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    // Canonical failure case: 3-word tool_result on 43K history.
+    {
+        auto d = decide_v2(make_features(true, 43000, 8), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T9a: sparse agentic turn must be SAFE (full keep), not throttled");
+        TEST_ASSERT_MSG(d.cascade,
+                        "T9a: sparse_prompt_guard must cascade=true (SAFE)");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T9a: sparse_prompt_guard -> recency keep-all");
+        TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard",
+                        "T9a: reason must be 'sparse_prompt_guard'");
+    }
+    // new_content = sparse_new_content_tokens - 1 (just below the guard).
+    {
+        auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens - 1), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T9b: new_content just below sparse threshold -> SAFE");
+        TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard",
+                        "T9b: reason must be 'sparse_prompt_guard'");
+    }
+    // new_content = 0 (degenerate new turn, still sparse guard NOT degenerate path).
+    // Note: 0 < sparse_new_content_tokens (256) so sparse guard fires first.
+    {
+        auto d = decide_v2(make_features(true, 40000, 0), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T9c: new_content=0 -> SAFE (sparse guard or degenerate, both SAFE)");
+    }
+    // Confirm: new_content = sparse_new_content_tokens (AT the boundary → NOT sparse).
+    // is_agentic=true above threshold with enough new content → throttle kicks in.
+    {
+        auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target),
+                        "T9d: new_content==sparse threshold -> agentic throttle applies");
+        TEST_ASSERT_MSG(!d.cascade,
+                        "T9d: agentic throttle -> cascade=false");
+    }
+}
+
+// ─── T10: AGENTIC-THROTTLE ───────────────────────────────────────────────────
+// enabled, is_agentic=true, prompt > threshold, new_content > sparse threshold
+// → keep_target=agentic_keep_target, cascade=false, recency >= 1.
+
+static void t10_agentic_throttle() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    {
+        auto d = decide_v2(make_features(true, 40000, 3000), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target),
+                        "T10a: agentic throttle -> keep_target=agentic_keep_target");
+        TEST_ASSERT_MSG(!d.cascade,
+                        "T10a: agentic throttle -> cascade=false");
+        TEST_ASSERT_MSG(d.recency_floor_turns == p.recency_floor_turns,
+                        "T10a: agentic throttle -> recency matches policy");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
+                        "T10a: recency_floor_turns must be >= 1 (continuity guaranteed)");
+        TEST_ASSERT_MSG(std::string(d.reason) == "agentic_throttle",
+                        "T10a: reason must be 'agentic_throttle'");
+    }
+    // Custom policy: verify fields propagate.
+    {
+        RouterPolicyV2 p2  = p;
+        p2.agentic_keep_target  = 0.30;
+        p2.recency_floor_turns  = 5;
+        auto d = decide_v2(make_features(true, 60000, 1000), p2);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.30),
+                        "T10b: custom agentic_keep_target propagated");
+        TEST_ASSERT_MSG(d.recency_floor_turns == 5,
+                        "T10b: custom recency_floor_turns propagated");
+    }
+}
+
+// ─── T11: RETRIEVAL-FULL ─────────────────────────────────────────────────────
+// enabled, is_agentic=false, prompt > threshold, new_content > sparse threshold
+// → cascade=true, keep_target=full_keep_target.
+
+static void t11_retrieval_full() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    {
+        auto d = decide_v2(make_features(false, 40000, 3000), p);
+        TEST_ASSERT_MSG(d.cascade,
+                        "T11a: retrieval -> cascade=true");
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T11a: retrieval -> keep_target=full_keep_target");
+        TEST_ASSERT_MSG(std::string(d.reason) == "retrieval_full",
+                        "T11a: reason must be 'retrieval_full'");
+    }
+    // Custom full_keep_target.
+    {
+        RouterPolicyV2 p2       = p;
+        p2.full_keep_target     = 0.80;
+        auto d = decide_v2(make_features(false, 50000, 5000), p2);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.80),
+                        "T11b: custom full_keep_target propagated");
+        TEST_ASSERT_MSG(d.cascade, "T11b: retrieval -> cascade=true");
+    }
+}
+
+// ─── T12: BELOW-THRESHOLD (v2) ───────────────────────────────────────────────
+// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic and new_content.
+
+static void t12_v2_below_threshold() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    // Agentic, just below threshold, plenty of new content.
+    {
+        auto d = decide_v2(make_features(true, p.threshold_tokens - 1, 5000), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T12a: agentic, below threshold -> SAFE");
+        TEST_ASSERT_MSG(d.cascade,
+                        "T12a: below threshold -> cascade=true");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T12a: below threshold -> recency keep-all");
+        TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
+                        "T12a: reason must be 'below_threshold'");
+    }
+    // Non-agentic, at threshold boundary - 1.
+    {
+        auto d = decide_v2(make_features(false, p.threshold_tokens - 1, 5000), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T12b: non-agentic, below threshold -> SAFE");
+    }
+    // Custom threshold.
+    {
+        RouterPolicyV2 p2     = p;
+        p2.threshold_tokens   = 10000;
+        auto d = decide_v2(make_features(true, 9999, 2000), p2);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p2.full_keep_target),
+                        "T12c: custom threshold, below it -> SAFE");
+        TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
+                        "T12c: reason must be 'below_threshold'");
+    }
+}
+
+// ─── T13: RECENCY-FLOOR INVARIANT ────────────────────────────────────────────
+// In every throttling decision (non-SAFE), recency_floor_turns >= 1.
+// In every SAFE decision, recency_floor_turns >= kRecencyKeepAll.
+
+static void t13_recency_floor_invariant() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    // Throttle path (agentic): recency >= 1.
+    {
+        auto d = decide_v2(make_features(true, 50000, 1000), p);
+        TEST_ASSERT_MSG(!approx_eq(d.keep_target, p.full_keep_target) ||
+                         d.recency_floor_turns >= 1,
+                        "T13a: throttled decision must have recency >= 1");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
+                        "T13a: agentic throttle recency_floor_turns >= 1 (continuity)");
+    }
+    // SAFE paths: recency must be keep-all.
+    // disabled
+    {
+        RouterPolicyV2 pd; pd.enabled = false;
+        auto d = decide_v2(make_features(true, 50000, 1000), pd);
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T13b: disabled SAFE recency must be keep-all");
+    }
+    // sparse_prompt_guard
+    {
+        auto d = decide_v2(make_features(true, 50000, 5), p);
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T13c: sparse_prompt_guard SAFE recency must be keep-all");
+    }
+    // below_threshold
+    {
+        auto d = decide_v2(make_features(true, 1000, 500), p);
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T13d: below_threshold SAFE recency must be keep-all");
+    }
+    // retrieval_full path: recency = policy value (not keep-all, it's a throttle-adjacent path)
+    {
+        auto d = decide_v2(make_features(false, 50000, 1000), p);
+        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
+                        "T13e: retrieval_full recency >= 1");
+    }
+    // Custom recency_floor_turns: verify agentic propagates it.
+    for (int k = 1; k <= 10; ++k) {
+        RouterPolicyV2 pk          = p;
+        pk.recency_floor_turns     = k;
+        auto d = decide_v2(make_features(true, 50000, 1000), pk);
+        TEST_ASSERT_MSG(d.recency_floor_turns == k,
+                        "T13f: agentic throttle recency must equal policy recency_floor_turns");
+    }
+}
+
+// ─── T14: DEGENERATE (v2) ────────────────────────────────────────────────────
+// prompt_tokens <= 0 or new_content_tokens < 0 → SAFE (no crash, no garbage).
+
+static void t14_v2_degenerate() {
+    RouterPolicyV2 p = enabled_v2_policy();
+
+    // prompt_tokens = 0
+    {
+        auto d = decide_v2(make_features(true, 0, 500), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T14a: prompt_tokens=0 -> SAFE");
+        TEST_ASSERT_MSG(d.cascade, "T14a: degenerate -> cascade=true");
+        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
+                        "T14a: degenerate -> recency keep-all");
+        TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
+                        "T14a: reason must be 'degenerate'");
+    }
+    // prompt_tokens < 0
+    {
+        auto d = decide_v2(make_features(false, -1, 100), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T14b: negative prompt_tokens -> SAFE");
+        TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
+                        "T14b: reason must be 'degenerate'");
+    }
+    // new_content_tokens < 0
+    {
+        auto d = decide_v2(make_features(true, 50000, -1), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T14c: negative new_content_tokens -> SAFE");
+        TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
+                        "T14c: reason must be 'degenerate'");
+    }
+    // Both degenerate
+    {
+        auto d = decide_v2(make_features(true, -5, -10), p);
+        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
+                        "T14d: both degenerate -> SAFE");
+    }
+}
+
+// ─── T15: RECENCY_FLOOR_FOR — off ────────────────────────────────────────────
+// recency_floor_tokens == 0 → always 0 regardless of prompt size.
+
+static void t15_recency_floor_off() {
+    // 0 → off
+    TEST_ASSERT_MSG(recency_floor_for(0,      0) == 0, "T15a: S=0 R=0 -> 0");
+    TEST_ASSERT_MSG(recency_floor_for(1000,   0) == 0, "T15b: S=1000 R=0 -> 0");
+    TEST_ASSERT_MSG(recency_floor_for(100000, 0) == 0, "T15c: S=100K R=0 -> 0");
+    // Negative R (shouldn't happen but must be safe)
+    TEST_ASSERT_MSG(recency_floor_for(10000, -2) == 0, "T15d: negative R (not sentinel) -> 0");
+}
+
+// ─── T16: RECENCY_FLOOR_FOR — auto ───────────────────────────────────────────
+// kRecencyFloorAuto (-1) → min(1024, ceil(0.04 * S)).
+
+static void t16_recency_floor_auto() {
+    const int A = kRecencyFloorAuto;
+
+    // S=0: ceil(0.04*0)=0
+    TEST_ASSERT_MSG(recency_floor_for(0, A) == 0,   "T16a: S=0 auto -> 0");
+    // S=1000: ceil(0.04*1000)=40
+    TEST_ASSERT_MSG(recency_floor_for(1000,  A) == 40,  "T16b: S=1000 auto -> 40");
+    // S=10000: ceil(0.04*10000)=400
+    TEST_ASSERT_MSG(recency_floor_for(10000, A) == 400, "T16c: S=10K auto -> 400");
+    // S=25000: ceil(0.04*25000)=1000
+    TEST_ASSERT_MSG(recency_floor_for(25000, A) == 1000, "T16d: S=25K auto -> 1000");
+    // S=25001: ceil(0.04*25001)=1001 but capped at 1024
+    // actually 0.04*25001=1000.04 → ceil=1001 < 1024 → 1001
+    TEST_ASSERT_MSG(recency_floor_for(25001, A) == 1001, "T16e: S=25001 auto -> 1001");
+    // S=25600: 0.04*25600=1024.0 → ceil=1024
+    TEST_ASSERT_MSG(recency_floor_for(25600, A) == 1024, "T16f: S=25600 auto -> 1024");
+    // S=26000: 0.04*26000=1040 → ceil=1040 but capped at 1024
+    TEST_ASSERT_MSG(recency_floor_for(26000, A) == 1024, "T16g: S=26000 auto -> cap 1024");
+    // S=100000: 0.04*100000=4000 → capped at 1024
+    TEST_ASSERT_MSG(recency_floor_for(100000, A) == 1024, "T16h: S=100K auto -> cap 1024");
+    // S=-1: negative prompt treated as 0 → 0
+    TEST_ASSERT_MSG(recency_floor_for(-1, A) == 0,  "T16i: S=-1 auto -> 0");
+}
+
+// ─── T17: RECENCY_FLOOR_FOR — explicit ───────────────────────────────────────
+// Any explicit positive value is returned unchanged (no prompt-size influence).
+
+static void t17_recency_floor_explicit() {
+    // Explicit override ignores prompt size
+    TEST_ASSERT_MSG(recency_floor_for(1000,  512)  == 512,  "T17a: explicit 512");
+    TEST_ASSERT_MSG(recency_floor_for(100000, 512) == 512,  "T17b: explicit 512, large S");
+    TEST_ASSERT_MSG(recency_floor_for(1000, 1024)  == 1024, "T17c: explicit 1024");
+    TEST_ASSERT_MSG(recency_floor_for(1000, 2048)  == 2048, "T17d: explicit 2048 > cap");
+    TEST_ASSERT_MSG(recency_floor_for(0,    256)   == 256,  "T17e: explicit 256, S=0");
+    // Monotone: explicit > auto at short prompts
+    const int A = kRecencyFloorAuto;
+    TEST_ASSERT_MSG(recency_floor_for(1000, 512) > recency_floor_for(1000, A),
+                    "T17f: explicit 512 > auto(1000)=40");
+}
+
+// ─── main ─────────────────────────────────────────────────────────────────────
+
+int main() {
+    std::fprintf(stderr, "=== test_regime_router ===\n");
+
+    RUN_TEST(t1_deploy_noop);
+    RUN_TEST(t2_degenerate);
+    RUN_TEST(t3_below_threshold);
+    RUN_TEST(t4_too_few_anchors);
+    RUN_TEST(t5_monotone);
+    RUN_TEST(t6_boundary);
+    RUN_TEST(t7_ratio_value);
+
+    std::fprintf(stderr, "--- v2 ---\n");
+    RUN_TEST(t8_v2_deploy_noop);
+    RUN_TEST(t9_sparse_prompt_guard);
+    RUN_TEST(t10_agentic_throttle);
+    RUN_TEST(t11_retrieval_full);
+    RUN_TEST(t12_v2_below_threshold);
+    RUN_TEST(t13_recency_floor_invariant);
+    RUN_TEST(t14_v2_degenerate);
+
+    std::fprintf(stderr, "--- recency_floor_for ---\n");
+    RUN_TEST(t15_recency_floor_off);
+    RUN_TEST(t16_recency_floor_auto);
+    RUN_TEST(t17_recency_floor_explicit);
+
+    std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures);
+    return (test_failures == 0) ? 0 : 1;
+}

From b31544f0e160c2879a5ffd89dcd3989fcc6f1a87 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Sat, 30 May 2026 22:12:22 +0200
Subject: [PATCH 15/16] feat(pflash): wire type-gate router into live handler;
 prune disproven mechanisms

Wires the compression-regime router into the request path (default-OFF via
PFLASH_ROUTER_ENABLE, so this commit is a no-op until enabled) and prunes the
router down to only what this session validated. Net -433 lines.

Live path (when enabled):
- detect agentic vs retrieval at admission (tool schemas / tool_use|tool_result
  blocks / tool_calls present). JSON-walking lives at the handler boundary
  (http_server.cpp); the router header stays pure (stdlib-only).
- agentic -> cascade off, keep_target 0.25 (~3x prefill/wall, content permitting)
- retrieval -> cascade on, full keep
- per-request use_transitive threaded through the compress path.

Pruned (mechanisms disproven this session, see bench/2026-05-30_*):
- v1 cascade-expansion router (R): refuted as keep predictor, Spearman rho=-0.27.
- recency-token floor: validated zero-sum at fixed keep (displaces top-K middle
  context, breaks controls). Removed from header + drafter.
- decide_v2 sparse-prompt guard (new_content_tokens): does not separate
  fails-from-passes under compression.

regime_router.h: 271 -> 96 LOC, zero non-stdlib includes, standalone -Werror,
43 unit tests (decide_v2 type-gate + detect_request_type truth-table).

Known follow-ups (NOT in this commit): (1) hard empty/degenerate safety guard
(result.tokens.size()/degenerate_decode_close already at the update site) to
catch empty responses; (2) reconcile bandit keep range [0.025,0.20] vs router
floor 0.25 (bandit currently wins per-session); (3) confident-off-task closure
needs client/app feedback, not a label-free reward (Momus-confirmed).
---
 server/src/common/model_backend.h    |   4 +
 server/src/common/regime_router.h    | 195 +++-----
 server/src/qwen3/qwen3_backend.cpp   |   4 +-
 server/src/qwen3/qwen3_drafter.cpp   |  84 ++--
 server/src/qwen3/qwen3_drafter.h     |  24 +-
 server/src/qwen35/qwen35_backend.cpp |   4 +-
 server/src/server/http_server.cpp    |  72 ++-
 server/src/server/http_server.h      |   6 +
 server/src/server/server_main.cpp    |  16 +
 server/test/test_regime_router.cpp   | 644 +++++----------------------
 10 files changed, 310 insertions(+), 743 deletions(-)

diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index f2a863418..0d8a85d5b 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -177,6 +177,10 @@ struct ModelBackend {
         std::string          drafter_path;    // GGUF path (for lazy-load)
         int                  drafter_gpu = 0;  // backend-local GPU for PFlash drafter
         bool                 skip_park = false; // true on >=32GB GPUs
+        // Per-request transitive-cascade override (-1 = use env default).
+        // 0 = off (agentic path: suppress cascade to avoid anchor bloat).
+        // 1 = on  (retrieval path: full expansion, same as today).
+        int                  use_transitive = -1;
     };
 
     struct CompressResult {
diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h
index 7c9917eb7..426c0c985 100644
--- a/server/src/common/regime_router.h
+++ b/server/src/common/regime_router.h
@@ -1,12 +1,13 @@
-// Pure, correct-by-construction adaptive compression-regime router.
-// No IO, no globals, no GPU, no ggml/llama deps — header-only.
+// Adaptive compression-regime router v2.
+// No IO, no globals, no GPU, no ggml/llama deps — header-only, stdlib-only.
 //
-// Decides whether the transitive anchor cascade should run at full expansion
-// (FullCascade, recall-preserving default) or be throttled
-// (Throttle, fires ONLY when expansion_ratio >= policy threshold).
+// Splits on prompt TYPE (agentic vs retrieval).
+// V1 R-router (cascade expansion ratio) was refuted as a keep predictor (ρ=-0.27).
+// Sparse-prompt guard and recency floor were validated zero-sum; removed.
 //
 // Build (standalone):
-//   g++-11 -std=gnu++17 -O2 -I. -o test_regime_router test/test_regime_router.cpp
+//   g++-11 -std=gnu++17 -O2 -I server/src/common
+//     -o /tmp/test_regime_router server/test/test_regime_router.cpp
 // CMake:  cmake --build build --target test_regime_router -j
 //         ctest -R regime_router --output-on-failure
 #pragma once
@@ -16,176 +17,80 @@
 
 namespace dflash::common {
 
-// ─── Input ───────────────────────────────────────────────────────────────────
-
-// All inputs are cheap lexical counts already available in the cascade path.
-struct CascadeStats {
-    int n_chunks;
-    int forced_anchor_only;    // chunks forced by BASE anchors, pre-cascade
-    int forced_after_cascade;  // chunks forced AFTER transitive cascade
-    int prompt_tokens;         // S
-    int keep_floor_chunks;     // ceil(keep_ratio * n_chunks) budget (informational)
-};
-
-// ─── Policy ──────────────────────────────────────────────────────────────────
-
-struct RouterPolicy {
-    int    threshold_tokens         = 32000;           // below this: passthrough
-    double expansion_throttle_ratio = INFINITY;        // DEFAULT disabled
-    int    min_anchor_chunks        = 1;               // don't throttle if too few anchors
-};
-
-// ─── Output ──────────────────────────────────────────────────────────────────
-
-enum class Regime { FullCascade, Throttle };
-
-struct RegimeDecision {
-    Regime      regime;
-    double      expansion_ratio;
-    const char* reason;
-};
-
-// ─── Core function ───────────────────────────────────────────────────────────
-
-// decide_regime — pure, no IO, no globals.
-//
-// Expansion ratio R = forced_after_cascade / forced_anchor_only
-//   (defined as 1.0 when forced_anchor_only == 0 to avoid division by zero).
-//
-// Transition to Throttle ONLY on the last branch; every other path returns
-// FullCascade so the default deployment posture is recall-preserving.
-inline RegimeDecision decide_regime(const CascadeStats& s, const RouterPolicy& p) {
-    // Compute R first (needed for degenerate guard + return value).
-    const double R = (s.forced_anchor_only > 0)
-        ? static_cast<double>(s.forced_after_cascade) / s.forced_anchor_only
-        : 1.0;
-
-    // Guard: degenerate inputs — return FullCascade, no further processing.
-    if (s.n_chunks <= 0 || s.forced_anchor_only < 0 || s.forced_after_cascade < 0)
-        return { Regime::FullCascade, R, "degenerate" };
-
-    // Passthrough: prompt too short to compress meaningfully.
-    if (s.prompt_tokens < p.threshold_tokens)
-        return { Regime::FullCascade, R, "below_threshold" };
-
-    // Guard: too few base anchors — throttle would be meaningless.
-    if (s.forced_anchor_only < p.min_anchor_chunks)
-        return { Regime::FullCascade, R, "too_few_anchors" };
-
-    // Only transition: cascade expanded beyond the policy limit.
-    if (R >= p.expansion_throttle_ratio)
-        return { Regime::Throttle, R, "cascade_over_expansion" };
-
-    return { Regime::FullCascade, R, "default_safe" };
-}
-
 // ─── V2 Router ───────────────────────────────────────────────────────────────
-//
-// Adaptive compression router v2.
-// Splits on prompt TYPE (agentic vs retrieval) rather than cascade expansion
-// ratio R (which was refuted as a keep predictor, Spearman ρ=-0.27).
-//
-// Additional guards:
-//   sparse_prompt_guard — skip compression when new_content_tokens is tiny
-//     (plumbing turns: recent orchestration continuity must not be dropped)
-//   recency_floor_turns — always keep the last K turns whole in the agentic path
-//
-// Sentinel for "keep all turns" recency in SAFE decisions:
-static constexpr int kRecencyKeepAll = (1 << 20);
-
-// Size-adaptive recency floor sentinel.
-// When recency_floor_tokens == kRecencyFloorAuto the compress path computes
-//   R = min(1024, ceil(0.04 * prompt_tokens))
-// instead of using a fixed token count.  0 = off (no-op default).
-static constexpr int kRecencyFloorAuto = -1;
 
 struct RequestFeatures {
-    bool is_agentic;           // tool schemas / tool_use|tool_result blocks present
-    int  prompt_tokens;        // total S
-    int  new_content_tokens;   // newest turn content size (sparse-plumbing detector)
+    bool is_agentic;    // tool schemas / tool_use|tool_result blocks present
+    int  prompt_tokens; // total S
 };
 
 struct RouterPolicyV2 {
-    bool   enabled                    = false;   // DEFAULT DISABLED → exact no-op
-    int    threshold_tokens           = 32000;   // below → passthrough
-    double agentic_keep_target        = 0.25;    // conservative floor, closes empty-failure tail
-    double full_keep_target           = 1.0;     // retrieval/QA & safe fallbacks
-    int    recency_floor_turns        = 2;       // keep last K turns whole (continuity)
-    int    sparse_new_content_tokens  = 256;     // below this → sparse_prompt_guard fires
+    bool   enabled             = false;  // DEFAULT DISABLED → exact no-op
+    int    threshold_tokens    = 32000;  // below → passthrough
+    double agentic_keep_target = 0.25;   // conservative floor, agentic path
+    double full_keep_target    = 1.0;    // retrieval/QA & safe fallbacks
 };
 
-// recency_floor_for — pure helper, no IO.
-//
-// Returns the concrete token floor for a given prompt size and policy:
-//   recency_floor_tokens == 0           → 0  (off, no-op)
-//   recency_floor_tokens == kRecencyFloorAuto (-1)
-//                                       → min(1024, ceil(0.04 * prompt_tokens))
-//   recency_floor_tokens  > 0           → recency_floor_tokens  (explicit override)
-//
-// "one turn equivalent" lower-bound: the agentic throttle path in decide_v2
-// already reserves recency_floor_turns whole turns; this helper computes the
-// token-count floor passed to the compress path for the token-budget guard.
-inline int recency_floor_for(int prompt_tokens, int recency_floor_tokens) {
-    if (recency_floor_tokens == 0)
-        return 0;
-    if (recency_floor_tokens == kRecencyFloorAuto) {
-        // min(1024, ceil(0.04 * S)) — scales with context, caps at 1024
-        const int adaptive = static_cast<int>(
-            std::ceil(0.04 * static_cast<double>(prompt_tokens < 0 ? 0 : prompt_tokens)));
-        return (adaptive < 1024) ? adaptive : 1024;
-    }
-    // Explicit positive override.
-    return (recency_floor_tokens > 0) ? recency_floor_tokens : 0;
-}
-
 struct RouterDecisionV2 {
     double      keep_target;
-    int         recency_floor_turns;
     bool        cascade;
     const char* reason;
 };
 
 // decide_v2 — pure, no IO, no globals.
 //
-// SAFE path: keep_target=full_keep_target, recency=kRecencyKeepAll, cascade=true.
+// SAFE path: keep_target=full_keep_target, cascade=true.
 // Returns SAFE when:
-//   - p.enabled == false                        (deploy no-op, correct-by-construction)
-//   - f.prompt_tokens <= 0 || f.new_content_tokens < 0  (degenerate)
-//   - f.prompt_tokens < p.threshold_tokens      (below threshold)
-//   - f.new_content_tokens < p.sparse_new_content_tokens (sparse_prompt_guard)
+//   - p.enabled == false                    (deploy no-op, correct-by-construction)
+//   - f.prompt_tokens <= 0                  (degenerate)
+//   - f.prompt_tokens < p.threshold_tokens  (below threshold)
 // Throttling path (only when all guards pass):
-//   - is_agentic → {agentic_keep_target, recency_floor_turns, cascade=false}
-//   - else       → {full_keep_target,    recency_floor_turns, cascade=true}
+//   - is_agentic → {agentic_keep_target, cascade=false, "agentic_throttle"}
+//   - else       → {full_keep_target,    cascade=true,  "retrieval_full"}
 inline RouterDecisionV2 decide_v2(const RequestFeatures& f,
                                    const RouterPolicyV2&   p) {
-    // Helper: SAFE return (keep everything, cascade on, recency = keep-all).
-    const RouterDecisionV2 SAFE_disabled        = { p.full_keep_target, kRecencyKeepAll, true, "disabled_noop"       };
-    const RouterDecisionV2 SAFE_degenerate      = { p.full_keep_target, kRecencyKeepAll, true, "degenerate"          };
-    const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, kRecencyKeepAll, true, "below_threshold"     };
-    const RouterDecisionV2 SAFE_sparse          = { p.full_keep_target, kRecencyKeepAll, true, "sparse_prompt_guard" };
+    const RouterDecisionV2 SAFE_disabled        = { p.full_keep_target, true, "disabled_noop"   };
+    const RouterDecisionV2 SAFE_degenerate      = { p.full_keep_target, true, "degenerate"      };
+    const RouterDecisionV2 SAFE_below_threshold = { p.full_keep_target, true, "below_threshold" };
 
-    // 1. Deploy no-op: disabled router is an exact no-op (correct-by-construction).
     if (!p.enabled)
         return SAFE_disabled;
 
-    // 2. Degenerate inputs: prompt_tokens <= 0 or new_content_tokens < 0.
-    if (f.prompt_tokens <= 0 || f.new_content_tokens < 0)
+    if (f.prompt_tokens <= 0)
         return SAFE_degenerate;
 
-    // 3. Below threshold: prompt too short to compress meaningfully.
     if (f.prompt_tokens < p.threshold_tokens)
         return SAFE_below_threshold;
 
-    // 4. Sparse-prompt guard: tiny new-content turn (plumbing class).
-    //    Compression would drop recent orchestration continuity with no anchor signal.
-    if (f.new_content_tokens < p.sparse_new_content_tokens)
-        return SAFE_sparse;
-
-    // 5. Throttling paths — all guards passed.
     if (f.is_agentic)
-        return { p.agentic_keep_target, p.recency_floor_turns, false, "agentic_throttle" };
+        return { p.agentic_keep_target, false, "agentic_throttle" };
+
+    return { p.full_keep_target, true, "retrieval_full" };
+}
 
-    return { p.full_keep_target, p.recency_floor_turns, true, "retrieval_full" };
+// ─── TYPE GATE ───────────────────────────────────────────────────────────────
+//
+// Coarse request-type classifier.  Pure function — no IO, no globals, no JSON.
+//
+// Agentic signals (any one is sufficient):
+//   1. has_tools          — tools array was non-null and non-empty
+//   2. has_tool_use_blocks — any message content contained a tool_use or
+//                           tool_result block  (Anthropic style)
+//   3. has_tool_calls     — any assistant message had a non-empty tool_calls
+//                           array  (OpenAI style)
+//
+// The caller is responsible for extracting these bools from the wire format.
+// Default: Retrieval (safe — never compresses more than intended).
+
+enum class RequestType { Agentic, Retrieval };
+
+// detect_request_type — pure, stdlib-only, no IO.
+inline RequestType detect_request_type(bool has_tools,
+                                        bool has_tool_use_blocks,
+                                        bool has_tool_calls) {
+    if (has_tools || has_tool_use_blocks || has_tool_calls)
+        return RequestType::Agentic;
+    return RequestType::Retrieval;
 }
 
 }  // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp
index e2adc7f65..bc0e9178b 100644
--- a/server/src/qwen3/qwen3_backend.cpp
+++ b/server/src/qwen3/qwen3_backend.cpp
@@ -952,7 +952,9 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req)
     }
 
     result.compressed_ids = drafter_score_and_compress(
-        drafter_ctx_, req.input_ids, req.keep_ratio);
+        drafter_ctx_, req.input_ids, req.keep_ratio,
+        /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13,
+        req.use_transitive);
     result.ok = true;
 
     if (!req.skip_park && !was_parked) unpark("target");
diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index 833247b70..852fc96e1 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -18,7 +18,6 @@
 #include "common/backend_precision.h"
 #include "internal.h"
 #include "anchor_scan.h"
-#include "regime_router.h"
 
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -81,13 +80,13 @@ struct CompressCfg {
     int   query_tokens;
     int   head_chunks;
     int   tail_chunks;
-    int   recency_floor_tokens; // PFLASH_RECENCY_FLOOR_TOKENS: force-keep last N tokens (0 = off)
     dflash::qwen3::AnchorScanCfg anchor;
     bool  use_transitive;
     int   max_iters;
 };
 
-static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) {
+static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep,
+                                          int use_transitive_override = -1) {
     CompressCfg c{};
 
     c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
@@ -162,6 +161,10 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) {
     c.anchor.max_forced_count         = (int)(max_forced_ratio * n_keep);
 
     c.use_transitive = [&]{
+        // Per-request override (0=off, 1=on) from router decision takes precedence.
+        if (use_transitive_override == 0) return false;
+        if (use_transitive_override == 1) return true;
+        // Fallback: read from env (same as before, no behaviour change when -1).
         const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
         const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
         if (nv >= 0) return nv != 0;
@@ -177,23 +180,6 @@ static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep) {
         return 3;
     }();
 
-    // Recency floor: unconditionally force-keep the last R tokens of the prompt
-    // body before anchor scoring.  DEFAULT 0 = no-op (unchanged behavior).
-    //   0  = off
-    //  -1  = auto: min(1024, ceil(0.04 * S))  [resolved at compress time when S is known]
-    //  >0  = explicit token count
-    // Note: env_int() rejects negatives, so read raw and parse to preserve -1.
-    // Rescues recent wiring-sequence turns dropped when anchors seed from a
-    // short/sparse tail (e.g. bare [tool_result] turns).
-    {
-        const char * rfv = std::getenv("PFLASH_RECENCY_FLOOR_TOKENS");
-        if (rfv) {
-            c.recency_floor_tokens = std::atoi(rfv);  // preserves -1 sentinel
-        } else {
-            c.recency_floor_tokens = 0;
-        }
-    }
-
     return c;
 }
 
@@ -381,7 +367,8 @@ static std::vector<int32_t> qwen35_score_and_compress(
     float keep_ratio,
     int chunk_size,
     int n_lookahead,
-    int pool_kernel) {
+    int pool_kernel,
+    int use_transitive_override = -1) {
 
     const int S = (int)ids.size();
     const int hidden = w.n_embd;
@@ -658,7 +645,7 @@ static std::vector<int32_t> qwen35_score_and_compress(
     }
     std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; });
 
-    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep);
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override);
 
     std::vector<uint8_t> selected((size_t)n_chunks, 0);
     int count = 0;
@@ -669,19 +656,6 @@ static std::vector<int32_t> qwen35_score_and_compress(
     std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
 
-    // Recency floor: force-keep the last R tokens worth of chunks before anchor
-    // scoring so that recent wiring-sequence turns are never dropped regardless
-    // of anchor seed quality.  R=0 is a no-op (default).  R=-1 = auto.
-    {
-        const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
-        if (R > 0) {
-            const int floor_tok = std::min(S, R);
-            const int floor_start_tok = S - floor_tok;
-            const int floor_start_chunk = floor_start_tok / chunk_size;
-            for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1;
-        }
-    }
-
     dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
     anchor_cfg.chunk_size = chunk_size;
 
@@ -775,7 +749,8 @@ std::vector<int32_t> drafter_score_and_compress(
     float keep_ratio,
     int chunk_size,
     int n_lookahead,
-    int pool_kernel) {
+    int pool_kernel,
+    int use_transitive_override) {
     if (!ctx.loaded) {
         set_last_error("drafter not loaded");
         return {};
@@ -786,7 +761,7 @@ std::vector<int32_t> drafter_score_and_compress(
             return {};
         }
         auto * st = static_cast<Qwen35DrafterState *>(ctx.arch_state);
-        return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel);
+        return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel, use_transitive_override);
     }
     const int S = (int)ids.size();
     if (S < n_lookahead + 1) {
@@ -843,35 +818,20 @@ std::vector<int32_t> drafter_score_and_compress(
     std::sort(chunk_means.begin(), chunk_means.end(),
                       [](auto a, auto b) { return a.first > b.first; });
 
-    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep);
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override);
 
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
     for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1;
     for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
 
-    // Recency floor: force-keep the last R tokens worth of chunks before anchor
-    // scoring so that recent wiring-sequence turns are never dropped regardless
-    // of anchor seed quality.  R=0 is a no-op (default).  R=-1 = auto.
-    {
-        const int R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
-        if (R > 0) {
-            const int floor_tok = std::min(S, R);
-            const int floor_start_tok = S - floor_tok;
-            const int floor_start_chunk = floor_start_tok / chunk_size;
-            for (int c = floor_start_chunk; c < n_chunks; ++c) forced[(size_t)c] = 1;
-        }
-    }
-
     const int q0 = std::max(0, S - cfg.query_tokens);
     {
-        const int resolved_R = dflash::common::recency_floor_for(S, cfg.recency_floor_tokens);
         std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
         dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
         anchor_cfg.chunk_size = chunk_size;
-        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d recency_floor=%d (resolved=%d)\n",
-            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count,
-            cfg.recency_floor_tokens, resolved_R);
+        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n",
+            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count);
         std::fflush(stderr);
 
         if (cfg.use_transitive) {
@@ -935,4 +895,18 @@ std::vector<int32_t> drafter_score_and_compress(
     return out;
 }
 
+// ABI-stable 6-arg overload — old callers compiled before the use_transitive_override
+// parameter was added link here without requiring recompilation.
+std::vector<int32_t> drafter_score_and_compress(
+    DrafterContext & ctx,
+    const std::vector<int32_t> & ids,
+    float keep_ratio,
+    int chunk_size,
+    int n_lookahead,
+    int pool_kernel) {
+    return drafter_score_and_compress(ctx, ids, keep_ratio,
+                                      chunk_size, n_lookahead, pool_kernel,
+                                      /*use_transitive_override=*/-1);
+}
+
 } // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h
index e5424f9dd..08aed3e9b 100644
--- a/server/src/qwen3/qwen3_drafter.h
+++ b/server/src/qwen3/qwen3_drafter.h
@@ -66,13 +66,27 @@ void free_drafter_weights(DrafterContext & ctx);
 // Score importance per token via Liu Q-hook tail attention, then chunk-top-K
 // span merge. Returns surviving token IDs (drafter vocab).
 //
-//   ids          input token IDs of length S
-//   keep_ratio   fraction of `chunk_size`-token chunks to keep
-//   chunk_size   span granularity (default 32)
-//   n_lookahead  trailing Q tokens used for tail attention (default 8)
-//   pool_kernel  AvgPool kernel for score smoothing (default 13)
+//   ids                    input token IDs of length S
+//   keep_ratio             fraction of `chunk_size`-token chunks to keep
+//   chunk_size             span granularity (default 32)
+//   n_lookahead            trailing Q tokens used for tail attention (default 8)
+//   pool_kernel            AvgPool kernel for score smoothing (default 13)
+//   use_transitive_override  -1 = read from env (default, no behaviour change)
+//                             0 = cascade off (agentic path)
+//                             1 = cascade on  (retrieval path)
 //
 // On failure returns empty vector + sets last_error.
+std::vector<int32_t> drafter_score_and_compress(
+    DrafterContext & ctx,
+    const std::vector<int32_t> & ids,
+    float  keep_ratio,
+    int    chunk_size,
+    int    n_lookahead,
+    int    pool_kernel,
+    int    use_transitive_override);
+
+// Backward-compatible 6-arg overload — ABI-stable wrapper, defined in qwen3_drafter.cpp.
+// Old callers compiled against the 6-arg signature continue to link without recompile.
 std::vector<int32_t> drafter_score_and_compress(
     DrafterContext & ctx,
     const std::vector<int32_t> & ids,
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 82d0ea6ca..9859cffae 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -396,7 +396,9 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req
     }
 
     result.compressed_ids = drafter_score_and_compress(
-        drafter_ctx_, req.input_ids, req.keep_ratio);
+        drafter_ctx_, req.input_ids, req.keep_ratio,
+        /*chunk_size=*/32, /*n_lookahead=*/8, /*pool_kernel=*/13,
+        req.use_transitive);
     result.ok = !result.compressed_ids.empty();
     if (result.ok) {
         std::fprintf(stderr, "[compress] %zu -> %zu tokens\n",
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 219bccf6d..f2518ba1e 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1222,10 +1222,74 @@ void HttpServer::worker_loop() {
                         // 3. Compress via typed API
                         ModelBackend::CompressRequest creq;
                         creq.input_ids = std::move(drafter_ids);
-                        // Bandit: use per-session keep_ratio if session_id provided.
-                        creq.keep_ratio = req.session_id.empty()
-                            ? config_.pflash_keep_ratio
-                            : sessions_.get_keep_ratio(req.session_id);
+
+                        // TYPE-GATE router (default-off via pflash_router.enabled).
+                        // When enabled, detect request type and override keep_ratio +
+                        // cascade per the v2 policy.  When disabled → exact no-op.
+                        {
+                            // Extract agentic-signal bools from the parsed JSON
+                            // (json-walking belongs at the handler boundary, not
+                            //  in the pure router header).
+                            const bool _has_tools =
+                                req.tools.is_array() && !req.tools.empty();
+                            bool _has_tool_use_blocks = false;
+                            bool _has_tool_calls      = false;
+                            if (req.messages.is_array()) {
+                                for (const auto & _msg : req.messages) {
+                                    if (!_msg.is_object()) continue;
+                                    if (_msg.contains("tool_calls")) {
+                                        const auto & _tc = _msg["tool_calls"];
+                                        if (_tc.is_array() && !_tc.empty())
+                                            _has_tool_calls = true;
+                                    }
+                                    if (_msg.contains("content")) {
+                                        const auto & _c = _msg["content"];
+                                        if (_c.is_array()) {
+                                            for (const auto & _b : _c) {
+                                                if (!_b.is_object()) continue;
+                                                const std::string _bt = _b.value("type", "");
+                                                if (_bt == "tool_use" || _bt == "tool_result")
+                                                    _has_tool_use_blocks = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            const bool is_agentic = (detect_request_type(
+                                _has_tools, _has_tool_use_blocks, _has_tool_calls)
+                                    == RequestType::Agentic);
+                            const RequestFeatures rf {
+                                is_agentic,
+                                n_prompt
+                            };
+                            const RouterDecisionV2 rd = decide_v2(rf, config_.pflash_router);
+                            if (config_.pflash_router.enabled) {
+                                // Router is on: apply per-request keep + cascade override.
+                                // Bandit keeps winning if session_id is present — bandit
+                                // is the M2 lever for agentic keep level tuning.
+                                // For M1 the TYPE decision overrides keep_ratio when no
+                                // session bandit is active.
+                                if (req.session_id.empty()) {
+                                    creq.keep_ratio = (float)rd.keep_target;
+                                } else {
+                                    creq.keep_ratio = sessions_.get_keep_ratio(req.session_id);
+                                }
+                                // cascade = use_transitive: 0 = off, 1 = on, -1 = env default
+                                creq.use_transitive = rd.cascade ? 1 : 0;
+                                std::fprintf(stderr,
+                                    "[pflash-router] type=%s keep=%.3f cascade=%s reason=%s\n",
+                                    is_agentic ? "agentic" : "retrieval",
+                                    creq.keep_ratio,
+                                    rd.cascade ? "on" : "off",
+                                    rd.reason);
+                            } else {
+                                // Router disabled: legacy keep_ratio path, no change.
+                                creq.keep_ratio = req.session_id.empty()
+                                    ? config_.pflash_keep_ratio
+                                    : sessions_.get_keep_ratio(req.session_id);
+                                // use_transitive stays at -1 (env default).
+                            }
+                        }
                         creq.drafter_path = config_.pflash_drafter_path;
                         creq.drafter_gpu = config_.pflash_drafter_gpu;
                         creq.skip_park = config_.pflash_skip_park;
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 803cf74e8..f4fe57316 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "common/model_backend.h"
+#include "common/regime_router.h"
 #include "tokenizer.h"
 #include "chat_template.h"
 #include "tool_memory.h"
@@ -151,6 +152,11 @@ struct ServerConfig {
     bool        pflash_skip_park = false;   // skip park/unpark for >=32GB GPUs
     bool        lazy_draft      = false;   // park decode draft when idle to save VRAM
 
+    // TYPE-gate compression router (v2).
+    // Default: disabled (exact no-op, correct-by-construction).
+    // Enable via PFLASH_ROUTER_ENABLE=1 env var at server startup.
+    RouterPolicyV2 pflash_router;          // enabled=false by default
+
     // Disk prefix cache
     std::string disk_cache_dir;             // empty = disabled
     size_t      disk_cache_budget_mb = 4096; // max disk usage in MB
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index 156c84afe..21b8379f3 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -540,6 +540,21 @@ int main(int argc, char ** argv) {
                      sconfig.pflash_threshold, sconfig.pflash_keep_ratio,
                      sconfig.pflash_drafter_gpu,
                      (int)sconfig.pflash_skip_park);
+        // TYPE-gate router: opt-in via env var, default-off.
+        {
+            const char * router_env = std::getenv("PFLASH_ROUTER_ENABLE");
+            if (router_env && *router_env && std::strcmp(router_env, "0") != 0) {
+                sconfig.pflash_router.enabled = true;
+                // Inherit pflash threshold so the router fires at the same
+                // token count as the compression admission gate.
+                sconfig.pflash_router.threshold_tokens = sconfig.pflash_threshold;
+                std::fprintf(stderr,
+                    "[server] pflash-router: ENABLED (type-gate v2) "
+                    "threshold=%d agentic_keep=%.3f\n",
+                    sconfig.pflash_router.threshold_tokens,
+                    sconfig.pflash_router.agentic_keep_target);
+            }
+        }
     }
 
     // Create backend.
@@ -771,6 +786,7 @@ int main(int argc, char ** argv) {
         std::fprintf(stderr, "[server] │  pflash_skip_park= %s\n", sconfig.pflash_skip_park ? "ON" : "off");
         std::fprintf(stderr, "[server] │  fp_use_bsa      = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off");
         std::fprintf(stderr, "[server] │  fp_alpha        = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)");
+        std::fprintf(stderr, "[server] │  pflash_router   = %s\n", sconfig.pflash_router.enabled ? "ON" : "off");
     }
     if (bargs.draft_path) {
         std::fprintf(stderr, "[server] │  lazy_draft      = %s\n", sconfig.lazy_draft ? "ON" : "off");
diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp
index f92a0b512..b45370f99 100644
--- a/server/test/test_regime_router.cpp
+++ b/server/test/test_regime_router.cpp
@@ -1,7 +1,16 @@
-// Unit tests for dflash::common::decide_regime() — pure function, no GPU.
+// Unit tests for the pflash regime router v2 — pure function, no GPU.
+//
+// Tests kept: t8 (deploy-noop), t10 (agentic-throttle), t11 (retrieval-full),
+//             t12 (below-threshold), t14 (degenerate), t18 (detect_request_type).
+//
+// Tests removed:
+//   t1-t7  — v1 R-router (decide_regime), refuted (ρ=-0.27), deleted.
+//   t9     — sparse_prompt_guard, validated zero-sum, deleted.
+//   t13    — recency_floor_invariant, deleted with recency floor feature.
+//   t15-t17 — recency_floor_for, deleted with recency floor feature.
 //
 // Build (standalone, from repo root):
-//   g++-11 -std=gnu++17 -O2 -I server/src/common
+//   g++-11 -std=gnu++17 -O2 -Wall -Wextra -Werror -I server/src/common
 //     -o /tmp/test_regime_router server/test/test_regime_router.cpp
 // CMake:
 //   cmake --build build --target test_regime_router -j
@@ -11,12 +20,11 @@
 
 #include <cmath>
 #include <cstdio>
-#include <limits>
 #include <string>
 
 using namespace dflash::common;
 
-// ─── Minimal test framework (mirrors test_adaptive_keep_ratio.cpp) ───────────
+// ─── Minimal test framework ───────────────────────────────────────────────────
 
 static int test_failures = 0;
 static int test_count    = 0;
@@ -52,437 +60,86 @@ static inline bool approx_eq(double a, double b, double eps = 1e-9) {
 
 // ─── Helpers ─────────────────────────────────────────────────────────────────
 
-// Build a policy with expansion_throttle_ratio disabled (default safe).
-static RouterPolicy default_policy() { return {}; }
-
-// Build a policy that throttles at ratio >= r.
-static RouterPolicy throttle_policy(double r,
-                                    int threshold = 32000,
-                                    int min_anchor = 1) {
-    RouterPolicy p;
-    p.threshold_tokens         = threshold;
-    p.expansion_throttle_ratio = r;
-    p.min_anchor_chunks        = min_anchor;
-    return p;
-}
-
-static CascadeStats make_stats(int n_chunks,
-                                int anchor_only,
-                                int after_cascade,
-                                int prompt_tokens,
-                                int keep_floor = 0) {
-    return { n_chunks, anchor_only, after_cascade, prompt_tokens, keep_floor };
-}
-
-// ─── T1: DEPLOY-NO-OP ────────────────────────────────────────────────────────
-// With the DEFAULT RouterPolicy (ratio=INFINITY), decide_regime must return
-// FullCascade for ANY stats, including pathologically large expansion.
-
-static void t1_deploy_noop() {
-    RouterPolicy p = default_policy();
-
-    // Normal case
-    {
-        auto d = decide_regime(make_stats(100, 10, 20, 50000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T1a: default policy must always give FullCascade");
-    }
-    // Huge expansion: forced_anchor_only=10, forced_after_cascade=1000, prompt=100K
-    {
-        auto d = decide_regime(make_stats(500, 10, 1000, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T1b: huge expansion with default policy must be FullCascade");
-    }
-    // Prompt below threshold
-    {
-        auto d = decide_regime(make_stats(50, 5, 500, 1000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T1c: short prompt with default policy must be FullCascade");
-    }
-    // Zero anchors
-    {
-        auto d = decide_regime(make_stats(100, 0, 0, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T1d: zero anchors with default policy must be FullCascade");
-    }
-    // Sweep: 50 random-ish stat combinations
-    for (int i = 1; i <= 50; ++i) {
-        CascadeStats s = make_stats(i * 10,
-                                    i,
-                                    i * 100,          // R = 100, very high
-                                    i * 5000);
-        auto d = decide_regime(s, p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T1-sweep: default policy must be FullCascade for all stats");
-    }
-}
-
-// ─── T2: DEGENERATE ──────────────────────────────────────────────────────────
-// Degenerate inputs must not crash or div-by-zero, and must return FullCascade.
-
-static void t2_degenerate() {
-    RouterPolicy p = throttle_policy(2.0);   // would throttle if R >= 2
-
-    // n_chunks == 0
-    {
-        auto d = decide_regime(make_stats(0, 5, 10, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T2a: n_chunks=0 must return FullCascade");
-        TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio),
-                        "T2a: expansion_ratio must be finite when n_chunks=0");
-    }
-    // forced_anchor_only == 0 (no anchors before cascade) → R defaults to 1.0
-    {
-        auto d = decide_regime(make_stats(100, 0, 50, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T2b: forced_anchor_only=0 must return FullCascade");
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
-                        "T2b: expansion_ratio must be 1.0 when forced_anchor_only=0");
-    }
-    // Negative forced_anchor_only
-    {
-        auto d = decide_regime(make_stats(100, -1, 50, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T2c: negative forced_anchor_only must return FullCascade");
-        TEST_ASSERT_MSG(std::isfinite(d.expansion_ratio),
-                        "T2c: expansion_ratio must be finite for negative anchor count");
-    }
-    // Negative forced_after_cascade
-    {
-        auto d = decide_regime(make_stats(100, 5, -1, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T2d: negative forced_after_cascade must return FullCascade");
-    }
-    // Both negative
-    {
-        auto d = decide_regime(make_stats(100, -3, -7, 100000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T2e: both counts negative must return FullCascade");
-    }
-}
-
-// ─── T3: BELOW-THRESHOLD ─────────────────────────────────────────────────────
-// prompt_tokens < threshold → FullCascade regardless of R and finite ratio.
-
-static void t3_below_threshold() {
-    RouterPolicy p = throttle_policy(1.5, /*threshold=*/32000, /*min_anchor=*/1);
-
-    // prompt = threshold - 1 (just below)
-    {
-        auto d = decide_regime(make_stats(100, 10, 1000, 31999), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T3a: prompt just below threshold must be FullCascade");
-        TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
-                        "T3a: reason must be 'below_threshold'");
-    }
-    // prompt = 0
-    {
-        auto d = decide_regime(make_stats(100, 10, 9999, 0), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T3b: prompt=0 must be FullCascade");
-    }
-    // Even with R = 1000 and finite ratio = 2.0, still FullCascade below threshold
-    {
-        auto d = decide_regime(make_stats(200, 5, 5000, 100), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T3c: tiny prompt, huge R, finite ratio -> FullCascade");
-    }
-}
-
-// ─── T4: TOO-FEW-ANCHORS ─────────────────────────────────────────────────────
-// forced_anchor_only < min_anchor_chunks → FullCascade.
-
-static void t4_too_few_anchors() {
-    RouterPolicy p = throttle_policy(2.0, /*threshold=*/32000, /*min_anchor=*/3);
-    // forced_anchor_only = 2 < min_anchor = 3
-    {
-        auto d = decide_regime(make_stats(100, 2, 1000, 50000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T4a: anchors below min must be FullCascade");
-    }
-    // forced_anchor_only = 0 < min_anchor = 3
-    {
-        auto d = decide_regime(make_stats(100, 0, 500, 50000), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T4b: zero anchors below min must be FullCascade");
-    }
-    // forced_anchor_only = 3 == min_anchor = 3: NOT too few → may throttle
-    {
-        auto d = decide_regime(make_stats(100, 3, 300, 50000), p);
-        // R = 300/3 = 100 >= 2.0 → should be Throttle
-        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
-                        "T4c: anchors == min AND R >= ratio must throttle");
-    }
-}
-
-// ─── T5: MONOTONE ────────────────────────────────────────────────────────────
-// With a finite ratio policy, once Throttle triggers at R it must stay Throttle
-// for all larger R.
-
-static void t5_monotone() {
-    // Policy: ratio=3.0, threshold=32000, min_anchor=1, prompt_tokens=50000
-    RouterPolicy p = throttle_policy(3.0, 32000, 1);
-    const int prompt = 50000;
-    const int anchor = 10;  // fixed; vary after_cascade to control R
-
-    // R = 2.9 → FullCascade
-    {
-        // after = anchor * R = 10 * 2.9 = 29
-        auto d = decide_regime(make_stats(100, anchor, 29, prompt), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T5a: R=2.9 < 3.0 must be FullCascade");
-    }
-    // R = 3.0 → Throttle (boundary: >= triggers)
-    {
-        // after = 10 * 3 = 30
-        auto d = decide_regime(make_stats(100, anchor, 30, prompt), p);
-        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
-                        "T5b: R=3.0 == ratio must be Throttle");
-    }
-    // R = 10.0 → Throttle
-    {
-        auto d = decide_regime(make_stats(100, anchor, 100, prompt), p);
-        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
-                        "T5c: R=10.0 >> ratio must be Throttle");
-    }
-    // Monotone sweep: for all integer R from 1 to 100, once Throttle appears
-    // it must not flip back to FullCascade.
-    bool seen_throttle = false;
-    bool monotone      = true;
-    for (int r_int = 1; r_int <= 100; ++r_int) {
-        // after = anchor * r_int → exact integer R
-        auto d = decide_regime(make_stats(200, anchor, anchor * r_int, prompt), p);
-        if (d.regime == Regime::Throttle) {
-            seen_throttle = true;
-        } else if (seen_throttle) {
-            // Flipped back to FullCascade after Throttle was seen: not monotone
-            monotone = false;
-            std::fprintf(stderr,
-                         "  MONOTONE VIOLATION at R=%d: Throttle then FullCascade\n",
-                         r_int);
-            break;
-        }
-    }
-    TEST_ASSERT_MSG(seen_throttle, "T5d: sweep must trigger Throttle at some R");
-    TEST_ASSERT_MSG(monotone,      "T5e: regime must be monotone (no FullCascade after Throttle)");
-}
-
-// ─── T6: BOUNDARY ────────────────────────────────────────────────────────────
-// R exactly == ratio → Throttle; R = ratio - epsilon → FullCascade.
-
-static void t6_boundary() {
-    const double ratio   = 5.0;
-    RouterPolicy p       = throttle_policy(ratio, 32000, 1);
-    const int    anchor  = 1000;   // use large anchor to get precise integer ratios
-    const int    prompt  = 50000;
-
-    // R exactly == ratio: after = anchor * ratio = 5000
-    {
-        auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio, prompt), p);
-        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
-                        "T6a: R exactly == ratio must be Throttle");
-    }
-
-    // R = ratio - epsilon where epsilon = 0.5/anchor (one less chunk → R < ratio)
-    {
-        // after = anchor * ratio - 1 = 4999 → R = 4.999 < 5.0
-        auto d = decide_regime(make_stats(500, anchor, anchor * (int)ratio - 1, prompt), p);
-        TEST_ASSERT_MSG(d.regime == Regime::FullCascade,
-                        "T6b: R just below ratio must be FullCascade");
-    }
-}
-
-// ─── T7: RATIO-VALUE ─────────────────────────────────────────────────────────
-// Check that expansion_ratio is computed correctly.
-
-static void t7_ratio_value() {
-    RouterPolicy p = default_policy();  // regime doesn't matter; check ratio value
-
-    // forced_anchor_only=10, forced_after_cascade=85 → R = 8.5
-    {
-        auto d = decide_regime(make_stats(100, 10, 85, 50000), p);
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 8.5),
-                        "T7a: R must be 85/10 = 8.5");
-    }
-    // forced_anchor_only=0 → R must be 1.0 (no div-by-zero)
-    {
-        auto d = decide_regime(make_stats(100, 0, 50, 50000), p);
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
-                        "T7b: forced_anchor_only=0 must give expansion_ratio=1.0");
-    }
-    // forced_anchor_only=5, forced_after_cascade=5 → R = 1.0
-    {
-        auto d = decide_regime(make_stats(100, 5, 5, 50000), p);
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
-                        "T7c: equal anchors before/after must give R=1.0");
-    }
-    // forced_anchor_only=7, forced_after_cascade=7 → R = 1.0 (no expansion)
-    {
-        auto d = decide_regime(make_stats(100, 7, 7, 50000), p);
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 1.0),
-                        "T7d: no cascade expansion must give R=1.0");
-    }
-    // Verify ratio when throttle policy triggers: ratio value should still be correct
-    {
-        RouterPolicy tp = throttle_policy(3.0);
-        auto d = decide_regime(make_stats(100, 4, 20, 50000), tp);
-        // R = 20/4 = 5.0 → Throttle, ratio = 5.0
-        TEST_ASSERT_MSG(d.regime == Regime::Throttle,
-                        "T7e: R=5.0 >= 3.0 must throttle");
-        TEST_ASSERT_MSG(approx_eq(d.expansion_ratio, 5.0),
-                        "T7e: expansion_ratio must be 5.0");
-    }
-}
-
-// ─── V2 helpers ──────────────────────────────────────────────────────────────
-
-// Default v2 policy: disabled (deploy no-op).
 static RouterPolicyV2 default_v2_policy() { return {}; }
 
-// Enabled v2 policy with default field values.
 static RouterPolicyV2 enabled_v2_policy() {
     RouterPolicyV2 p;
     p.enabled = true;
     return p;
 }
 
-static RequestFeatures make_features(bool is_agentic,
-                                      int  prompt_tokens,
-                                      int  new_content_tokens) {
-    return { is_agentic, prompt_tokens, new_content_tokens };
+static RequestFeatures make_features(bool is_agentic, int prompt_tokens) {
+    return { is_agentic, prompt_tokens };
 }
 
-// ─── T8: DEPLOY-NO-OP (v2) ───────────────────────────────────────────────────
+// ─── T8: DEPLOY-NO-OP ────────────────────────────────────────────────────────
 // enabled=false → SAFE for every input, including is_agentic=true and huge prompts.
-// Correct-by-construction: disabled router must be an exact no-op.
 
 static void t8_v2_deploy_noop() {
     RouterPolicyV2 p = default_v2_policy();   // enabled=false
 
-    // Baseline: normal agentic prompt, well above threshold.
     {
-        auto d = decide_v2(make_features(true, 100000, 10000), p);
+        auto d = decide_v2(make_features(true, 100000), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T8a: disabled->keep_target must be full_keep_target");
-        TEST_ASSERT_MSG(d.cascade,
-                        "T8a: disabled->cascade must be true");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T8a: disabled->recency must be keep-all sentinel");
+        TEST_ASSERT_MSG(d.cascade, "T8a: disabled->cascade must be true");
+        TEST_ASSERT_MSG(std::string(d.reason) == "disabled_noop",
+                        "T8a: disabled->reason must be 'disabled_noop'");
     }
-    // Sweep: all combinations of is_agentic, varying prompt and new_content sizes.
+    // Sweep all combinations of is_agentic and prompt sizes.
     for (int i = 0; i < 4; ++i) {
-        bool agentic   = (i & 1) != 0;
-        int  prompt    = (i & 2) ? 100000 : 500;
-        int  new_toks  = (i & 2) ? 10000 : 10;
-        auto d = decide_v2(make_features(agentic, prompt, new_toks), p);
+        bool agentic = (i & 1) != 0;
+        int  prompt  = (i & 2) ? 100000 : 500;
+        auto d = decide_v2(make_features(agentic, prompt), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T8-sweep: disabled->keep_target must be full_keep_target");
-        TEST_ASSERT_MSG(d.cascade,
-                        "T8-sweep: disabled->cascade must be true");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T8-sweep: disabled->recency must be keep-all sentinel");
+        TEST_ASSERT_MSG(d.cascade, "T8-sweep: disabled->cascade must be true");
     }
-    // Explicitly: is_agentic=true, large prompt, large new_content — must be SAFE.
+    // Explicitly: is_agentic=true, large prompt — must be SAFE.
     {
-        auto d = decide_v2(make_features(true, 200000, 50000), p);
+        auto d = decide_v2(make_features(true, 200000), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T8b: disabled, agentic, huge prompt -> SAFE");
         TEST_ASSERT_MSG(d.cascade, "T8b: disabled -> cascade=true");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T8b: disabled -> recency keep-all");
-    }
-}
-
-// ─── T9: SPARSE-PROMPT GUARD (failure-class fix) ─────────────────────────────
-// is_agentic=true, prompt above threshold, BUT new_content < sparse threshold.
-// This is the LONG_A-t11/LONG_B-t10 plumbing class: a tiny tool_result riding
-// on long history. Compression must NOT throttle here (would drop continuity).
-
-static void t9_sparse_prompt_guard() {
-    RouterPolicyV2 p = enabled_v2_policy();
-
-    // Canonical failure case: 3-word tool_result on 43K history.
-    {
-        auto d = decide_v2(make_features(true, 43000, 8), p);
-        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
-                        "T9a: sparse agentic turn must be SAFE (full keep), not throttled");
-        TEST_ASSERT_MSG(d.cascade,
-                        "T9a: sparse_prompt_guard must cascade=true (SAFE)");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T9a: sparse_prompt_guard -> recency keep-all");
-        TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard",
-                        "T9a: reason must be 'sparse_prompt_guard'");
-    }
-    // new_content = sparse_new_content_tokens - 1 (just below the guard).
-    {
-        auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens - 1), p);
-        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
-                        "T9b: new_content just below sparse threshold -> SAFE");
-        TEST_ASSERT_MSG(std::string(d.reason) == "sparse_prompt_guard",
-                        "T9b: reason must be 'sparse_prompt_guard'");
-    }
-    // new_content = 0 (degenerate new turn, still sparse guard NOT degenerate path).
-    // Note: 0 < sparse_new_content_tokens (256) so sparse guard fires first.
-    {
-        auto d = decide_v2(make_features(true, 40000, 0), p);
-        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
-                        "T9c: new_content=0 -> SAFE (sparse guard or degenerate, both SAFE)");
-    }
-    // Confirm: new_content = sparse_new_content_tokens (AT the boundary → NOT sparse).
-    // is_agentic=true above threshold with enough new content → throttle kicks in.
-    {
-        auto d = decide_v2(make_features(true, 50000, p.sparse_new_content_tokens), p);
-        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target),
-                        "T9d: new_content==sparse threshold -> agentic throttle applies");
-        TEST_ASSERT_MSG(!d.cascade,
-                        "T9d: agentic throttle -> cascade=false");
     }
 }
 
 // ─── T10: AGENTIC-THROTTLE ───────────────────────────────────────────────────
-// enabled, is_agentic=true, prompt > threshold, new_content > sparse threshold
-// → keep_target=agentic_keep_target, cascade=false, recency >= 1.
+// enabled, is_agentic=true, prompt > threshold
+// → keep_target=agentic_keep_target, cascade=false.
 
 static void t10_agentic_throttle() {
     RouterPolicyV2 p = enabled_v2_policy();
 
     {
-        auto d = decide_v2(make_features(true, 40000, 3000), p);
+        auto d = decide_v2(make_features(true, 40000), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.agentic_keep_target),
                         "T10a: agentic throttle -> keep_target=agentic_keep_target");
-        TEST_ASSERT_MSG(!d.cascade,
-                        "T10a: agentic throttle -> cascade=false");
-        TEST_ASSERT_MSG(d.recency_floor_turns == p.recency_floor_turns,
-                        "T10a: agentic throttle -> recency matches policy");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
-                        "T10a: recency_floor_turns must be >= 1 (continuity guaranteed)");
+        TEST_ASSERT_MSG(!d.cascade, "T10a: agentic throttle -> cascade=false");
         TEST_ASSERT_MSG(std::string(d.reason) == "agentic_throttle",
                         "T10a: reason must be 'agentic_throttle'");
     }
-    // Custom policy: verify fields propagate.
+    // Custom agentic_keep_target.
     {
-        RouterPolicyV2 p2  = p;
-        p2.agentic_keep_target  = 0.30;
-        p2.recency_floor_turns  = 5;
-        auto d = decide_v2(make_features(true, 60000, 1000), p2);
+        RouterPolicyV2 p2 = p;
+        p2.agentic_keep_target = 0.30;
+        auto d = decide_v2(make_features(true, 60000), p2);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.30),
                         "T10b: custom agentic_keep_target propagated");
-        TEST_ASSERT_MSG(d.recency_floor_turns == 5,
-                        "T10b: custom recency_floor_turns propagated");
+        TEST_ASSERT_MSG(!d.cascade, "T10b: agentic -> cascade=false");
     }
 }
 
 // ─── T11: RETRIEVAL-FULL ─────────────────────────────────────────────────────
-// enabled, is_agentic=false, prompt > threshold, new_content > sparse threshold
+// enabled, is_agentic=false, prompt > threshold
 // → cascade=true, keep_target=full_keep_target.
 
 static void t11_retrieval_full() {
     RouterPolicyV2 p = enabled_v2_policy();
 
     {
-        auto d = decide_v2(make_features(false, 40000, 3000), p);
-        TEST_ASSERT_MSG(d.cascade,
-                        "T11a: retrieval -> cascade=true");
+        auto d = decide_v2(make_features(false, 40000), p);
+        TEST_ASSERT_MSG(d.cascade, "T11a: retrieval -> cascade=true");
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T11a: retrieval -> keep_target=full_keep_target");
         TEST_ASSERT_MSG(std::string(d.reason) == "retrieval_full",
@@ -490,44 +147,41 @@ static void t11_retrieval_full() {
     }
     // Custom full_keep_target.
     {
-        RouterPolicyV2 p2       = p;
-        p2.full_keep_target     = 0.80;
-        auto d = decide_v2(make_features(false, 50000, 5000), p2);
+        RouterPolicyV2 p2   = p;
+        p2.full_keep_target = 0.80;
+        auto d = decide_v2(make_features(false, 50000), p2);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, 0.80),
                         "T11b: custom full_keep_target propagated");
         TEST_ASSERT_MSG(d.cascade, "T11b: retrieval -> cascade=true");
     }
 }
 
-// ─── T12: BELOW-THRESHOLD (v2) ───────────────────────────────────────────────
-// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic and new_content.
+// ─── T12: BELOW-THRESHOLD ────────────────────────────────────────────────────
+// prompt_tokens < threshold_tokens → SAFE regardless of is_agentic.
 
 static void t12_v2_below_threshold() {
     RouterPolicyV2 p = enabled_v2_policy();
 
-    // Agentic, just below threshold, plenty of new content.
+    // Agentic, just below threshold.
     {
-        auto d = decide_v2(make_features(true, p.threshold_tokens - 1, 5000), p);
+        auto d = decide_v2(make_features(true, p.threshold_tokens - 1), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T12a: agentic, below threshold -> SAFE");
-        TEST_ASSERT_MSG(d.cascade,
-                        "T12a: below threshold -> cascade=true");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T12a: below threshold -> recency keep-all");
+        TEST_ASSERT_MSG(d.cascade, "T12a: below threshold -> cascade=true");
         TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
                         "T12a: reason must be 'below_threshold'");
     }
-    // Non-agentic, at threshold boundary - 1.
+    // Non-agentic, just below threshold.
     {
-        auto d = decide_v2(make_features(false, p.threshold_tokens - 1, 5000), p);
+        auto d = decide_v2(make_features(false, p.threshold_tokens - 1), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T12b: non-agentic, below threshold -> SAFE");
     }
     // Custom threshold.
     {
-        RouterPolicyV2 p2     = p;
-        p2.threshold_tokens   = 10000;
-        auto d = decide_v2(make_features(true, 9999, 2000), p2);
+        RouterPolicyV2 p2   = p;
+        p2.threshold_tokens = 10000;
+        auto d = decide_v2(make_features(true, 9999), p2);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p2.full_keep_target),
                         "T12c: custom threshold, below it -> SAFE");
         TEST_ASSERT_MSG(std::string(d.reason) == "below_threshold",
@@ -535,152 +189,91 @@ static void t12_v2_below_threshold() {
     }
 }
 
-// ─── T13: RECENCY-FLOOR INVARIANT ────────────────────────────────────────────
-// In every throttling decision (non-SAFE), recency_floor_turns >= 1.
-// In every SAFE decision, recency_floor_turns >= kRecencyKeepAll.
-
-static void t13_recency_floor_invariant() {
-    RouterPolicyV2 p = enabled_v2_policy();
-
-    // Throttle path (agentic): recency >= 1.
-    {
-        auto d = decide_v2(make_features(true, 50000, 1000), p);
-        TEST_ASSERT_MSG(!approx_eq(d.keep_target, p.full_keep_target) ||
-                         d.recency_floor_turns >= 1,
-                        "T13a: throttled decision must have recency >= 1");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
-                        "T13a: agentic throttle recency_floor_turns >= 1 (continuity)");
-    }
-    // SAFE paths: recency must be keep-all.
-    // disabled
-    {
-        RouterPolicyV2 pd; pd.enabled = false;
-        auto d = decide_v2(make_features(true, 50000, 1000), pd);
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T13b: disabled SAFE recency must be keep-all");
-    }
-    // sparse_prompt_guard
-    {
-        auto d = decide_v2(make_features(true, 50000, 5), p);
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T13c: sparse_prompt_guard SAFE recency must be keep-all");
-    }
-    // below_threshold
-    {
-        auto d = decide_v2(make_features(true, 1000, 500), p);
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T13d: below_threshold SAFE recency must be keep-all");
-    }
-    // retrieval_full path: recency = policy value (not keep-all, it's a throttle-adjacent path)
-    {
-        auto d = decide_v2(make_features(false, 50000, 1000), p);
-        TEST_ASSERT_MSG(d.recency_floor_turns >= 1,
-                        "T13e: retrieval_full recency >= 1");
-    }
-    // Custom recency_floor_turns: verify agentic propagates it.
-    for (int k = 1; k <= 10; ++k) {
-        RouterPolicyV2 pk          = p;
-        pk.recency_floor_turns     = k;
-        auto d = decide_v2(make_features(true, 50000, 1000), pk);
-        TEST_ASSERT_MSG(d.recency_floor_turns == k,
-                        "T13f: agentic throttle recency must equal policy recency_floor_turns");
-    }
-}
-
-// ─── T14: DEGENERATE (v2) ────────────────────────────────────────────────────
-// prompt_tokens <= 0 or new_content_tokens < 0 → SAFE (no crash, no garbage).
+// ─── T14: DEGENERATE ─────────────────────────────────────────────────────────
+// prompt_tokens <= 0 → SAFE (no crash, no garbage).
 
 static void t14_v2_degenerate() {
     RouterPolicyV2 p = enabled_v2_policy();
 
     // prompt_tokens = 0
     {
-        auto d = decide_v2(make_features(true, 0, 500), p);
+        auto d = decide_v2(make_features(true, 0), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T14a: prompt_tokens=0 -> SAFE");
         TEST_ASSERT_MSG(d.cascade, "T14a: degenerate -> cascade=true");
-        TEST_ASSERT_MSG(d.recency_floor_turns >= kRecencyKeepAll,
-                        "T14a: degenerate -> recency keep-all");
         TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
                         "T14a: reason must be 'degenerate'");
     }
     // prompt_tokens < 0
     {
-        auto d = decide_v2(make_features(false, -1, 100), p);
+        auto d = decide_v2(make_features(false, -1), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
                         "T14b: negative prompt_tokens -> SAFE");
         TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
                         "T14b: reason must be 'degenerate'");
     }
-    // new_content_tokens < 0
-    {
-        auto d = decide_v2(make_features(true, 50000, -1), p);
-        TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
-                        "T14c: negative new_content_tokens -> SAFE");
-        TEST_ASSERT_MSG(std::string(d.reason) == "degenerate",
-                        "T14c: reason must be 'degenerate'");
-    }
     // Both degenerate
     {
-        auto d = decide_v2(make_features(true, -5, -10), p);
+        auto d = decide_v2(make_features(true, -5), p);
         TEST_ASSERT_MSG(approx_eq(d.keep_target, p.full_keep_target),
-                        "T14d: both degenerate -> SAFE");
+                        "T14c: negative agentic -> SAFE");
     }
 }
 
-// ─── T15: RECENCY_FLOOR_FOR — off ────────────────────────────────────────────
-// recency_floor_tokens == 0 → always 0 regardless of prompt size.
-
-static void t15_recency_floor_off() {
-    // 0 → off
-    TEST_ASSERT_MSG(recency_floor_for(0,      0) == 0, "T15a: S=0 R=0 -> 0");
-    TEST_ASSERT_MSG(recency_floor_for(1000,   0) == 0, "T15b: S=1000 R=0 -> 0");
-    TEST_ASSERT_MSG(recency_floor_for(100000, 0) == 0, "T15c: S=100K R=0 -> 0");
-    // Negative R (shouldn't happen but must be safe)
-    TEST_ASSERT_MSG(recency_floor_for(10000, -2) == 0, "T15d: negative R (not sentinel) -> 0");
-}
-
-// ─── T16: RECENCY_FLOOR_FOR — auto ───────────────────────────────────────────
-// kRecencyFloorAuto (-1) → min(1024, ceil(0.04 * S)).
-
-static void t16_recency_floor_auto() {
-    const int A = kRecencyFloorAuto;
-
-    // S=0: ceil(0.04*0)=0
-    TEST_ASSERT_MSG(recency_floor_for(0, A) == 0,   "T16a: S=0 auto -> 0");
-    // S=1000: ceil(0.04*1000)=40
-    TEST_ASSERT_MSG(recency_floor_for(1000,  A) == 40,  "T16b: S=1000 auto -> 40");
-    // S=10000: ceil(0.04*10000)=400
-    TEST_ASSERT_MSG(recency_floor_for(10000, A) == 400, "T16c: S=10K auto -> 400");
-    // S=25000: ceil(0.04*25000)=1000
-    TEST_ASSERT_MSG(recency_floor_for(25000, A) == 1000, "T16d: S=25K auto -> 1000");
-    // S=25001: ceil(0.04*25001)=1001 but capped at 1024
-    // actually 0.04*25001=1000.04 → ceil=1001 < 1024 → 1001
-    TEST_ASSERT_MSG(recency_floor_for(25001, A) == 1001, "T16e: S=25001 auto -> 1001");
-    // S=25600: 0.04*25600=1024.0 → ceil=1024
-    TEST_ASSERT_MSG(recency_floor_for(25600, A) == 1024, "T16f: S=25600 auto -> 1024");
-    // S=26000: 0.04*26000=1040 → ceil=1040 but capped at 1024
-    TEST_ASSERT_MSG(recency_floor_for(26000, A) == 1024, "T16g: S=26000 auto -> cap 1024");
-    // S=100000: 0.04*100000=4000 → capped at 1024
-    TEST_ASSERT_MSG(recency_floor_for(100000, A) == 1024, "T16h: S=100K auto -> cap 1024");
-    // S=-1: negative prompt treated as 0 → 0
-    TEST_ASSERT_MSG(recency_floor_for(-1, A) == 0,  "T16i: S=-1 auto -> 0");
-}
+// ─── T18: detect_request_type — bool truth-table ─────────────────────────────
+//
+// Exhaustive 3-bit truth table: any true → Agentic, all false → Retrieval.
+// No JSON dependency; the caller extracts bools at the handler boundary.
 
-// ─── T17: RECENCY_FLOOR_FOR — explicit ───────────────────────────────────────
-// Any explicit positive value is returned unchanged (no prompt-size influence).
-
-static void t17_recency_floor_explicit() {
-    // Explicit override ignores prompt size
-    TEST_ASSERT_MSG(recency_floor_for(1000,  512)  == 512,  "T17a: explicit 512");
-    TEST_ASSERT_MSG(recency_floor_for(100000, 512) == 512,  "T17b: explicit 512, large S");
-    TEST_ASSERT_MSG(recency_floor_for(1000, 1024)  == 1024, "T17c: explicit 1024");
-    TEST_ASSERT_MSG(recency_floor_for(1000, 2048)  == 2048, "T17d: explicit 2048 > cap");
-    TEST_ASSERT_MSG(recency_floor_for(0,    256)   == 256,  "T17e: explicit 256, S=0");
-    // Monotone: explicit > auto at short prompts
-    const int A = kRecencyFloorAuto;
-    TEST_ASSERT_MSG(recency_floor_for(1000, 512) > recency_floor_for(1000, A),
-                    "T17f: explicit 512 > auto(1000)=40");
+static void t18_detect_request_type() {
+    // All-false → Retrieval (safe default).
+    {
+        auto type = detect_request_type(false, false, false);
+        TEST_ASSERT_MSG(type == RequestType::Retrieval,
+                        "T18a: all false -> Retrieval");
+    }
+    // has_tools only → Agentic.
+    {
+        auto type = detect_request_type(true, false, false);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18b: has_tools=true -> Agentic");
+    }
+    // has_tool_use_blocks only → Agentic.
+    {
+        auto type = detect_request_type(false, true, false);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18c: has_tool_use_blocks=true -> Agentic");
+    }
+    // has_tool_calls only → Agentic.
+    {
+        auto type = detect_request_type(false, false, true);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18d: has_tool_calls=true -> Agentic");
+    }
+    // has_tools + has_tool_use_blocks → Agentic.
+    {
+        auto type = detect_request_type(true, true, false);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18e: has_tools + has_tool_use_blocks -> Agentic");
+    }
+    // has_tools + has_tool_calls → Agentic.
+    {
+        auto type = detect_request_type(true, false, true);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18f: has_tools + has_tool_calls -> Agentic");
+    }
+    // has_tool_use_blocks + has_tool_calls → Agentic.
+    {
+        auto type = detect_request_type(false, true, true);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18g: has_tool_use_blocks + has_tool_calls -> Agentic");
+    }
+    // All true → Agentic.
+    {
+        auto type = detect_request_type(true, true, true);
+        TEST_ASSERT_MSG(type == RequestType::Agentic,
+                        "T18h: all true -> Agentic");
+    }
 }
 
 // ─── main ─────────────────────────────────────────────────────────────────────
@@ -688,27 +281,14 @@ static void t17_recency_floor_explicit() {
 int main() {
     std::fprintf(stderr, "=== test_regime_router ===\n");
 
-    RUN_TEST(t1_deploy_noop);
-    RUN_TEST(t2_degenerate);
-    RUN_TEST(t3_below_threshold);
-    RUN_TEST(t4_too_few_anchors);
-    RUN_TEST(t5_monotone);
-    RUN_TEST(t6_boundary);
-    RUN_TEST(t7_ratio_value);
-
-    std::fprintf(stderr, "--- v2 ---\n");
     RUN_TEST(t8_v2_deploy_noop);
-    RUN_TEST(t9_sparse_prompt_guard);
     RUN_TEST(t10_agentic_throttle);
     RUN_TEST(t11_retrieval_full);
     RUN_TEST(t12_v2_below_threshold);
-    RUN_TEST(t13_recency_floor_invariant);
     RUN_TEST(t14_v2_degenerate);
 
-    std::fprintf(stderr, "--- recency_floor_for ---\n");
-    RUN_TEST(t15_recency_floor_off);
-    RUN_TEST(t16_recency_floor_auto);
-    RUN_TEST(t17_recency_floor_explicit);
+    std::fprintf(stderr, "--- detect_request_type ---\n");
+    RUN_TEST(t18_detect_request_type);
 
     std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures);
     return (test_failures == 0) ? 0 : 1;

From 8fc961b54f44a98d697899e47df27c04753c1ac6 Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Sat, 30 May 2026 22:43:16 +0200
Subject: [PATCH 16/16] feat(pflash): empty-response guard + bandit floor
 reconciliation (task #10)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the two validated pieces of the adaptive-keep path (the label-free
quality-reward idea was dropped — Momus-confirmed it can't catch confident
off-task). Default-OFF; router gates these to agentic-routed requests.

- regime_router.h: two pure helpers (stdlib-only, TDD'd) —
  clamp_keep_to_floor(bandit_keep, router_floor, agentic): agentic effective
    keep = max(bandit_keep, floor) so the bandit's 0.20 ceiling can no longer
    silently undercut the router's 0.25 floor.
  compression_failed(tokens, degenerate_close, agentic_compressed, min=8):
    true on empty/degenerate output of an agentic compressed turn.
- adaptive_keep_ratio.h: per-session recover_full_next flag (+ set/consume).
- http_server.cpp: floor clamp at keep-apply; at the post-generate update site,
  on compression_failed → skip the bandit update (failure noise) and set the
  session to full keep for the next turn (deterministic recovery from the
  empty-response failure class, e.g. LONG_B t10). PFLASH_GUARD_MIN_TOKENS env
  (default 8) tunes the guard threshold.
- 59 standalone unit tests, -Werror.

LIVE-VALIDATED on RTX 3090 (server up on :18097, 34K-token prompts):
- type-gate: agentic→keep 0.250/cascade-off, retrieval→cascade-on.
- guard recovery loop: turn1 compression_failed→full-keep-next (resp_tokens=13,
  bandit update skipped); turn2 same session recover_full_next consumed→keep 1.0.
- floor clamp fired: agentic bandit 0.100 < floor 0.250 → 0.250.
Launch config (24GB): GGML_CUDA_NO_VMM=1 + --max-ctx 49152 (139264 KV OOMs the
3090 — that was the pre-existing bad_alloc, not this change). Still default-OFF
via PFLASH_ROUTER_ENABLE.
---
 server/src/common/regime_router.h       |  32 +++++++
 server/src/server/adaptive_keep_ratio.h |  38 ++++++++-
 server/src/server/http_server.cpp       |  69 ++++++++++++---
 server/test/test_regime_router.cpp      | 106 ++++++++++++++++++++++++
 4 files changed, 230 insertions(+), 15 deletions(-)

diff --git a/server/src/common/regime_router.h b/server/src/common/regime_router.h
index 426c0c985..4c03eff8f 100644
--- a/server/src/common/regime_router.h
+++ b/server/src/common/regime_router.h
@@ -68,6 +68,38 @@ inline RouterDecisionV2 decide_v2(const RequestFeatures& f,
     return { p.full_keep_target, true, "retrieval_full" };
 }
 
+// ─── PIECE 1: floor clamp ────────────────────────────────────────────────────
+//
+// When the router routed a request as agentic, the bandit must not compress
+// harder than the router's agentic_keep_target floor.  Non-agentic sessions
+// are passed through unchanged (bandit drives retrieval sessions freely).
+//
+// Pure, stdlib-only, no IO.
+inline double clamp_keep_to_floor(double bandit_keep,
+                                   double router_floor,
+                                   bool   agentic) {
+    if (!agentic) return bandit_keep;
+    return bandit_keep >= router_floor ? bandit_keep : router_floor;
+}
+
+// ─── PIECE 2: compression failure guard ──────────────────────────────────────
+//
+// Returns true when a compressed agentic turn produced an empty or degenerate
+// response.  Used to skip the bandit update (failure noise) and schedule a
+// full-keep recovery for the next turn.
+//
+// Fires ONLY on the agentic+compressed path — non-compressed failures are not
+// our fault and do not need recovery.
+//
+// Pure, stdlib-only, no IO.
+inline bool compression_failed(int  response_tokens,
+                                bool degenerate_close,
+                                bool agentic_compressed,
+                                int  min_tokens = 8) {
+    if (!agentic_compressed) return false;
+    return response_tokens < min_tokens || degenerate_close;
+}
+
 // ─── TYPE GATE ───────────────────────────────────────────────────────────────
 //
 // Coarse request-type classifier.  Pure function — no IO, no globals, no JSON.
diff --git a/server/src/server/adaptive_keep_ratio.h b/server/src/server/adaptive_keep_ratio.h
index 959b87bce..36a815917 100644
--- a/server/src/server/adaptive_keep_ratio.h
+++ b/server/src/server/adaptive_keep_ratio.h
@@ -9,9 +9,10 @@
 namespace dflash::common {
 
 struct AdaptiveKeepRatioState {
-    float ema        = 0.0f;
-    float last_keep  = 0.10f;
-    int   turn_count = 0;
+    float ema              = 0.0f;
+    float last_keep        = 0.10f;
+    int   turn_count       = 0;
+    bool  recover_full_next = false;  // set by compression_failed guard; cleared after one turn
 };
 
 constexpr float kBanditEmaAlpha   = 0.7f;
@@ -90,6 +91,37 @@ class HttpServerSessions {
         return it->second.state.turn_count;
     }
 
+    // Schedule full-keep recovery for the next turn of this session.
+    // Called by the compression_failed guard when an agentic compressed turn
+    // produced an empty or degenerate response.  Creates the session entry if
+    // it does not exist yet (guard may fire before any bandit update).
+    void set_recover_full_next(const std::string& session_id) {
+        std::lock_guard<std::mutex> lock(mu_);
+        auto it = map_.find(session_id);
+        if (it == map_.end()) {
+            evict_if_full_locked();
+            lru_.push_front(session_id);
+            AdaptiveKeepRatioState s{};
+            s.recover_full_next = true;
+            map_.emplace(session_id, Entry{s, lru_.begin()});
+        } else {
+            it->second.state.recover_full_next = true;
+            lru_.splice(lru_.begin(), lru_, it->second.lru_it);
+        }
+    }
+
+    // Returns true and clears the flag if recovery was scheduled; false otherwise.
+    // One-shot: the flag is consumed on read so the next turn runs normally.
+    bool consume_recover_full_next(const std::string& session_id) {
+        std::lock_guard<std::mutex> lock(mu_);
+        auto it = map_.find(session_id);
+        if (it == map_.end()) return false;
+        lru_.splice(lru_.begin(), lru_, it->second.lru_it);
+        if (!it->second.state.recover_full_next) return false;
+        it->second.state.recover_full_next = false;
+        return true;
+    }
+
     size_t size() const {
         std::lock_guard<std::mutex> lock(mu_);
         return map_.size();
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index f2518ba1e..5818dfd1f 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1187,6 +1187,7 @@ void HttpServer::worker_loop() {
         // If pflash is enabled and prompt exceeds threshold, compress.
         std::vector<int32_t> effective_prompt = req.prompt_tokens;
         bool pflash_compressed = false;
+        bool pflash_is_agentic = false;  // hoisted for post-generate guard
 
         if (config_.pflash_mode != ServerConfig::PflashMode::OFF &&
             drafter_tokenizer_ != nullptr)
@@ -1258,6 +1259,7 @@ void HttpServer::worker_loop() {
                             const bool is_agentic = (detect_request_type(
                                 _has_tools, _has_tool_use_blocks, _has_tool_calls)
                                     == RequestType::Agentic);
+                            pflash_is_agentic = is_agentic;  // hoist for post-generate guard
                             const RequestFeatures rf {
                                 is_agentic,
                                 n_prompt
@@ -1272,7 +1274,32 @@ void HttpServer::worker_loop() {
                                 if (req.session_id.empty()) {
                                     creq.keep_ratio = (float)rd.keep_target;
                                 } else {
-                                    creq.keep_ratio = sessions_.get_keep_ratio(req.session_id);
+                                    // PIECE 2: recover_full_next — one-shot full-keep recovery
+                                    // after a compression_failed turn.  Consumed here (one turn).
+                                    if (!req.session_id.empty() &&
+                                        sessions_.consume_recover_full_next(req.session_id)) {
+                                        creq.keep_ratio = (float)config_.pflash_router.full_keep_target;
+                                        std::fprintf(stderr,
+                                            "[pflash-guard] recover_full_next consumed — "
+                                            "session=%s full_keep=%.3f\n",
+                                            req.session_id.c_str(), creq.keep_ratio);
+                                    } else {
+                                        // PIECE 1: floor clamp — bandit must not undercut
+                                        // the router's agentic floor.
+                                        float raw_keep = sessions_.get_keep_ratio(req.session_id);
+                                        creq.keep_ratio = (float)clamp_keep_to_floor(
+                                            raw_keep,
+                                            config_.pflash_router.agentic_keep_target,
+                                            is_agentic);
+                                        if (is_agentic && creq.keep_ratio > raw_keep) {
+                                            std::fprintf(stderr,
+                                                "[pflash-router] floor-clamp: "
+                                                "agentic bandit %.3f < floor %.3f → %.3f\n",
+                                                raw_keep,
+                                                config_.pflash_router.agentic_keep_target,
+                                                creq.keep_ratio);
+                                        }
+                                    }
                                 }
                                 // cascade = use_transitive: 0 = off, 1 = on, -1 = env default
                                 creq.use_transitive = rd.cascade ? 1 : 0;
@@ -1620,18 +1647,36 @@ void HttpServer::worker_loop() {
         // doesn't grow monotonically across requests with different sizes.
         backend_.release_scratch();
 
-        // Bandit: update when spec decode actually ran — including 0-accept case,
-        // which signals the current keep_ratio is too low.
-        if (!req.session_id.empty() && result.spec_decode_ran) {
-            float old_keep = sessions_.get_keep_ratio(req.session_id);
-            int   old_turn = sessions_.turn_count(req.session_id);
-            sessions_.update(req.session_id, result.accept_rate);
-            float new_keep = sessions_.get_keep_ratio(req.session_id);
-            float ema      = sessions_.get_ema(req.session_id);
+        // PIECE 2: compression failure guard — deterministic recovery.
+        // When an agentic compressed turn produces an empty or degenerate response:
+        //   (a) skip the bandit update (failure noise — don't reward/penalise)
+        //   (b) schedule full-keep recovery for the next turn of this session
+        const bool agentic_compressed = pflash_is_agentic && pflash_compressed;
+        const int  n_response_tokens  = (int)result.tokens.size();
+        if (!req.session_id.empty() &&
+            compression_failed(n_response_tokens, result.degenerate_decode_close,
+                               agentic_compressed)) {
             std::fprintf(stderr,
-                "[pflash-bandit] session=%s turn=%d keep=%.4f->%.4f ema=%.3f accept=%.3f\n",
-                req.session_id.c_str(), old_turn + 1,
-                old_keep, new_keep, ema, result.accept_rate);
+                "[pflash-guard] compression_failed → full-keep next: "
+                "session=%s resp_tokens=%d degenerate=%s\n",
+                req.session_id.c_str(), n_response_tokens,
+                result.degenerate_decode_close ? "true" : "false");
+            sessions_.set_recover_full_next(req.session_id);
+            // Fall through — skip bandit update below (spec_decode_ran may still be true).
+        } else {
+            // Bandit: update when spec decode actually ran — including 0-accept case,
+            // which signals the current keep_ratio is too low.
+            if (!req.session_id.empty() && result.spec_decode_ran) {
+                float old_keep = sessions_.get_keep_ratio(req.session_id);
+                int   old_turn = sessions_.turn_count(req.session_id);
+                sessions_.update(req.session_id, result.accept_rate);
+                float new_keep = sessions_.get_keep_ratio(req.session_id);
+                float ema      = sessions_.get_ema(req.session_id);
+                std::fprintf(stderr,
+                    "[pflash-bandit] session=%s turn=%d keep=%.4f->%.4f ema=%.3f accept=%.3f\n",
+                    req.session_id.c_str(), old_turn + 1,
+                    old_keep, new_keep, ema, result.accept_rate);
+            }
         }
 
 
diff --git a/server/test/test_regime_router.cpp b/server/test/test_regime_router.cpp
index b45370f99..215145f90 100644
--- a/server/test/test_regime_router.cpp
+++ b/server/test/test_regime_router.cpp
@@ -276,6 +276,108 @@ static void t18_detect_request_type() {
     }
 }
 
+// ─── T19: clamp_keep_to_floor ────────────────────────────────────────────────
+// agentic=true  → effective keep = max(bandit_keep, router_floor)
+// agentic=false → pass through bandit_keep unchanged
+// bandit_keep > floor → no clamping even for agentic
+
+static void t19_clamp_keep_to_floor() {
+    // Agentic + bandit below floor → clamped up to floor.
+    {
+        double result = clamp_keep_to_floor(0.10, 0.25, /*agentic=*/true);
+        TEST_ASSERT_MSG(approx_eq(result, 0.25),
+                        "T19a: agentic, bandit 0.10 < floor 0.25 -> clamped to 0.25");
+    }
+    // Agentic + bandit == floor → returns floor.
+    {
+        double result = clamp_keep_to_floor(0.25, 0.25, /*agentic=*/true);
+        TEST_ASSERT_MSG(approx_eq(result, 0.25),
+                        "T19b: agentic, bandit == floor -> 0.25");
+    }
+    // Agentic + bandit above floor → no clamping (bandit wins).
+    {
+        double result = clamp_keep_to_floor(0.30, 0.25, /*agentic=*/true);
+        TEST_ASSERT_MSG(approx_eq(result, 0.30),
+                        "T19c: agentic, bandit 0.30 > floor 0.25 -> 0.30 (bandit wins)");
+    }
+    // Non-agentic → pass through, even if below floor.
+    {
+        double result = clamp_keep_to_floor(0.05, 0.25, /*agentic=*/false);
+        TEST_ASSERT_MSG(approx_eq(result, 0.05),
+                        "T19d: non-agentic -> 0.05 passed through unchanged");
+    }
+    // Non-agentic, bandit above floor → pass through.
+    {
+        double result = clamp_keep_to_floor(0.50, 0.25, /*agentic=*/false);
+        TEST_ASSERT_MSG(approx_eq(result, 0.50),
+                        "T19e: non-agentic, bandit above floor -> 0.50 passed through");
+    }
+    // Agentic, bandit=0.0 (minimum possible) → clamped to floor.
+    {
+        double result = clamp_keep_to_floor(0.0, 0.25, /*agentic=*/true);
+        TEST_ASSERT_MSG(approx_eq(result, 0.25),
+                        "T19f: agentic, bandit=0.0 -> clamped to floor 0.25");
+    }
+}
+
+// ─── T20: compression_failed truth table ─────────────────────────────────────
+// Returns true iff agentic_compressed && (response_tokens < min_tokens || degenerate_close).
+// When not agentic_compressed, always false.
+
+static void t20_compression_failed() {
+    // agentic_compressed=true, response_tokens < min_tokens → failed.
+    {
+        bool result = compression_failed(/*response_tokens=*/3, /*degenerate_close=*/false,
+                                         /*agentic_compressed=*/true, /*min_tokens=*/8);
+        TEST_ASSERT_MSG(result, "T20a: agentic, 3 tokens < 8 min -> failed=true");
+    }
+    // agentic_compressed=true, response_tokens == min_tokens-1 → failed.
+    {
+        bool result = compression_failed(7, false, true, 8);
+        TEST_ASSERT_MSG(result, "T20b: agentic, 7 < 8 -> failed=true");
+    }
+    // agentic_compressed=true, response_tokens == min_tokens → NOT failed.
+    {
+        bool result = compression_failed(8, false, true, 8);
+        TEST_ASSERT_MSG(!result, "T20c: agentic, 8 == 8 -> failed=false");
+    }
+    // agentic_compressed=true, response_tokens > min_tokens → NOT failed (normal).
+    {
+        bool result = compression_failed(100, false, true, 8);
+        TEST_ASSERT_MSG(!result, "T20d: agentic, 100 tokens, normal -> failed=false");
+    }
+    // agentic_compressed=true, degenerate_close=true (even with enough tokens) → failed.
+    {
+        bool result = compression_failed(50, /*degenerate_close=*/true, true, 8);
+        TEST_ASSERT_MSG(result, "T20e: agentic, degenerate_close -> failed=true");
+    }
+    // agentic_compressed=true, both degenerate + empty → failed.
+    {
+        bool result = compression_failed(0, true, true, 8);
+        TEST_ASSERT_MSG(result, "T20f: agentic, 0 tokens + degenerate -> failed=true");
+    }
+    // agentic_compressed=false, even with empty response → NOT failed (not our fault).
+    {
+        bool result = compression_failed(0, false, /*agentic_compressed=*/false, 8);
+        TEST_ASSERT_MSG(!result, "T20g: not agentic_compressed, empty -> failed=false");
+    }
+    // agentic_compressed=false, degenerate_close=true → NOT failed (guard only fires on compression path).
+    {
+        bool result = compression_failed(0, true, false, 8);
+        TEST_ASSERT_MSG(!result, "T20h: not agentic_compressed, degenerate -> failed=false");
+    }
+    // Default min_tokens=8: verify default is honoured.
+    {
+        bool result = compression_failed(5, false, true);
+        TEST_ASSERT_MSG(result, "T20i: agentic, 5<8 with default min_tokens -> failed=true");
+    }
+    // Default min_tokens=8: 8 tokens → not failed.
+    {
+        bool result = compression_failed(8, false, true);
+        TEST_ASSERT_MSG(!result, "T20j: agentic, 8 tokens with default min_tokens -> failed=false");
+    }
+}
+
 // ─── main ─────────────────────────────────────────────────────────────────────
 
 int main() {
@@ -290,6 +392,10 @@ int main() {
     std::fprintf(stderr, "--- detect_request_type ---\n");
     RUN_TEST(t18_detect_request_type);
 
+    std::fprintf(stderr, "--- floor clamp + compression_failed ---\n");
+    RUN_TEST(t19_clamp_keep_to_floor);
+    RUN_TEST(t20_compression_failed);
+
     std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures);
     return (test_failures == 0) ? 0 : 1;
 }