From 38f92c4841e1de42d86ff3b0bfbe257548230fef Mon Sep 17 00:00:00 2001
From: Omar Baradei <omarbaradei21@gmail.com>
Date: Sat, 30 May 2026 11:17:25 -0700
Subject: [PATCH 1/3] fix(qwen35): fall back to AR decode when spec-decode
 emits empty output

dflash speculative decoding emits EOS as the very first token on certain
agentic "decision" turns, returning an empty completion (0 tokens,
finish=stop, no tool_call). The agent loop then stalls with nothing to
run -- the user-visible "dflash stops the moment it needs to do
something." At temperature 0 spec-decode must equal AR greedy, so this is
a spec-decode correctness bug (the batched target verify diverges from AR
on the first emitted position for these contexts).

Root cause isolated with a reproducible eval (stateless dflash-nocache
lane, two full passes byte-identical, jaccard 1.0): 9 of 55 real captured
agentic turns produce empty output under spec-decode, deterministically.
The SAME turns produce correct non-empty output on the AR path (no
draft/ddtree) and on stock llama.cpp Q6 -- so it is specific to the
dflash spec-decode path, not the prompt or the model weights.

Fix: at the two do_spec_decode call sites, if it returns success but
emitted zero tokens, fall back to do_ar_decode. The trigger is strictly
"0 tokens emitted", so healthy turns (which emit >=1 token) never reach
the fallback -- blast radius is exactly the currently-100%-failing turns.
do_ar_decode is the existing autoregressive path, verified correct here.

Validated (all turns replayed in isolation on the reproducible lane):
- 9/9 empty turns now produce non-empty output
- those 9 outputs are BYTE-IDENTICAL to the AR lane (fallback resumes
  from correct state)
- full 55-turn sweep: empty count 9 -> 0, zero regressions to empty,
  tool-call set unchanged
- prefix-cache oracle still 6/6 bit-identical
- 12-turn consecutive sequence on the stateful cache-on lane: 0 empties
  (no cross-turn state poisoning)

Spec-decode speed is retained for every non-degenerate turn; the slower
AR path runs only on the rare empty case (which was already failing).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 server/src/qwen35/qwen35_backend.cpp | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 234ad374..894de93e 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -577,14 +577,23 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
         // without sacrificing spec-decode throughput for the bulk of
         // generation. Most requests never hit the tail because the
         // model closes </think> naturally well before the budget edge.
-        if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
+        {
+        bool _sd_ok = do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                              result.accept_rate, result.spec_decode_ran,
                              req.hint_tokens, &req.budget_hook,
                              &result.budget_forced_close,
-                             &result.degenerate_decode_close)) {
+                             &result.degenerate_decode_close);
+        if (_sd_ok && result.tokens.empty()) {
+            // FIX: spec-decode degenerate empty (EOS as first token) on certain
+            // agentic turns -> fall back to AR decode, which is verified to produce
+            // correct non-empty output for exactly these contexts (temp-0 parity).
+            _sd_ok = do_ar_decode(committed, req.n_gen, result.tokens, out_io, req.budget_hook, &result.budget_forced_close, &result.degenerate_decode_close);
+        }
+        if (!_sd_ok) {
             result.error = "decode";
             return result;
         }
+        }
         result.decode_s = std::chrono::duration<double>(
             std::chrono::steady_clock::now() - t_decode_start).count();
     }
@@ -668,14 +677,23 @@ GenerateResult Qwen35Backend::restore_and_generate(int slot,
         // without sacrificing spec-decode throughput for the bulk of
         // generation. Most requests never hit the tail because the
         // model closes </think> naturally well before the budget edge.
-        if (!do_spec_decode(committed, req.n_gen, result.tokens, out_io,
+        {
+        bool _sd_ok = do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                              result.accept_rate, result.spec_decode_ran,
                              req.hint_tokens, &req.budget_hook,
                              &result.budget_forced_close,
-                             &result.degenerate_decode_close)) {
+                             &result.degenerate_decode_close);
+        if (_sd_ok && result.tokens.empty()) {
+            // FIX: spec-decode degenerate empty (EOS as first token) on certain
+            // agentic turns -> fall back to AR decode, which is verified to produce
+            // correct non-empty output for exactly these contexts (temp-0 parity).
+            _sd_ok = do_ar_decode(committed, req.n_gen, result.tokens, out_io, req.budget_hook, &result.budget_forced_close, &result.degenerate_decode_close);
+        }
+        if (!_sd_ok) {
             result.error = "decode";
             return result;
         }
+        }
         result.decode_s = std::chrono::duration<double>(
             std::chrono::steady_clock::now() - t_decode_start).count();
     }

From 6c8db53ffedb96117bd6f29216cac5ef19a009b1 Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Sat, 30 May 2026 17:57:20 -0700
Subject: [PATCH 2/3] fix(qwen35): recover spec-decode agent stalls

---
 server/src/common/model_backend.h    |   6 ++
 server/src/qwen35/qwen35_backend.cpp | 146 ++++++++++++++++++++++++++-
 server/src/qwen35/qwen35_backend.h   |   3 +
 server/src/server/http_server.cpp    |  35 +++++++
 4 files changed, 185 insertions(+), 5 deletions(-)

diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index 182b5003..6182bf47 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -98,6 +98,12 @@ struct GenerateRequest {
     // When non-null, the spec decode loop uses these as draft overrides,
     // bypassing draft model computation for covered positions.
     const std::vector<int32_t> * hint_tokens = nullptr;
+    // Optional env-gated dflash stall recovery: when spec decode is about to
+    // emit early EOS after an action preamble, inject a bare tool-call XML
+    // prefix and continue in AR with KV state intact.
+    const std::vector<int32_t> * stall_tool_prefix_tokens = nullptr;
+    const std::vector<int32_t> * stall_action_suffix_tokens = nullptr;
+    const std::vector<int32_t> * stall_skip_tokens = nullptr;
     // Optional thinking-budget hook — see BudgetHook docs above.
     BudgetHook                 budget_hook;
 };
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 894de93e..0364d91c 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -34,6 +34,27 @@ static float bf16_bits_to_f32(uint16_t bits) {
     v.u = (uint32_t)bits << 16;
     return v.f;
 }
+
+static bool tokens_contain(const std::vector<int32_t> & tokens,
+                           const std::vector<int32_t> & needle) {
+    if (needle.empty() || tokens.size() < needle.size()) return false;
+    return std::search(tokens.begin(), tokens.end(),
+                       needle.begin(), needle.end()) != tokens.end();
+}
+
+static bool tokens_have_recent_any(const std::vector<int32_t> & tokens,
+                                   const std::vector<int32_t> & candidates,
+                                   size_t max_trailing) {
+    if (tokens.empty() || candidates.empty()) return false;
+    for (size_t trailing = 0; trailing <= max_trailing; ++trailing) {
+        if (tokens.size() <= trailing) break;
+        const int32_t tok = tokens[tokens.size() - 1 - trailing];
+        if (std::find(candidates.begin(), candidates.end(), tok) != candidates.end()) {
+            return true;
+        }
+    }
+    return false;
+}
 }  // namespace
 
 #define IS_EOS_TOK(tok, w)                                         \
@@ -580,7 +601,11 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
         {
         bool _sd_ok = do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                              result.accept_rate, result.spec_decode_ran,
-                             req.hint_tokens, &req.budget_hook,
+                             req.hint_tokens,
+                             req.stall_tool_prefix_tokens,
+                             req.stall_action_suffix_tokens,
+                             req.stall_skip_tokens,
+                             &req.budget_hook,
                              &result.budget_forced_close,
                              &result.degenerate_decode_close);
         if (_sd_ok && result.tokens.empty()) {
@@ -680,7 +705,11 @@ GenerateResult Qwen35Backend::restore_and_generate(int slot,
         {
         bool _sd_ok = do_spec_decode(committed, req.n_gen, result.tokens, out_io,
                              result.accept_rate, result.spec_decode_ran,
-                             req.hint_tokens, &req.budget_hook,
+                             req.hint_tokens,
+                             req.stall_tool_prefix_tokens,
+                             req.stall_action_suffix_tokens,
+                             req.stall_skip_tokens,
+                             &req.budget_hook,
                              &result.budget_forced_close,
                              &result.degenerate_decode_close);
         if (_sd_ok && result.tokens.empty()) {
@@ -1018,6 +1047,27 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
             }
         }
 
+        // MIN_TOKENS_BEFORE_EOS (env DFLASH_MIN_TOKENS, default 0=off): if the
+        // model tries to stop before producing N tokens in this decode call,
+        // suppress EOS and take the best NON-eos token instead. Targets the Q4
+        // 'preamble then stop, no tool_call' agentic stall. Env-gated so the
+        // default production lane is byte-for-byte unchanged.
+        {
+            static const int _min_floor = []{ const char* e = std::getenv("DFLASH_MIN_TOKENS"); return e ? std::atoi(e) : 0; }();
+            if (_min_floor > 0 && (int)out_tokens.size() < _min_floor && IS_EOS_TOK(next_tok, w_)) {
+                int alt = -1; float altbest = -1e30f;
+                for (int v = 0; v < vocab; v++) {
+                    if (IS_EOS_TOK(v, w_)) continue;
+                    if (logits_buf[v] > altbest) { altbest = logits_buf[v]; alt = v; }
+                }
+                if (alt >= 0) {
+                    FILE* _d = std::fopen("/tmp/dflash_floor.log", "a");
+                    if (_d) { std::fprintf(_d, "[floor] eos@%d -> alt=%d\n", (int)out_tokens.size(), alt); std::fclose(_d); }
+                    next_tok = alt;
+                }
+            }
+        }
+
         maybe_force_close(next_tok, committed);
 
         out_tokens.push_back(next_tok);
@@ -1118,6 +1168,9 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
                                     float & out_accept_rate,
                                     bool & out_spec_ran,
                                     const std::vector<int32_t> * hint_tokens,
+                                    const std::vector<int32_t> * stall_tool_prefix_tokens,
+                                    const std::vector<int32_t> * stall_action_suffix_tokens,
+                                    const std::vector<int32_t> * stall_skip_tokens,
                                     const BudgetHook * budget_hook,
                                     bool * forced_close_out,
                                     bool * degenerate_close_out) {
@@ -1156,6 +1209,10 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
     }
 
     out_spec_ran = true;
+    static const int _min_floor = []{
+        const char* e = std::getenv("DFLASH_MIN_TOKENS");
+        return e ? std::atoi(e) : 0;
+    }();
 
     // ── DFlash spec-decode: draft → verify → accept → replay ──────────
 
@@ -1360,7 +1417,6 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
             step_graph_destroy(draft_sg);
             return false;
         }
-        last_tok = replay_last_tok;
 
         // 7. Sync features for replayed range to mirror (needed for next draft step)
         if (use_remote_draft && cache_.target_feat) {
@@ -1375,20 +1431,100 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
 
         // 8. Emit committed tokens (stop at EOS)
         bool hit_eos = false;
+        bool floor_to_ar = false;
+        bool inject_tool_prefix = false;
         int emitted = 0;
         for (int i = 0; i < commit_n; i++) {
+            if (_min_floor > 0 && (int)out_tokens.size() < _min_floor &&
+                IS_EOS_TOK(replay_tok[i], w_)) {
+                const bool can_inject_tool =
+                    stall_tool_prefix_tokens && !stall_tool_prefix_tokens->empty() &&
+                    stall_action_suffix_tokens && !stall_action_suffix_tokens->empty() &&
+                    tokens_have_recent_any(out_tokens, *stall_action_suffix_tokens, 4) &&
+                    !(stall_skip_tokens && tokens_contain(out_tokens, *stall_skip_tokens));
+                if (can_inject_tool) {
+                    FILE* _d = std::fopen("/tmp/dflash_floor.log", "a");
+                    if (_d) {
+                        std::fprintf(_d,
+                            "[spec-tool-floor] eos@%d committed=%d emitted=%d prefix=%zu -> ar\n",
+                            (int)out_tokens.size(), committed, emitted,
+                            stall_tool_prefix_tokens->size());
+                        std::fclose(_d);
+                    }
+                    floor_to_ar = true;
+                    inject_tool_prefix = true;
+                    break;
+                }
+            }
             out_tokens.push_back(replay_tok[i]);
             io.emit(replay_tok[i]);
             emitted++;
             if (io.cancelled) break;
             if (IS_EOS_TOK(replay_tok[i], w_)) { hit_eos = true; break; }
         }
-        committed   += emitted;
+        int injected = 0;
+        if (floor_to_ar) {
+            if (!target->restore_kv()) {
+                step_graph_destroy(draft_sg);
+                return false;
+            }
+            cache_.cur_pos = committed;
+            if (emitted > 0) {
+                std::vector<int32_t> replay_prefix(replay_tok.begin(),
+                                                   replay_tok.begin() + emitted);
+                int prefix_last_tok = -1;
+                if (!target->verify_batch(replay_prefix, committed,
+                                          prefix_last_tok, nullptr)) {
+                    std::fprintf(stderr, "spec-decode: floor prefix replay failed\n");
+                    step_graph_destroy(draft_sg);
+                    return false;
+                }
+            }
+            committed += emitted;
+            cache_.cur_pos = committed;
+            if (inject_tool_prefix) {
+                int tool_prefix_last_tok = -1;
+                if (!target->verify_batch(*stall_tool_prefix_tokens, committed,
+                                          tool_prefix_last_tok, nullptr)) {
+                    std::fprintf(stderr, "spec-decode: tool prefix replay failed\n");
+                    step_graph_destroy(draft_sg);
+                    return false;
+                }
+                for (int32_t tok : *stall_tool_prefix_tokens) {
+                    out_tokens.push_back(tok);
+                    io.emit(tok);
+                }
+                injected = (int)stall_tool_prefix_tokens->size();
+                committed += injected;
+                cache_.cur_pos = committed;
+            }
+        } else {
+            last_tok = replay_last_tok;
+            committed += emitted;
+        }
         cache_.cur_pos = committed;
-        n_generated += emitted;
+        n_generated += emitted + injected;
         n_accept_sum += std::min(accept_n, emitted);
         n_draft_steps++;
         if (io.cancelled) break;
+        if (floor_to_ar) {
+            step_graph_destroy(draft_sg);
+            cache_.last_tok = out_tokens.empty() ? last_tok : out_tokens.back();
+            const int total_draft_pos = std::max(1, n_draft_steps * q_len);
+            out_accept_rate =
+                (float)((double)n_accept_sum / (double)total_draft_pos);
+            const int ar_n_gen = n_gen - n_generated;
+            if (ar_n_gen <= 0) {
+                io.emit(-1);
+                return true;
+            }
+            BudgetHook tail_hook = budget_hook ? *budget_hook : BudgetHook{};
+            bool ok = do_ar_decode(committed, ar_n_gen, out_tokens, io,
+                                    tail_hook, forced_close_out,
+                                    degenerate_close_out);
+            io.emit(-1);
+            return ok;
+        }
         if (hit_eos) break;
     }
 
diff --git a/server/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h
index fb9b8f60..f0884c38 100644
--- a/server/src/qwen35/qwen35_backend.h
+++ b/server/src/qwen35/qwen35_backend.h
@@ -227,6 +227,9 @@ class Qwen35Backend : public ModelBackend {
                         float & out_accept_rate,
                         bool & out_spec_ran,
                         const std::vector<int32_t> * hint_tokens = nullptr,
+                        const std::vector<int32_t> * stall_tool_prefix_tokens = nullptr,
+                        const std::vector<int32_t> * stall_action_suffix_tokens = nullptr,
+                        const std::vector<int32_t> * stall_skip_tokens = nullptr,
                         const BudgetHook * budget_hook = nullptr,
                         bool * forced_close_out = nullptr,
                         bool * degenerate_close_out = nullptr);
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index a89309dd..6bc8766f 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1332,6 +1332,41 @@ void HttpServer::worker_loop() {
                 gen_req.hint_tokens = &hint_tokens_storage;
             }
         }
+        std::vector<int32_t> stall_tool_prefix_tokens_storage;
+        std::vector<int32_t> stall_action_suffix_tokens_storage;
+        std::vector<int32_t> stall_skip_tokens_storage;
+        if (!req.tools.empty() && std::getenv("DFLASH_STALL_TOOL_PREFIX")) {
+            bool has_terminal_tool = false;
+            for (const auto & tool : req.tools) {
+                if (!tool.contains("function") || !tool["function"].is_object()) continue;
+                if (tool["function"].value("name", "") == "terminal") {
+                    has_terminal_tool = true;
+                    break;
+                }
+            }
+            stall_tool_prefix_tokens_storage = tokenizer_.encode(
+                has_terminal_tool
+                    ? "\n<function=terminal>\n<parameter=command>\n"
+                    : "\n<function=");
+            stall_action_suffix_tokens_storage = tokenizer_.encode(":");
+            auto add_suffix_terminal = [&](const std::string & text) {
+                auto ids = tokenizer_.encode(text);
+                if (ids.empty()) return;
+                int32_t tok = ids.back();
+                if (std::find(stall_action_suffix_tokens_storage.begin(),
+                              stall_action_suffix_tokens_storage.end(), tok) ==
+                    stall_action_suffix_tokens_storage.end()) {
+                    stall_action_suffix_tokens_storage.push_back(tok);
+                }
+            };
+            add_suffix_terminal("`:");
+            add_suffix_terminal("):");
+            add_suffix_terminal("\":");
+            stall_skip_tokens_storage = tokenizer_.encode(" done");
+            gen_req.stall_tool_prefix_tokens = &stall_tool_prefix_tokens_storage;
+            gen_req.stall_action_suffix_tokens = &stall_action_suffix_tokens_storage;
+            gen_req.stall_skip_tokens = &stall_skip_tokens_storage;
+        }
 
         // Prefix cache: check for cached KV state.
         auto [cache_slot, prefix_len] = prefix_cache_.lookup(effective_prompt);

From 3ba401f0eefeac4d98ff2f9e177dca73757d3d5c Mon Sep 17 00:00:00 2001
From: Omar Baradei <omar@kostudios.io>
Date: Sat, 30 May 2026 20:20:48 -0700
Subject: [PATCH 3/3] fix(qwen35): bound residual dflash stall loops

---
 server/src/qwen35/qwen35_backend.cpp |  81 ++++++++++++++++++--
 server/src/server/http_server.cpp    | 106 +++++++++++++++++++++++----
 2 files changed, 167 insertions(+), 20 deletions(-)

diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 0364d91c..eaa7f49a 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -35,11 +35,19 @@ static float bf16_bits_to_f32(uint16_t bits) {
     return v.f;
 }
 
-static bool tokens_contain(const std::vector<int32_t> & tokens,
-                           const std::vector<int32_t> & needle) {
+static bool tokens_contain_recent_sequence(const std::vector<int32_t> & tokens,
+                                           const std::vector<int32_t> & needle,
+                                           size_t max_trailing) {
     if (needle.empty() || tokens.size() < needle.size()) return false;
-    return std::search(tokens.begin(), tokens.end(),
-                       needle.begin(), needle.end()) != tokens.end();
+    const size_t last_start = tokens.size() - needle.size();
+    const size_t first_start =
+        last_start > max_trailing ? last_start - max_trailing : 0;
+    for (size_t start = first_start; start <= last_start; ++start) {
+        if (std::equal(needle.begin(), needle.end(), tokens.begin() + start)) {
+            return true;
+        }
+    }
+    return false;
 }
 
 static bool tokens_have_recent_any(const std::vector<int32_t> & tokens,
@@ -55,6 +63,13 @@ static bool tokens_have_recent_any(const std::vector<int32_t> & tokens,
     }
     return false;
 }
+
+static int env_int_or_default(const char * name, int fallback) {
+    if (const char * raw = std::getenv(name)) {
+        if (*raw) return std::atoi(raw);
+    }
+    return fallback;
+}
 }  // namespace
 
 #define IS_EOS_TOK(tok, w)                                         \
@@ -969,6 +984,13 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
 
     auto t_dec0_ar = std::chrono::steady_clock::now();
     const size_t out_tokens_at_entry = out_tokens.size();
+    static const int _min_floor = env_int_or_default("DFLASH_MIN_TOKENS", 0);
+    static const int _repeat_guard = []{
+        const int explicit_guard =
+            env_int_or_default("DFLASH_DEGENERATE_RUN_TOKENS", -1);
+        if (explicit_guard >= 0) return explicit_guard;
+        return env_int_or_default("DFLASH_MIN_TOKENS", 0) > 0 ? 32 : 0;
+    }();
 
     const int hidden = w_.n_embd;
     const int vocab  = w_.n_vocab;
@@ -1053,7 +1075,6 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
         // 'preamble then stop, no tool_call' agentic stall. Env-gated so the
         // default production lane is byte-for-byte unchanged.
         {
-            static const int _min_floor = []{ const char* e = std::getenv("DFLASH_MIN_TOKENS"); return e ? std::atoi(e) : 0; }();
             if (_min_floor > 0 && (int)out_tokens.size() < _min_floor && IS_EOS_TOK(next_tok, w_)) {
                 int alt = -1; float altbest = -1e30f;
                 for (int v = 0; v < vocab; v++) {
@@ -1078,6 +1099,22 @@ bool Qwen35Backend::do_ar_decode(int committed, int n_gen,
 
         if (IS_EOS_TOK(next_tok, w_)) break;
 
+        if (_repeat_guard > 0 && (int)out_tokens.size() >= _repeat_guard) {
+            int run = 1;
+            for (int j = (int)out_tokens.size() - 2; j >= 0; --j) {
+                if (out_tokens[j] != next_tok) break;
+                run++;
+            }
+            if (run >= _repeat_guard) {
+                std::fprintf(stderr,
+                    "[degenerate-decode] token %d repeated %d times - "
+                    "breaking AR loop at committed=%d\n",
+                    next_tok, run, committed);
+                if (degenerate_close_out) *degenerate_close_out = true;
+                break;
+            }
+        }
+
         // Degenerate-decode watchdog. Once we're past the budget-hook's
         // close sequence (model in post-`</think>` content phase), watch
         // for repetition loops. The aime2025-02 case at think_max=4k
@@ -1283,6 +1320,26 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
             }
         }
 
+        if (last_tok < 0 && !out_tokens.empty()) {
+            std::fprintf(stderr,
+                "[spec-decode] invalid draft seed %d after %d emitted tokens; "
+                "switching to AR\n",
+                last_tok, (int)out_tokens.size());
+            step_graph_destroy(draft_sg);
+            cache_.last_tok = out_tokens.back();
+            const int ar_n_gen = n_gen - n_generated;
+            if (ar_n_gen <= 0) {
+                io.emit(-1);
+                return true;
+            }
+            BudgetHook tail_hook = budget_hook ? *budget_hook : BudgetHook{};
+            bool ok = do_ar_decode(committed, ar_n_gen, out_tokens, io,
+                                    tail_hook, forced_close_out,
+                                    degenerate_close_out);
+            io.emit(-1);
+            return ok;
+        }
+
         // 1. Build noise input for draft
         noise_ids[0] = last_tok;
         for (int i = 1; i < q_len; i++) noise_ids[i] = target->mask_token_id();
@@ -1433,15 +1490,25 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
         bool hit_eos = false;
         bool floor_to_ar = false;
         bool inject_tool_prefix = false;
+        constexpr size_t kActionSuffixLookback = 16;
+        constexpr size_t kSkipSequenceLookback = 64;
         int emitted = 0;
         for (int i = 0; i < commit_n; i++) {
             if (_min_floor > 0 && (int)out_tokens.size() < _min_floor &&
                 IS_EOS_TOK(replay_tok[i], w_)) {
+                // Action preambles often end as "I'll check:\n\n" before EOS.
+                // Tokenization makes the colon several tokens back, so keep a
+                // modest trailing window while still requiring a recent action
+                // suffix and no nearby completion phrase.
                 const bool can_inject_tool =
                     stall_tool_prefix_tokens && !stall_tool_prefix_tokens->empty() &&
                     stall_action_suffix_tokens && !stall_action_suffix_tokens->empty() &&
-                    tokens_have_recent_any(out_tokens, *stall_action_suffix_tokens, 4) &&
-                    !(stall_skip_tokens && tokens_contain(out_tokens, *stall_skip_tokens));
+                    tokens_have_recent_any(out_tokens, *stall_action_suffix_tokens,
+                                           kActionSuffixLookback) &&
+                    !(stall_skip_tokens &&
+                      tokens_contain_recent_sequence(out_tokens,
+                                                     *stall_skip_tokens,
+                                                     kSkipSequenceLookback));
                 if (can_inject_tool) {
                     FILE* _d = std::fopen("/tmp/dflash_floor.log", "a");
                     if (_d) {
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 6bc8766f..85e22ac8 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -10,7 +10,9 @@
 #include <algorithm>
 #include <cerrno>
 #include <chrono>
+#include <cctype>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 
 #include <arpa/inet.h>
@@ -77,6 +79,90 @@ static size_t json_array_size(const json & value) {
     return value.is_array() ? value.size() : 0;
 }
 
+static bool env_flag_enabled(const char * name) {
+    const char * raw = std::getenv(name);
+    if (!raw || !*raw) return false;
+    std::string value(raw);
+    std::transform(value.begin(), value.end(), value.begin(),
+                   [](unsigned char c) { return (char)std::tolower(c); });
+    return value != "0" && value != "false" && value != "no" &&
+           value != "off";
+}
+
+static const json * find_tool_function(const json & tools,
+                                       const std::string & name) {
+    if (!tools.is_array() || name.empty()) return nullptr;
+    for (const auto & tool : tools) {
+        if (!tool.contains("function") || !tool["function"].is_object()) {
+            continue;
+        }
+        const json & fn = tool["function"];
+        if (fn.value("name", "") == name) return &fn;
+    }
+    return nullptr;
+}
+
+static std::string first_tool_parameter_name(const json & function_def) {
+    const auto & params = function_def.value("parameters", json::object());
+    if (params.contains("required") && params["required"].is_array()) {
+        for (const auto & name : params["required"]) {
+            if (name.is_string()) return name.get<std::string>();
+        }
+    }
+    if (params.contains("properties") && params["properties"].is_object()) {
+        for (const auto & item : params["properties"].items()) {
+            return item.key();
+        }
+    }
+    return "";
+}
+
+static const json * select_stall_recovery_function(const json & tools,
+                                                   const json & tool_choice) {
+    if (!tools.is_array() || tools.empty()) return nullptr;
+
+    if (tool_choice.is_object() && tool_choice.contains("function") &&
+        tool_choice["function"].is_object()) {
+        const std::string forced_name =
+            tool_choice["function"].value("name", "");
+        // If the request forced a concrete function, recovery must honor it;
+        // falling back to terminal here would synthesize invalid tool XML.
+        return find_tool_function(tools, forced_name);
+    }
+
+    if (tool_choice.is_string() && tool_choice.get<std::string>() == "required" &&
+        tools.size() == 1 && tools[0].contains("function") &&
+        tools[0]["function"].is_object()) {
+        return &tools[0]["function"];
+    }
+
+    if (const json * terminal = find_tool_function(tools, "terminal")) {
+        return terminal;
+    }
+    if (tools.size() == 1 && tools[0].contains("function") &&
+        tools[0]["function"].is_object()) {
+        return &tools[0]["function"];
+    }
+    return nullptr;
+}
+
+static std::string build_stall_tool_prefix(const json & tools,
+                                           const json & tool_choice) {
+    const json * function_def =
+        select_stall_recovery_function(tools, tool_choice);
+    if (!function_def) return "\n<function=";
+
+    const std::string name = function_def->value("name", "");
+    if (name.empty()) return "\n<function=";
+
+    std::string prefix = "\n<function=" + name + ">\n";
+    std::string param = first_tool_parameter_name(*function_def);
+    if (!param.empty()) {
+        prefix += "<parameter=" + param + ">\n";
+    }
+    return prefix;
+}
+
 // Build the /props response body.
 //
 // Non-static so unit tests can call it directly (declared in http_server.h).
@@ -1335,19 +1421,10 @@ void HttpServer::worker_loop() {
         std::vector<int32_t> stall_tool_prefix_tokens_storage;
         std::vector<int32_t> stall_action_suffix_tokens_storage;
         std::vector<int32_t> stall_skip_tokens_storage;
-        if (!req.tools.empty() && std::getenv("DFLASH_STALL_TOOL_PREFIX")) {
-            bool has_terminal_tool = false;
-            for (const auto & tool : req.tools) {
-                if (!tool.contains("function") || !tool["function"].is_object()) continue;
-                if (tool["function"].value("name", "") == "terminal") {
-                    has_terminal_tool = true;
-                    break;
-                }
-            }
-            stall_tool_prefix_tokens_storage = tokenizer_.encode(
-                has_terminal_tool
-                    ? "\n<function=terminal>\n<parameter=command>\n"
-                    : "\n<function=");
+        if (!req.tools.empty() && env_flag_enabled("DFLASH_STALL_TOOL_PREFIX")) {
+            stall_tool_prefix_tokens_storage =
+                tokenizer_.encode(build_stall_tool_prefix(req.tools,
+                                                          req.tool_choice));
             stall_action_suffix_tokens_storage = tokenizer_.encode(":");
             auto add_suffix_terminal = [&](const std::string & text) {
                 auto ids = tokenizer_.encode(text);
@@ -1740,6 +1817,9 @@ void HttpServer::worker_loop() {
                         effective_finish_reason = "length";
                     }
                 }
+                if (result.degenerate_decode_close) {
+                    effective_finish_reason = "length";
+                }
                 json choice = {
                     {"index", 0}, {"message", msg},
                     {"finish_reason", effective_finish_reason}