diff --git a/.gitattributes b/.gitattributes
index 592097d1..45e8d6ec 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -5,3 +5,4 @@ assets/banner.png -filter -diff -merge -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.webm filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2a87f88a..72eb9cce 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,6 +26,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
+          lfs: true
           token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }}
 
       - uses: Jimver/cuda-toolkit@v0.2.35
@@ -55,7 +56,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release
           cmake --build build --target \
             test_dflash test_generate test_flash_attn_sparse \
-            dflash_server test_server_unit \
+            dflash_server test_server_unit replay_http_server \
             -j$(nproc)
 
       - name: Run C++ server unit tests
@@ -70,6 +71,19 @@ jobs:
         # in the optional `megakernel` extra so its build does NOT run yet.
         run: uv sync --frozen
 
+      - name: Run CPU integration tests (stub backend, no GPU)
+        # End-to-end exercise of HttpServer + render_chat_template +
+        # SseEmitter with a deterministic stub model backend. No GPU
+        # required: the spike driver runs under CUDA_VISIBLE_DEVICES=""
+        # and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata
+        # only). Covers the regression class from PR #308 end-to-end —
+        # streaming and non-streaming, OpenAI and Anthropic formats.
+        env:
+          CUDA_VISIBLE_DEVICES: ""
+        run: |
+          uv run --frozen --with pytest --with requests \
+            pytest -v server/test/test_stub_integration.py
+
       - name: Build megakernel via uv sync (sm_75)
         env:
           CUDA_HOME: ${{ env.CUDA_PATH }}
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 71298ff6..25c7063d 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -769,6 +769,39 @@ if(DFLASH27B_TESTS)
         endif()
     endif()
 
+    # ─── replay_http_server: CPU-only HttpServer test driver ────────────
+    # Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to
+    # serve real HTTP requests on the wire, replaying scripted token
+    # streams from JSON scenario files. Links dflash_common (which
+    # includes CUDA-compiled TUs) but never instantiates a real
+    # ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by
+    # test_stub_integration.py.
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp")
+        add_executable(replay_http_server
+            test/replay_http_server.cpp
+            test/scenario_store.cpp
+            test/stub_model_backend.cpp
+            src/server/http_server.cpp
+            src/server/model_card.cpp)
+        target_include_directories(replay_http_server PRIVATE
+            ${DFLASH27B_SRC_INCLUDE_DIRS}
+            ${CMAKE_CURRENT_SOURCE_DIR}/test)
+        if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
+            target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
+        else()
+            target_compile_definitions(replay_http_server PRIVATE
+                DFLASH27B_BACKEND_CUDA=1
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
+        endif()
+        target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
+        if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
+            find_package(CUDAToolkit REQUIRED)
+            target_link_libraries(replay_http_server PRIVATE CUDA::cudart)
+        else()
+            target_link_libraries(replay_http_server PRIVATE hip::host)
+        endif()
+    endif()
+
     # ─── Unit tests (no GPU, no model files) ────────────────────────────
     enable_testing()
 
diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py
index 08da6ced..43535203 100644
--- a/server/scripts/test_server_integration.py
+++ b/server/scripts/test_server_integration.py
@@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self):
 
     @pytest.mark.slow
     def test_thinking_enabled_via_chat_template_kwargs(self):
-        """Enabling thinking should produce reasoning_content."""
+        """Enabling thinking must route reasoning into reasoning_content,
+        not leak it into content. Regression guard for the Qwen3.6/Laguna
+        pre-opened-<think> bug: the chat template appends `<think>` to the
+        prompt suffix, so the model emits reasoning directly with no
+        opening tag. If the renderer→emitter wiring drops, reasoning_content
+        stays empty and the raw reasoning text appears in content."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
-        # With thinking enabled, model may produce reasoning_content
-        # (not guaranteed for short prompts, so we just check it doesn't crash)
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with enable_thinking=True — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning, (
+            f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}"
+        )
+        assert "<think>" not in content and "</think>" not in content, (
+            f"think tags leaked into content channel: {content[:200]!r}"
+        )
+        assert content, "content channel empty — model never closed </think>"
 
     @pytest.mark.slow
     def test_thinking_enabled_via_reasoning_effort(self):
-        """OpenAI Responses-style reasoning.effort field."""
+        """OpenAI Responses-style reasoning.effort=high must also route
+        reasoning to reasoning_content. Same regression class as above
+        but reached through a different request shape (effort→template
+        kwargs translation in http_server.cpp)."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with reasoning.effort=high — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning
+        assert "<think>" not in content and "</think>" not in content
+        assert content
 
 
 # ═══════════════════════════════════════════════════════════════════
diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
index 1349109a..d4bf3e03 100644
--- a/server/src/server/chat_template.cpp
+++ b/server/src/server/chat_template.cpp
@@ -51,7 +51,7 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
     return ChatFormat::QWEN3;
 }
 
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt,
@@ -59,6 +59,10 @@ std::string render_chat_template(
     const std::string & tools_json)
 {
     std::string result;
+    // `started_in_thinking` is derived deterministically from the template
+    // branch + render flags below. Set per format inside the switch so a
+    // future format addition can't silently miss the wiring.
+    bool started_in_thinking = false;
     bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
 
     switch (format) {
@@ -141,6 +145,14 @@ std::string render_chat_template(
                 // even when the client opts in, defeating the thinking-budget
                 // mechanism entirely.
                 result += "<think>\n";
+                // The prompt suffix pre-opens `<think>` — the model's very
+                // first generated token is reasoning, never preceded by an
+                // explicit `<think>` opener in the stream. Callers must
+                // start the SSE state machine in REASONING mode and pass
+                // `started_in_thinking=true` to parse_reasoning() so that
+                // reasoning text routes to reasoning_content instead of
+                // leaking into content.
+                started_in_thinking = true;
             }
         }
         break;
@@ -224,6 +236,11 @@ std::string render_chat_template(
             result += "<assistant>\n";
             if (enable_thinking) {
                 result += "<think>";
+                // Same situation as Qwen3.6: Laguna XS.2's enable_thinking
+                // generation prompt ends with `<think>` so the model starts
+                // emitting reasoning tokens with no explicit opener in the
+                // stream. Route subsequent tokens to the reasoning channel.
+                started_in_thinking = true;
             } else {
                 // Empty think block — model jumps straight to answer.
                 result += "</think>";
@@ -311,11 +328,17 @@ std::string render_chat_template(
                 result += "<|channel>thought\n<channel|>";
             }
         }
+        // Gemma4 does NOT pre-open `<think>` from the prompt; its
+        // reasoning channel is opened by the model emitting `<|channel>`
+        // which http_server forwards into the SseEmitter as the text
+        // `<think>` — so the emitter's existing CONTENT→REASONING
+        // transition fires on that synthesized opener. started_in_thinking
+        // stays false (initial CONTENT mode is correct).
         break;
     }
     }
 
-    return result;
+    return PromptRenderResult{std::move(result), started_in_thinking};
 }
 
 // ─── Jinja path ─────────────────────────────────────────────────────────
@@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template
 
 }  // namespace
 
-std::string render_chat_template_jinja(
+// Sniff a rendered prompt for a trailing `<think>` opener so the caller
+// can route subsequent stream tokens to the reasoning channel. Accepts
+// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
+// True positive ⇒ caller should treat the prompt as having pre-opened
+// the reasoning channel (and the renderer warns loudly so a model-card
+// mismatch is visible at runtime).
+static bool prompt_ends_with_think_open(const std::string & s) {
+    static const std::string OPEN = "<think>";
+    // Walk back over trailing ASCII whitespace.
+    size_t end = s.size();
+    while (end > 0) {
+        char c = s[end - 1];
+        if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
+            end--;
+        } else {
+            break;
+        }
+    }
+    if (end < OPEN.size()) return false;
+    return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
+}
+
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
@@ -407,14 +452,37 @@ std::string render_chat_template_jinja(
         throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
     }
 
+    std::string rendered;
     try {
         jinja::runtime rt(ctx);
         jinja::value results = rt.execute(*prog);
         auto parts = jinja::runtime::gather_string_parts(results);
-        return parts->as_string().str();
+        rendered = parts->as_string().str();
     } catch (const std::exception & e) {
         throw std::runtime_error(std::string("jinja runtime: ") + e.what());
     }
+
+    // Jinja path: we don't know which template family the caller passed
+    // in, so derive `started_in_thinking` by sniffing the rendered tail
+    // for a `<think>` opener. This catches the common Qwen3.6 / Laguna
+    // chat templates that end with `<think>\n` when enable_thinking is
+    // honored, plus any custom template that follows the same convention.
+    //
+    // Warn loudly when sniffing decides true so a template/model-card
+    // mismatch (e.g. enable_thinking=false but template hard-codes
+    // `<think>` anyway) surfaces in server logs.
+    bool started_in_thinking =
+        enable_thinking && add_generation_prompt &&
+        prompt_ends_with_think_open(rendered);
+    if (started_in_thinking) {
+        std::fprintf(stderr,
+            "[WARN] render_chat_template_jinja: rendered prompt ends with "
+            "`<think>` opener — treating as started_in_thinking=true. If "
+            "this is unexpected, check the template's enable_thinking "
+            "branch or the model card's reasoning configuration.\n");
+    }
+
+    return PromptRenderResult{std::move(rendered), started_in_thinking};
 }
 
 }  // namespace dflash::common
diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h
index ca7ef9db..770e65a4 100644
--- a/server/src/server/chat_template.h
+++ b/server/src/server/chat_template.h
@@ -27,6 +27,23 @@ enum class ChatFormat {
     GEMMA4,    // <bos><|turn>role\n...<turn|>\n
 };
 
+// Provenance for a rendered prompt. `text` is the byte string that gets
+// tokenized; `started_in_thinking` records whether the prompt suffix
+// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
+// that the model is expected to continue into.
+//
+// Callers route this into the SseEmitter's initial mode and into
+// parse_reasoning()'s `started_in_thinking` argument so reasoning text
+// emitted before any explicit `<think>` opener is still attributed to
+// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
+// enable_thinking prompts (which pre-open `<think>\n` in the assistant
+// turn) cause the model to emit reasoning straight into the content
+// channel, leaving `reasoning_content` empty.
+struct PromptRenderResult {
+    std::string text;            // rendered prompt text, ready to tokenize
+    bool started_in_thinking;    // prompt suffix opens reasoning channel
+};
+
 // Render chat messages into the model-specific prompt string.
 // The result is plain text ready to be tokenized.
 //
@@ -40,7 +57,7 @@ enum class ChatFormat {
 // `tools_json` is an optional JSON string containing the tool definitions
 // array. When non-empty, the Qwen3/3.5 template injects a tool preamble
 // into the system message instructing the model how to emit <tool_call> tags.
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt = true,
@@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
 // Internally caches the most recently parsed program per thread (avoids
 // re-parsing the template on every request). Throws std::runtime_error on
 // lexer/parser/runtime failure (caller should surface a 500 response).
-std::string render_chat_template_jinja(
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index a89309dd..311a5102 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1009,7 +1009,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
             tools_json = req.tools.dump();
         }
 
-        std::string rendered;
+        PromptRenderResult render_result;
         if (!config_.chat_template_src.empty()) {
             // Jinja path: caller supplied a chat template file via
             // --chat-template-file. Override the hardcoded QWEN3/LAGUNA
@@ -1026,7 +1026,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 ? tokenizer_.raw_token(tokenizer_.eos_id())
                 : std::string();
             try {
-                rendered = render_chat_template_jinja(
+                render_result = render_chat_template_jinja(
                     config_.chat_template_src,
                     chat_msgs,
                     bos_str,
@@ -1040,11 +1040,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 return true;
             }
         } else {
-            rendered = render_chat_template(chat_msgs, chat_format_,
-                                            true, enable_thinking,
-                                            tools_json);
+            render_result = render_chat_template(chat_msgs, chat_format_,
+                                                 true, enable_thinking,
+                                                 tools_json);
         }
-        req.prompt_tokens = tokenizer_.encode(rendered);
+        // Propagate prompt provenance so the SseEmitter's initial mode
+        // matches the template's pre-opened reasoning channel (Qwen3.6 /
+        // Laguna enable_thinking case). Without this, reasoning text
+        // leaks into the content channel and `reasoning_content` stays
+        // empty — see fix(server): route Qwen3.6/Laguna think-mode
+        // reasoning to reasoning_content channel.
+        req.started_in_thinking = render_result.started_in_thinking;
+        req.prompt_tokens = tokenizer_.encode(render_result.text);
 
         // count_tokens: short-circuit after tokenization. Skip generation
         // entirely — Anthropic's contract is just `{"input_tokens": N}`.
@@ -1149,11 +1156,20 @@ void HttpServer::worker_loop() {
             }
         }
 
-        // Create SSE emitter for streaming state machine.
+        // Create SSE emitter for streaming state machine. `initial_mode`
+        // tracks whether the chat-template prompt pre-opened a `<think>`
+        // block (Qwen3.6 / Laguna enable_thinking path). When true, the
+        // emitter starts in REASONING so the model's first generated
+        // token routes to reasoning_content even though no explicit
+        // `<think>` opener appears in the token stream.
+        const StreamMode initial_mode = req.started_in_thinking
+            ? StreamMode::REASONING
+            : StreamMode::CONTENT;
         SseEmitter emitter(req.format, req.response_id, req.model,
                            (int)req.prompt_tokens.size(), req.tools,
                            &tool_memory_,
-                           req.stop_sequences);
+                           req.stop_sequences,
+                           initial_mode);
 
         // Emit initial SSE events.
         if (req.stream) {
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 999eb5d9..bf5477f6 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -200,6 +200,12 @@ struct ParsedRequest {
     std::vector<std::string>  stop_sequences;
     // Bandit: per-session adaptive keep_ratio opt-in
     std::string               session_id;
+    // Set by the chat-template renderer when the rendered prompt suffix
+    // pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
+    // Drives the SseEmitter's initial mode so reasoning tokens emitted
+    // before any explicit `<think>` opener route to reasoning_content
+    // instead of leaking into content.
+    bool                      started_in_thinking = false;
 };
 
 // Build the /props response body. Exposed (non-static) so unit tests
diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
index 604f11a7..beae530b 100644
--- a/server/src/server/sse_emitter.cpp
+++ b/server/src/server/sse_emitter.cpp
@@ -76,15 +76,16 @@ SseEmitter::SseEmitter(ApiFormat format,
                        int prompt_tokens,
                        const json & tools,
                        ToolMemory * tool_memory,
-                       const std::vector<std::string> & stop_sequences)
+                       const std::vector<std::string> & stop_sequences,
+                       StreamMode initial_mode)
     : format_(format)
     , request_id_(request_id)
     , model_name_(model_name)
     , prompt_tokens_(prompt_tokens)
     , tools_(tools)
     , tool_memory_(tool_memory)
-    , mode_(StreamMode::CONTENT)
-    , active_kind_("text")
+    , mode_(initial_mode)
+    , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text")
     , stop_sequences_(stop_sequences)
     , created_at_(unix_timestamp())
     , msg_item_id_(gen_item_id())
@@ -93,6 +94,12 @@ SseEmitter::SseEmitter(ApiFormat format,
     for (const auto & s : stop_sequences_) {
         if (s.size() > stop_holdback_) stop_holdback_ = s.size();
     }
+    // NOTE on `checked_think_prefix_`: we deliberately leave the default
+    // (false) here even when initial_mode == REASONING. The emitter has a
+    // one-time guard in emit_token() that strips a redundantly-emitted
+    // leading `<think>` if the model emits one anyway (model-card /
+    // template-mismatch edge case). Pre-setting the flag to true would
+    // skip that strip and leak the duplicate opener into reasoning_text.
 }
 
 // ─── SSE formatting helpers ─────────────────────────────────────────────
diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h
index 4710b8d4..fcc40d9c 100644
--- a/server/src/server/sse_emitter.h
+++ b/server/src/server/sse_emitter.h
@@ -54,13 +54,27 @@ nlohmann::json build_timings_json(const GenTimings & t, int completion_tokens);
 // Manages SSE streaming for a single request.
 class SseEmitter {
 public:
+    // `initial_mode` defaults to CONTENT for backward compatibility. Pass
+    // StreamMode::REASONING when the chat-template prompt suffix pre-opens
+    // a `<think>` block (Qwen3.6 / Laguna enable_thinking path): the
+    // model's first generated token is reasoning, never preceded by an
+    // explicit `<think>` opener in the stream. Without this hint the
+    // emitter would route reasoning text to the content channel and
+    // reasoning_content would stay empty.
+    //
+    // Note: the leading-`<think>` strip guard (`checked_think_prefix_`)
+    // remains active when we start in REASONING mode — if the model
+    // *does* emit a redundant `<think>` opener anyway, the guard still
+    // strips it. Pre-setting checked_think_prefix_=true here would let a
+    // duplicate `<think>` leak into reasoning_text in that edge case.
     SseEmitter(ApiFormat format,
                const std::string & request_id,
                const std::string & model_name,
                int prompt_tokens,
                const json & tools,
                ToolMemory * tool_memory,
-               const std::vector<std::string> & stop_sequences = {});
+               const std::vector<std::string> & stop_sequences = {},
+               StreamMode initial_mode = StreamMode::CONTENT);
 
     // Emit the initial SSE events (role delta, message_start, etc.)
     // Returns the formatted SSE strings to send.
diff --git a/server/test/fixtures/qwen3.6-tokenizer.gguf b/server/test/fixtures/qwen3.6-tokenizer.gguf
new file mode 100644
index 00000000..967827d3
--- /dev/null
+++ b/server/test/fixtures/qwen3.6-tokenizer.gguf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b420704f204792f14b919b3d52effa839cc7a0739844c2fd2887f6400b045aa
+size 10941620
diff --git a/server/test/replay_http_server.cpp b/server/test/replay_http_server.cpp
new file mode 100644
index 00000000..174bba87
--- /dev/null
+++ b/server/test/replay_http_server.cpp
@@ -0,0 +1,120 @@
+// CPU-only test driver for dflash HttpServer.
+//
+// Boots the server with the real Qwen3.6 tokenizer (loaded from a GGUF —
+// vocab/metadata only, never any GPU weights) and a deterministic
+// StubModelBackend whose generate() replays scripted token streams from
+// JSON scenario files. The chain
+//   real chat template renderer → real ParsedRequest plumbing →
+//   real SseEmitter wiring → real socket writes
+// is fully exercised; only the per-token sample comes from the stub.
+//
+// Runs without a GPU: link against dflash_common (CUDA TUs included), but
+// because no real ModelBackend is instantiated, ggml_cuda_init() is never
+// called. CUDA_VISIBLE_DEVICES="" is the supported test configuration.
+//
+// Usage:
+//   CUDA_VISIBLE_DEVICES="" ./replay_http_server \
+//       <vocab.gguf> --scenarios <dir> [--port 9999]
+//
+// See server/test/scenarios/*.json for the scenario file schema.
+
+#include "server/http_server.h"
+#include "server/tokenizer.h"
+#include "server/chat_template.h"
+#include "common/model_backend.h"
+#include "scenario_store.h"
+#include "stub_model_backend.h"
+
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+using namespace dflash::common;
+
+namespace {
+
+HttpServer * g_server = nullptr;
+
+void signal_handler(int) {
+    if (g_server) g_server->request_stop();
+}
+
+}  // namespace
+
+int main(int argc, char ** argv) {
+    const char * gguf_path     = nullptr;
+    const char * scenarios_dir = nullptr;
+    int          port          = 9999;
+    int          max_ctx       = 4096;
+
+    for (int i = 1; i < argc; ++i) {
+        if (std::strcmp(argv[i], "--scenarios") == 0 && i + 1 < argc) {
+            scenarios_dir = argv[++i];
+        } else if (std::strcmp(argv[i], "--port") == 0 && i + 1 < argc) {
+            port = std::atoi(argv[++i]);
+        } else if (std::strcmp(argv[i], "--max-ctx") == 0 && i + 1 < argc) {
+            max_ctx = std::atoi(argv[++i]);
+        } else if (argv[i][0] != '-' && !gguf_path) {
+            gguf_path = argv[i];
+        } else {
+            std::fprintf(stderr,
+                "usage: %s <vocab.gguf> --scenarios <dir> [--port N] [--max-ctx N]\n",
+                argv[0]);
+            return 1;
+        }
+    }
+    if (!gguf_path) {
+        std::fprintf(stderr, "[driver] missing positional <vocab.gguf>\n");
+        return 1;
+    }
+
+    const char * cvd = std::getenv("CUDA_VISIBLE_DEVICES");
+    std::fprintf(stderr, "[driver] CUDA_VISIBLE_DEVICES=%s\n",
+                 cvd ? cvd : "(unset)");
+
+    Tokenizer tokenizer;
+    if (!tokenizer.load_from_gguf(gguf_path)) {
+        std::fprintf(stderr, "[driver] tokenizer load failed: %s\n", gguf_path);
+        return 2;
+    }
+    std::fprintf(stderr, "[driver] tokenizer loaded: vocab=%d\n",
+                 tokenizer.vocab_size());
+
+    test::ScenarioStore store;
+    if (scenarios_dir) {
+        if (!store.load_directory(scenarios_dir)) {
+            std::fprintf(stderr,
+                "[driver] one or more scenarios failed to load — aborting\n");
+            return 3;
+        }
+        std::fprintf(stderr, "[driver] loaded %zu scenarios from %s\n",
+            store.size(), scenarios_dir);
+    } else {
+        std::fprintf(stderr,
+            "[driver] no --scenarios dir given; every request will 500\n");
+    }
+
+    test::StubModelBackend backend(store, tokenizer);
+
+    ServerConfig cfg;
+    cfg.host       = "127.0.0.1";
+    cfg.port       = port;
+    cfg.model_name = "dflash";    // matches existing test-suite default
+    cfg.max_ctx    = max_ctx;
+
+    HttpServer server(backend, tokenizer, cfg);
+    server.set_chat_format(ChatFormat::QWEN3);
+
+    g_server = &server;
+    std::signal(SIGTERM, signal_handler);
+    std::signal(SIGINT,  signal_handler);
+
+    std::fprintf(stderr,
+        "[driver] HttpServer listening on http://%s:%d (SIGINT/SIGTERM to stop)\n",
+        cfg.host.c_str(), cfg.port);
+    int rc = server.run();
+    std::fprintf(stderr, "[driver] exit rc=%d\n", rc);
+    return rc;
+}
diff --git a/server/test/scenario_store.cpp b/server/test/scenario_store.cpp
new file mode 100644
index 00000000..1badd7d1
--- /dev/null
+++ b/server/test/scenario_store.cpp
@@ -0,0 +1,167 @@
+#include "scenario_store.h"
+
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <cstdio>
+#include <dirent.h>
+#include <fstream>
+#include <sstream>
+#include <sys/stat.h>
+
+namespace dflash::common::test {
+
+using json = nlohmann::json;
+
+namespace {
+
+bool parse_token(const json & j, ScenarioToken & out, const std::string & file) {
+    if (j.is_string()) {
+        out.text    = j.get<std::string>();
+        out.special = false;
+        return true;
+    }
+    if (j.is_object()) {
+        if (!j.contains("text") || !j["text"].is_string()) {
+            std::fprintf(stderr,
+                "[scenario] %s: token object missing 'text' string\n",
+                file.c_str());
+            return false;
+        }
+        out.text    = j["text"].get<std::string>();
+        out.special = j.value("special", false);
+        return true;
+    }
+    std::fprintf(stderr,
+        "[scenario] %s: token must be a string or {text,special} object\n",
+        file.c_str());
+    return false;
+}
+
+bool parse_scenario(const json & j, Scenario & out, const std::string & file) {
+    if (!j.is_object()) {
+        std::fprintf(stderr, "[scenario] %s: top level must be object\n", file.c_str());
+        return false;
+    }
+    out.name        = j.value("name", file);
+    out.description = j.value("description", "");
+
+    if (!j.contains("match") || !j["match"].is_object()) {
+        std::fprintf(stderr, "[scenario] %s: missing 'match' object\n", file.c_str());
+        return false;
+    }
+    const auto & m = j["match"];
+    if (!m.contains("prompt_suffix") || !m["prompt_suffix"].is_string()) {
+        std::fprintf(stderr,
+            "[scenario] %s: match.prompt_suffix must be a string\n", file.c_str());
+        return false;
+    }
+    out.match_prompt_suffix = m["prompt_suffix"].get<std::string>();
+    if (out.match_prompt_suffix.empty()) {
+        std::fprintf(stderr,
+            "[scenario] %s: match.prompt_suffix is empty — would match every prompt\n",
+            file.c_str());
+        return false;
+    }
+
+    if (!j.contains("response") || !j["response"].is_object()) {
+        std::fprintf(stderr, "[scenario] %s: missing 'response' object\n", file.c_str());
+        return false;
+    }
+    const auto & r = j["response"];
+    out.response.ok            = r.value("ok", true);
+    out.response.error         = r.value("error", "");
+    out.response.finish_reason = r.value("finish_reason", "stop");
+    out.response.decode_us     = r.value("decode_us", 0);
+
+    if (out.response.ok) {
+        if (!r.contains("tokens") || !r["tokens"].is_array()) {
+            std::fprintf(stderr,
+                "[scenario] %s: response.tokens must be an array when ok=true\n",
+                file.c_str());
+            return false;
+        }
+        out.response.tokens.reserve(r["tokens"].size());
+        for (const auto & jt : r["tokens"]) {
+            ScenarioToken tok;
+            if (!parse_token(jt, tok, file)) return false;
+            out.response.tokens.push_back(std::move(tok));
+        }
+    }
+    return true;
+}
+
+bool ends_with(const std::string & s, const std::string & suffix) {
+    if (suffix.size() > s.size()) return false;
+    return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+}
+
+}  // namespace
+
+bool ScenarioStore::load_file(const std::string & path) {
+    std::ifstream f(path);
+    if (!f.is_open()) {
+        std::fprintf(stderr, "[scenario] cannot open: %s\n", path.c_str());
+        return false;
+    }
+    std::stringstream buf;
+    buf << f.rdbuf();
+    json j;
+    try {
+        j = json::parse(buf.str());
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "[scenario] %s: JSON parse error: %s\n",
+            path.c_str(), e.what());
+        return false;
+    }
+    Scenario sc;
+    if (!parse_scenario(j, sc, path)) return false;
+    scenarios_.push_back(std::move(sc));
+    std::fprintf(stderr, "[scenario] loaded %s (suffix=%zub, tokens=%zu)\n",
+        scenarios_.back().name.c_str(),
+        scenarios_.back().match_prompt_suffix.size(),
+        scenarios_.back().response.tokens.size());
+    return true;
+}
+
+bool ScenarioStore::load_directory(const std::string & dir) {
+    DIR * d = opendir(dir.c_str());
+    if (!d) {
+        std::fprintf(stderr, "[scenario] cannot open dir: %s\n", dir.c_str());
+        return false;
+    }
+    std::vector<std::string> files;
+    while (struct dirent * de = readdir(d)) {
+        std::string name = de->d_name;
+        if (name.size() < 6) continue;
+        if (name.compare(name.size() - 5, 5, ".json") != 0) continue;
+        files.push_back(dir + "/" + name);
+    }
+    closedir(d);
+    std::sort(files.begin(), files.end());  // deterministic load order
+    bool all_ok = true;
+    for (const auto & p : files) {
+        if (!load_file(p)) all_ok = false;
+    }
+    return all_ok;
+}
+
+const Scenario * ScenarioStore::match(const std::string & rendered_prompt) const {
+    const Scenario * best = nullptr;
+    for (const auto & sc : scenarios_) {
+        if (!ends_with(rendered_prompt, sc.match_prompt_suffix)) continue;
+        if (!best ||
+            sc.match_prompt_suffix.size() > best->match_prompt_suffix.size()) {
+            best = &sc;
+        } else if (sc.match_prompt_suffix.size() == best->match_prompt_suffix.size()) {
+            std::fprintf(stderr,
+                "[scenario] tie on suffix length %zu between '%s' and '%s' — "
+                "load-order earlier wins ('%s'); add a longer suffix to disambiguate\n",
+                sc.match_prompt_suffix.size(),
+                best->name.c_str(), sc.name.c_str(), best->name.c_str());
+        }
+    }
+    return best;
+}
+
+}  // namespace dflash::common::test
diff --git a/server/test/scenario_store.h b/server/test/scenario_store.h
new file mode 100644
index 00000000..9cc2fc10
--- /dev/null
+++ b/server/test/scenario_store.h
@@ -0,0 +1,84 @@
+// Scenario store — loads JSON scenario files and matches them against
+// rendered prompts for the StubModelBackend.
+//
+// A scenario file is a JSON object describing one (prompt → token stream)
+// pairing:
+//
+//   {
+//     "name":        "qwen3_enable_thinking_basic",
+//     "description": "Qwen3.6 enable_thinking emits reasoning before </think>",
+//     "match": {
+//       "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+//     },
+//     "response": {
+//       "finish_reason": "stop",
+//       "decode_us":     0,
+//       "tokens": [
+//         "Let me think. ",
+//         "2",
+//         "+",
+//         "2",
+//         " = 4.",
+//         { "text": "</think>", "special": true },
+//         "\n\nThe answer is 4."
+//       ]
+//     }
+//   }
+//
+// Match semantics: longest matching `prompt_suffix` wins. File load order
+// breaks ties (with a stderr warning).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace dflash::common::test {
+
+struct ScenarioToken {
+    std::string text;
+    // When true, the stub backend emits this as a single special token
+    // looked up via Tokenizer::token_to_id(). When false, the text is
+    // BPE-encoded and emitted as the resulting sequence of IDs.
+    bool        special = false;
+};
+
+struct ScenarioResponse {
+    bool                       ok            = true;
+    std::string                error;          // only used when ok=false
+    std::string                finish_reason = "stop";  // "stop", "length", "tool_calls"
+    int                        decode_us     = 0;  // optional inter-token delay
+    std::vector<ScenarioToken> tokens;
+};
+
+struct Scenario {
+    std::string      name;
+    std::string      description;
+    std::string      match_prompt_suffix;
+    ScenarioResponse response;
+};
+
+class ScenarioStore {
+public:
+    // Load every *.json file under `dir`. Returns false if directory does
+    // not exist or any file fails to parse (errors logged to stderr).
+    bool load_directory(const std::string & dir);
+
+    // Load a single scenario file. Returns false on parse error.
+    bool load_file(const std::string & path);
+
+    // Find the scenario whose `match_prompt_suffix` is the longest suffix
+    // of `rendered_prompt`. Returns nullptr if none match.
+    const Scenario * match(const std::string & rendered_prompt) const;
+
+    std::size_t size() const { return scenarios_.size(); }
+
+    // For diagnostic logging.
+    const std::vector<Scenario> & scenarios() const { return scenarios_; }
+
+private:
+    std::vector<Scenario> scenarios_;
+};
+
+}  // namespace dflash::common::test
diff --git a/server/test/scenarios/qwen3_enable_thinking_basic.json b/server/test/scenarios/qwen3_enable_thinking_basic.json
new file mode 100644
index 00000000..7f5de2c3
--- /dev/null
+++ b/server/test/scenarios/qwen3_enable_thinking_basic.json
@@ -0,0 +1,19 @@
+{
+  "name": "qwen3_enable_thinking_basic",
+  "description": "Qwen3.6 chat template pre-opens <think> when enable_thinking=true. Model emits reasoning directly, closes with </think>, then writes the answer.",
+  "match": {
+    "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+  },
+  "response": {
+    "finish_reason": "stop",
+    "tokens": [
+      "Let me compute. ",
+      "2",
+      "+",
+      "2",
+      " equals 4.",
+      { "text": "</think>", "special": true },
+      "\n\nThe answer is 4."
+    ]
+  }
+}
diff --git a/server/test/scripts/strip_gguf_to_tokenizer.py b/server/test/scripts/strip_gguf_to_tokenizer.py
new file mode 100644
index 00000000..cabc3638
--- /dev/null
+++ b/server/test/scripts/strip_gguf_to_tokenizer.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""Strip a GGUF to tokenizer metadata only — no tensor data.
+
+The dflash Tokenizer (server/src/server/tokenizer.cpp) only reads KV metadata:
+  tokenizer.ggml.tokens, .merges, .token_type, .model, .pre, .bos_token_id,
+  .eos_token_id, .eot_token_id (best-effort), .chat_template
+
+Everything else — including all 851+ tensor weights — can be dropped. The
+result is small enough to commit as a CI test fixture for the CPU-only
+HttpServer driver.
+
+Usage:
+  python strip_gguf_to_tokenizer.py <input.gguf> <output.gguf>
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import gguf
+
+
+KEEP_PREFIXES = (
+    "tokenizer.",
+    # general.architecture is read by backend_factory's detect_arch() to pick
+    # the chat-format enum. Cheap to keep and useful for any caller that
+    # peeks at the file.
+    "general.architecture",
+    "general.name",
+)
+
+
+def main() -> int:
+    if len(sys.argv) != 3:
+        print(__doc__)
+        return 1
+    src = Path(sys.argv[1])
+    dst = Path(sys.argv[2])
+
+    reader = gguf.GGUFReader(str(src))
+    arch = reader.fields["general.architecture"].contents()
+    print(f"[strip] reading {src} (arch={arch}, {len(reader.tensors)} tensors)")
+
+    writer = gguf.GGUFWriter(str(dst), arch)
+
+    kept = 0
+    for key, field in reader.fields.items():
+        if not key.startswith(KEEP_PREFIXES):
+            continue
+        # GGUFReader does not expose a "copy field to writer" helper, so we
+        # have to translate the field's value type back through GGUFWriter's
+        # typed API. For our tokenizer-only subset that's just strings,
+        # uint32 scalars, bool scalars, and arrays of strings / int32.
+        val = field.contents()
+        ftype = field.types[0] if field.types else None
+        if ftype == gguf.GGUFValueType.STRING:
+            writer.add_string(key, val)
+        elif ftype == gguf.GGUFValueType.UINT32:
+            writer.add_uint32(key, int(val))
+        elif ftype == gguf.GGUFValueType.INT32:
+            writer.add_int32(key, int(val))
+        elif ftype == gguf.GGUFValueType.BOOL:
+            writer.add_bool(key, bool(val))
+        elif ftype == gguf.GGUFValueType.ARRAY:
+            # Array element type is types[1].
+            elem = field.types[1]
+            if elem == gguf.GGUFValueType.STRING:
+                writer.add_array(key, list(val))
+            elif elem in (gguf.GGUFValueType.UINT32, gguf.GGUFValueType.INT32):
+                writer.add_array(key, [int(x) for x in val])
+            else:
+                print(f"[strip] skipping array key {key} (elem type {elem})")
+                continue
+        else:
+            print(f"[strip] skipping key {key} (unsupported type {ftype})")
+            continue
+        kept += 1
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    # write_tensors_to_file() is omitted — we have no tensors.
+    writer.close()
+
+    out_size = dst.stat().st_size
+    print(f"[strip] wrote {dst} (kept {kept} KV pairs, {out_size:,} bytes)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/server/test/stub_model_backend.cpp b/server/test/stub_model_backend.cpp
new file mode 100644
index 00000000..c5796bdc
--- /dev/null
+++ b/server/test/stub_model_backend.cpp
@@ -0,0 +1,73 @@
+#include "stub_model_backend.h"
+
+#include "server/tokenizer.h"
+
+#include <chrono>
+#include <cstdio>
+#include <thread>
+
+namespace dflash::common::test {
+
+GenerateResult StubModelBackend::generate(const GenerateRequest & req,
+                                          const DaemonIO & io) {
+    GenerateResult r;
+
+    const std::string rendered = tokenizer_.decode(req.prompt);
+    const Scenario * sc = store_.match(rendered);
+    if (!sc) {
+        r.ok = false;
+        r.error = "stub: no scenario matches prompt (size="
+                  + std::to_string(rendered.size()) + ")";
+        std::fprintf(stderr,
+            "[stub] no scenario match. last 120b of prompt: %s\n",
+            rendered.substr(rendered.size() > 120 ? rendered.size() - 120 : 0).c_str());
+        return r;
+    }
+
+    if (!sc->response.ok) {
+        r.ok = false;
+        r.error = sc->response.error.empty()
+            ? std::string("stub: scenario '") + sc->name + "' declared failure"
+            : sc->response.error;
+        return r;
+    }
+
+    std::fprintf(stderr, "[stub] match=%s emitting %zu scripted tokens\n",
+        sc->name.c_str(), sc->response.tokens.size());
+
+    r.ok = true;
+    bool aborted = false;
+
+    auto emit = [&](int32_t id) -> bool {
+        if (req.on_token && !req.on_token(id)) return false;
+        if (io.on_token  && !io.on_token(id))  return false;
+        r.tokens.push_back(id);
+        return true;
+    };
+
+    for (const auto & t : sc->response.tokens) {
+        if (t.special) {
+            int32_t id = tokenizer_.token_to_id(t.text);
+            if (id < 0) {
+                r.ok = false;
+                r.error = "stub: special token not in vocab: " + t.text;
+                return r;
+            }
+            if (!emit(id)) { aborted = true; break; }
+        } else {
+            auto ids = tokenizer_.encode(t.text);
+            for (int32_t id : ids) {
+                if (!emit(id)) { aborted = true; break; }
+            }
+            if (aborted) break;
+        }
+        if (sc->response.decode_us > 0) {
+            std::this_thread::sleep_for(
+                std::chrono::microseconds(sc->response.decode_us));
+        }
+    }
+
+    return r;
+}
+
+}  // namespace dflash::common::test
diff --git a/server/test/stub_model_backend.h b/server/test/stub_model_backend.h
new file mode 100644
index 00000000..83525b55
--- /dev/null
+++ b/server/test/stub_model_backend.h
@@ -0,0 +1,63 @@
+// StubModelBackend — deterministic ModelBackend driven by a ScenarioStore.
+//
+// generate() decodes the request prompt back to text using the real
+// tokenizer, looks up the matching scenario (longest prompt_suffix wins),
+// then streams the scenario's tokens through the production callbacks
+// (req.on_token / io.on_token). Streaming behavior is inherited from the
+// production code path — no separate streaming machine.
+//
+// Token-stream construction:
+//   - Plain text tokens are BPE-encoded by the real tokenizer, so a
+//     scripted "Let me think. " expands to the same Qwen3.6 token IDs
+//     the real model would have emitted.
+//   - Special tokens (e.g. </think>) are looked up via token_to_id()
+//     and emitted as a single ID. This matters: Qwen3.6's </think>
+//     is one added token (id 248069); for SseEmitter to recognize it
+//     as a channel-close, it must arrive as the single special ID,
+//     not as the BPE-decomposed sequence.
+
+#pragma once
+
+#include "common/model_backend.h"
+#include "scenario_store.h"
+
+namespace dflash::common {
+class Tokenizer;
+}
+
+namespace dflash::common::test {
+
+class StubModelBackend : public ModelBackend {
+public:
+    StubModelBackend(const ScenarioStore & store, const Tokenizer & tokenizer)
+        : store_(store), tokenizer_(tokenizer) {}
+
+    void print_ready_banner() const override {}
+    bool park(const std::string &) override { return true; }
+    bool unpark(const std::string &) override { return true; }
+    bool is_target_parked() const override { return false; }
+
+    GenerateResult generate(const GenerateRequest & req,
+                            const DaemonIO & io) override;
+
+    bool snapshot_save(int) override { return false; }
+    void snapshot_free(int) override {}
+    bool snapshot_used(int) const override { return false; }
+    int  snapshot_cur_pos(int) const override { return 0; }
+    GenerateResult restore_and_generate(int, const GenerateRequest & req,
+                                        const DaemonIO & io) override {
+        return generate(req, io);
+    }
+
+    bool handle_compress(const std::string &, const DaemonIO &) override {
+        return false;
+    }
+    void free_drafter() override {}
+    void shutdown() override {}
+
+private:
+    const ScenarioStore & store_;
+    const Tokenizer &     tokenizer_;
+};
+
+}  // namespace dflash::common::test
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index 147ddbce..fc983c2d 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -1058,11 +1058,11 @@ static void test_jinja_render_basic() {
         {"system", "you are helpful", ""},
         {"user",   "hi",              ""},
     };
-    std::string out = render_chat_template_jinja(
+    auto out = render_chat_template_jinja(
         MINI_JINJA_TEMPLATE, msgs,
         /*bos=*/"<s>", /*eos=*/"</s>",
         /*add_gen=*/true, /*think=*/false,
-        /*tools=*/"");
+        /*tools=*/"").text;
     TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos);
     TEST_ASSERT(out.find("<|user|>hi")               != std::string::npos);
     TEST_ASSERT(out.find("<|assistant|>")            != std::string::npos);
@@ -1070,9 +1070,9 @@ static void test_jinja_render_basic() {
 
 static void test_jinja_render_no_gen_prompt() {
     std::vector<ChatMessage> msgs = {{"user", "ping", ""}};
-    std::string out = render_chat_template_jinja(
+    auto out = render_chat_template_jinja(
         MINI_JINJA_TEMPLATE, msgs, "", "",
-        /*add_gen=*/false, /*think=*/false, "");
+        /*add_gen=*/false, /*think=*/false, "").text;
     TEST_ASSERT(out.find("<|user|>ping") != std::string::npos);
     TEST_ASSERT(out.find("<|assistant|>") == std::string::npos);
 }
@@ -1084,8 +1084,8 @@ static void test_jinja_render_tools_injected() {
         "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}";
     std::vector<ChatMessage> msgs = {{"user", "?", ""}};
     std::string tools = R"([{"name":"my_tool","description":"test"}])";
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "", "", false, false, tools);
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "", "", false, false, tools).text;
     TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos);
 }
 
@@ -1094,8 +1094,8 @@ static void test_jinja_render_empty_tools_skipped() {
     static const char TPL[] =
         "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}";
     std::vector<ChatMessage> msgs = {{"user", "?", ""}};
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "", "", false, false, "[]");
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "", "", false, false, "[]").text;
     TEST_ASSERT(out.find("NO_TOOLS")        != std::string::npos);
     TEST_ASSERT(out.find("TOOLS_PRESENT")   == std::string::npos);
 }
@@ -1104,8 +1104,8 @@ static void test_jinja_render_bos_eos_threaded() {
     // {{ bos_token }} and {{ eos_token }} must reach the template.
     static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}";
     std::vector<ChatMessage> msgs;
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "<BOS>", "<EOS>", false, false, "");
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "<BOS>", "<EOS>", false, false, "").text;
     TEST_ASSERT(out == "<BOS>HI<EOS>");
 }
 
@@ -1133,6 +1133,311 @@ static void test_jinja_render_bad_tools_json_throws() {
     TEST_ASSERT(threw);
 }
 
+// ─── started_in_thinking provenance ─────────────────────────────────────
+//
+// Regression suite for the Qwen3.6 / Laguna think-mode channel-routing
+// bug: the rendered prompt suffix pre-opens `<think>` so the model
+// starts emitting reasoning tokens with no explicit opener. Callers
+// route PromptRenderResult.started_in_thinking → SseEmitter initial
+// mode so reasoning text lands in reasoning_content, not content.
+
+static void test_chat_template_qwen3_enable_thinking_pre_opens() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(result.started_in_thinking);
+    // Sanity: rendered suffix ends with `<think>\n` per the Qwen3.6
+    // chat_template.jinja's enable_thinking branch.
+    TEST_ASSERT(result.text.size() >= 8);
+    TEST_ASSERT(result.text.compare(result.text.size() - 8, 8, "<think>\n") == 0);
+}
+
+static void test_chat_template_qwen3_disable_thinking_does_not_pre_open() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+    // The disabled branch emits `<think>\n\n</think>\n\n` — closes
+    // immediately, so the reasoning channel is NOT left open.
+    TEST_ASSERT(result.text.find("</think>") != std::string::npos);
+}
+
+static void test_chat_template_qwen3_no_gen_prompt_does_not_pre_open() {
+    // Without add_generation_prompt the assistant turn isn't appended
+    // and there's nothing to pre-open.
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/false,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+}
+
+static void test_chat_template_laguna_enable_thinking_pre_opens() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(result.started_in_thinking);
+    TEST_ASSERT(result.text.size() >= 7);
+    TEST_ASSERT(result.text.compare(result.text.size() - 7, 7, "<think>") == 0);
+}
+
+static void test_chat_template_laguna_disable_thinking_does_not_pre_open() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+}
+
+static void test_chat_template_gemma4_does_not_pre_open() {
+    // Gemma4's reasoning channel is opened by the model's `<|channel>`
+    // token (which http_server forwards into the emitter as `<think>`).
+    // The prompt itself never pre-opens `<think>` regardless of
+    // enable_thinking, so started_in_thinking must stay false.
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto enabled = render_chat_template(msgs, ChatFormat::GEMMA4,
+                                        /*add_gen=*/true,
+                                        /*enable_thinking=*/true,
+                                        /*tools=*/"");
+    TEST_ASSERT(!enabled.started_in_thinking);
+    auto disabled = render_chat_template(msgs, ChatFormat::GEMMA4,
+                                         /*add_gen=*/true,
+                                         /*enable_thinking=*/false,
+                                         /*tools=*/"");
+    TEST_ASSERT(!disabled.started_in_thinking);
+}
+
+// Jinja path: suffix-sniff detection. The renderer should set
+// started_in_thinking=true when the rendered prompt ends with `<think>`
+// (optionally followed by whitespace) AND enable_thinking is honored.
+static void test_jinja_render_suffix_sniff_sets_started_in_thinking() {
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "<|assistant|>{%- if enable_thinking -%}<think>\n{%- endif -%}"
+        "{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, "");
+    TEST_ASSERT(r.started_in_thinking);
+}
+
+static void test_jinja_render_suffix_sniff_negative() {
+    // Template doesn't end with `<think>` → started_in_thinking=false
+    // even with enable_thinking=true.
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}<|assistant|>{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, "");
+    TEST_ASSERT(!r.started_in_thinking);
+}
+
+// ─── SseEmitter initial_mode=REASONING ──────────────────────────────────
+//
+// Regression: when constructed with initial_mode=REASONING (the
+// Qwen3.6/Laguna enable_thinking path), the emitter must route the
+// model's first generated tokens to reasoning_content until a natural
+// `</think>` is seen, even though no explicit `<think>` opener appears
+// in the stream.
+
+static void test_emitter_initial_mode_reasoning_routes_to_reasoning_content() {
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-1", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+
+    // Model emits reasoning tokens directly with no leading `<think>`
+    // (because the prompt suffix already opened the channel), then
+    // closes with `</think>` and emits the answer.
+    em.emit_token("alpha ");
+    em.emit_token("beta ");
+    em.emit_token("</think>\n\nAnswer: 4");
+    em.emit_finish(4);
+
+    TEST_ASSERT(em.reasoning_text().find("alpha")  != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("beta")   != std::string::npos);
+    // No spurious <think> tag leaked into reasoning or content.
+    TEST_ASSERT(em.reasoning_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("</think>") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("Answer: 4") != std::string::npos);
+}
+
+static void test_emitter_initial_mode_reasoning_unclosed_stays_reasoning() {
+    // No </think> close — everything stays in reasoning_content, content
+    // stays empty. Matches parse_reasoning(started_in_thinking=true)
+    // behavior for the non-streaming path.
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-2", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+    em.emit_token("still thinking");
+    em.emit_token(" more thinking");
+    em.emit_finish(3);
+
+    TEST_ASSERT(em.reasoning_text().find("still thinking") != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("more thinking")  != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().empty());
+}
+
+static void test_emitter_initial_mode_reasoning_strips_redundant_think_opener() {
+    // Edge case: prompt pre-opened <think>, but the model also emits a
+    // leading <think> anyway (template/model-card mismatch). The
+    // emitter's strip guard (checked_think_prefix_) must still trip
+    // because we deliberately leave it at its default (false) in the
+    // constructor — otherwise the duplicate opener would leak into
+    // reasoning_text.
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-3", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+    em.emit_token("<think>actual reasoning</think>answer");
+    em.emit_finish(3);
+
+    TEST_ASSERT(em.reasoning_text().find("<think>") == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("actual reasoning") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos);
+}
+
+static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking() {
+    // Anthropic format: when starting in REASONING mode, the very first
+    // content_block_start must be `thinking`, not `text`. Otherwise the
+    // emitter would open a text block, then have to stop+restart it as
+    // thinking on the first reasoning delta — wasteful and visible to
+    // SDK clients as a spurious empty text block.
+    SseEmitter em(ApiFormat::ANTHROPIC, "req-4", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    auto start = em.emit_start();
+    std::string all;
+    for (const auto & c : start) all += c;
+    // First content block must be a thinking block. nlohmann::json sorts
+    // keys alphabetically on dump(), so the inner block serializes as
+    // `{"thinking":"","type":"thinking"}` (NOT type-first). Assert on
+    // the unique `"thinking":""` opener which only appears in the
+    // thinking-kind serialization.
+    TEST_ASSERT(all.find("\"thinking\":\"\",\"type\":\"thinking\"")
+                != std::string::npos);
+    // And the initial text-block opener must NOT appear (regression: if
+    // active_kind_ defaulted to "text", emit_start would have emitted
+    // `{"text":"","type":"text"}` here instead).
+    TEST_ASSERT(all.find("\"text\":\"\",\"type\":\"text\"")
+                == std::string::npos);
+}
+
+// ─── Integration: render_chat_template → SseEmitter wiring ──────────────
+//
+// The original bug was an integration gap: render_chat_template correctly
+// reported started_in_thinking=true, but no caller routed it into the
+// SseEmitter's initial_mode, so reasoning text leaked into content and
+// reasoning_content stayed empty. Each end of the wire has its own unit
+// tests above; these chain the two ends so a future refactor that drops
+// the propagation cannot pass without an assertion failure here.
+//
+// The body mirrors the production wiring in
+// server/src/server/http_server.cpp (the `started_in_thinking →
+// initial_mode → SseEmitter` chain). Keep these in sync if that wiring
+// moves.
+
+static void test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning() {
+    std::vector<ChatMessage> msgs = {{"user", "What is 2+2?", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT_MSG(render.started_in_thinking,
+        "renderer end of wire: QWEN3 enable_thinking must pre-open <think>");
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-q", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Let me compute. ");
+    em.emit_token("2+2 equals 4.");
+    em.emit_token("</think>\n\nThe answer is 4.");
+    em.emit_finish(5);
+
+    TEST_ASSERT_MSG(!em.reasoning_text().empty(),
+        "wiring broken: reasoning_content empty despite started_in_thinking=true");
+    TEST_ASSERT(em.reasoning_text().find("Let me compute")    != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("<think>")           == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("</think>")          == std::string::npos);
+    TEST_ASSERT_MSG(em.accumulated_text().find("Let me compute") == std::string::npos,
+        "wiring broken: reasoning text leaked into content channel");
+    TEST_ASSERT(em.accumulated_text().find("The answer is 4") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")         == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>")        == std::string::npos);
+}
+
+static void test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning() {
+    std::vector<ChatMessage> msgs = {{"user", "Solve 7*8.", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT_MSG(render.started_in_thinking,
+        "renderer end of wire: LAGUNA enable_thinking must pre-open <think>");
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-l", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Working through it: ");
+    em.emit_token("7*8 = 56.");
+    em.emit_token("</think>\n\n56.");
+    em.emit_finish(4);
+
+    TEST_ASSERT_MSG(!em.reasoning_text().empty(),
+        "wiring broken: reasoning_content empty despite started_in_thinking=true");
+    TEST_ASSERT(em.reasoning_text().find("Working through it") != std::string::npos);
+    TEST_ASSERT_MSG(em.accumulated_text().find("Working through it") == std::string::npos,
+        "wiring broken: reasoning text leaked into content channel");
+    TEST_ASSERT(em.accumulated_text().find("56.") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>") == std::string::npos);
+}
+
+static void test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content() {
+    // Inverse direction: when enable_thinking=false the renderer must not
+    // pre-open and the emitter must start in CONTENT, so the model's
+    // tokens land in content from the first byte. Guards against the
+    // opposite regression of unconditionally starting in REASONING.
+    std::vector<ChatMessage> msgs = {{"user", "Hi.", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!render.started_in_thinking);
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-n", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Hello there.");
+    em.emit_finish(2);
+
+    TEST_ASSERT(em.reasoning_text().empty());
+    TEST_ASSERT(em.accumulated_text().find("Hello there") != std::string::npos);
+}
+
 static void test_normalize_responses_tool_followup_messages() {
     ToolMemory tool_memory;
     const std::string call_id = "call_exec_001";
@@ -2538,6 +2843,10 @@ int main() {
     RUN_TEST(test_emitter_streaming_openai_has_done);
     RUN_TEST(test_emitter_nonstreaming_accumulates);
     RUN_TEST(test_emitter_anthropic_thinking_blocks);
+    RUN_TEST(test_emitter_initial_mode_reasoning_routes_to_reasoning_content);
+    RUN_TEST(test_emitter_initial_mode_reasoning_unclosed_stays_reasoning);
+    RUN_TEST(test_emitter_initial_mode_reasoning_strips_redundant_think_opener);
+    RUN_TEST(test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking);
 
     std::fprintf(stderr, "\n── Stop sequences ──\n");
     RUN_TEST(test_stop_sequence_basic);
@@ -2579,6 +2888,17 @@ int main() {
     RUN_TEST(test_jinja_render_bos_eos_threaded);
     RUN_TEST(test_jinja_render_empty_template_throws);
     RUN_TEST(test_jinja_render_bad_tools_json_throws);
+    RUN_TEST(test_chat_template_qwen3_enable_thinking_pre_opens);
+    RUN_TEST(test_chat_template_qwen3_disable_thinking_does_not_pre_open);
+    RUN_TEST(test_chat_template_qwen3_no_gen_prompt_does_not_pre_open);
+    RUN_TEST(test_chat_template_laguna_enable_thinking_pre_opens);
+    RUN_TEST(test_chat_template_laguna_disable_thinking_does_not_pre_open);
+    RUN_TEST(test_chat_template_gemma4_does_not_pre_open);
+    RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking);
+    RUN_TEST(test_jinja_render_suffix_sniff_negative);
+    RUN_TEST(test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning);
+    RUN_TEST(test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning);
+    RUN_TEST(test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content);
     RUN_TEST(test_normalize_responses_tool_followup_messages);
 
     std::fprintf(stderr, "\n── Placement config ──\n");
diff --git a/server/test/test_stub_integration.py b/server/test/test_stub_integration.py
new file mode 100644
index 00000000..496d877f
--- /dev/null
+++ b/server/test/test_stub_integration.py
@@ -0,0 +1,241 @@
+"""End-to-end integration test for the dflash HttpServer, driven by a
+deterministic stub backend (no GPU, no model weights).
+
+Runs the replay_http_server binary with:
+  - the tokenizer-only Qwen3.6 GGUF fixture under server/test/fixtures/
+  - the JSON scenario files under server/test/scenarios/
+
+Then sends real HTTP requests at it via the same `requests` calls that
+test_server_integration.py uses against a live GPU server. Because the
+chat-template renderer, request parser, SseEmitter, and SSE socket
+writes are the production code path, this test exercises the exact wire
+that broke in the original Qwen3.6 think-channel bug — but on a CPU-only
+runner with no model file.
+
+Run locally:
+  ./server/build/replay_http_server (built by cmake)
+  uv run pytest server/test/test_stub_integration.py -v
+
+The test starts and stops the driver itself; no separate server required.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import socket
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+import requests
+
+
+REPO_ROOT     = Path(__file__).resolve().parents[2]
+BUILD_DIR     = REPO_ROOT / "server" / "build"
+DRIVER_BIN    = BUILD_DIR / "replay_http_server"
+TOKENIZER_GGUF = REPO_ROOT / "server" / "test" / "fixtures" / "qwen3.6-tokenizer.gguf"
+SCENARIOS_DIR = REPO_ROOT / "server" / "test" / "scenarios"
+
+
+def _free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+@pytest.fixture(scope="module")
+def stub_server():
+    """Spawn the stub-driven HttpServer for the duration of this module."""
+    assert DRIVER_BIN.is_file(), (
+        f"driver binary missing: {DRIVER_BIN} — "
+        "build target replay_http_server first")
+    assert TOKENIZER_GGUF.is_file(), (
+        f"tokenizer fixture missing: {TOKENIZER_GGUF} — "
+        "is git-lfs configured for *.gguf?")
+
+    port = _free_port()
+    env = {**os.environ, "CUDA_VISIBLE_DEVICES": ""}
+    proc = subprocess.Popen(
+        [str(DRIVER_BIN),
+         str(TOKENIZER_GGUF),
+         "--scenarios", str(SCENARIOS_DIR),
+         "--port", str(port)],
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    base = f"http://127.0.0.1:{port}"
+
+    # Wait up to 10s for /health to come up.
+    deadline = time.time() + 10
+    while time.time() < deadline:
+        try:
+            r = requests.get(f"{base}/health", timeout=1)
+            if r.status_code == 200:
+                break
+        except requests.RequestException:
+            pass
+        if proc.poll() is not None:
+            out = proc.stdout.read().decode() if proc.stdout else ""
+            raise RuntimeError(f"driver exited early. output:\n{out}")
+        time.sleep(0.1)
+    else:
+        proc.terminate()
+        out = proc.stdout.read().decode() if proc.stdout else ""
+        raise RuntimeError(f"driver did not become healthy in 10s. output:\n{out}")
+
+    yield base
+
+    proc.terminate()
+    try:
+        proc.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        proc.wait(timeout=2)
+
+
+# ─── Tests ─────────────────────────────────────────────────────────────
+
+
+class TestQwen3EnableThinkingNonStreaming:
+    """Regression guard for the original PR #308 bug: Qwen3.6 enable_thinking
+    must route reasoning into reasoning_content, not leak it into content."""
+
+    def test_openai_chat_completions(self, stub_server):
+        r = requests.post(f"{stub_server}/v1/chat/completions",
+            json={
+                "model": "dflash",
+                "messages": [{"role": "user", "content": "What is 2+2?"}],
+                "chat_template_kwargs": {"enable_thinking": True},
+                "stream": False,
+                "max_tokens": 256,
+            },
+            timeout=10)
+        assert r.status_code == 200, r.text
+        msg = r.json()["choices"][0]["message"]
+        reasoning = msg.get("reasoning_content") or ""
+        content   = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty — render→emit wiring broken. "
+            f"content={content!r}")
+        assert "Let me compute" in reasoning
+        assert "<think>"  not in reasoning
+        assert "</think>" not in reasoning
+        assert "<think>"  not in content
+        assert "</think>" not in content
+        assert "The answer is 4." in content
+        assert "Let me compute" not in content, (
+            f"reasoning text leaked into content: {content!r}")
+
+    def test_anthropic_messages(self, stub_server):
+        r = requests.post(f"{stub_server}/v1/messages",
+            json={
+                "model": "dflash",
+                "messages": [{"role": "user", "content": "What is 2+2?"}],
+                "thinking": {"type": "enabled"},
+                "stream": False,
+                "max_tokens": 256,
+            },
+            timeout=10)
+        assert r.status_code == 200, r.text
+        blocks = r.json().get("content", [])
+        types = [b.get("type") for b in blocks]
+        assert "thinking" in types, f"no thinking block; types={types}"
+        assert "text" in types,     f"no text block; types={types}"
+        thinking = next(b["thinking"] for b in blocks if b["type"] == "thinking")
+        text     = next(b["text"]     for b in blocks if b["type"] == "text")
+        assert "Let me compute"  in thinking
+        assert "The answer is 4" in text
+        assert "<think>"  not in thinking
+        assert "</think>" not in thinking
+
+
+class TestQwen3EnableThinkingStreaming:
+    """Same bug class but through the streaming SSE code path."""
+
+    def test_openai_chat_completions_streaming(self, stub_server):
+        r = requests.post(f"{stub_server}/v1/chat/completions",
+            json={
+                "model": "dflash",
+                "messages": [{"role": "user", "content": "What is 2+2?"}],
+                "chat_template_kwargs": {"enable_thinking": True},
+                "stream": True,
+                "max_tokens": 256,
+            },
+            stream=True, timeout=10)
+        assert r.status_code == 200
+
+        reasoning_text = ""
+        content_text = ""
+        saw_reasoning_delta = False
+        saw_content_delta = False
+        saw_done = False
+        for line in r.iter_lines(decode_unicode=True):
+            if not line:
+                continue
+            if line == "data: [DONE]":
+                saw_done = True
+                break
+            assert line.startswith("data: "), line
+            chunk = json.loads(line[len("data: "):])
+            choices = chunk.get("choices") or []
+            if not choices:
+                continue   # usage-only tail chunk; no delta
+            delta = choices[0].get("delta", {})
+            if "reasoning_content" in delta:
+                saw_reasoning_delta = True
+                reasoning_text += delta["reasoning_content"]
+            if "content" in delta:
+                saw_content_delta = True
+                content_text += delta["content"]
+        assert saw_done
+        assert saw_reasoning_delta, "no reasoning_content deltas emitted"
+        assert saw_content_delta,   "no content deltas emitted"
+        assert "Let me compute"  in reasoning_text
+        assert "The answer is 4" in content_text
+        assert "<think>"  not in reasoning_text
+        assert "</think>" not in reasoning_text
+        assert "<think>"  not in content_text
+        assert "</think>" not in content_text
+
+    def test_anthropic_messages_streaming(self, stub_server):
+        """First content_block_start must be `thinking`, not `text` — the
+        Anthropic-side half of the PR #308 fix."""
+        r = requests.post(f"{stub_server}/v1/messages",
+            json={
+                "model": "dflash",
+                "messages": [{"role": "user", "content": "What is 2+2?"}],
+                "thinking": {"type": "enabled"},
+                "stream": True,
+                "max_tokens": 256,
+            },
+            stream=True, timeout=10)
+        assert r.status_code == 200
+
+        events = []
+        event_type = None
+        for line in r.iter_lines(decode_unicode=True):
+            if line is None:
+                continue
+            if line.startswith("event: "):
+                event_type = line[len("event: "):]
+            elif line.startswith("data: ") and event_type:
+                events.append((event_type, json.loads(line[len("data: "):])))
+                event_type = None
+        types = [t for t, _ in events]
+        assert "message_start"      in types
+        first_start = next(d for t, d in events if t == "content_block_start")
+        assert first_start["content_block"]["type"] == "thinking", first_start
+        # At least one thinking delta and one text delta.
+        thinking_deltas = [d for t, d in events
+                           if t == "content_block_delta"
+                           and d["delta"]["type"] == "thinking_delta"]
+        text_deltas = [d for t, d in events
+                       if t == "content_block_delta"
+                       and d["delta"]["type"] == "text_delta"]
+        assert thinking_deltas, f"no thinking_delta events; types={types}"
+        assert text_deltas,     f"no text_delta events; types={types}"
+        assert "Let me compute" in "".join(d["delta"]["thinking"] for d in thinking_deltas)
+        assert "The answer is 4" in "".join(d["delta"]["text"] for d in text_deltas)