Luce-Org · easel · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -5,3 +5,4 @@ assets/banner.png -filter -diff -merge -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.webm filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,6 +26,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
+          lfs: true
           token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }}
 
       - uses: Jimver/cuda-toolkit@v0.2.35
@@ -55,7 +56,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release
           cmake --build build --target \
             test_dflash test_generate test_flash_attn_sparse \
-            dflash_server test_server_unit \
+            dflash_server test_server_unit replay_http_server \
             -j$(nproc)
 
       - name: Run C++ server unit tests
@@ -70,6 +71,19 @@ jobs:
         # in the optional `megakernel` extra so its build does NOT run yet.
         run: uv sync --frozen
 
+      - name: Run CPU integration tests (stub backend, no GPU)
+        # End-to-end exercise of HttpServer + render_chat_template +
+        # SseEmitter with a deterministic stub model backend. No GPU
+        # required: the spike driver runs under CUDA_VISIBLE_DEVICES=""
+        # and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata
+        # only). Covers the regression class from PR #308 end-to-end —
+        # streaming and non-streaming, OpenAI and Anthropic formats.
+        env:
+          CUDA_VISIBLE_DEVICES: ""
+        run: |
+          uv run --frozen --with pytest --with requests \
+            pytest -v server/test/test_stub_integration.py
+
       - name: Build megakernel via uv sync (sm_75)
         env:
           CUDA_HOME: ${{ env.CUDA_PATH }}

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -769,6 +769,39 @@ if(DFLASH27B_TESTS)
         endif()
     endif()
 
+    # ─── replay_http_server: CPU-only HttpServer test driver ────────────
+    # Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to
+    # serve real HTTP requests on the wire, replaying scripted token
+    # streams from JSON scenario files. Links dflash_common (which
+    # includes CUDA-compiled TUs) but never instantiates a real
+    # ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by
+    # test_stub_integration.py.
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp")
+        add_executable(replay_http_server
+            test/replay_http_server.cpp
+            test/scenario_store.cpp
+            test/stub_model_backend.cpp
+            src/server/http_server.cpp
+            src/server/model_card.cpp)
+        target_include_directories(replay_http_server PRIVATE
+            ${DFLASH27B_SRC_INCLUDE_DIRS}
+            ${CMAKE_CURRENT_SOURCE_DIR}/test)
+        if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
+            target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
+        else()
+            target_compile_definitions(replay_http_server PRIVATE
+                DFLASH27B_BACKEND_CUDA=1
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
+        endif()
+        target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
+        if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
+            find_package(CUDAToolkit REQUIRED)
+            target_link_libraries(replay_http_server PRIVATE CUDA::cudart)
+        else()
+            target_link_libraries(replay_http_server PRIVATE hip::host)
+        endif()
+    endif()
+
     # ─── Unit tests (no GPU, no model files) ────────────────────────────
     enable_testing()
 

diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py
@@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self):
 
     @pytest.mark.slow
     def test_thinking_enabled_via_chat_template_kwargs(self):
-        """Enabling thinking should produce reasoning_content."""
+        """Enabling thinking must route reasoning into reasoning_content,
+        not leak it into content. Regression guard for the Qwen3.6/Laguna
+        pre-opened-<think> bug: the chat template appends `<think>` to the
+        prompt suffix, so the model emits reasoning directly with no
+        opening tag. If the renderer→emitter wiring drops, reasoning_content
+        stays empty and the raw reasoning text appears in content."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
-        # With thinking enabled, model may produce reasoning_content
-        # (not guaranteed for short prompts, so we just check it doesn't crash)
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with enable_thinking=True — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning, (
+            f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}"
+        )
+        assert "<think>" not in content and "</think>" not in content, (
+            f"think tags leaked into content channel: {content[:200]!r}"
+        )
+        assert content, "content channel empty — model never closed </think>"
 
     @pytest.mark.slow
     def test_thinking_enabled_via_reasoning_effort(self):
-        """OpenAI Responses-style reasoning.effort field."""
+        """OpenAI Responses-style reasoning.effort=high must also route
+        reasoning to reasoning_content. Same regression class as above
+        but reached through a different request shape (effort→template
+        kwargs translation in http_server.cpp)."""
         r = post_json("/v1/chat/completions", {
             "model": MODEL_NAME,
             "messages": [{"role": "user", "content": "What is 15 * 17?"}],
@@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self):
         })
         assert r.status_code == 200
         msg = r.json()["choices"][0]["message"]
-        assert msg["content"]
+        reasoning = msg.get("reasoning_content") or ""
+        content = msg.get("content") or ""
+        assert reasoning, (
+            f"reasoning_content empty with reasoning.effort=high — "
+            f"renderer→emitter wiring likely broken. content={content[:200]!r}"
+        )
+        assert "<think>" not in reasoning and "</think>" not in reasoning
+        assert "<think>" not in content and "</think>" not in content
+        assert content
 
 
 # ═══════════════════════════════════════════════════════════════════

diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
@@ -51,14 +51,18 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
     return ChatFormat::QWEN3;
 }
 
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt,
     bool enable_thinking,
     const std::string & tools_json)
 {
     std::string result;
+    // `started_in_thinking` is derived deterministically from the template
+    // branch + render flags below. Set per format inside the switch so a
+    // future format addition can't silently miss the wiring.
+    bool started_in_thinking = false;
     bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
 
     switch (format) {
@@ -141,6 +145,14 @@ std::string render_chat_template(
                 // even when the client opts in, defeating the thinking-budget
                 // mechanism entirely.
                 result += "<think>\n";
+                // The prompt suffix pre-opens `<think>` — the model's very
+                // first generated token is reasoning, never preceded by an
+                // explicit `<think>` opener in the stream. Callers must
+                // start the SSE state machine in REASONING mode and pass
+                // `started_in_thinking=true` to parse_reasoning() so that
+                // reasoning text routes to reasoning_content instead of
+                // leaking into content.
+                started_in_thinking = true;
             }
         }
         break;
@@ -224,6 +236,11 @@ std::string render_chat_template(
             result += "<assistant>\n";
             if (enable_thinking) {
                 result += "<think>";
+                // Same situation as Qwen3.6: Laguna XS.2's enable_thinking
+                // generation prompt ends with `<think>` so the model starts
+                // emitting reasoning tokens with no explicit opener in the
+                // stream. Route subsequent tokens to the reasoning channel.
+                started_in_thinking = true;
             } else {
                 // Empty think block — model jumps straight to answer.
                 result += "</think>";
@@ -311,11 +328,17 @@ std::string render_chat_template(
                 result += "<|channel>thought\n<channel|>";
             }
         }
+        // Gemma4 does NOT pre-open `<think>` from the prompt; its
+        // reasoning channel is opened by the model emitting `<|channel>`
+        // which http_server forwards into the SseEmitter as the text
+        // `<think>` — so the emitter's existing CONTENT→REASONING
+        // transition fires on that synthesized opener. started_in_thinking
+        // stays false (initial CONTENT mode is correct).
         break;
     }
     }
 
-    return result;
+    return PromptRenderResult{std::move(result), started_in_thinking};
 }
 
 // ─── Jinja path ─────────────────────────────────────────────────────────
@@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template
 
 }  // namespace
 
-std::string render_chat_template_jinja(
+// Sniff a rendered prompt for a trailing `<think>` opener so the caller
+// can route subsequent stream tokens to the reasoning channel. Accepts
+// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
+// True positive ⇒ caller should treat the prompt as having pre-opened
+// the reasoning channel (and the renderer warns loudly so a model-card
+// mismatch is visible at runtime).
+static bool prompt_ends_with_think_open(const std::string & s) {
+    static const std::string OPEN = "<think>";
+    // Walk back over trailing ASCII whitespace.
+    size_t end = s.size();
+    while (end > 0) {
+        char c = s[end - 1];
+        if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
+            end--;
+        } else {
+            break;
+        }
+    }
+    if (end < OPEN.size()) return false;
+    return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
+}
+
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
@@ -407,14 +452,37 @@ std::string render_chat_template_jinja(
         throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
     }
 
+    std::string rendered;
     try {
         jinja::runtime rt(ctx);
         jinja::value results = rt.execute(*prog);
         auto parts = jinja::runtime::gather_string_parts(results);
-        return parts->as_string().str();
+        rendered = parts->as_string().str();
     } catch (const std::exception & e) {
         throw std::runtime_error(std::string("jinja runtime: ") + e.what());
     }
+
+    // Jinja path: we don't know which template family the caller passed
+    // in, so derive `started_in_thinking` by sniffing the rendered tail
+    // for a `<think>` opener. This catches the common Qwen3.6 / Laguna
+    // chat templates that end with `<think>\n` when enable_thinking is
+    // honored, plus any custom template that follows the same convention.
+    //
+    // Warn loudly when sniffing decides true so a template/model-card
+    // mismatch (e.g. enable_thinking=false but template hard-codes
+    // `<think>` anyway) surfaces in server logs.
+    bool started_in_thinking =
+        enable_thinking && add_generation_prompt &&
+        prompt_ends_with_think_open(rendered);
+    if (started_in_thinking) {
+        std::fprintf(stderr,
+            "[WARN] render_chat_template_jinja: rendered prompt ends with "
+            "`<think>` opener — treating as started_in_thinking=true. If "
+            "this is unexpected, check the template's enable_thinking "
+            "branch or the model card's reasoning configuration.\n");
+    }
+
+    return PromptRenderResult{std::move(rendered), started_in_thinking};
 }
 
 }  // namespace dflash::common
diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h
@@ -27,6 +27,23 @@ enum class ChatFormat {
     GEMMA4,    // <bos><|turn>role\n...<turn|>\n
 };
 
+// Provenance for a rendered prompt. `text` is the byte string that gets
+// tokenized; `started_in_thinking` records whether the prompt suffix
+// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
+// that the model is expected to continue into.
+//
+// Callers route this into the SseEmitter's initial mode and into
+// parse_reasoning()'s `started_in_thinking` argument so reasoning text
+// emitted before any explicit `<think>` opener is still attributed to
+// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
+// enable_thinking prompts (which pre-open `<think>\n` in the assistant
+// turn) cause the model to emit reasoning straight into the content
+// channel, leaving `reasoning_content` empty.
+struct PromptRenderResult {
+    std::string text;            // rendered prompt text, ready to tokenize
+    bool started_in_thinking;    // prompt suffix opens reasoning channel
+};
+
 // Render chat messages into the model-specific prompt string.
 // The result is plain text ready to be tokenized.
 //
@@ -40,7 +57,7 @@ enum class ChatFormat {
 // `tools_json` is an optional JSON string containing the tool definitions
 // array. When non-empty, the Qwen3/3.5 template injects a tool preamble
 // into the system message instructing the model how to emit <tool_call> tags.
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt = true,
@@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
 // Internally caches the most recently parsed program per thread (avoids
 // re-parsing the template on every request). Throws std::runtime_error on
 // lexer/parser/runtime failure (caller should surface a 500 response).
-std::string render_chat_template_jinja(
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,

diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
@@ -1009,7 +1009,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
             tools_json = req.tools.dump();
         }
 
-        std::string rendered;
+        PromptRenderResult render_result;
         if (!config_.chat_template_src.empty()) {
             // Jinja path: caller supplied a chat template file via
             // --chat-template-file. Override the hardcoded QWEN3/LAGUNA
@@ -1026,7 +1026,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 ? tokenizer_.raw_token(tokenizer_.eos_id())
                 : std::string();
             try {
-                rendered = render_chat_template_jinja(
+                render_result = render_chat_template_jinja(
                     config_.chat_template_src,
                     chat_msgs,
                     bos_str,
@@ -1040,11 +1040,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 return true;
             }
         } else {
-            rendered = render_chat_template(chat_msgs, chat_format_,
-                                            true, enable_thinking,
-                                            tools_json);
+            render_result = render_chat_template(chat_msgs, chat_format_,
+                                                 true, enable_thinking,
+                                                 tools_json);
         }
-        req.prompt_tokens = tokenizer_.encode(rendered);
+        // Propagate prompt provenance so the SseEmitter's initial mode
+        // matches the template's pre-opened reasoning channel (Qwen3.6 /
+        // Laguna enable_thinking case). Without this, reasoning text
+        // leaks into the content channel and `reasoning_content` stays
+        // empty — see fix(server): route Qwen3.6/Laguna think-mode
+        // reasoning to reasoning_content channel.
+        req.started_in_thinking = render_result.started_in_thinking;
+        req.prompt_tokens = tokenizer_.encode(render_result.text);
 
         // count_tokens: short-circuit after tokenization. Skip generation
         // entirely — Anthropic's contract is just `{"input_tokens": N}`.
@@ -1149,11 +1156,20 @@ void HttpServer::worker_loop() {
             }
         }
 
-        // Create SSE emitter for streaming state machine.
+        // Create SSE emitter for streaming state machine. `initial_mode`
+        // tracks whether the chat-template prompt pre-opened a `<think>`
+        // block (Qwen3.6 / Laguna enable_thinking path). When true, the
+        // emitter starts in REASONING so the model's first generated
+        // token routes to reasoning_content even though no explicit
+        // `<think>` opener appears in the token stream.
+        const StreamMode initial_mode = req.started_in_thinking
+            ? StreamMode::REASONING
+            : StreamMode::CONTENT;
         SseEmitter emitter(req.format, req.response_id, req.model,
                            (int)req.prompt_tokens.size(), req.tools,
                            &tool_memory_,
-                           req.stop_sequences);
+                           req.stop_sequences,
+                           initial_mode);
 
         // Emit initial SSE events.
         if (req.stream) {

diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
@@ -200,6 +200,12 @@ struct ParsedRequest {
     std::vector<std::string>  stop_sequences;
     // Bandit: per-session adaptive keep_ratio opt-in
     std::string               session_id;
+    // Set by the chat-template renderer when the rendered prompt suffix
+    // pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
+    // Drives the SseEmitter's initial mode so reasoning tokens emitted
+    // before any explicit `<think>` opener route to reasoning_content
+    // instead of leaking into content.
+    bool                      started_in_thinking = false;
 };
 
 // Build the /props response body. Exposed (non-static) so unit tests