From db79a7db1d029d54b9d8f252f1b059bd1955572d Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Fri, 29 May 2026 11:25:08 -0400 Subject: [PATCH 1/4] fix(server): route Qwen3.6/Laguna think-mode reasoning to reasoning_content channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SseEmitter hard-started in StreamMode::CONTENT and only transitioned to REASONING when it saw `` in the generated stream. But Qwen3.6 / Laguna chat templates append `\n` to the prompt suffix when enable_thinking is honored, so the model emits reasoning tokens directly with no opening tag — the emitter never transitioned and reasoning text leaked into `content` while `reasoning_content` stayed empty. ds4-eval pass rate: 14.1% (think) vs 71.7% (no-think) for Qwen3.6-27B Q4_K_M. The plumbing was already there: parse_reasoning() supports started_in_thinking=true (reasoning.h:17-19) but no caller passed it. Fix: 1. chat_template.h: render_chat_template / render_chat_template_jinja now return a PromptRenderResult { text, started_in_thinking }. The built-in QWEN3 and LAGUNA branches set started_in_thinking deterministically when enable_thinking && add_generation_prompt; GEMMA4 stays false (its reasoning channel is opened by the model emitting `<|channel>`, which http_server forwards into the emitter as ``). The Jinja path suffix-sniffs the rendered prompt for a trailing `` opener and emits a [WARN] log when sniffing decides true so a template/model-card mismatch surfaces at runtime. 2. SseEmitter: add `initial_mode = StreamMode::CONTENT` defaulted parameter. When constructed with REASONING, active_kind_ initializes to "thinking" so the Anthropic first content_block is `thinking` instead of `text` (avoids a spurious empty text-block stop+restart on the first reasoning delta). Deliberately leaves checked_think_prefix_ at its default (false) so the existing one-time `` strip guard still trips if a template/model-card mismatch causes the model to emit a redundant opener. 3. http_server.cpp: thread render_result.started_in_thinking through ParsedRequest into the SseEmitter's initial_mode. Both streaming and non-streaming paths feed tokens through the same emitter, so the fix covers both response shapes. Tests: add 12 unit tests under test_server_unit (assertion count 1608 → 1637): SseEmitter initial_mode=REASONING routing for OPENAI_CHAT and ANTHROPIC formats (closed, unclosed, redundant-opener-strip cases) plus PromptRenderResult.started_in_thinking provenance for QWEN3 / LAGUNA / GEMMA4 (enable/disable/no-gen-prompt) and the Jinja suffix-sniff positive/negative cases. Smoke-tested manually against Qwen3.6-27B Q4_K_M; non-streaming `/v1/chat/completions` with `thinking:{type:enabled}` now populates reasoning_content and never leaks `` into content. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/chat_template.cpp | 76 ++++++++- server/src/server/chat_template.h | 21 ++- server/src/server/http_server.cpp | 32 +++- server/src/server/http_server.h | 6 + server/src/server/sse_emitter.cpp | 13 +- server/src/server/sse_emitter.h | 16 +- server/test/test_server_unit.cpp | 239 ++++++++++++++++++++++++++-- 7 files changed, 375 insertions(+), 28 deletions(-) diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp index 1349109ad..d4bf3e039 100644 --- a/server/src/server/chat_template.cpp +++ b/server/src/server/chat_template.cpp @@ -51,7 +51,7 @@ ChatFormat chat_format_for_arch(const std::string & arch) { return ChatFormat::QWEN3; } -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt, @@ -59,6 +59,10 @@ std::string render_chat_template( const std::string & tools_json) { std::string result; + // `started_in_thinking` is derived deterministically from the template + // branch + render flags below. Set per format inside the switch so a + // future format addition can't silently miss the wiring. + bool started_in_thinking = false; bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null"; switch (format) { @@ -141,6 +145,14 @@ std::string render_chat_template( // even when the client opts in, defeating the thinking-budget // mechanism entirely. result += "\n"; + // The prompt suffix pre-opens `` — the model's very + // first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Callers must + // start the SSE state machine in REASONING mode and pass + // `started_in_thinking=true` to parse_reasoning() so that + // reasoning text routes to reasoning_content instead of + // leaking into content. + started_in_thinking = true; } } break; @@ -224,6 +236,11 @@ std::string render_chat_template( result += "\n"; if (enable_thinking) { result += ""; + // Same situation as Qwen3.6: Laguna XS.2's enable_thinking + // generation prompt ends with `` so the model starts + // emitting reasoning tokens with no explicit opener in the + // stream. Route subsequent tokens to the reasoning channel. + started_in_thinking = true; } else { // Empty think block — model jumps straight to answer. result += ""; @@ -311,11 +328,17 @@ std::string render_chat_template( result += "<|channel>thought\n"; } } + // Gemma4 does NOT pre-open `` from the prompt; its + // reasoning channel is opened by the model emitting `<|channel>` + // which http_server forwards into the SseEmitter as the text + // `` — so the emitter's existing CONTENT→REASONING + // transition fires on that synthesized opener. started_in_thinking + // stays false (initial CONTENT mode is correct). break; } } - return result; + return PromptRenderResult{std::move(result), started_in_thinking}; } // ─── Jinja path ───────────────────────────────────────────────────────── @@ -353,7 +376,29 @@ static std::shared_ptr get_or_parse(const std::string & template } // namespace -std::string render_chat_template_jinja( +// Sniff a rendered prompt for a trailing `` opener so the caller +// can route subsequent stream tokens to the reasoning channel. Accepts +// optional whitespace after the opener (Qwen3.6 emits `\n`). +// True positive ⇒ caller should treat the prompt as having pre-opened +// the reasoning channel (and the renderer warns loudly so a model-card +// mismatch is visible at runtime). +static bool prompt_ends_with_think_open(const std::string & s) { + static const std::string OPEN = ""; + // Walk back over trailing ASCII whitespace. + size_t end = s.size(); + while (end > 0) { + char c = s[end - 1]; + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + end--; + } else { + break; + } + } + if (end < OPEN.size()) return false; + return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0; +} + +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, @@ -407,14 +452,37 @@ std::string render_chat_template_jinja( throw std::runtime_error(std::string("jinja global_from_json: ") + e.what()); } + std::string rendered; try { jinja::runtime rt(ctx); jinja::value results = rt.execute(*prog); auto parts = jinja::runtime::gather_string_parts(results); - return parts->as_string().str(); + rendered = parts->as_string().str(); } catch (const std::exception & e) { throw std::runtime_error(std::string("jinja runtime: ") + e.what()); } + + // Jinja path: we don't know which template family the caller passed + // in, so derive `started_in_thinking` by sniffing the rendered tail + // for a `` opener. This catches the common Qwen3.6 / Laguna + // chat templates that end with `\n` when enable_thinking is + // honored, plus any custom template that follows the same convention. + // + // Warn loudly when sniffing decides true so a template/model-card + // mismatch (e.g. enable_thinking=false but template hard-codes + // `` anyway) surfaces in server logs. + bool started_in_thinking = + enable_thinking && add_generation_prompt && + prompt_ends_with_think_open(rendered); + if (started_in_thinking) { + std::fprintf(stderr, + "[WARN] render_chat_template_jinja: rendered prompt ends with " + "`` opener — treating as started_in_thinking=true. If " + "this is unexpected, check the template's enable_thinking " + "branch or the model card's reasoning configuration.\n"); + } + + return PromptRenderResult{std::move(rendered), started_in_thinking}; } } // namespace dflash::common diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h index ca7ef9db5..770e65a42 100644 --- a/server/src/server/chat_template.h +++ b/server/src/server/chat_template.h @@ -27,6 +27,23 @@ enum class ChatFormat { GEMMA4, // <|turn>role\n...\n }; +// Provenance for a rendered prompt. `text` is the byte string that gets +// tokenized; `started_in_thinking` records whether the prompt suffix +// pre-opens a `` block (or equivalent reasoning-channel marker) +// that the model is expected to continue into. +// +// Callers route this into the SseEmitter's initial mode and into +// parse_reasoning()'s `started_in_thinking` argument so reasoning text +// emitted before any explicit `` opener is still attributed to +// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna +// enable_thinking prompts (which pre-open `\n` in the assistant +// turn) cause the model to emit reasoning straight into the content +// channel, leaving `reasoning_content` empty. +struct PromptRenderResult { + std::string text; // rendered prompt text, ready to tokenize + bool started_in_thinking; // prompt suffix opens reasoning channel +}; + // Render chat messages into the model-specific prompt string. // The result is plain text ready to be tokenized. // @@ -40,7 +57,7 @@ enum class ChatFormat { // `tools_json` is an optional JSON string containing the tool definitions // array. When non-empty, the Qwen3/3.5 template injects a tool preamble // into the system message instructing the model how to emit tags. -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt = true, @@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch); // Internally caches the most recently parsed program per thread (avoids // re-parsing the template on every request). Throws std::runtime_error on // lexer/parser/runtime failure (caller should surface a 500 response). -std::string render_chat_template_jinja( +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index a89309dd3..311a51025 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1009,7 +1009,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { tools_json = req.tools.dump(); } - std::string rendered; + PromptRenderResult render_result; if (!config_.chat_template_src.empty()) { // Jinja path: caller supplied a chat template file via // --chat-template-file. Override the hardcoded QWEN3/LAGUNA @@ -1026,7 +1026,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { ? tokenizer_.raw_token(tokenizer_.eos_id()) : std::string(); try { - rendered = render_chat_template_jinja( + render_result = render_chat_template_jinja( config_.chat_template_src, chat_msgs, bos_str, @@ -1040,11 +1040,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { return true; } } else { - rendered = render_chat_template(chat_msgs, chat_format_, - true, enable_thinking, - tools_json); + render_result = render_chat_template(chat_msgs, chat_format_, + true, enable_thinking, + tools_json); } - req.prompt_tokens = tokenizer_.encode(rendered); + // Propagate prompt provenance so the SseEmitter's initial mode + // matches the template's pre-opened reasoning channel (Qwen3.6 / + // Laguna enable_thinking case). Without this, reasoning text + // leaks into the content channel and `reasoning_content` stays + // empty — see fix(server): route Qwen3.6/Laguna think-mode + // reasoning to reasoning_content channel. + req.started_in_thinking = render_result.started_in_thinking; + req.prompt_tokens = tokenizer_.encode(render_result.text); // count_tokens: short-circuit after tokenization. Skip generation // entirely — Anthropic's contract is just `{"input_tokens": N}`. @@ -1149,11 +1156,20 @@ void HttpServer::worker_loop() { } } - // Create SSE emitter for streaming state machine. + // Create SSE emitter for streaming state machine. `initial_mode` + // tracks whether the chat-template prompt pre-opened a `` + // block (Qwen3.6 / Laguna enable_thinking path). When true, the + // emitter starts in REASONING so the model's first generated + // token routes to reasoning_content even though no explicit + // `` opener appears in the token stream. + const StreamMode initial_mode = req.started_in_thinking + ? StreamMode::REASONING + : StreamMode::CONTENT; SseEmitter emitter(req.format, req.response_id, req.model, (int)req.prompt_tokens.size(), req.tools, &tool_memory_, - req.stop_sequences); + req.stop_sequences, + initial_mode); // Emit initial SSE events. if (req.stream) { diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 999eb5d99..bf5477f67 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -200,6 +200,12 @@ struct ParsedRequest { std::vector stop_sequences; // Bandit: per-session adaptive keep_ratio opt-in std::string session_id; + // Set by the chat-template renderer when the rendered prompt suffix + // pre-opens a `` block (Qwen3.6 / Laguna enable_thinking path). + // Drives the SseEmitter's initial mode so reasoning tokens emitted + // before any explicit `` opener route to reasoning_content + // instead of leaking into content. + bool started_in_thinking = false; }; // Build the /props response body. Exposed (non-static) so unit tests diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp index 604f11a73..beae530bf 100644 --- a/server/src/server/sse_emitter.cpp +++ b/server/src/server/sse_emitter.cpp @@ -76,15 +76,16 @@ SseEmitter::SseEmitter(ApiFormat format, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences) + const std::vector & stop_sequences, + StreamMode initial_mode) : format_(format) , request_id_(request_id) , model_name_(model_name) , prompt_tokens_(prompt_tokens) , tools_(tools) , tool_memory_(tool_memory) - , mode_(StreamMode::CONTENT) - , active_kind_("text") + , mode_(initial_mode) + , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text") , stop_sequences_(stop_sequences) , created_at_(unix_timestamp()) , msg_item_id_(gen_item_id()) @@ -93,6 +94,12 @@ SseEmitter::SseEmitter(ApiFormat format, for (const auto & s : stop_sequences_) { if (s.size() > stop_holdback_) stop_holdback_ = s.size(); } + // NOTE on `checked_think_prefix_`: we deliberately leave the default + // (false) here even when initial_mode == REASONING. The emitter has a + // one-time guard in emit_token() that strips a redundantly-emitted + // leading `` if the model emits one anyway (model-card / + // template-mismatch edge case). Pre-setting the flag to true would + // skip that strip and leak the duplicate opener into reasoning_text. } // ─── SSE formatting helpers ───────────────────────────────────────────── diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h index 4710b8d45..fcc40d9c3 100644 --- a/server/src/server/sse_emitter.h +++ b/server/src/server/sse_emitter.h @@ -54,13 +54,27 @@ nlohmann::json build_timings_json(const GenTimings & t, int completion_tokens); // Manages SSE streaming for a single request. class SseEmitter { public: + // `initial_mode` defaults to CONTENT for backward compatibility. Pass + // StreamMode::REASONING when the chat-template prompt suffix pre-opens + // a `` block (Qwen3.6 / Laguna enable_thinking path): the + // model's first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Without this hint the + // emitter would route reasoning text to the content channel and + // reasoning_content would stay empty. + // + // Note: the leading-`` strip guard (`checked_think_prefix_`) + // remains active when we start in REASONING mode — if the model + // *does* emit a redundant `` opener anyway, the guard still + // strips it. Pre-setting checked_think_prefix_=true here would let a + // duplicate `` leak into reasoning_text in that edge case. SseEmitter(ApiFormat format, const std::string & request_id, const std::string & model_name, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences = {}); + const std::vector & stop_sequences = {}, + StreamMode initial_mode = StreamMode::CONTENT); // Emit the initial SSE events (role delta, message_start, etc.) // Returns the formatted SSE strings to send. diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 147ddbce9..d16c4b43e 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -1058,11 +1058,11 @@ static void test_jinja_render_basic() { {"system", "you are helpful", ""}, {"user", "hi", ""}, }; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, /*bos=*/"", /*eos=*/"", /*add_gen=*/true, /*think=*/false, - /*tools=*/""); + /*tools=*/"").text; TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos); TEST_ASSERT(out.find("<|user|>hi") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") != std::string::npos); @@ -1070,9 +1070,9 @@ static void test_jinja_render_basic() { static void test_jinja_render_no_gen_prompt() { std::vector msgs = {{"user", "ping", ""}}; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, "", "", - /*add_gen=*/false, /*think=*/false, ""); + /*add_gen=*/false, /*think=*/false, "").text; TEST_ASSERT(out.find("<|user|>ping") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") == std::string::npos); } @@ -1084,8 +1084,8 @@ static void test_jinja_render_tools_injected() { "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"; std::vector msgs = {{"user", "?", ""}}; std::string tools = R"([{"name":"my_tool","description":"test"}])"; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, tools); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, tools).text; TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos); } @@ -1094,8 +1094,8 @@ static void test_jinja_render_empty_tools_skipped() { static const char TPL[] = "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}"; std::vector msgs = {{"user", "?", ""}}; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, "[]"); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "[]").text; TEST_ASSERT(out.find("NO_TOOLS") != std::string::npos); TEST_ASSERT(out.find("TOOLS_PRESENT") == std::string::npos); } @@ -1104,8 +1104,8 @@ static void test_jinja_render_bos_eos_threaded() { // {{ bos_token }} and {{ eos_token }} must reach the template. static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}"; std::vector msgs; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, ""); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "").text; TEST_ASSERT(out == "HI"); } @@ -1133,6 +1133,213 @@ static void test_jinja_render_bad_tools_json_throws() { TEST_ASSERT(threw); } +// ─── started_in_thinking provenance ───────────────────────────────────── +// +// Regression suite for the Qwen3.6 / Laguna think-mode channel-routing +// bug: the rendered prompt suffix pre-opens `` so the model +// starts emitting reasoning tokens with no explicit opener. Callers +// route PromptRenderResult.started_in_thinking → SseEmitter initial +// mode so reasoning text lands in reasoning_content, not content. + +static void test_chat_template_qwen3_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + // Sanity: rendered suffix ends with `\n` per the Qwen3.6 + // chat_template.jinja's enable_thinking branch. + TEST_ASSERT(result.text.size() >= 8); + TEST_ASSERT(result.text.compare(result.text.size() - 8, 8, "\n") == 0); +} + +static void test_chat_template_qwen3_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); + // The disabled branch emits `\n\n\n\n` — closes + // immediately, so the reasoning channel is NOT left open. + TEST_ASSERT(result.text.find("") != std::string::npos); +} + +static void test_chat_template_qwen3_no_gen_prompt_does_not_pre_open() { + // Without add_generation_prompt the assistant turn isn't appended + // and there's nothing to pre-open. + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/false, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_laguna_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + TEST_ASSERT(result.text.size() >= 7); + TEST_ASSERT(result.text.compare(result.text.size() - 7, 7, "") == 0); +} + +static void test_chat_template_laguna_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_gemma4_does_not_pre_open() { + // Gemma4's reasoning channel is opened by the model's `<|channel>` + // token (which http_server forwards into the emitter as ``). + // The prompt itself never pre-opens `` regardless of + // enable_thinking, so started_in_thinking must stay false. + std::vector msgs = {{"user", "hi", ""}}; + auto enabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!enabled.started_in_thinking); + auto disabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!disabled.started_in_thinking); +} + +// Jinja path: suffix-sniff detection. The renderer should set +// started_in_thinking=true when the rendered prompt ends with `` +// (optionally followed by whitespace) AND enable_thinking is honored. +static void test_jinja_render_suffix_sniff_sets_started_in_thinking() { + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>{%- if enable_thinking -%}\n{%- endif -%}" + "{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(r.started_in_thinking); +} + +static void test_jinja_render_suffix_sniff_negative() { + // Template doesn't end with `` → started_in_thinking=false + // even with enable_thinking=true. + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}<|assistant|>{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(!r.started_in_thinking); +} + +// ─── SseEmitter initial_mode=REASONING ────────────────────────────────── +// +// Regression: when constructed with initial_mode=REASONING (the +// Qwen3.6/Laguna enable_thinking path), the emitter must route the +// model's first generated tokens to reasoning_content until a natural +// `` is seen, even though no explicit `` opener appears +// in the stream. + +static void test_emitter_initial_mode_reasoning_routes_to_reasoning_content() { + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-1", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + + // Model emits reasoning tokens directly with no leading `` + // (because the prompt suffix already opened the channel), then + // closes with `` and emits the answer. + em.emit_token("alpha "); + em.emit_token("beta "); + em.emit_token("\n\nAnswer: 4"); + em.emit_finish(4); + + TEST_ASSERT(em.reasoning_text().find("alpha") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("beta") != std::string::npos); + // No spurious tag leaked into reasoning or content. + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("Answer: 4") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_unclosed_stays_reasoning() { + // No close — everything stays in reasoning_content, content + // stays empty. Matches parse_reasoning(started_in_thinking=true) + // behavior for the non-streaming path. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-2", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("still thinking"); + em.emit_token(" more thinking"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("still thinking") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("more thinking") != std::string::npos); + TEST_ASSERT(em.accumulated_text().empty()); +} + +static void test_emitter_initial_mode_reasoning_strips_redundant_think_opener() { + // Edge case: prompt pre-opened , but the model also emits a + // leading anyway (template/model-card mismatch). The + // emitter's strip guard (checked_think_prefix_) must still trip + // because we deliberately leave it at its default (false) in the + // constructor — otherwise the duplicate opener would leak into + // reasoning_text. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-3", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("actual reasoninganswer"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("actual reasoning") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking() { + // Anthropic format: when starting in REASONING mode, the very first + // content_block_start must be `thinking`, not `text`. Otherwise the + // emitter would open a text block, then have to stop+restart it as + // thinking on the first reasoning delta — wasteful and visible to + // SDK clients as a spurious empty text block. + SseEmitter em(ApiFormat::ANTHROPIC, "req-4", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + auto start = em.emit_start(); + std::string all; + for (const auto & c : start) all += c; + // First content block must be a thinking block. nlohmann::json sorts + // keys alphabetically on dump(), so the inner block serializes as + // `{"thinking":"","type":"thinking"}` (NOT type-first). Assert on + // the unique `"thinking":""` opener which only appears in the + // thinking-kind serialization. + TEST_ASSERT(all.find("\"thinking\":\"\",\"type\":\"thinking\"") + != std::string::npos); + // And the initial text-block opener must NOT appear (regression: if + // active_kind_ defaulted to "text", emit_start would have emitted + // `{"text":"","type":"text"}` here instead). + TEST_ASSERT(all.find("\"text\":\"\",\"type\":\"text\"") + == std::string::npos); +} + static void test_normalize_responses_tool_followup_messages() { ToolMemory tool_memory; const std::string call_id = "call_exec_001"; @@ -2538,6 +2745,10 @@ int main() { RUN_TEST(test_emitter_streaming_openai_has_done); RUN_TEST(test_emitter_nonstreaming_accumulates); RUN_TEST(test_emitter_anthropic_thinking_blocks); + RUN_TEST(test_emitter_initial_mode_reasoning_routes_to_reasoning_content); + RUN_TEST(test_emitter_initial_mode_reasoning_unclosed_stays_reasoning); + RUN_TEST(test_emitter_initial_mode_reasoning_strips_redundant_think_opener); + RUN_TEST(test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking); std::fprintf(stderr, "\n── Stop sequences ──\n"); RUN_TEST(test_stop_sequence_basic); @@ -2579,6 +2790,14 @@ int main() { RUN_TEST(test_jinja_render_bos_eos_threaded); RUN_TEST(test_jinja_render_empty_template_throws); RUN_TEST(test_jinja_render_bad_tools_json_throws); + RUN_TEST(test_chat_template_qwen3_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_qwen3_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_qwen3_no_gen_prompt_does_not_pre_open); + RUN_TEST(test_chat_template_laguna_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_laguna_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_gemma4_does_not_pre_open); + RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking); + RUN_TEST(test_jinja_render_suffix_sniff_negative); RUN_TEST(test_normalize_responses_tool_followup_messages); std::fprintf(stderr, "\n── Placement config ──\n"); From f62b1ebbce8c005d689aa392daf476bfa38f9325 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Fri, 29 May 2026 15:36:33 -0400 Subject: [PATCH 2/4] =?UTF-8?q?test(server):=20add=20render=E2=86=92emit?= =?UTF-8?q?=20integration=20tests=20for=20reasoning=20channel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three C++ tests that chain render_chat_template + SseEmitter so the wiring between the renderer's started_in_thinking flag and the emitter's initial_mode is exercised end-to-end, not just at each end. The per-unit tests above each verify their half of the contract, but the original bug was a missing call-site wire — both halves were correct in isolation. Also tighten the Python integration test assertions for enable_thinking and reasoning.effort: require non-empty reasoning_content and no raw / in either channel. The prior 'doesn't crash' assertion would have passed on the broken code. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/scripts/test_server_integration.py | 38 ++++++-- server/test/test_server_unit.cpp | 101 ++++++++++++++++++++++ 2 files changed, 133 insertions(+), 6 deletions(-) diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py index 08da6cedf..43535203b 100644 --- a/server/scripts/test_server_integration.py +++ b/server/scripts/test_server_integration.py @@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self): @pytest.mark.slow def test_thinking_enabled_via_chat_template_kwargs(self): - """Enabling thinking should produce reasoning_content.""" + """Enabling thinking must route reasoning into reasoning_content, + not leak it into content. Regression guard for the Qwen3.6/Laguna + pre-opened- bug: the chat template appends `` to the + prompt suffix, so the model emits reasoning directly with no + opening tag. If the renderer→emitter wiring drops, reasoning_content + stays empty and the raw reasoning text appears in content.""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] - # With thinking enabled, model may produce reasoning_content - # (not guaranteed for short prompts, so we just check it doesn't crash) + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with enable_thinking=True — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning, ( + f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}" + ) + assert "" not in content and "" not in content, ( + f"think tags leaked into content channel: {content[:200]!r}" + ) + assert content, "content channel empty — model never closed " @pytest.mark.slow def test_thinking_enabled_via_reasoning_effort(self): - """OpenAI Responses-style reasoning.effort field.""" + """OpenAI Responses-style reasoning.effort=high must also route + reasoning to reasoning_content. Same regression class as above + but reached through a different request shape (effort→template + kwargs translation in http_server.cpp).""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with reasoning.effort=high — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning + assert "" not in content and "" not in content + assert content # ═══════════════════════════════════════════════════════════════════ diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index d16c4b43e..fc983c2d3 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -1340,6 +1340,104 @@ static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinkin == std::string::npos); } +// ─── Integration: render_chat_template → SseEmitter wiring ────────────── +// +// The original bug was an integration gap: render_chat_template correctly +// reported started_in_thinking=true, but no caller routed it into the +// SseEmitter's initial_mode, so reasoning text leaked into content and +// reasoning_content stayed empty. Each end of the wire has its own unit +// tests above; these chain the two ends so a future refactor that drops +// the propagation cannot pass without an assertion failure here. +// +// The body mirrors the production wiring in +// server/src/server/http_server.cpp (the `started_in_thinking → +// initial_mode → SseEmitter` chain). Keep these in sync if that wiring +// moves. + +static void test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "What is 2+2?", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: QWEN3 enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-q", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Let me compute. "); + em.emit_token("2+2 equals 4."); + em.emit_token("\n\nThe answer is 4."); + em.emit_finish(5); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Let me compute") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Let me compute") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("The answer is 4") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "Solve 7*8.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: LAGUNA enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-l", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Working through it: "); + em.emit_token("7*8 = 56."); + em.emit_token("\n\n56."); + em.emit_finish(4); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Working through it") != std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Working through it") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("56.") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content() { + // Inverse direction: when enable_thinking=false the renderer must not + // pre-open and the emitter must start in CONTENT, so the model's + // tokens land in content from the first byte. Guards against the + // opposite regression of unconditionally starting in REASONING. + std::vector msgs = {{"user", "Hi.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!render.started_in_thinking); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-n", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Hello there."); + em.emit_finish(2); + + TEST_ASSERT(em.reasoning_text().empty()); + TEST_ASSERT(em.accumulated_text().find("Hello there") != std::string::npos); +} + static void test_normalize_responses_tool_followup_messages() { ToolMemory tool_memory; const std::string call_id = "call_exec_001"; @@ -2798,6 +2896,9 @@ int main() { RUN_TEST(test_chat_template_gemma4_does_not_pre_open); RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking); RUN_TEST(test_jinja_render_suffix_sniff_negative); + RUN_TEST(test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content); RUN_TEST(test_normalize_responses_tool_followup_messages); std::fprintf(stderr, "\n── Placement config ──\n"); From 40d158f7fb72685715f2c0f350fe3288a12edab8 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Fri, 29 May 2026 18:40:35 -0400 Subject: [PATCH 3/4] test(server): CPU-only HTTP server driver + stub backend + CI integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a deterministic, scenario-driven integration test that exercises the full HttpServer request path on a CPU-only CI runner — no GPU, no model weights, no GGUF download. Designed to catch the regression class from this PR (renderer→emitter wiring) end-to-end pre-merge. Components: - StubModelBackend (server/test/stub_model_backend.*) — a ModelBackend whose generate() decodes the prompt to text via the real tokenizer, matches it against a JSON scenario (longest prompt_suffix wins), then replays scripted tokens through the production req.on_token/io.on_token callbacks. Streaming behavior comes from the production code path; the stub just feeds it tokens. - ScenarioStore (server/test/scenario_store.*) — loads server/test/ scenarios/*.json. Schema: { "match": {"prompt_suffix": "..."}, "response": {"tokens": [...], "finish_reason": "stop"} } Tokens are either plain strings (BPE-encoded by the real tokenizer) or {text, special:true} objects (looked up via token_to_id, so Qwen3.6's single-token arrives as the right ID). - spike_no_gpu_http_server (server/test/) — driver that wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer together. Links dflash_common (CUDA TUs included) but never instantiates a real model; ggml_cuda_init() is never called, so CUDA_VISIBLE_DEVICES="" is the supported configuration. - Tokenizer fixture (server/test/fixtures/qwen3.6-tokenizer.gguf, 11MB via LFS) — full Qwen3.6 vocab/merges/special tokens stripped from the 27B GGUF. Real BPE round-tripping, deterministic token concordance. Build script at server/test/scripts/strip_gguf_to_tokenizer.py. - test_stub_integration.py — pytest module that spawns the driver and exercises OpenAI + Anthropic, streaming + non-streaming. Asserts on reasoning_content routing, content channel cleanliness, no leakage, Anthropic first-content-block-is-thinking. 4 tests, 0.43s. - CI (.github/workflows/ci.yml) — new "Run CPU integration tests" step after the venv populate, before the megakernel build. Builds the spike target alongside the existing ones. Enables LFS in checkout so the tokenizer fixture lands. Why this matters: the 1656-assertion test_server_unit suite catches emitter/renderer issues in isolation but cannot fail on a missed http_server.cpp wire (the original bug). This new step exercises the exact wire — render → ParsedRequest → SseEmitter → SSE socket — on every PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitattributes | 1 + .github/workflows/ci.yml | 16 +- server/CMakeLists.txt | 32 +++ server/test/fixtures/qwen3.6-tokenizer.gguf | 3 + server/test/scenario_store.cpp | 167 ++++++++++++ server/test/scenario_store.h | 84 ++++++ .../qwen3_enable_thinking_basic.json | 19 ++ .../test/scripts/strip_gguf_to_tokenizer.py | 91 +++++++ server/test/spike_no_gpu_http_server.cpp | 120 +++++++++ server/test/stub_model_backend.cpp | 73 ++++++ server/test/stub_model_backend.h | 63 +++++ server/test/test_stub_integration.py | 241 ++++++++++++++++++ 12 files changed, 909 insertions(+), 1 deletion(-) create mode 100644 server/test/fixtures/qwen3.6-tokenizer.gguf create mode 100644 server/test/scenario_store.cpp create mode 100644 server/test/scenario_store.h create mode 100644 server/test/scenarios/qwen3_enable_thinking_basic.json create mode 100644 server/test/scripts/strip_gguf_to_tokenizer.py create mode 100644 server/test/spike_no_gpu_http_server.cpp create mode 100644 server/test/stub_model_backend.cpp create mode 100644 server/test/stub_model_backend.h create mode 100644 server/test/test_stub_integration.py diff --git a/.gitattributes b/.gitattributes index 592097d13..45e8d6ec3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,4 @@ assets/banner.png -filter -diff -merge -text *.jpeg filter=lfs diff=lfs merge=lfs -text *.mp4 filter=lfs diff=lfs merge=lfs -text *.webm filter=lfs diff=lfs merge=lfs -text +*.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2a87f88ad..d6f605365 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive + lfs: true token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }} - uses: Jimver/cuda-toolkit@v0.2.35 @@ -55,7 +56,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release cmake --build build --target \ test_dflash test_generate test_flash_attn_sparse \ - dflash_server test_server_unit \ + dflash_server test_server_unit spike_no_gpu_http_server \ -j$(nproc) - name: Run C++ server unit tests @@ -70,6 +71,19 @@ jobs: # in the optional `megakernel` extra so its build does NOT run yet. run: uv sync --frozen + - name: Run CPU integration tests (stub backend, no GPU) + # End-to-end exercise of HttpServer + render_chat_template + + # SseEmitter with a deterministic stub model backend. No GPU + # required: the spike driver runs under CUDA_VISIBLE_DEVICES="" + # and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata + # only). Covers the regression class from PR #308 end-to-end — + # streaming and non-streaming, OpenAI and Anthropic formats. + env: + CUDA_VISIBLE_DEVICES: "" + run: | + uv run --frozen --with pytest --with requests \ + pytest -v server/test/test_stub_integration.py + - name: Build megakernel via uv sync (sm_75) env: CUDA_HOME: ${{ env.CUDA_PATH }} diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 71298ff6a..ad09b526e 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -769,6 +769,38 @@ if(DFLASH27B_TESTS) endif() endif() + # ─── Spike: HttpServer with stub backend, no GPU bound ────────────── + # Validates that the server constructs and serves /health without a + # GPU — the foundation for a CPU-only integration test driver. Links + # dflash_common (which includes CUDA-compiled TUs) but never calls + # into any real ModelBackend; running under CUDA_VISIBLE_DEVICES="" + # is the supported configuration. + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/spike_no_gpu_http_server.cpp") + add_executable(spike_no_gpu_http_server + test/spike_no_gpu_http_server.cpp + test/scenario_store.cpp + test/stub_model_backend.cpp + src/server/http_server.cpp + src/server/model_card.cpp) + target_include_directories(spike_no_gpu_http_server PRIVATE + ${DFLASH27B_SRC_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/test) + if(DFLASH27B_GPU_BACKEND STREQUAL "hip") + target_compile_definitions(spike_no_gpu_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) + else() + target_compile_definitions(spike_no_gpu_http_server PRIVATE + DFLASH27B_BACKEND_CUDA=1 + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) + endif() + target_link_libraries(spike_no_gpu_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) + if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") + find_package(CUDAToolkit REQUIRED) + target_link_libraries(spike_no_gpu_http_server PRIVATE CUDA::cudart) + else() + target_link_libraries(spike_no_gpu_http_server PRIVATE hip::host) + endif() + endif() + # ─── Unit tests (no GPU, no model files) ──────────────────────────── enable_testing() diff --git a/server/test/fixtures/qwen3.6-tokenizer.gguf b/server/test/fixtures/qwen3.6-tokenizer.gguf new file mode 100644 index 000000000..967827d3b --- /dev/null +++ b/server/test/fixtures/qwen3.6-tokenizer.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b420704f204792f14b919b3d52effa839cc7a0739844c2fd2887f6400b045aa +size 10941620 diff --git a/server/test/scenario_store.cpp b/server/test/scenario_store.cpp new file mode 100644 index 000000000..1badd7d14 --- /dev/null +++ b/server/test/scenario_store.cpp @@ -0,0 +1,167 @@ +#include "scenario_store.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace dflash::common::test { + +using json = nlohmann::json; + +namespace { + +bool parse_token(const json & j, ScenarioToken & out, const std::string & file) { + if (j.is_string()) { + out.text = j.get(); + out.special = false; + return true; + } + if (j.is_object()) { + if (!j.contains("text") || !j["text"].is_string()) { + std::fprintf(stderr, + "[scenario] %s: token object missing 'text' string\n", + file.c_str()); + return false; + } + out.text = j["text"].get(); + out.special = j.value("special", false); + return true; + } + std::fprintf(stderr, + "[scenario] %s: token must be a string or {text,special} object\n", + file.c_str()); + return false; +} + +bool parse_scenario(const json & j, Scenario & out, const std::string & file) { + if (!j.is_object()) { + std::fprintf(stderr, "[scenario] %s: top level must be object\n", file.c_str()); + return false; + } + out.name = j.value("name", file); + out.description = j.value("description", ""); + + if (!j.contains("match") || !j["match"].is_object()) { + std::fprintf(stderr, "[scenario] %s: missing 'match' object\n", file.c_str()); + return false; + } + const auto & m = j["match"]; + if (!m.contains("prompt_suffix") || !m["prompt_suffix"].is_string()) { + std::fprintf(stderr, + "[scenario] %s: match.prompt_suffix must be a string\n", file.c_str()); + return false; + } + out.match_prompt_suffix = m["prompt_suffix"].get(); + if (out.match_prompt_suffix.empty()) { + std::fprintf(stderr, + "[scenario] %s: match.prompt_suffix is empty — would match every prompt\n", + file.c_str()); + return false; + } + + if (!j.contains("response") || !j["response"].is_object()) { + std::fprintf(stderr, "[scenario] %s: missing 'response' object\n", file.c_str()); + return false; + } + const auto & r = j["response"]; + out.response.ok = r.value("ok", true); + out.response.error = r.value("error", ""); + out.response.finish_reason = r.value("finish_reason", "stop"); + out.response.decode_us = r.value("decode_us", 0); + + if (out.response.ok) { + if (!r.contains("tokens") || !r["tokens"].is_array()) { + std::fprintf(stderr, + "[scenario] %s: response.tokens must be an array when ok=true\n", + file.c_str()); + return false; + } + out.response.tokens.reserve(r["tokens"].size()); + for (const auto & jt : r["tokens"]) { + ScenarioToken tok; + if (!parse_token(jt, tok, file)) return false; + out.response.tokens.push_back(std::move(tok)); + } + } + return true; +} + +bool ends_with(const std::string & s, const std::string & suffix) { + if (suffix.size() > s.size()) return false; + return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin()); +} + +} // namespace + +bool ScenarioStore::load_file(const std::string & path) { + std::ifstream f(path); + if (!f.is_open()) { + std::fprintf(stderr, "[scenario] cannot open: %s\n", path.c_str()); + return false; + } + std::stringstream buf; + buf << f.rdbuf(); + json j; + try { + j = json::parse(buf.str()); + } catch (const std::exception & e) { + std::fprintf(stderr, "[scenario] %s: JSON parse error: %s\n", + path.c_str(), e.what()); + return false; + } + Scenario sc; + if (!parse_scenario(j, sc, path)) return false; + scenarios_.push_back(std::move(sc)); + std::fprintf(stderr, "[scenario] loaded %s (suffix=%zub, tokens=%zu)\n", + scenarios_.back().name.c_str(), + scenarios_.back().match_prompt_suffix.size(), + scenarios_.back().response.tokens.size()); + return true; +} + +bool ScenarioStore::load_directory(const std::string & dir) { + DIR * d = opendir(dir.c_str()); + if (!d) { + std::fprintf(stderr, "[scenario] cannot open dir: %s\n", dir.c_str()); + return false; + } + std::vector files; + while (struct dirent * de = readdir(d)) { + std::string name = de->d_name; + if (name.size() < 6) continue; + if (name.compare(name.size() - 5, 5, ".json") != 0) continue; + files.push_back(dir + "/" + name); + } + closedir(d); + std::sort(files.begin(), files.end()); // deterministic load order + bool all_ok = true; + for (const auto & p : files) { + if (!load_file(p)) all_ok = false; + } + return all_ok; +} + +const Scenario * ScenarioStore::match(const std::string & rendered_prompt) const { + const Scenario * best = nullptr; + for (const auto & sc : scenarios_) { + if (!ends_with(rendered_prompt, sc.match_prompt_suffix)) continue; + if (!best || + sc.match_prompt_suffix.size() > best->match_prompt_suffix.size()) { + best = ≻ + } else if (sc.match_prompt_suffix.size() == best->match_prompt_suffix.size()) { + std::fprintf(stderr, + "[scenario] tie on suffix length %zu between '%s' and '%s' — " + "load-order earlier wins ('%s'); add a longer suffix to disambiguate\n", + sc.match_prompt_suffix.size(), + best->name.c_str(), sc.name.c_str(), best->name.c_str()); + } + } + return best; +} + +} // namespace dflash::common::test diff --git a/server/test/scenario_store.h b/server/test/scenario_store.h new file mode 100644 index 000000000..9cc2fc108 --- /dev/null +++ b/server/test/scenario_store.h @@ -0,0 +1,84 @@ +// Scenario store — loads JSON scenario files and matches them against +// rendered prompts for the StubModelBackend. +// +// A scenario file is a JSON object describing one (prompt → token stream) +// pairing: +// +// { +// "name": "qwen3_enable_thinking_basic", +// "description": "Qwen3.6 enable_thinking emits reasoning before ", +// "match": { +// "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n\n" +// }, +// "response": { +// "finish_reason": "stop", +// "decode_us": 0, +// "tokens": [ +// "Let me think. ", +// "2", +// "+", +// "2", +// " = 4.", +// { "text": "", "special": true }, +// "\n\nThe answer is 4." +// ] +// } +// } +// +// Match semantics: longest matching `prompt_suffix` wins. File load order +// breaks ties (with a stderr warning). + +#pragma once + +#include +#include +#include + +namespace dflash::common::test { + +struct ScenarioToken { + std::string text; + // When true, the stub backend emits this as a single special token + // looked up via Tokenizer::token_to_id(). When false, the text is + // BPE-encoded and emitted as the resulting sequence of IDs. + bool special = false; +}; + +struct ScenarioResponse { + bool ok = true; + std::string error; // only used when ok=false + std::string finish_reason = "stop"; // "stop", "length", "tool_calls" + int decode_us = 0; // optional inter-token delay + std::vector tokens; +}; + +struct Scenario { + std::string name; + std::string description; + std::string match_prompt_suffix; + ScenarioResponse response; +}; + +class ScenarioStore { +public: + // Load every *.json file under `dir`. Returns false if directory does + // not exist or any file fails to parse (errors logged to stderr). + bool load_directory(const std::string & dir); + + // Load a single scenario file. Returns false on parse error. + bool load_file(const std::string & path); + + // Find the scenario whose `match_prompt_suffix` is the longest suffix + // of `rendered_prompt`. Returns nullptr if none match. + const Scenario * match(const std::string & rendered_prompt) const; + + std::size_t size() const { return scenarios_.size(); } + + // For diagnostic logging. + const std::vector & scenarios() const { return scenarios_; } + +private: + std::vector scenarios_; +}; + +} // namespace dflash::common::test diff --git a/server/test/scenarios/qwen3_enable_thinking_basic.json b/server/test/scenarios/qwen3_enable_thinking_basic.json new file mode 100644 index 000000000..7f5de2c32 --- /dev/null +++ b/server/test/scenarios/qwen3_enable_thinking_basic.json @@ -0,0 +1,19 @@ +{ + "name": "qwen3_enable_thinking_basic", + "description": "Qwen3.6 chat template pre-opens when enable_thinking=true. Model emits reasoning directly, closes with , then writes the answer.", + "match": { + "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n\n" + }, + "response": { + "finish_reason": "stop", + "tokens": [ + "Let me compute. ", + "2", + "+", + "2", + " equals 4.", + { "text": "", "special": true }, + "\n\nThe answer is 4." + ] + } +} diff --git a/server/test/scripts/strip_gguf_to_tokenizer.py b/server/test/scripts/strip_gguf_to_tokenizer.py new file mode 100644 index 000000000..cabc36383 --- /dev/null +++ b/server/test/scripts/strip_gguf_to_tokenizer.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Strip a GGUF to tokenizer metadata only — no tensor data. + +The dflash Tokenizer (server/src/server/tokenizer.cpp) only reads KV metadata: + tokenizer.ggml.tokens, .merges, .token_type, .model, .pre, .bos_token_id, + .eos_token_id, .eot_token_id (best-effort), .chat_template + +Everything else — including all 851+ tensor weights — can be dropped. The +result is small enough to commit as a CI test fixture for the CPU-only +HttpServer driver. + +Usage: + python strip_gguf_to_tokenizer.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import gguf + + +KEEP_PREFIXES = ( + "tokenizer.", + # general.architecture is read by backend_factory's detect_arch() to pick + # the chat-format enum. Cheap to keep and useful for any caller that + # peeks at the file. + "general.architecture", + "general.name", +) + + +def main() -> int: + if len(sys.argv) != 3: + print(__doc__) + return 1 + src = Path(sys.argv[1]) + dst = Path(sys.argv[2]) + + reader = gguf.GGUFReader(str(src)) + arch = reader.fields["general.architecture"].contents() + print(f"[strip] reading {src} (arch={arch}, {len(reader.tensors)} tensors)") + + writer = gguf.GGUFWriter(str(dst), arch) + + kept = 0 + for key, field in reader.fields.items(): + if not key.startswith(KEEP_PREFIXES): + continue + # GGUFReader does not expose a "copy field to writer" helper, so we + # have to translate the field's value type back through GGUFWriter's + # typed API. For our tokenizer-only subset that's just strings, + # uint32 scalars, bool scalars, and arrays of strings / int32. + val = field.contents() + ftype = field.types[0] if field.types else None + if ftype == gguf.GGUFValueType.STRING: + writer.add_string(key, val) + elif ftype == gguf.GGUFValueType.UINT32: + writer.add_uint32(key, int(val)) + elif ftype == gguf.GGUFValueType.INT32: + writer.add_int32(key, int(val)) + elif ftype == gguf.GGUFValueType.BOOL: + writer.add_bool(key, bool(val)) + elif ftype == gguf.GGUFValueType.ARRAY: + # Array element type is types[1]. + elem = field.types[1] + if elem == gguf.GGUFValueType.STRING: + writer.add_array(key, list(val)) + elif elem in (gguf.GGUFValueType.UINT32, gguf.GGUFValueType.INT32): + writer.add_array(key, [int(x) for x in val]) + else: + print(f"[strip] skipping array key {key} (elem type {elem})") + continue + else: + print(f"[strip] skipping key {key} (unsupported type {ftype})") + continue + kept += 1 + + writer.write_header_to_file() + writer.write_kv_data_to_file() + # write_tensors_to_file() is omitted — we have no tensors. + writer.close() + + out_size = dst.stat().st_size + print(f"[strip] wrote {dst} (kept {kept} KV pairs, {out_size:,} bytes)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/server/test/spike_no_gpu_http_server.cpp b/server/test/spike_no_gpu_http_server.cpp new file mode 100644 index 000000000..af73c9b8c --- /dev/null +++ b/server/test/spike_no_gpu_http_server.cpp @@ -0,0 +1,120 @@ +// CPU-only test driver for dflash HttpServer. +// +// Boots the server with the real Qwen3.6 tokenizer (loaded from a GGUF — +// vocab/metadata only, never any GPU weights) and a deterministic +// StubModelBackend whose generate() replays scripted token streams from +// JSON scenario files. The chain +// real chat template renderer → real ParsedRequest plumbing → +// real SseEmitter wiring → real socket writes +// is fully exercised; only the per-token sample comes from the stub. +// +// Runs without a GPU: link against dflash_common (CUDA TUs included), but +// because no real ModelBackend is instantiated, ggml_cuda_init() is never +// called. CUDA_VISIBLE_DEVICES="" is the supported test configuration. +// +// Usage: +// CUDA_VISIBLE_DEVICES="" ./spike_no_gpu_http_server \ +// --scenarios [--port 9999] +// +// See server/test/scenarios/*.json for the scenario file schema. + +#include "server/http_server.h" +#include "server/tokenizer.h" +#include "server/chat_template.h" +#include "common/model_backend.h" +#include "scenario_store.h" +#include "stub_model_backend.h" + +#include +#include +#include +#include +#include + +using namespace dflash::common; + +namespace { + +HttpServer * g_server = nullptr; + +void signal_handler(int) { + if (g_server) g_server->request_stop(); +} + +} // namespace + +int main(int argc, char ** argv) { + const char * gguf_path = nullptr; + const char * scenarios_dir = nullptr; + int port = 9999; + int max_ctx = 4096; + + for (int i = 1; i < argc; ++i) { + if (std::strcmp(argv[i], "--scenarios") == 0 && i + 1 < argc) { + scenarios_dir = argv[++i]; + } else if (std::strcmp(argv[i], "--port") == 0 && i + 1 < argc) { + port = std::atoi(argv[++i]); + } else if (std::strcmp(argv[i], "--max-ctx") == 0 && i + 1 < argc) { + max_ctx = std::atoi(argv[++i]); + } else if (argv[i][0] != '-' && !gguf_path) { + gguf_path = argv[i]; + } else { + std::fprintf(stderr, + "usage: %s --scenarios [--port N] [--max-ctx N]\n", + argv[0]); + return 1; + } + } + if (!gguf_path) { + std::fprintf(stderr, "[driver] missing positional \n"); + return 1; + } + + const char * cvd = std::getenv("CUDA_VISIBLE_DEVICES"); + std::fprintf(stderr, "[driver] CUDA_VISIBLE_DEVICES=%s\n", + cvd ? cvd : "(unset)"); + + Tokenizer tokenizer; + if (!tokenizer.load_from_gguf(gguf_path)) { + std::fprintf(stderr, "[driver] tokenizer load failed: %s\n", gguf_path); + return 2; + } + std::fprintf(stderr, "[driver] tokenizer loaded: vocab=%d\n", + tokenizer.vocab_size()); + + test::ScenarioStore store; + if (scenarios_dir) { + if (!store.load_directory(scenarios_dir)) { + std::fprintf(stderr, + "[driver] one or more scenarios failed to load — aborting\n"); + return 3; + } + std::fprintf(stderr, "[driver] loaded %zu scenarios from %s\n", + store.size(), scenarios_dir); + } else { + std::fprintf(stderr, + "[driver] no --scenarios dir given; every request will 500\n"); + } + + test::StubModelBackend backend(store, tokenizer); + + ServerConfig cfg; + cfg.host = "127.0.0.1"; + cfg.port = port; + cfg.model_name = "dflash"; // matches existing test-suite default + cfg.max_ctx = max_ctx; + + HttpServer server(backend, tokenizer, cfg); + server.set_chat_format(ChatFormat::QWEN3); + + g_server = &server; + std::signal(SIGTERM, signal_handler); + std::signal(SIGINT, signal_handler); + + std::fprintf(stderr, + "[driver] HttpServer listening on http://%s:%d (SIGINT/SIGTERM to stop)\n", + cfg.host.c_str(), cfg.port); + int rc = server.run(); + std::fprintf(stderr, "[driver] exit rc=%d\n", rc); + return rc; +} diff --git a/server/test/stub_model_backend.cpp b/server/test/stub_model_backend.cpp new file mode 100644 index 000000000..c5796bdc9 --- /dev/null +++ b/server/test/stub_model_backend.cpp @@ -0,0 +1,73 @@ +#include "stub_model_backend.h" + +#include "server/tokenizer.h" + +#include +#include +#include + +namespace dflash::common::test { + +GenerateResult StubModelBackend::generate(const GenerateRequest & req, + const DaemonIO & io) { + GenerateResult r; + + const std::string rendered = tokenizer_.decode(req.prompt); + const Scenario * sc = store_.match(rendered); + if (!sc) { + r.ok = false; + r.error = "stub: no scenario matches prompt (size=" + + std::to_string(rendered.size()) + ")"; + std::fprintf(stderr, + "[stub] no scenario match. last 120b of prompt: %s\n", + rendered.substr(rendered.size() > 120 ? rendered.size() - 120 : 0).c_str()); + return r; + } + + if (!sc->response.ok) { + r.ok = false; + r.error = sc->response.error.empty() + ? std::string("stub: scenario '") + sc->name + "' declared failure" + : sc->response.error; + return r; + } + + std::fprintf(stderr, "[stub] match=%s emitting %zu scripted tokens\n", + sc->name.c_str(), sc->response.tokens.size()); + + r.ok = true; + bool aborted = false; + + auto emit = [&](int32_t id) -> bool { + if (req.on_token && !req.on_token(id)) return false; + if (io.on_token && !io.on_token(id)) return false; + r.tokens.push_back(id); + return true; + }; + + for (const auto & t : sc->response.tokens) { + if (t.special) { + int32_t id = tokenizer_.token_to_id(t.text); + if (id < 0) { + r.ok = false; + r.error = "stub: special token not in vocab: " + t.text; + return r; + } + if (!emit(id)) { aborted = true; break; } + } else { + auto ids = tokenizer_.encode(t.text); + for (int32_t id : ids) { + if (!emit(id)) { aborted = true; break; } + } + if (aborted) break; + } + if (sc->response.decode_us > 0) { + std::this_thread::sleep_for( + std::chrono::microseconds(sc->response.decode_us)); + } + } + + return r; +} + +} // namespace dflash::common::test diff --git a/server/test/stub_model_backend.h b/server/test/stub_model_backend.h new file mode 100644 index 000000000..83525b559 --- /dev/null +++ b/server/test/stub_model_backend.h @@ -0,0 +1,63 @@ +// StubModelBackend — deterministic ModelBackend driven by a ScenarioStore. +// +// generate() decodes the request prompt back to text using the real +// tokenizer, looks up the matching scenario (longest prompt_suffix wins), +// then streams the scenario's tokens through the production callbacks +// (req.on_token / io.on_token). Streaming behavior is inherited from the +// production code path — no separate streaming machine. +// +// Token-stream construction: +// - Plain text tokens are BPE-encoded by the real tokenizer, so a +// scripted "Let me think. " expands to the same Qwen3.6 token IDs +// the real model would have emitted. +// - Special tokens (e.g. ) are looked up via token_to_id() +// and emitted as a single ID. This matters: Qwen3.6's +// is one added token (id 248069); for SseEmitter to recognize it +// as a channel-close, it must arrive as the single special ID, +// not as the BPE-decomposed sequence. + +#pragma once + +#include "common/model_backend.h" +#include "scenario_store.h" + +namespace dflash::common { +class Tokenizer; +} + +namespace dflash::common::test { + +class StubModelBackend : public ModelBackend { +public: + StubModelBackend(const ScenarioStore & store, const Tokenizer & tokenizer) + : store_(store), tokenizer_(tokenizer) {} + + void print_ready_banner() const override {} + bool park(const std::string &) override { return true; } + bool unpark(const std::string &) override { return true; } + bool is_target_parked() const override { return false; } + + GenerateResult generate(const GenerateRequest & req, + const DaemonIO & io) override; + + bool snapshot_save(int) override { return false; } + void snapshot_free(int) override {} + bool snapshot_used(int) const override { return false; } + int snapshot_cur_pos(int) const override { return 0; } + GenerateResult restore_and_generate(int, const GenerateRequest & req, + const DaemonIO & io) override { + return generate(req, io); + } + + bool handle_compress(const std::string &, const DaemonIO &) override { + return false; + } + void free_drafter() override {} + void shutdown() override {} + +private: + const ScenarioStore & store_; + const Tokenizer & tokenizer_; +}; + +} // namespace dflash::common::test diff --git a/server/test/test_stub_integration.py b/server/test/test_stub_integration.py new file mode 100644 index 000000000..73fcc46b1 --- /dev/null +++ b/server/test/test_stub_integration.py @@ -0,0 +1,241 @@ +"""End-to-end integration test for the dflash HttpServer, driven by a +deterministic stub backend (no GPU, no model weights). + +Runs the spike_no_gpu_http_server binary with: + - the tokenizer-only Qwen3.6 GGUF fixture under server/test/fixtures/ + - the JSON scenario files under server/test/scenarios/ + +Then sends real HTTP requests at it via the same `requests` calls that +test_server_integration.py uses against a live GPU server. Because the +chat-template renderer, request parser, SseEmitter, and SSE socket +writes are the production code path, this test exercises the exact wire +that broke in the original Qwen3.6 think-channel bug — but on a CPU-only +runner with no model file. + +Run locally: + ./server/build/spike_no_gpu_http_server (built by cmake) + uv run pytest server/test/test_stub_integration.py -v + +The test starts and stops the driver itself; no separate server required. +""" + +from __future__ import annotations + +import json +import os +import socket +import subprocess +import time +from pathlib import Path + +import pytest +import requests + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BUILD_DIR = REPO_ROOT / "server" / "build" +DRIVER_BIN = BUILD_DIR / "spike_no_gpu_http_server" +TOKENIZER_GGUF = REPO_ROOT / "server" / "test" / "fixtures" / "qwen3.6-tokenizer.gguf" +SCENARIOS_DIR = REPO_ROOT / "server" / "test" / "scenarios" + + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def stub_server(): + """Spawn the stub-driven HttpServer for the duration of this module.""" + assert DRIVER_BIN.is_file(), ( + f"driver binary missing: {DRIVER_BIN} — " + "build target spike_no_gpu_http_server first") + assert TOKENIZER_GGUF.is_file(), ( + f"tokenizer fixture missing: {TOKENIZER_GGUF} — " + "is git-lfs configured for *.gguf?") + + port = _free_port() + env = {**os.environ, "CUDA_VISIBLE_DEVICES": ""} + proc = subprocess.Popen( + [str(DRIVER_BIN), + str(TOKENIZER_GGUF), + "--scenarios", str(SCENARIOS_DIR), + "--port", str(port)], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + base = f"http://127.0.0.1:{port}" + + # Wait up to 10s for /health to come up. + deadline = time.time() + 10 + while time.time() < deadline: + try: + r = requests.get(f"{base}/health", timeout=1) + if r.status_code == 200: + break + except requests.RequestException: + pass + if proc.poll() is not None: + out = proc.stdout.read().decode() if proc.stdout else "" + raise RuntimeError(f"driver exited early. output:\n{out}") + time.sleep(0.1) + else: + proc.terminate() + out = proc.stdout.read().decode() if proc.stdout else "" + raise RuntimeError(f"driver did not become healthy in 10s. output:\n{out}") + + yield base + + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=2) + + +# ─── Tests ───────────────────────────────────────────────────────────── + + +class TestQwen3EnableThinkingNonStreaming: + """Regression guard for the original PR #308 bug: Qwen3.6 enable_thinking + must route reasoning into reasoning_content, not leak it into content.""" + + def test_openai_chat_completions(self, stub_server): + r = requests.post(f"{stub_server}/v1/chat/completions", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "chat_template_kwargs": {"enable_thinking": True}, + "stream": False, + "max_tokens": 256, + }, + timeout=10) + assert r.status_code == 200, r.text + msg = r.json()["choices"][0]["message"] + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty — render→emit wiring broken. " + f"content={content!r}") + assert "Let me compute" in reasoning + assert "" not in reasoning + assert "" not in reasoning + assert "" not in content + assert "" not in content + assert "The answer is 4." in content + assert "Let me compute" not in content, ( + f"reasoning text leaked into content: {content!r}") + + def test_anthropic_messages(self, stub_server): + r = requests.post(f"{stub_server}/v1/messages", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "thinking": {"type": "enabled"}, + "stream": False, + "max_tokens": 256, + }, + timeout=10) + assert r.status_code == 200, r.text + blocks = r.json().get("content", []) + types = [b.get("type") for b in blocks] + assert "thinking" in types, f"no thinking block; types={types}" + assert "text" in types, f"no text block; types={types}" + thinking = next(b["thinking"] for b in blocks if b["type"] == "thinking") + text = next(b["text"] for b in blocks if b["type"] == "text") + assert "Let me compute" in thinking + assert "The answer is 4" in text + assert "" not in thinking + assert "" not in thinking + + +class TestQwen3EnableThinkingStreaming: + """Same bug class but through the streaming SSE code path.""" + + def test_openai_chat_completions_streaming(self, stub_server): + r = requests.post(f"{stub_server}/v1/chat/completions", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "chat_template_kwargs": {"enable_thinking": True}, + "stream": True, + "max_tokens": 256, + }, + stream=True, timeout=10) + assert r.status_code == 200 + + reasoning_text = "" + content_text = "" + saw_reasoning_delta = False + saw_content_delta = False + saw_done = False + for line in r.iter_lines(decode_unicode=True): + if not line: + continue + if line == "data: [DONE]": + saw_done = True + break + assert line.startswith("data: "), line + chunk = json.loads(line[len("data: "):]) + choices = chunk.get("choices") or [] + if not choices: + continue # usage-only tail chunk; no delta + delta = choices[0].get("delta", {}) + if "reasoning_content" in delta: + saw_reasoning_delta = True + reasoning_text += delta["reasoning_content"] + if "content" in delta: + saw_content_delta = True + content_text += delta["content"] + assert saw_done + assert saw_reasoning_delta, "no reasoning_content deltas emitted" + assert saw_content_delta, "no content deltas emitted" + assert "Let me compute" in reasoning_text + assert "The answer is 4" in content_text + assert "" not in reasoning_text + assert "" not in reasoning_text + assert "" not in content_text + assert "" not in content_text + + def test_anthropic_messages_streaming(self, stub_server): + """First content_block_start must be `thinking`, not `text` — the + Anthropic-side half of the PR #308 fix.""" + r = requests.post(f"{stub_server}/v1/messages", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "thinking": {"type": "enabled"}, + "stream": True, + "max_tokens": 256, + }, + stream=True, timeout=10) + assert r.status_code == 200 + + events = [] + event_type = None + for line in r.iter_lines(decode_unicode=True): + if line is None: + continue + if line.startswith("event: "): + event_type = line[len("event: "):] + elif line.startswith("data: ") and event_type: + events.append((event_type, json.loads(line[len("data: "):]))) + event_type = None + types = [t for t, _ in events] + assert "message_start" in types + first_start = next(d for t, d in events if t == "content_block_start") + assert first_start["content_block"]["type"] == "thinking", first_start + # At least one thinking delta and one text delta. + thinking_deltas = [d for t, d in events + if t == "content_block_delta" + and d["delta"]["type"] == "thinking_delta"] + text_deltas = [d for t, d in events + if t == "content_block_delta" + and d["delta"]["type"] == "text_delta"] + assert thinking_deltas, f"no thinking_delta events; types={types}" + assert text_deltas, f"no text_delta events; types={types}" + assert "Let me compute" in "".join(d["delta"]["thinking"] for d in thinking_deltas) + assert "The answer is 4" in "".join(d["delta"]["text"] for d in text_deltas) From 9d4defe1ef3f71c5a1aee935073e09584ed943c7 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Fri, 29 May 2026 18:48:08 -0400 Subject: [PATCH 4/4] =?UTF-8?q?rename:=20spike=5Fno=5Fgpu=5Fhttp=5Fserver?= =?UTF-8?q?=20=E2=86=92=20replay=5Fhttp=5Fserver?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The binary is the permanent stub-driven HTTP server test driver, not throwaway exploration. Rename it to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 2 +- server/CMakeLists.txt | 31 ++++++++++--------- ...http_server.cpp => replay_http_server.cpp} | 2 +- server/test/test_stub_integration.py | 8 ++--- 4 files changed, 22 insertions(+), 21 deletions(-) rename server/test/{spike_no_gpu_http_server.cpp => replay_http_server.cpp} (98%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6f605365..72eb9cce8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,7 +56,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release cmake --build build --target \ test_dflash test_generate test_flash_attn_sparse \ - dflash_server test_server_unit spike_no_gpu_http_server \ + dflash_server test_server_unit replay_http_server \ -j$(nproc) - name: Run C++ server unit tests diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index ad09b526e..25c7063d8 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -769,35 +769,36 @@ if(DFLASH27B_TESTS) endif() endif() - # ─── Spike: HttpServer with stub backend, no GPU bound ────────────── - # Validates that the server constructs and serves /health without a - # GPU — the foundation for a CPU-only integration test driver. Links - # dflash_common (which includes CUDA-compiled TUs) but never calls - # into any real ModelBackend; running under CUDA_VISIBLE_DEVICES="" - # is the supported configuration. - if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/spike_no_gpu_http_server.cpp") - add_executable(spike_no_gpu_http_server - test/spike_no_gpu_http_server.cpp + # ─── replay_http_server: CPU-only HttpServer test driver ──────────── + # Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to + # serve real HTTP requests on the wire, replaying scripted token + # streams from JSON scenario files. Links dflash_common (which + # includes CUDA-compiled TUs) but never instantiates a real + # ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by + # test_stub_integration.py. + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp") + add_executable(replay_http_server + test/replay_http_server.cpp test/scenario_store.cpp test/stub_model_backend.cpp src/server/http_server.cpp src/server/model_card.cpp) - target_include_directories(spike_no_gpu_http_server PRIVATE + target_include_directories(replay_http_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/test) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") - target_compile_definitions(spike_no_gpu_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) + target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) else() - target_compile_definitions(spike_no_gpu_http_server PRIVATE + target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_CUDA=1 DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) endif() - target_link_libraries(spike_no_gpu_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) + target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") find_package(CUDAToolkit REQUIRED) - target_link_libraries(spike_no_gpu_http_server PRIVATE CUDA::cudart) + target_link_libraries(replay_http_server PRIVATE CUDA::cudart) else() - target_link_libraries(spike_no_gpu_http_server PRIVATE hip::host) + target_link_libraries(replay_http_server PRIVATE hip::host) endif() endif() diff --git a/server/test/spike_no_gpu_http_server.cpp b/server/test/replay_http_server.cpp similarity index 98% rename from server/test/spike_no_gpu_http_server.cpp rename to server/test/replay_http_server.cpp index af73c9b8c..174bba870 100644 --- a/server/test/spike_no_gpu_http_server.cpp +++ b/server/test/replay_http_server.cpp @@ -13,7 +13,7 @@ // called. CUDA_VISIBLE_DEVICES="" is the supported test configuration. // // Usage: -// CUDA_VISIBLE_DEVICES="" ./spike_no_gpu_http_server \ +// CUDA_VISIBLE_DEVICES="" ./replay_http_server \ // --scenarios [--port 9999] // // See server/test/scenarios/*.json for the scenario file schema. diff --git a/server/test/test_stub_integration.py b/server/test/test_stub_integration.py index 73fcc46b1..496d877f5 100644 --- a/server/test/test_stub_integration.py +++ b/server/test/test_stub_integration.py @@ -1,7 +1,7 @@ """End-to-end integration test for the dflash HttpServer, driven by a deterministic stub backend (no GPU, no model weights). -Runs the spike_no_gpu_http_server binary with: +Runs the replay_http_server binary with: - the tokenizer-only Qwen3.6 GGUF fixture under server/test/fixtures/ - the JSON scenario files under server/test/scenarios/ @@ -13,7 +13,7 @@ runner with no model file. Run locally: - ./server/build/spike_no_gpu_http_server (built by cmake) + ./server/build/replay_http_server (built by cmake) uv run pytest server/test/test_stub_integration.py -v The test starts and stops the driver itself; no separate server required. @@ -34,7 +34,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2] BUILD_DIR = REPO_ROOT / "server" / "build" -DRIVER_BIN = BUILD_DIR / "spike_no_gpu_http_server" +DRIVER_BIN = BUILD_DIR / "replay_http_server" TOKENIZER_GGUF = REPO_ROOT / "server" / "test" / "fixtures" / "qwen3.6-tokenizer.gguf" SCENARIOS_DIR = REPO_ROOT / "server" / "test" / "scenarios" @@ -50,7 +50,7 @@ def stub_server(): """Spawn the stub-driven HttpServer for the duration of this module.""" assert DRIVER_BIN.is_file(), ( f"driver binary missing: {DRIVER_BIN} — " - "build target spike_no_gpu_http_server first") + "build target replay_http_server first") assert TOKENIZER_GGUF.is_file(), ( f"tokenizer fixture missing: {TOKENIZER_GGUF} — " "is git-lfs configured for *.gguf?")