diff --git a/.gitattributes b/.gitattributes index 592097d1..45e8d6ec 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,4 @@ assets/banner.png -filter -diff -merge -text *.jpeg filter=lfs diff=lfs merge=lfs -text *.mp4 filter=lfs diff=lfs merge=lfs -text *.webm filter=lfs diff=lfs merge=lfs -text +*.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2a87f88a..72eb9cce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive + lfs: true token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }} - uses: Jimver/cuda-toolkit@v0.2.35 @@ -55,7 +56,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release cmake --build build --target \ test_dflash test_generate test_flash_attn_sparse \ - dflash_server test_server_unit \ + dflash_server test_server_unit replay_http_server \ -j$(nproc) - name: Run C++ server unit tests @@ -70,6 +71,19 @@ jobs: # in the optional `megakernel` extra so its build does NOT run yet. run: uv sync --frozen + - name: Run CPU integration tests (stub backend, no GPU) + # End-to-end exercise of HttpServer + render_chat_template + + # SseEmitter with a deterministic stub model backend. No GPU + # required: the spike driver runs under CUDA_VISIBLE_DEVICES="" + # and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata + # only). Covers the regression class from PR #308 end-to-end — + # streaming and non-streaming, OpenAI and Anthropic formats. + env: + CUDA_VISIBLE_DEVICES: "" + run: | + uv run --frozen --with pytest --with requests \ + pytest -v server/test/test_stub_integration.py + - name: Build megakernel via uv sync (sm_75) env: CUDA_HOME: ${{ env.CUDA_PATH }} diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 71298ff6..25c7063d 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -769,6 +769,39 @@ if(DFLASH27B_TESTS) endif() endif() + # ─── replay_http_server: CPU-only HttpServer test driver ──────────── + # Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to + # serve real HTTP requests on the wire, replaying scripted token + # streams from JSON scenario files. Links dflash_common (which + # includes CUDA-compiled TUs) but never instantiates a real + # ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by + # test_stub_integration.py. + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp") + add_executable(replay_http_server + test/replay_http_server.cpp + test/scenario_store.cpp + test/stub_model_backend.cpp + src/server/http_server.cpp + src/server/model_card.cpp) + target_include_directories(replay_http_server PRIVATE + ${DFLASH27B_SRC_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/test) + if(DFLASH27B_GPU_BACKEND STREQUAL "hip") + target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) + else() + target_compile_definitions(replay_http_server PRIVATE + DFLASH27B_BACKEND_CUDA=1 + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) + endif() + target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) + if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") + find_package(CUDAToolkit REQUIRED) + target_link_libraries(replay_http_server PRIVATE CUDA::cudart) + else() + target_link_libraries(replay_http_server PRIVATE hip::host) + endif() + endif() + # ─── Unit tests (no GPU, no model files) ──────────────────────────── enable_testing() diff --git a/server/scripts/test_server_integration.py b/server/scripts/test_server_integration.py index 08da6ced..43535203 100644 --- a/server/scripts/test_server_integration.py +++ b/server/scripts/test_server_integration.py @@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self): @pytest.mark.slow def test_thinking_enabled_via_chat_template_kwargs(self): - """Enabling thinking should produce reasoning_content.""" + """Enabling thinking must route reasoning into reasoning_content, + not leak it into content. Regression guard for the Qwen3.6/Laguna + pre-opened- bug: the chat template appends `` to the + prompt suffix, so the model emits reasoning directly with no + opening tag. If the renderer→emitter wiring drops, reasoning_content + stays empty and the raw reasoning text appears in content.""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] - # With thinking enabled, model may produce reasoning_content - # (not guaranteed for short prompts, so we just check it doesn't crash) + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with enable_thinking=True — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning, ( + f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}" + ) + assert "" not in content and "" not in content, ( + f"think tags leaked into content channel: {content[:200]!r}" + ) + assert content, "content channel empty — model never closed " @pytest.mark.slow def test_thinking_enabled_via_reasoning_effort(self): - """OpenAI Responses-style reasoning.effort field.""" + """OpenAI Responses-style reasoning.effort=high must also route + reasoning to reasoning_content. Same regression class as above + but reached through a different request shape (effort→template + kwargs translation in http_server.cpp).""" r = post_json("/v1/chat/completions", { "model": MODEL_NAME, "messages": [{"role": "user", "content": "What is 15 * 17?"}], @@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self): }) assert r.status_code == 200 msg = r.json()["choices"][0]["message"] - assert msg["content"] + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty with reasoning.effort=high — " + f"renderer→emitter wiring likely broken. content={content[:200]!r}" + ) + assert "" not in reasoning and "" not in reasoning + assert "" not in content and "" not in content + assert content # ═══════════════════════════════════════════════════════════════════ diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp index 1349109a..d4bf3e03 100644 --- a/server/src/server/chat_template.cpp +++ b/server/src/server/chat_template.cpp @@ -51,7 +51,7 @@ ChatFormat chat_format_for_arch(const std::string & arch) { return ChatFormat::QWEN3; } -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt, @@ -59,6 +59,10 @@ std::string render_chat_template( const std::string & tools_json) { std::string result; + // `started_in_thinking` is derived deterministically from the template + // branch + render flags below. Set per format inside the switch so a + // future format addition can't silently miss the wiring. + bool started_in_thinking = false; bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null"; switch (format) { @@ -141,6 +145,14 @@ std::string render_chat_template( // even when the client opts in, defeating the thinking-budget // mechanism entirely. result += "\n"; + // The prompt suffix pre-opens `` — the model's very + // first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Callers must + // start the SSE state machine in REASONING mode and pass + // `started_in_thinking=true` to parse_reasoning() so that + // reasoning text routes to reasoning_content instead of + // leaking into content. + started_in_thinking = true; } } break; @@ -224,6 +236,11 @@ std::string render_chat_template( result += "\n"; if (enable_thinking) { result += ""; + // Same situation as Qwen3.6: Laguna XS.2's enable_thinking + // generation prompt ends with `` so the model starts + // emitting reasoning tokens with no explicit opener in the + // stream. Route subsequent tokens to the reasoning channel. + started_in_thinking = true; } else { // Empty think block — model jumps straight to answer. result += ""; @@ -311,11 +328,17 @@ std::string render_chat_template( result += "<|channel>thought\n"; } } + // Gemma4 does NOT pre-open `` from the prompt; its + // reasoning channel is opened by the model emitting `<|channel>` + // which http_server forwards into the SseEmitter as the text + // `` — so the emitter's existing CONTENT→REASONING + // transition fires on that synthesized opener. started_in_thinking + // stays false (initial CONTENT mode is correct). break; } } - return result; + return PromptRenderResult{std::move(result), started_in_thinking}; } // ─── Jinja path ───────────────────────────────────────────────────────── @@ -353,7 +376,29 @@ static std::shared_ptr get_or_parse(const std::string & template } // namespace -std::string render_chat_template_jinja( +// Sniff a rendered prompt for a trailing `` opener so the caller +// can route subsequent stream tokens to the reasoning channel. Accepts +// optional whitespace after the opener (Qwen3.6 emits `\n`). +// True positive ⇒ caller should treat the prompt as having pre-opened +// the reasoning channel (and the renderer warns loudly so a model-card +// mismatch is visible at runtime). +static bool prompt_ends_with_think_open(const std::string & s) { + static const std::string OPEN = ""; + // Walk back over trailing ASCII whitespace. + size_t end = s.size(); + while (end > 0) { + char c = s[end - 1]; + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + end--; + } else { + break; + } + } + if (end < OPEN.size()) return false; + return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0; +} + +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, @@ -407,14 +452,37 @@ std::string render_chat_template_jinja( throw std::runtime_error(std::string("jinja global_from_json: ") + e.what()); } + std::string rendered; try { jinja::runtime rt(ctx); jinja::value results = rt.execute(*prog); auto parts = jinja::runtime::gather_string_parts(results); - return parts->as_string().str(); + rendered = parts->as_string().str(); } catch (const std::exception & e) { throw std::runtime_error(std::string("jinja runtime: ") + e.what()); } + + // Jinja path: we don't know which template family the caller passed + // in, so derive `started_in_thinking` by sniffing the rendered tail + // for a `` opener. This catches the common Qwen3.6 / Laguna + // chat templates that end with `\n` when enable_thinking is + // honored, plus any custom template that follows the same convention. + // + // Warn loudly when sniffing decides true so a template/model-card + // mismatch (e.g. enable_thinking=false but template hard-codes + // `` anyway) surfaces in server logs. + bool started_in_thinking = + enable_thinking && add_generation_prompt && + prompt_ends_with_think_open(rendered); + if (started_in_thinking) { + std::fprintf(stderr, + "[WARN] render_chat_template_jinja: rendered prompt ends with " + "`` opener — treating as started_in_thinking=true. If " + "this is unexpected, check the template's enable_thinking " + "branch or the model card's reasoning configuration.\n"); + } + + return PromptRenderResult{std::move(rendered), started_in_thinking}; } } // namespace dflash::common diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h index ca7ef9db..770e65a4 100644 --- a/server/src/server/chat_template.h +++ b/server/src/server/chat_template.h @@ -27,6 +27,23 @@ enum class ChatFormat { GEMMA4, // <|turn>role\n...\n }; +// Provenance for a rendered prompt. `text` is the byte string that gets +// tokenized; `started_in_thinking` records whether the prompt suffix +// pre-opens a `` block (or equivalent reasoning-channel marker) +// that the model is expected to continue into. +// +// Callers route this into the SseEmitter's initial mode and into +// parse_reasoning()'s `started_in_thinking` argument so reasoning text +// emitted before any explicit `` opener is still attributed to +// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna +// enable_thinking prompts (which pre-open `\n` in the assistant +// turn) cause the model to emit reasoning straight into the content +// channel, leaving `reasoning_content` empty. +struct PromptRenderResult { + std::string text; // rendered prompt text, ready to tokenize + bool started_in_thinking; // prompt suffix opens reasoning channel +}; + // Render chat messages into the model-specific prompt string. // The result is plain text ready to be tokenized. // @@ -40,7 +57,7 @@ enum class ChatFormat { // `tools_json` is an optional JSON string containing the tool definitions // array. When non-empty, the Qwen3/3.5 template injects a tool preamble // into the system message instructing the model how to emit tags. -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt = true, @@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch); // Internally caches the most recently parsed program per thread (avoids // re-parsing the template on every request). Throws std::runtime_error on // lexer/parser/runtime failure (caller should surface a 500 response). -std::string render_chat_template_jinja( +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index a89309dd..311a5102 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -1009,7 +1009,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { tools_json = req.tools.dump(); } - std::string rendered; + PromptRenderResult render_result; if (!config_.chat_template_src.empty()) { // Jinja path: caller supplied a chat template file via // --chat-template-file. Override the hardcoded QWEN3/LAGUNA @@ -1026,7 +1026,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { ? tokenizer_.raw_token(tokenizer_.eos_id()) : std::string(); try { - rendered = render_chat_template_jinja( + render_result = render_chat_template_jinja( config_.chat_template_src, chat_msgs, bos_str, @@ -1040,11 +1040,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { return true; } } else { - rendered = render_chat_template(chat_msgs, chat_format_, - true, enable_thinking, - tools_json); + render_result = render_chat_template(chat_msgs, chat_format_, + true, enable_thinking, + tools_json); } - req.prompt_tokens = tokenizer_.encode(rendered); + // Propagate prompt provenance so the SseEmitter's initial mode + // matches the template's pre-opened reasoning channel (Qwen3.6 / + // Laguna enable_thinking case). Without this, reasoning text + // leaks into the content channel and `reasoning_content` stays + // empty — see fix(server): route Qwen3.6/Laguna think-mode + // reasoning to reasoning_content channel. + req.started_in_thinking = render_result.started_in_thinking; + req.prompt_tokens = tokenizer_.encode(render_result.text); // count_tokens: short-circuit after tokenization. Skip generation // entirely — Anthropic's contract is just `{"input_tokens": N}`. @@ -1149,11 +1156,20 @@ void HttpServer::worker_loop() { } } - // Create SSE emitter for streaming state machine. + // Create SSE emitter for streaming state machine. `initial_mode` + // tracks whether the chat-template prompt pre-opened a `` + // block (Qwen3.6 / Laguna enable_thinking path). When true, the + // emitter starts in REASONING so the model's first generated + // token routes to reasoning_content even though no explicit + // `` opener appears in the token stream. + const StreamMode initial_mode = req.started_in_thinking + ? StreamMode::REASONING + : StreamMode::CONTENT; SseEmitter emitter(req.format, req.response_id, req.model, (int)req.prompt_tokens.size(), req.tools, &tool_memory_, - req.stop_sequences); + req.stop_sequences, + initial_mode); // Emit initial SSE events. if (req.stream) { diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 999eb5d9..bf5477f6 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -200,6 +200,12 @@ struct ParsedRequest { std::vector stop_sequences; // Bandit: per-session adaptive keep_ratio opt-in std::string session_id; + // Set by the chat-template renderer when the rendered prompt suffix + // pre-opens a `` block (Qwen3.6 / Laguna enable_thinking path). + // Drives the SseEmitter's initial mode so reasoning tokens emitted + // before any explicit `` opener route to reasoning_content + // instead of leaking into content. + bool started_in_thinking = false; }; // Build the /props response body. Exposed (non-static) so unit tests diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp index 604f11a7..beae530b 100644 --- a/server/src/server/sse_emitter.cpp +++ b/server/src/server/sse_emitter.cpp @@ -76,15 +76,16 @@ SseEmitter::SseEmitter(ApiFormat format, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences) + const std::vector & stop_sequences, + StreamMode initial_mode) : format_(format) , request_id_(request_id) , model_name_(model_name) , prompt_tokens_(prompt_tokens) , tools_(tools) , tool_memory_(tool_memory) - , mode_(StreamMode::CONTENT) - , active_kind_("text") + , mode_(initial_mode) + , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text") , stop_sequences_(stop_sequences) , created_at_(unix_timestamp()) , msg_item_id_(gen_item_id()) @@ -93,6 +94,12 @@ SseEmitter::SseEmitter(ApiFormat format, for (const auto & s : stop_sequences_) { if (s.size() > stop_holdback_) stop_holdback_ = s.size(); } + // NOTE on `checked_think_prefix_`: we deliberately leave the default + // (false) here even when initial_mode == REASONING. The emitter has a + // one-time guard in emit_token() that strips a redundantly-emitted + // leading `` if the model emits one anyway (model-card / + // template-mismatch edge case). Pre-setting the flag to true would + // skip that strip and leak the duplicate opener into reasoning_text. } // ─── SSE formatting helpers ───────────────────────────────────────────── diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h index 4710b8d4..fcc40d9c 100644 --- a/server/src/server/sse_emitter.h +++ b/server/src/server/sse_emitter.h @@ -54,13 +54,27 @@ nlohmann::json build_timings_json(const GenTimings & t, int completion_tokens); // Manages SSE streaming for a single request. class SseEmitter { public: + // `initial_mode` defaults to CONTENT for backward compatibility. Pass + // StreamMode::REASONING when the chat-template prompt suffix pre-opens + // a `` block (Qwen3.6 / Laguna enable_thinking path): the + // model's first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Without this hint the + // emitter would route reasoning text to the content channel and + // reasoning_content would stay empty. + // + // Note: the leading-`` strip guard (`checked_think_prefix_`) + // remains active when we start in REASONING mode — if the model + // *does* emit a redundant `` opener anyway, the guard still + // strips it. Pre-setting checked_think_prefix_=true here would let a + // duplicate `` leak into reasoning_text in that edge case. SseEmitter(ApiFormat format, const std::string & request_id, const std::string & model_name, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences = {}); + const std::vector & stop_sequences = {}, + StreamMode initial_mode = StreamMode::CONTENT); // Emit the initial SSE events (role delta, message_start, etc.) // Returns the formatted SSE strings to send. diff --git a/server/test/fixtures/qwen3.6-tokenizer.gguf b/server/test/fixtures/qwen3.6-tokenizer.gguf new file mode 100644 index 00000000..967827d3 --- /dev/null +++ b/server/test/fixtures/qwen3.6-tokenizer.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b420704f204792f14b919b3d52effa839cc7a0739844c2fd2887f6400b045aa +size 10941620 diff --git a/server/test/replay_http_server.cpp b/server/test/replay_http_server.cpp new file mode 100644 index 00000000..174bba87 --- /dev/null +++ b/server/test/replay_http_server.cpp @@ -0,0 +1,120 @@ +// CPU-only test driver for dflash HttpServer. +// +// Boots the server with the real Qwen3.6 tokenizer (loaded from a GGUF — +// vocab/metadata only, never any GPU weights) and a deterministic +// StubModelBackend whose generate() replays scripted token streams from +// JSON scenario files. The chain +// real chat template renderer → real ParsedRequest plumbing → +// real SseEmitter wiring → real socket writes +// is fully exercised; only the per-token sample comes from the stub. +// +// Runs without a GPU: link against dflash_common (CUDA TUs included), but +// because no real ModelBackend is instantiated, ggml_cuda_init() is never +// called. CUDA_VISIBLE_DEVICES="" is the supported test configuration. +// +// Usage: +// CUDA_VISIBLE_DEVICES="" ./replay_http_server \ +// --scenarios [--port 9999] +// +// See server/test/scenarios/*.json for the scenario file schema. + +#include "server/http_server.h" +#include "server/tokenizer.h" +#include "server/chat_template.h" +#include "common/model_backend.h" +#include "scenario_store.h" +#include "stub_model_backend.h" + +#include +#include +#include +#include +#include + +using namespace dflash::common; + +namespace { + +HttpServer * g_server = nullptr; + +void signal_handler(int) { + if (g_server) g_server->request_stop(); +} + +} // namespace + +int main(int argc, char ** argv) { + const char * gguf_path = nullptr; + const char * scenarios_dir = nullptr; + int port = 9999; + int max_ctx = 4096; + + for (int i = 1; i < argc; ++i) { + if (std::strcmp(argv[i], "--scenarios") == 0 && i + 1 < argc) { + scenarios_dir = argv[++i]; + } else if (std::strcmp(argv[i], "--port") == 0 && i + 1 < argc) { + port = std::atoi(argv[++i]); + } else if (std::strcmp(argv[i], "--max-ctx") == 0 && i + 1 < argc) { + max_ctx = std::atoi(argv[++i]); + } else if (argv[i][0] != '-' && !gguf_path) { + gguf_path = argv[i]; + } else { + std::fprintf(stderr, + "usage: %s --scenarios [--port N] [--max-ctx N]\n", + argv[0]); + return 1; + } + } + if (!gguf_path) { + std::fprintf(stderr, "[driver] missing positional \n"); + return 1; + } + + const char * cvd = std::getenv("CUDA_VISIBLE_DEVICES"); + std::fprintf(stderr, "[driver] CUDA_VISIBLE_DEVICES=%s\n", + cvd ? cvd : "(unset)"); + + Tokenizer tokenizer; + if (!tokenizer.load_from_gguf(gguf_path)) { + std::fprintf(stderr, "[driver] tokenizer load failed: %s\n", gguf_path); + return 2; + } + std::fprintf(stderr, "[driver] tokenizer loaded: vocab=%d\n", + tokenizer.vocab_size()); + + test::ScenarioStore store; + if (scenarios_dir) { + if (!store.load_directory(scenarios_dir)) { + std::fprintf(stderr, + "[driver] one or more scenarios failed to load — aborting\n"); + return 3; + } + std::fprintf(stderr, "[driver] loaded %zu scenarios from %s\n", + store.size(), scenarios_dir); + } else { + std::fprintf(stderr, + "[driver] no --scenarios dir given; every request will 500\n"); + } + + test::StubModelBackend backend(store, tokenizer); + + ServerConfig cfg; + cfg.host = "127.0.0.1"; + cfg.port = port; + cfg.model_name = "dflash"; // matches existing test-suite default + cfg.max_ctx = max_ctx; + + HttpServer server(backend, tokenizer, cfg); + server.set_chat_format(ChatFormat::QWEN3); + + g_server = &server; + std::signal(SIGTERM, signal_handler); + std::signal(SIGINT, signal_handler); + + std::fprintf(stderr, + "[driver] HttpServer listening on http://%s:%d (SIGINT/SIGTERM to stop)\n", + cfg.host.c_str(), cfg.port); + int rc = server.run(); + std::fprintf(stderr, "[driver] exit rc=%d\n", rc); + return rc; +} diff --git a/server/test/scenario_store.cpp b/server/test/scenario_store.cpp new file mode 100644 index 00000000..1badd7d1 --- /dev/null +++ b/server/test/scenario_store.cpp @@ -0,0 +1,167 @@ +#include "scenario_store.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace dflash::common::test { + +using json = nlohmann::json; + +namespace { + +bool parse_token(const json & j, ScenarioToken & out, const std::string & file) { + if (j.is_string()) { + out.text = j.get(); + out.special = false; + return true; + } + if (j.is_object()) { + if (!j.contains("text") || !j["text"].is_string()) { + std::fprintf(stderr, + "[scenario] %s: token object missing 'text' string\n", + file.c_str()); + return false; + } + out.text = j["text"].get(); + out.special = j.value("special", false); + return true; + } + std::fprintf(stderr, + "[scenario] %s: token must be a string or {text,special} object\n", + file.c_str()); + return false; +} + +bool parse_scenario(const json & j, Scenario & out, const std::string & file) { + if (!j.is_object()) { + std::fprintf(stderr, "[scenario] %s: top level must be object\n", file.c_str()); + return false; + } + out.name = j.value("name", file); + out.description = j.value("description", ""); + + if (!j.contains("match") || !j["match"].is_object()) { + std::fprintf(stderr, "[scenario] %s: missing 'match' object\n", file.c_str()); + return false; + } + const auto & m = j["match"]; + if (!m.contains("prompt_suffix") || !m["prompt_suffix"].is_string()) { + std::fprintf(stderr, + "[scenario] %s: match.prompt_suffix must be a string\n", file.c_str()); + return false; + } + out.match_prompt_suffix = m["prompt_suffix"].get(); + if (out.match_prompt_suffix.empty()) { + std::fprintf(stderr, + "[scenario] %s: match.prompt_suffix is empty — would match every prompt\n", + file.c_str()); + return false; + } + + if (!j.contains("response") || !j["response"].is_object()) { + std::fprintf(stderr, "[scenario] %s: missing 'response' object\n", file.c_str()); + return false; + } + const auto & r = j["response"]; + out.response.ok = r.value("ok", true); + out.response.error = r.value("error", ""); + out.response.finish_reason = r.value("finish_reason", "stop"); + out.response.decode_us = r.value("decode_us", 0); + + if (out.response.ok) { + if (!r.contains("tokens") || !r["tokens"].is_array()) { + std::fprintf(stderr, + "[scenario] %s: response.tokens must be an array when ok=true\n", + file.c_str()); + return false; + } + out.response.tokens.reserve(r["tokens"].size()); + for (const auto & jt : r["tokens"]) { + ScenarioToken tok; + if (!parse_token(jt, tok, file)) return false; + out.response.tokens.push_back(std::move(tok)); + } + } + return true; +} + +bool ends_with(const std::string & s, const std::string & suffix) { + if (suffix.size() > s.size()) return false; + return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin()); +} + +} // namespace + +bool ScenarioStore::load_file(const std::string & path) { + std::ifstream f(path); + if (!f.is_open()) { + std::fprintf(stderr, "[scenario] cannot open: %s\n", path.c_str()); + return false; + } + std::stringstream buf; + buf << f.rdbuf(); + json j; + try { + j = json::parse(buf.str()); + } catch (const std::exception & e) { + std::fprintf(stderr, "[scenario] %s: JSON parse error: %s\n", + path.c_str(), e.what()); + return false; + } + Scenario sc; + if (!parse_scenario(j, sc, path)) return false; + scenarios_.push_back(std::move(sc)); + std::fprintf(stderr, "[scenario] loaded %s (suffix=%zub, tokens=%zu)\n", + scenarios_.back().name.c_str(), + scenarios_.back().match_prompt_suffix.size(), + scenarios_.back().response.tokens.size()); + return true; +} + +bool ScenarioStore::load_directory(const std::string & dir) { + DIR * d = opendir(dir.c_str()); + if (!d) { + std::fprintf(stderr, "[scenario] cannot open dir: %s\n", dir.c_str()); + return false; + } + std::vector files; + while (struct dirent * de = readdir(d)) { + std::string name = de->d_name; + if (name.size() < 6) continue; + if (name.compare(name.size() - 5, 5, ".json") != 0) continue; + files.push_back(dir + "/" + name); + } + closedir(d); + std::sort(files.begin(), files.end()); // deterministic load order + bool all_ok = true; + for (const auto & p : files) { + if (!load_file(p)) all_ok = false; + } + return all_ok; +} + +const Scenario * ScenarioStore::match(const std::string & rendered_prompt) const { + const Scenario * best = nullptr; + for (const auto & sc : scenarios_) { + if (!ends_with(rendered_prompt, sc.match_prompt_suffix)) continue; + if (!best || + sc.match_prompt_suffix.size() > best->match_prompt_suffix.size()) { + best = ≻ + } else if (sc.match_prompt_suffix.size() == best->match_prompt_suffix.size()) { + std::fprintf(stderr, + "[scenario] tie on suffix length %zu between '%s' and '%s' — " + "load-order earlier wins ('%s'); add a longer suffix to disambiguate\n", + sc.match_prompt_suffix.size(), + best->name.c_str(), sc.name.c_str(), best->name.c_str()); + } + } + return best; +} + +} // namespace dflash::common::test diff --git a/server/test/scenario_store.h b/server/test/scenario_store.h new file mode 100644 index 00000000..9cc2fc10 --- /dev/null +++ b/server/test/scenario_store.h @@ -0,0 +1,84 @@ +// Scenario store — loads JSON scenario files and matches them against +// rendered prompts for the StubModelBackend. +// +// A scenario file is a JSON object describing one (prompt → token stream) +// pairing: +// +// { +// "name": "qwen3_enable_thinking_basic", +// "description": "Qwen3.6 enable_thinking emits reasoning before ", +// "match": { +// "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n\n" +// }, +// "response": { +// "finish_reason": "stop", +// "decode_us": 0, +// "tokens": [ +// "Let me think. ", +// "2", +// "+", +// "2", +// " = 4.", +// { "text": "", "special": true }, +// "\n\nThe answer is 4." +// ] +// } +// } +// +// Match semantics: longest matching `prompt_suffix` wins. File load order +// breaks ties (with a stderr warning). + +#pragma once + +#include +#include +#include + +namespace dflash::common::test { + +struct ScenarioToken { + std::string text; + // When true, the stub backend emits this as a single special token + // looked up via Tokenizer::token_to_id(). When false, the text is + // BPE-encoded and emitted as the resulting sequence of IDs. + bool special = false; +}; + +struct ScenarioResponse { + bool ok = true; + std::string error; // only used when ok=false + std::string finish_reason = "stop"; // "stop", "length", "tool_calls" + int decode_us = 0; // optional inter-token delay + std::vector tokens; +}; + +struct Scenario { + std::string name; + std::string description; + std::string match_prompt_suffix; + ScenarioResponse response; +}; + +class ScenarioStore { +public: + // Load every *.json file under `dir`. Returns false if directory does + // not exist or any file fails to parse (errors logged to stderr). + bool load_directory(const std::string & dir); + + // Load a single scenario file. Returns false on parse error. + bool load_file(const std::string & path); + + // Find the scenario whose `match_prompt_suffix` is the longest suffix + // of `rendered_prompt`. Returns nullptr if none match. + const Scenario * match(const std::string & rendered_prompt) const; + + std::size_t size() const { return scenarios_.size(); } + + // For diagnostic logging. + const std::vector & scenarios() const { return scenarios_; } + +private: + std::vector scenarios_; +}; + +} // namespace dflash::common::test diff --git a/server/test/scenarios/qwen3_enable_thinking_basic.json b/server/test/scenarios/qwen3_enable_thinking_basic.json new file mode 100644 index 00000000..7f5de2c3 --- /dev/null +++ b/server/test/scenarios/qwen3_enable_thinking_basic.json @@ -0,0 +1,19 @@ +{ + "name": "qwen3_enable_thinking_basic", + "description": "Qwen3.6 chat template pre-opens when enable_thinking=true. Model emits reasoning directly, closes with , then writes the answer.", + "match": { + "prompt_suffix": "What is 2+2?<|im_end|>\n<|im_start|>assistant\n\n" + }, + "response": { + "finish_reason": "stop", + "tokens": [ + "Let me compute. ", + "2", + "+", + "2", + " equals 4.", + { "text": "", "special": true }, + "\n\nThe answer is 4." + ] + } +} diff --git a/server/test/scripts/strip_gguf_to_tokenizer.py b/server/test/scripts/strip_gguf_to_tokenizer.py new file mode 100644 index 00000000..cabc3638 --- /dev/null +++ b/server/test/scripts/strip_gguf_to_tokenizer.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Strip a GGUF to tokenizer metadata only — no tensor data. + +The dflash Tokenizer (server/src/server/tokenizer.cpp) only reads KV metadata: + tokenizer.ggml.tokens, .merges, .token_type, .model, .pre, .bos_token_id, + .eos_token_id, .eot_token_id (best-effort), .chat_template + +Everything else — including all 851+ tensor weights — can be dropped. The +result is small enough to commit as a CI test fixture for the CPU-only +HttpServer driver. + +Usage: + python strip_gguf_to_tokenizer.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import gguf + + +KEEP_PREFIXES = ( + "tokenizer.", + # general.architecture is read by backend_factory's detect_arch() to pick + # the chat-format enum. Cheap to keep and useful for any caller that + # peeks at the file. + "general.architecture", + "general.name", +) + + +def main() -> int: + if len(sys.argv) != 3: + print(__doc__) + return 1 + src = Path(sys.argv[1]) + dst = Path(sys.argv[2]) + + reader = gguf.GGUFReader(str(src)) + arch = reader.fields["general.architecture"].contents() + print(f"[strip] reading {src} (arch={arch}, {len(reader.tensors)} tensors)") + + writer = gguf.GGUFWriter(str(dst), arch) + + kept = 0 + for key, field in reader.fields.items(): + if not key.startswith(KEEP_PREFIXES): + continue + # GGUFReader does not expose a "copy field to writer" helper, so we + # have to translate the field's value type back through GGUFWriter's + # typed API. For our tokenizer-only subset that's just strings, + # uint32 scalars, bool scalars, and arrays of strings / int32. + val = field.contents() + ftype = field.types[0] if field.types else None + if ftype == gguf.GGUFValueType.STRING: + writer.add_string(key, val) + elif ftype == gguf.GGUFValueType.UINT32: + writer.add_uint32(key, int(val)) + elif ftype == gguf.GGUFValueType.INT32: + writer.add_int32(key, int(val)) + elif ftype == gguf.GGUFValueType.BOOL: + writer.add_bool(key, bool(val)) + elif ftype == gguf.GGUFValueType.ARRAY: + # Array element type is types[1]. + elem = field.types[1] + if elem == gguf.GGUFValueType.STRING: + writer.add_array(key, list(val)) + elif elem in (gguf.GGUFValueType.UINT32, gguf.GGUFValueType.INT32): + writer.add_array(key, [int(x) for x in val]) + else: + print(f"[strip] skipping array key {key} (elem type {elem})") + continue + else: + print(f"[strip] skipping key {key} (unsupported type {ftype})") + continue + kept += 1 + + writer.write_header_to_file() + writer.write_kv_data_to_file() + # write_tensors_to_file() is omitted — we have no tensors. + writer.close() + + out_size = dst.stat().st_size + print(f"[strip] wrote {dst} (kept {kept} KV pairs, {out_size:,} bytes)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/server/test/stub_model_backend.cpp b/server/test/stub_model_backend.cpp new file mode 100644 index 00000000..c5796bdc --- /dev/null +++ b/server/test/stub_model_backend.cpp @@ -0,0 +1,73 @@ +#include "stub_model_backend.h" + +#include "server/tokenizer.h" + +#include +#include +#include + +namespace dflash::common::test { + +GenerateResult StubModelBackend::generate(const GenerateRequest & req, + const DaemonIO & io) { + GenerateResult r; + + const std::string rendered = tokenizer_.decode(req.prompt); + const Scenario * sc = store_.match(rendered); + if (!sc) { + r.ok = false; + r.error = "stub: no scenario matches prompt (size=" + + std::to_string(rendered.size()) + ")"; + std::fprintf(stderr, + "[stub] no scenario match. last 120b of prompt: %s\n", + rendered.substr(rendered.size() > 120 ? rendered.size() - 120 : 0).c_str()); + return r; + } + + if (!sc->response.ok) { + r.ok = false; + r.error = sc->response.error.empty() + ? std::string("stub: scenario '") + sc->name + "' declared failure" + : sc->response.error; + return r; + } + + std::fprintf(stderr, "[stub] match=%s emitting %zu scripted tokens\n", + sc->name.c_str(), sc->response.tokens.size()); + + r.ok = true; + bool aborted = false; + + auto emit = [&](int32_t id) -> bool { + if (req.on_token && !req.on_token(id)) return false; + if (io.on_token && !io.on_token(id)) return false; + r.tokens.push_back(id); + return true; + }; + + for (const auto & t : sc->response.tokens) { + if (t.special) { + int32_t id = tokenizer_.token_to_id(t.text); + if (id < 0) { + r.ok = false; + r.error = "stub: special token not in vocab: " + t.text; + return r; + } + if (!emit(id)) { aborted = true; break; } + } else { + auto ids = tokenizer_.encode(t.text); + for (int32_t id : ids) { + if (!emit(id)) { aborted = true; break; } + } + if (aborted) break; + } + if (sc->response.decode_us > 0) { + std::this_thread::sleep_for( + std::chrono::microseconds(sc->response.decode_us)); + } + } + + return r; +} + +} // namespace dflash::common::test diff --git a/server/test/stub_model_backend.h b/server/test/stub_model_backend.h new file mode 100644 index 00000000..83525b55 --- /dev/null +++ b/server/test/stub_model_backend.h @@ -0,0 +1,63 @@ +// StubModelBackend — deterministic ModelBackend driven by a ScenarioStore. +// +// generate() decodes the request prompt back to text using the real +// tokenizer, looks up the matching scenario (longest prompt_suffix wins), +// then streams the scenario's tokens through the production callbacks +// (req.on_token / io.on_token). Streaming behavior is inherited from the +// production code path — no separate streaming machine. +// +// Token-stream construction: +// - Plain text tokens are BPE-encoded by the real tokenizer, so a +// scripted "Let me think. " expands to the same Qwen3.6 token IDs +// the real model would have emitted. +// - Special tokens (e.g. ) are looked up via token_to_id() +// and emitted as a single ID. This matters: Qwen3.6's +// is one added token (id 248069); for SseEmitter to recognize it +// as a channel-close, it must arrive as the single special ID, +// not as the BPE-decomposed sequence. + +#pragma once + +#include "common/model_backend.h" +#include "scenario_store.h" + +namespace dflash::common { +class Tokenizer; +} + +namespace dflash::common::test { + +class StubModelBackend : public ModelBackend { +public: + StubModelBackend(const ScenarioStore & store, const Tokenizer & tokenizer) + : store_(store), tokenizer_(tokenizer) {} + + void print_ready_banner() const override {} + bool park(const std::string &) override { return true; } + bool unpark(const std::string &) override { return true; } + bool is_target_parked() const override { return false; } + + GenerateResult generate(const GenerateRequest & req, + const DaemonIO & io) override; + + bool snapshot_save(int) override { return false; } + void snapshot_free(int) override {} + bool snapshot_used(int) const override { return false; } + int snapshot_cur_pos(int) const override { return 0; } + GenerateResult restore_and_generate(int, const GenerateRequest & req, + const DaemonIO & io) override { + return generate(req, io); + } + + bool handle_compress(const std::string &, const DaemonIO &) override { + return false; + } + void free_drafter() override {} + void shutdown() override {} + +private: + const ScenarioStore & store_; + const Tokenizer & tokenizer_; +}; + +} // namespace dflash::common::test diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 147ddbce..fc983c2d 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -1058,11 +1058,11 @@ static void test_jinja_render_basic() { {"system", "you are helpful", ""}, {"user", "hi", ""}, }; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, /*bos=*/"", /*eos=*/"", /*add_gen=*/true, /*think=*/false, - /*tools=*/""); + /*tools=*/"").text; TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos); TEST_ASSERT(out.find("<|user|>hi") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") != std::string::npos); @@ -1070,9 +1070,9 @@ static void test_jinja_render_basic() { static void test_jinja_render_no_gen_prompt() { std::vector msgs = {{"user", "ping", ""}}; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, "", "", - /*add_gen=*/false, /*think=*/false, ""); + /*add_gen=*/false, /*think=*/false, "").text; TEST_ASSERT(out.find("<|user|>ping") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") == std::string::npos); } @@ -1084,8 +1084,8 @@ static void test_jinja_render_tools_injected() { "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"; std::vector msgs = {{"user", "?", ""}}; std::string tools = R"([{"name":"my_tool","description":"test"}])"; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, tools); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, tools).text; TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos); } @@ -1094,8 +1094,8 @@ static void test_jinja_render_empty_tools_skipped() { static const char TPL[] = "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}"; std::vector msgs = {{"user", "?", ""}}; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, "[]"); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "[]").text; TEST_ASSERT(out.find("NO_TOOLS") != std::string::npos); TEST_ASSERT(out.find("TOOLS_PRESENT") == std::string::npos); } @@ -1104,8 +1104,8 @@ static void test_jinja_render_bos_eos_threaded() { // {{ bos_token }} and {{ eos_token }} must reach the template. static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}"; std::vector msgs; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, ""); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "").text; TEST_ASSERT(out == "HI"); } @@ -1133,6 +1133,311 @@ static void test_jinja_render_bad_tools_json_throws() { TEST_ASSERT(threw); } +// ─── started_in_thinking provenance ───────────────────────────────────── +// +// Regression suite for the Qwen3.6 / Laguna think-mode channel-routing +// bug: the rendered prompt suffix pre-opens `` so the model +// starts emitting reasoning tokens with no explicit opener. Callers +// route PromptRenderResult.started_in_thinking → SseEmitter initial +// mode so reasoning text lands in reasoning_content, not content. + +static void test_chat_template_qwen3_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + // Sanity: rendered suffix ends with `\n` per the Qwen3.6 + // chat_template.jinja's enable_thinking branch. + TEST_ASSERT(result.text.size() >= 8); + TEST_ASSERT(result.text.compare(result.text.size() - 8, 8, "\n") == 0); +} + +static void test_chat_template_qwen3_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); + // The disabled branch emits `\n\n\n\n` — closes + // immediately, so the reasoning channel is NOT left open. + TEST_ASSERT(result.text.find("") != std::string::npos); +} + +static void test_chat_template_qwen3_no_gen_prompt_does_not_pre_open() { + // Without add_generation_prompt the assistant turn isn't appended + // and there's nothing to pre-open. + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/false, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_laguna_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + TEST_ASSERT(result.text.size() >= 7); + TEST_ASSERT(result.text.compare(result.text.size() - 7, 7, "") == 0); +} + +static void test_chat_template_laguna_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_gemma4_does_not_pre_open() { + // Gemma4's reasoning channel is opened by the model's `<|channel>` + // token (which http_server forwards into the emitter as ``). + // The prompt itself never pre-opens `` regardless of + // enable_thinking, so started_in_thinking must stay false. + std::vector msgs = {{"user", "hi", ""}}; + auto enabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!enabled.started_in_thinking); + auto disabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!disabled.started_in_thinking); +} + +// Jinja path: suffix-sniff detection. The renderer should set +// started_in_thinking=true when the rendered prompt ends with `` +// (optionally followed by whitespace) AND enable_thinking is honored. +static void test_jinja_render_suffix_sniff_sets_started_in_thinking() { + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>{%- if enable_thinking -%}\n{%- endif -%}" + "{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(r.started_in_thinking); +} + +static void test_jinja_render_suffix_sniff_negative() { + // Template doesn't end with `` → started_in_thinking=false + // even with enable_thinking=true. + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}<|assistant|>{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(!r.started_in_thinking); +} + +// ─── SseEmitter initial_mode=REASONING ────────────────────────────────── +// +// Regression: when constructed with initial_mode=REASONING (the +// Qwen3.6/Laguna enable_thinking path), the emitter must route the +// model's first generated tokens to reasoning_content until a natural +// `` is seen, even though no explicit `` opener appears +// in the stream. + +static void test_emitter_initial_mode_reasoning_routes_to_reasoning_content() { + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-1", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + + // Model emits reasoning tokens directly with no leading `` + // (because the prompt suffix already opened the channel), then + // closes with `` and emits the answer. + em.emit_token("alpha "); + em.emit_token("beta "); + em.emit_token("\n\nAnswer: 4"); + em.emit_finish(4); + + TEST_ASSERT(em.reasoning_text().find("alpha") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("beta") != std::string::npos); + // No spurious tag leaked into reasoning or content. + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("Answer: 4") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_unclosed_stays_reasoning() { + // No close — everything stays in reasoning_content, content + // stays empty. Matches parse_reasoning(started_in_thinking=true) + // behavior for the non-streaming path. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-2", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("still thinking"); + em.emit_token(" more thinking"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("still thinking") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("more thinking") != std::string::npos); + TEST_ASSERT(em.accumulated_text().empty()); +} + +static void test_emitter_initial_mode_reasoning_strips_redundant_think_opener() { + // Edge case: prompt pre-opened , but the model also emits a + // leading anyway (template/model-card mismatch). The + // emitter's strip guard (checked_think_prefix_) must still trip + // because we deliberately leave it at its default (false) in the + // constructor — otherwise the duplicate opener would leak into + // reasoning_text. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-3", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("actual reasoninganswer"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("actual reasoning") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking() { + // Anthropic format: when starting in REASONING mode, the very first + // content_block_start must be `thinking`, not `text`. Otherwise the + // emitter would open a text block, then have to stop+restart it as + // thinking on the first reasoning delta — wasteful and visible to + // SDK clients as a spurious empty text block. + SseEmitter em(ApiFormat::ANTHROPIC, "req-4", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + auto start = em.emit_start(); + std::string all; + for (const auto & c : start) all += c; + // First content block must be a thinking block. nlohmann::json sorts + // keys alphabetically on dump(), so the inner block serializes as + // `{"thinking":"","type":"thinking"}` (NOT type-first). Assert on + // the unique `"thinking":""` opener which only appears in the + // thinking-kind serialization. + TEST_ASSERT(all.find("\"thinking\":\"\",\"type\":\"thinking\"") + != std::string::npos); + // And the initial text-block opener must NOT appear (regression: if + // active_kind_ defaulted to "text", emit_start would have emitted + // `{"text":"","type":"text"}` here instead). + TEST_ASSERT(all.find("\"text\":\"\",\"type\":\"text\"") + == std::string::npos); +} + +// ─── Integration: render_chat_template → SseEmitter wiring ────────────── +// +// The original bug was an integration gap: render_chat_template correctly +// reported started_in_thinking=true, but no caller routed it into the +// SseEmitter's initial_mode, so reasoning text leaked into content and +// reasoning_content stayed empty. Each end of the wire has its own unit +// tests above; these chain the two ends so a future refactor that drops +// the propagation cannot pass without an assertion failure here. +// +// The body mirrors the production wiring in +// server/src/server/http_server.cpp (the `started_in_thinking → +// initial_mode → SseEmitter` chain). Keep these in sync if that wiring +// moves. + +static void test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "What is 2+2?", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: QWEN3 enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-q", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Let me compute. "); + em.emit_token("2+2 equals 4."); + em.emit_token("\n\nThe answer is 4."); + em.emit_finish(5); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Let me compute") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Let me compute") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("The answer is 4") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "Solve 7*8.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: LAGUNA enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-l", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Working through it: "); + em.emit_token("7*8 = 56."); + em.emit_token("\n\n56."); + em.emit_finish(4); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Working through it") != std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Working through it") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("56.") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content() { + // Inverse direction: when enable_thinking=false the renderer must not + // pre-open and the emitter must start in CONTENT, so the model's + // tokens land in content from the first byte. Guards against the + // opposite regression of unconditionally starting in REASONING. + std::vector msgs = {{"user", "Hi.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!render.started_in_thinking); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-n", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Hello there."); + em.emit_finish(2); + + TEST_ASSERT(em.reasoning_text().empty()); + TEST_ASSERT(em.accumulated_text().find("Hello there") != std::string::npos); +} + static void test_normalize_responses_tool_followup_messages() { ToolMemory tool_memory; const std::string call_id = "call_exec_001"; @@ -2538,6 +2843,10 @@ int main() { RUN_TEST(test_emitter_streaming_openai_has_done); RUN_TEST(test_emitter_nonstreaming_accumulates); RUN_TEST(test_emitter_anthropic_thinking_blocks); + RUN_TEST(test_emitter_initial_mode_reasoning_routes_to_reasoning_content); + RUN_TEST(test_emitter_initial_mode_reasoning_unclosed_stays_reasoning); + RUN_TEST(test_emitter_initial_mode_reasoning_strips_redundant_think_opener); + RUN_TEST(test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking); std::fprintf(stderr, "\n── Stop sequences ──\n"); RUN_TEST(test_stop_sequence_basic); @@ -2579,6 +2888,17 @@ int main() { RUN_TEST(test_jinja_render_bos_eos_threaded); RUN_TEST(test_jinja_render_empty_template_throws); RUN_TEST(test_jinja_render_bad_tools_json_throws); + RUN_TEST(test_chat_template_qwen3_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_qwen3_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_qwen3_no_gen_prompt_does_not_pre_open); + RUN_TEST(test_chat_template_laguna_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_laguna_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_gemma4_does_not_pre_open); + RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking); + RUN_TEST(test_jinja_render_suffix_sniff_negative); + RUN_TEST(test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content); RUN_TEST(test_normalize_responses_tool_followup_messages); std::fprintf(stderr, "\n── Placement config ──\n"); diff --git a/server/test/test_stub_integration.py b/server/test/test_stub_integration.py new file mode 100644 index 00000000..496d877f --- /dev/null +++ b/server/test/test_stub_integration.py @@ -0,0 +1,241 @@ +"""End-to-end integration test for the dflash HttpServer, driven by a +deterministic stub backend (no GPU, no model weights). + +Runs the replay_http_server binary with: + - the tokenizer-only Qwen3.6 GGUF fixture under server/test/fixtures/ + - the JSON scenario files under server/test/scenarios/ + +Then sends real HTTP requests at it via the same `requests` calls that +test_server_integration.py uses against a live GPU server. Because the +chat-template renderer, request parser, SseEmitter, and SSE socket +writes are the production code path, this test exercises the exact wire +that broke in the original Qwen3.6 think-channel bug — but on a CPU-only +runner with no model file. + +Run locally: + ./server/build/replay_http_server (built by cmake) + uv run pytest server/test/test_stub_integration.py -v + +The test starts and stops the driver itself; no separate server required. +""" + +from __future__ import annotations + +import json +import os +import socket +import subprocess +import time +from pathlib import Path + +import pytest +import requests + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BUILD_DIR = REPO_ROOT / "server" / "build" +DRIVER_BIN = BUILD_DIR / "replay_http_server" +TOKENIZER_GGUF = REPO_ROOT / "server" / "test" / "fixtures" / "qwen3.6-tokenizer.gguf" +SCENARIOS_DIR = REPO_ROOT / "server" / "test" / "scenarios" + + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def stub_server(): + """Spawn the stub-driven HttpServer for the duration of this module.""" + assert DRIVER_BIN.is_file(), ( + f"driver binary missing: {DRIVER_BIN} — " + "build target replay_http_server first") + assert TOKENIZER_GGUF.is_file(), ( + f"tokenizer fixture missing: {TOKENIZER_GGUF} — " + "is git-lfs configured for *.gguf?") + + port = _free_port() + env = {**os.environ, "CUDA_VISIBLE_DEVICES": ""} + proc = subprocess.Popen( + [str(DRIVER_BIN), + str(TOKENIZER_GGUF), + "--scenarios", str(SCENARIOS_DIR), + "--port", str(port)], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + base = f"http://127.0.0.1:{port}" + + # Wait up to 10s for /health to come up. + deadline = time.time() + 10 + while time.time() < deadline: + try: + r = requests.get(f"{base}/health", timeout=1) + if r.status_code == 200: + break + except requests.RequestException: + pass + if proc.poll() is not None: + out = proc.stdout.read().decode() if proc.stdout else "" + raise RuntimeError(f"driver exited early. output:\n{out}") + time.sleep(0.1) + else: + proc.terminate() + out = proc.stdout.read().decode() if proc.stdout else "" + raise RuntimeError(f"driver did not become healthy in 10s. output:\n{out}") + + yield base + + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=2) + + +# ─── Tests ───────────────────────────────────────────────────────────── + + +class TestQwen3EnableThinkingNonStreaming: + """Regression guard for the original PR #308 bug: Qwen3.6 enable_thinking + must route reasoning into reasoning_content, not leak it into content.""" + + def test_openai_chat_completions(self, stub_server): + r = requests.post(f"{stub_server}/v1/chat/completions", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "chat_template_kwargs": {"enable_thinking": True}, + "stream": False, + "max_tokens": 256, + }, + timeout=10) + assert r.status_code == 200, r.text + msg = r.json()["choices"][0]["message"] + reasoning = msg.get("reasoning_content") or "" + content = msg.get("content") or "" + assert reasoning, ( + f"reasoning_content empty — render→emit wiring broken. " + f"content={content!r}") + assert "Let me compute" in reasoning + assert "" not in reasoning + assert "" not in reasoning + assert "" not in content + assert "" not in content + assert "The answer is 4." in content + assert "Let me compute" not in content, ( + f"reasoning text leaked into content: {content!r}") + + def test_anthropic_messages(self, stub_server): + r = requests.post(f"{stub_server}/v1/messages", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "thinking": {"type": "enabled"}, + "stream": False, + "max_tokens": 256, + }, + timeout=10) + assert r.status_code == 200, r.text + blocks = r.json().get("content", []) + types = [b.get("type") for b in blocks] + assert "thinking" in types, f"no thinking block; types={types}" + assert "text" in types, f"no text block; types={types}" + thinking = next(b["thinking"] for b in blocks if b["type"] == "thinking") + text = next(b["text"] for b in blocks if b["type"] == "text") + assert "Let me compute" in thinking + assert "The answer is 4" in text + assert "" not in thinking + assert "" not in thinking + + +class TestQwen3EnableThinkingStreaming: + """Same bug class but through the streaming SSE code path.""" + + def test_openai_chat_completions_streaming(self, stub_server): + r = requests.post(f"{stub_server}/v1/chat/completions", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "chat_template_kwargs": {"enable_thinking": True}, + "stream": True, + "max_tokens": 256, + }, + stream=True, timeout=10) + assert r.status_code == 200 + + reasoning_text = "" + content_text = "" + saw_reasoning_delta = False + saw_content_delta = False + saw_done = False + for line in r.iter_lines(decode_unicode=True): + if not line: + continue + if line == "data: [DONE]": + saw_done = True + break + assert line.startswith("data: "), line + chunk = json.loads(line[len("data: "):]) + choices = chunk.get("choices") or [] + if not choices: + continue # usage-only tail chunk; no delta + delta = choices[0].get("delta", {}) + if "reasoning_content" in delta: + saw_reasoning_delta = True + reasoning_text += delta["reasoning_content"] + if "content" in delta: + saw_content_delta = True + content_text += delta["content"] + assert saw_done + assert saw_reasoning_delta, "no reasoning_content deltas emitted" + assert saw_content_delta, "no content deltas emitted" + assert "Let me compute" in reasoning_text + assert "The answer is 4" in content_text + assert "" not in reasoning_text + assert "" not in reasoning_text + assert "" not in content_text + assert "" not in content_text + + def test_anthropic_messages_streaming(self, stub_server): + """First content_block_start must be `thinking`, not `text` — the + Anthropic-side half of the PR #308 fix.""" + r = requests.post(f"{stub_server}/v1/messages", + json={ + "model": "dflash", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "thinking": {"type": "enabled"}, + "stream": True, + "max_tokens": 256, + }, + stream=True, timeout=10) + assert r.status_code == 200 + + events = [] + event_type = None + for line in r.iter_lines(decode_unicode=True): + if line is None: + continue + if line.startswith("event: "): + event_type = line[len("event: "):] + elif line.startswith("data: ") and event_type: + events.append((event_type, json.loads(line[len("data: "):]))) + event_type = None + types = [t for t, _ in events] + assert "message_start" in types + first_start = next(d for t, d in events if t == "content_block_start") + assert first_start["content_block"]["type"] == "thinking", first_start + # At least one thinking delta and one text delta. + thinking_deltas = [d for t, d in events + if t == "content_block_delta" + and d["delta"]["type"] == "thinking_delta"] + text_deltas = [d for t, d in events + if t == "content_block_delta" + and d["delta"]["type"] == "text_delta"] + assert thinking_deltas, f"no thinking_delta events; types={types}" + assert text_deltas, f"no text_delta events; types={types}" + assert "Let me compute" in "".join(d["delta"]["thinking"] for d in thinking_deltas) + assert "The answer is 4" in "".join(d["delta"]["text"] for d in text_deltas)