Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ assets/banner.png -filter -diff -merge -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.webm filter=lfs diff=lfs merge=lfs -text
*.gguf filter=lfs diff=lfs merge=lfs -text
16 changes: 15 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: recursive
lfs: true
token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }}

- uses: Jimver/cuda-toolkit@v0.2.35
Expand Down Expand Up @@ -55,7 +56,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
cmake --build build --target \
test_dflash test_generate test_flash_attn_sparse \
dflash_server test_server_unit \
dflash_server test_server_unit replay_http_server \
-j$(nproc)

- name: Run C++ server unit tests
Expand All @@ -70,6 +71,19 @@ jobs:
# in the optional `megakernel` extra so its build does NOT run yet.
run: uv sync --frozen

- name: Run CPU integration tests (stub backend, no GPU)
# End-to-end exercise of HttpServer + render_chat_template +
# SseEmitter with a deterministic stub model backend. No GPU
# required: the spike driver runs under CUDA_VISIBLE_DEVICES=""
# and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata
# only). Covers the regression class from PR #308 end-to-end —
# streaming and non-streaming, OpenAI and Anthropic formats.
env:
CUDA_VISIBLE_DEVICES: ""
run: |
uv run --frozen --with pytest --with requests \
pytest -v server/test/test_stub_integration.py

- name: Build megakernel via uv sync (sm_75)
env:
CUDA_HOME: ${{ env.CUDA_PATH }}
Expand Down
33 changes: 33 additions & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,39 @@ if(DFLASH27B_TESTS)
endif()
endif()

# ─── replay_http_server: CPU-only HttpServer test driver ────────────
# Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to
# serve real HTTP requests on the wire, replaying scripted token
# streams from JSON scenario files. Links dflash_common (which
# includes CUDA-compiled TUs) but never instantiates a real
# ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by
# test_stub_integration.py.
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp")
add_executable(replay_http_server
test/replay_http_server.cpp
test/scenario_store.cpp
test/stub_model_backend.cpp
src/server/http_server.cpp
src/server/model_card.cpp)
target_include_directories(replay_http_server PRIVATE
${DFLASH27B_SRC_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/test)
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
else()
target_compile_definitions(replay_http_server PRIVATE
DFLASH27B_BACKEND_CUDA=1
DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
endif()
target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(replay_http_server PRIVATE CUDA::cudart)
else()
target_link_libraries(replay_http_server PRIVATE hip::host)
endif()
endif()

# ─── Unit tests (no GPU, no model files) ────────────────────────────
enable_testing()

Expand Down
38 changes: 32 additions & 6 deletions server/scripts/test_server_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,12 @@ def test_thinking_disabled_by_default(self):

@pytest.mark.slow
def test_thinking_enabled_via_chat_template_kwargs(self):
"""Enabling thinking should produce reasoning_content."""
"""Enabling thinking must route reasoning into reasoning_content,
not leak it into content. Regression guard for the Qwen3.6/Laguna
pre-opened-<think> bug: the chat template appends `<think>` to the
prompt suffix, so the model emits reasoning directly with no
opening tag. If the renderer→emitter wiring drops, reasoning_content
stays empty and the raw reasoning text appears in content."""
r = post_json("/v1/chat/completions", {
"model": MODEL_NAME,
"messages": [{"role": "user", "content": "What is 15 * 17?"}],
Expand All @@ -500,13 +505,26 @@ def test_thinking_enabled_via_chat_template_kwargs(self):
})
assert r.status_code == 200
msg = r.json()["choices"][0]["message"]
assert msg["content"]
# With thinking enabled, model may produce reasoning_content
# (not guaranteed for short prompts, so we just check it doesn't crash)
reasoning = msg.get("reasoning_content") or ""
content = msg.get("content") or ""
assert reasoning, (
f"reasoning_content empty with enable_thinking=True — "
f"renderer→emitter wiring likely broken. content={content[:200]!r}"
)
assert "<think>" not in reasoning and "</think>" not in reasoning, (
f"raw think tags leaked into reasoning_content: {reasoning[:200]!r}"
)
assert "<think>" not in content and "</think>" not in content, (
f"think tags leaked into content channel: {content[:200]!r}"
)
assert content, "content channel empty — model never closed </think>"

@pytest.mark.slow
def test_thinking_enabled_via_reasoning_effort(self):
"""OpenAI Responses-style reasoning.effort field."""
"""OpenAI Responses-style reasoning.effort=high must also route
reasoning to reasoning_content. Same regression class as above
but reached through a different request shape (effort→template
kwargs translation in http_server.cpp)."""
r = post_json("/v1/chat/completions", {
"model": MODEL_NAME,
"messages": [{"role": "user", "content": "What is 15 * 17?"}],
Expand All @@ -516,7 +534,15 @@ def test_thinking_enabled_via_reasoning_effort(self):
})
assert r.status_code == 200
msg = r.json()["choices"][0]["message"]
assert msg["content"]
reasoning = msg.get("reasoning_content") or ""
content = msg.get("content") or ""
assert reasoning, (
f"reasoning_content empty with reasoning.effort=high — "
f"renderer→emitter wiring likely broken. content={content[:200]!r}"
)
assert "<think>" not in reasoning and "</think>" not in reasoning
assert "<think>" not in content and "</think>" not in content
assert content


# ═══════════════════════════════════════════════════════════════════
Expand Down
76 changes: 72 additions & 4 deletions server/src/server/chat_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,18 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
return ChatFormat::QWEN3;
}

std::string render_chat_template(
PromptRenderResult render_chat_template(
const std::vector<ChatMessage> & messages,
ChatFormat format,
bool add_generation_prompt,
bool enable_thinking,
const std::string & tools_json)
{
std::string result;
// `started_in_thinking` is derived deterministically from the template
// branch + render flags below. Set per format inside the switch so a
// future format addition can't silently miss the wiring.
bool started_in_thinking = false;
bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";

switch (format) {
Expand Down Expand Up @@ -141,6 +145,14 @@ std::string render_chat_template(
// even when the client opts in, defeating the thinking-budget
// mechanism entirely.
result += "<think>\n";
// The prompt suffix pre-opens `<think>` — the model's very
// first generated token is reasoning, never preceded by an
// explicit `<think>` opener in the stream. Callers must
// start the SSE state machine in REASONING mode and pass
// `started_in_thinking=true` to parse_reasoning() so that
// reasoning text routes to reasoning_content instead of
// leaking into content.
started_in_thinking = true;
}
}
break;
Expand Down Expand Up @@ -224,6 +236,11 @@ std::string render_chat_template(
result += "<assistant>\n";
if (enable_thinking) {
result += "<think>";
// Same situation as Qwen3.6: Laguna XS.2's enable_thinking
// generation prompt ends with `<think>` so the model starts
// emitting reasoning tokens with no explicit opener in the
// stream. Route subsequent tokens to the reasoning channel.
started_in_thinking = true;
} else {
// Empty think block — model jumps straight to answer.
result += "</think>";
Expand Down Expand Up @@ -311,11 +328,17 @@ std::string render_chat_template(
result += "<|channel>thought\n<channel|>";
}
}
// Gemma4 does NOT pre-open `<think>` from the prompt; its
// reasoning channel is opened by the model emitting `<|channel>`
// which http_server forwards into the SseEmitter as the text
// `<think>` — so the emitter's existing CONTENT→REASONING
// transition fires on that synthesized opener. started_in_thinking
// stays false (initial CONTENT mode is correct).
break;
}
}

return result;
return PromptRenderResult{std::move(result), started_in_thinking};
}

// ─── Jinja path ─────────────────────────────────────────────────────────
Expand Down Expand Up @@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template

} // namespace

std::string render_chat_template_jinja(
// Sniff a rendered prompt for a trailing `<think>` opener so the caller
// can route subsequent stream tokens to the reasoning channel. Accepts
// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
// True positive ⇒ caller should treat the prompt as having pre-opened
// the reasoning channel (and the renderer warns loudly so a model-card
// mismatch is visible at runtime).
static bool prompt_ends_with_think_open(const std::string & s) {
static const std::string OPEN = "<think>";
// Walk back over trailing ASCII whitespace.
size_t end = s.size();
while (end > 0) {
char c = s[end - 1];
if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
end--;
} else {
break;
}
}
if (end < OPEN.size()) return false;
return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
}

PromptRenderResult render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
Expand Down Expand Up @@ -407,14 +452,37 @@ std::string render_chat_template_jinja(
throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
}

std::string rendered;
try {
jinja::runtime rt(ctx);
jinja::value results = rt.execute(*prog);
auto parts = jinja::runtime::gather_string_parts(results);
return parts->as_string().str();
rendered = parts->as_string().str();
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja runtime: ") + e.what());
}

// Jinja path: we don't know which template family the caller passed
// in, so derive `started_in_thinking` by sniffing the rendered tail
// for a `<think>` opener. This catches the common Qwen3.6 / Laguna
// chat templates that end with `<think>\n` when enable_thinking is
// honored, plus any custom template that follows the same convention.
//
// Warn loudly when sniffing decides true so a template/model-card
// mismatch (e.g. enable_thinking=false but template hard-codes
// `<think>` anyway) surfaces in server logs.
bool started_in_thinking =
enable_thinking && add_generation_prompt &&
prompt_ends_with_think_open(rendered);
if (started_in_thinking) {
std::fprintf(stderr,
"[WARN] render_chat_template_jinja: rendered prompt ends with "
"`<think>` opener — treating as started_in_thinking=true. If "
"this is unexpected, check the template's enable_thinking "
"branch or the model card's reasoning configuration.\n");
}

return PromptRenderResult{std::move(rendered), started_in_thinking};
}

} // namespace dflash::common
21 changes: 19 additions & 2 deletions server/src/server/chat_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ enum class ChatFormat {
GEMMA4, // <bos><|turn>role\n...<turn|>\n
};

// Provenance for a rendered prompt. `text` is the byte string that gets
// tokenized; `started_in_thinking` records whether the prompt suffix
// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
// that the model is expected to continue into.
//
// Callers route this into the SseEmitter's initial mode and into
// parse_reasoning()'s `started_in_thinking` argument so reasoning text
// emitted before any explicit `<think>` opener is still attributed to
// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
// enable_thinking prompts (which pre-open `<think>\n` in the assistant
// turn) cause the model to emit reasoning straight into the content
// channel, leaving `reasoning_content` empty.
struct PromptRenderResult {
std::string text; // rendered prompt text, ready to tokenize
bool started_in_thinking; // prompt suffix opens reasoning channel
};

// Render chat messages into the model-specific prompt string.
// The result is plain text ready to be tokenized.
//
Expand All @@ -40,7 +57,7 @@ enum class ChatFormat {
// `tools_json` is an optional JSON string containing the tool definitions
// array. When non-empty, the Qwen3/3.5 template injects a tool preamble
// into the system message instructing the model how to emit <tool_call> tags.
std::string render_chat_template(
PromptRenderResult render_chat_template(
const std::vector<ChatMessage> & messages,
ChatFormat format,
bool add_generation_prompt = true,
Expand All @@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
// Internally caches the most recently parsed program per thread (avoids
// re-parsing the template on every request). Throws std::runtime_error on
// lexer/parser/runtime failure (caller should surface a 500 response).
std::string render_chat_template_jinja(
PromptRenderResult render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
Expand Down
32 changes: 24 additions & 8 deletions server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
tools_json = req.tools.dump();
}

std::string rendered;
PromptRenderResult render_result;
if (!config_.chat_template_src.empty()) {
// Jinja path: caller supplied a chat template file via
// --chat-template-file. Override the hardcoded QWEN3/LAGUNA
Expand All @@ -1026,7 +1026,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
? tokenizer_.raw_token(tokenizer_.eos_id())
: std::string();
try {
rendered = render_chat_template_jinja(
render_result = render_chat_template_jinja(
config_.chat_template_src,
chat_msgs,
bos_str,
Expand All @@ -1040,11 +1040,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
return true;
}
} else {
rendered = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
render_result = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
}
req.prompt_tokens = tokenizer_.encode(rendered);
// Propagate prompt provenance so the SseEmitter's initial mode
// matches the template's pre-opened reasoning channel (Qwen3.6 /
// Laguna enable_thinking case). Without this, reasoning text
// leaks into the content channel and `reasoning_content` stays
// empty — see fix(server): route Qwen3.6/Laguna think-mode
// reasoning to reasoning_content channel.
req.started_in_thinking = render_result.started_in_thinking;
req.prompt_tokens = tokenizer_.encode(render_result.text);

// count_tokens: short-circuit after tokenization. Skip generation
// entirely — Anthropic's contract is just `{"input_tokens": N}`.
Expand Down Expand Up @@ -1149,11 +1156,20 @@ void HttpServer::worker_loop() {
}
}

// Create SSE emitter for streaming state machine.
// Create SSE emitter for streaming state machine. `initial_mode`
// tracks whether the chat-template prompt pre-opened a `<think>`
// block (Qwen3.6 / Laguna enable_thinking path). When true, the
// emitter starts in REASONING so the model's first generated
// token routes to reasoning_content even though no explicit
// `<think>` opener appears in the token stream.
const StreamMode initial_mode = req.started_in_thinking
? StreamMode::REASONING
: StreamMode::CONTENT;
SseEmitter emitter(req.format, req.response_id, req.model,
(int)req.prompt_tokens.size(), req.tools,
&tool_memory_,
req.stop_sequences);
req.stop_sequences,
initial_mode);

// Emit initial SSE events.
if (req.stream) {
Expand Down
6 changes: 6 additions & 0 deletions server/src/server/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ struct ParsedRequest {
std::vector<std::string> stop_sequences;
// Bandit: per-session adaptive keep_ratio opt-in
std::string session_id;
// Set by the chat-template renderer when the rendered prompt suffix
// pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
// Drives the SseEmitter's initial mode so reasoning tokens emitted
// before any explicit `<think>` opener route to reasoning_content
// instead of leaking into content.
bool started_in_thinking = false;
};

// Build the /props response body. Exposed (non-static) so unit tests
Expand Down
Loading
Loading