Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 68 additions & 16 deletions server/src/server/sse_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,21 @@ static bool looks_like_plain_text_call(const std::string & text) {
return false;
}

static bool find_tool_start(const std::string & text, size_t & pos) {
// `is_plain_text` (out) reports whether the matched opener was Pattern B
// (plain-text `call:<verb>{`) vs Pattern A (XML envelope: `<tool_call>`,
// `<function=`, `<tool_code>`). Callers use this to drive divergent
// downstream behavior at emit_finish:
// - Pattern A: malformed parse → suppress buffer (XML envelopes are not
// user-facing text); .done events expose only the pre-call accumulated
// content.
// - Pattern B: malformed parse → flush buffer back to accumulated_content_
// so the literal `call:foo{...` span stays caller-visible; on success,
// the raw call text must also appear in the Responses-format
// finalization events (see emit_finish for the responses_streamed_text
// handling).
static bool find_tool_start(const std::string & text, size_t & pos,
bool & is_plain_text) {
is_plain_text = false;
// Pattern A: XML-like openers (<tool_call>, <function=, <tool_code>).
size_t idx = text.find('<');
while (idx != std::string::npos) {
Expand Down Expand Up @@ -107,6 +121,7 @@ static bool find_tool_start(const std::string & text, size_t & pos) {
size_t verb_start = found + CALL_PREFIX_LEN;
if (verb_start < text.size() && std::isalpha((unsigned char)text[verb_start])) {
pos = found;
is_plain_text = true;
return true;
}
}
Expand Down Expand Up @@ -466,8 +481,9 @@ std::vector<std::string> SseEmitter::emit_token(const std::string & raw_piece) {
size_t think_idx = window_.find(THINK_OPEN);
size_t think_close_idx = window_.find(THINK_CLOSE);
size_t tool_idx = std::string::npos;
bool tool_is_plain_text = false;
bool tool_hit = has_request_tools(tools_) &&
find_tool_start(window_, tool_idx);
find_tool_start(window_, tool_idx, tool_is_plain_text);

struct Hit { size_t pos; int type; }; // type: 0=think, 1=think_close, 2=tool-ish
std::vector<Hit> hits;
Expand Down Expand Up @@ -496,6 +512,7 @@ std::vector<std::string> SseEmitter::emit_token(const std::string & raw_piece) {
// Tool-call syntax. Keep the full tag/function text buffered
// until finish so the parser can validate it.
tool_buffer_ = window_.substr(h.pos);
tool_open_is_plain_text_ = tool_is_plain_text;
window_.clear();
mode_ = StreamMode::TOOL_BUFFER;
}
Expand Down Expand Up @@ -585,19 +602,36 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
}
window_.clear();

// Snapshot of what the Responses stream actually emitted as text
// deltas. The CONTENT-mode plain-text tool-call branch below
// mutates accumulated_content_ (strips matched call spans so the
// non-streaming response shape doesn't duplicate them as both text
// AND tool_use), but the Responses-format finalization events
// Snapshot of pre-strip text for the Responses finalization events.
//
// The Responses-format finalization events
// (response.output_text.done / content_part.done / completed) must
// reflect what was actually streamed in earlier
// response.output_text.delta events — otherwise a streaming client
// sees its accumulated buffer disagree with the .done payload.
// Other formats (OpenAI Chat, Anthropic) don't echo final
// aggregated text in the stream, so they can continue to read the
// (possibly stripped) accumulated_content_ directly.
const std::string responses_streamed_text = accumulated_content_;
// reflect the full assistant text — including any plain-text
// `call:<verb>{...}` span — so a streaming client sees its accumulated
// buffer agree with the server's .done payload, and non-streaming
// builders that consume .completed get the raw assistant emission.
// Meanwhile, accumulated_text() (used by OpenAI Chat / Anthropic final
// shapes and non-streaming Responses builders that DO want stripped
// text to avoid text+tool_use duplication) continues to return the
// post-hoist stripped form.
//
// Cases:
// - Pattern A (XML envelope, mode==TOOL_BUFFER): tool_buffer_ holds
// protocol artifact text (`<tool_call>...`) that was never streamed
// as a delta. Excluded from responses_streamed_text — the .done
// events expose only the pre-call accumulated_content_ (current
// behavior).
// - Pattern B (plain-text `call:`, mode==TOOL_BUFFER): tool_buffer_
// holds the raw `call:<verb>{...}` span plus any post-call trailing
// text. Both belong in the visible text snapshot per the PR #329
// review (tests #1126 et al).
// - mode==CONTENT plain-text hoist branch below: accumulated_content_
// already contains the full pre-strip text; the snapshot taken
// here freezes it before the strip mutates it.
std::string responses_streamed_text = accumulated_content_;
if (mode_ == StreamMode::TOOL_BUFFER && tool_open_is_plain_text_) {
responses_streamed_text += tool_buffer_;
Comment on lines +632 to +633

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid adding unstreamed tool buffer to Responses text

When a plain-text call: opener is detected before it has been flushed past the holdback (for example Looking up: + call:get_weather{...}), the emitter switches to TOOL_BUFFER, so the raw call span is never sent as response.output_text.delta; on successful parse, only parsed.cleaned_text is emitted as content before the final events. Appending tool_buffer_ here therefore makes response.output_text.done / response.completed.output_text include text that streaming clients never received in deltas, recreating the mismatch this snapshot is meant to avoid for the TOOL_BUFFER path.

Useful? React with 👍 / 👎.

}

// Parse tool calls from buffer
std::string fr = "stop";
Expand Down Expand Up @@ -699,9 +733,27 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
break;
default: break;
}
} else if (tool_open_is_plain_text_) {
// Pattern B (plain-text `call:<verb>{...`) failed to parse —
// most commonly an unbalanced `{` (the model's args were
// truncated, or the verb name is real but the JSON body
// never closed). Unlike Pattern A's XML envelopes, the
// buffered span here is plain user-facing text. Flushing
// it back to accumulated_content_ (and re-emitting as a
// content delta) preserves the malformed span as
// caller-visible signal that the model produced garbage —
// dropping it silently would hide the failure mode.
// accumulated_text() then reports the original `call:`
// text exactly as the model emitted it.
accumulated_content_ += tool_buffer_;
emit_content_delta(out, tool_buffer_);
tool_buffer_.clear();
} else {
// Tool syntax was detected but no valid call parsed. Do not leak
// malformed/incomplete XML back to the user as assistant text.
// Pattern A (XML envelope) parse failure. Do not leak
// malformed/incomplete `<tool_call>` / `<function=` /
// `<tool_code>` markup back to the user as assistant text
// — XML envelopes are protocol artifacts, not prose. See
// test_emitter_does_not_leak_malformed_tool_xml.
std::fprintf(stderr,
"[server] tool_call parse failed; suppressing buffered tool text "
"request_id=%s format=%d bytes=%zu\n",
Expand Down
15 changes: 15 additions & 0 deletions server/src/server/sse_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,21 @@ class SseEmitter {
StreamMode mode_;
std::string window_; // holdback buffer
std::string tool_buffer_; // accumulated tool text
// True when TOOL_BUFFER was entered via Pattern B (plain-text
// `call:<verb>{` opener) rather than Pattern A (XML envelope:
// `<tool_call>` / `<function=` / `<tool_code>`). Set at the
// CONTENT→TOOL_BUFFER transition in emit_token(). Drives two
// divergent behaviors at emit_finish():
// 1. malformed-parse branch: Pattern A drops the buffer
// (XML envelopes are not user-facing prose); Pattern B
// flushes the buffer back to accumulated_content_ so the
// literal `call:foo{...` span stays caller-visible.
// 2. Responses-format finalization events (.output_text.done /
// .content_part.done / .completed): Pattern B includes the
// raw call span in the streamed-text snapshot used for
// these events, while accumulated_text() continues to
// return the stripped (post-hoist) text.
bool tool_open_is_plain_text_ = false;
std::string accumulated_content_;
std::string accumulated_raw_; // all raw text for tool memory
std::string reasoning_text_;
Expand Down
Loading