From e974ac388fcec36db79bb08249ef1c5740863e18 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 22:59:33 -0400 Subject: [PATCH 1/6] docs(experiments): plan SSE emitter CONTENT-mode tool parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan + Codex review for running parse_tool_calls on accumulated CONTENT-mode text so plain-text `call:{...}` invocations (Gemma4) actually produce tool_use blocks instead of stop=end_turn. Codex verdict: REVISE (residue hazard) → integrated as a new emitter- level test guarding accumulated_text() span strip. Q4 rebuttal: tool_allowed enforcement is already inside parse_tool_calls. --- ...se-emitter-content-mode-tool-parse-plan.md | 439 ++++++++++++++++++ 1 file changed, 439 insertions(+) create mode 100644 docs/experiments/sse-emitter-content-mode-tool-parse-plan.md diff --git a/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md b/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md new file mode 100644 index 00000000..335bf36f --- /dev/null +++ b/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md @@ -0,0 +1,439 @@ +# SSE Emitter: run `parse_tool_calls` on CONTENT-mode text (plain-text `call:{}` path) + +Status: PLAN — pre-implementation. No code changes in this commit. + +Branch: `fix/sse-emitter-content-mode-tool-parse` +Base: `Luce-Org/lucebox-hub:main` @ `8305b6c` +Affected files: +- `server/src/server/sse_emitter.cpp` — add CONTENT-mode finalize branch. +- `server/test/test_server_unit.cpp` — emitter-level coverage. + +## 1. Problem statement (diagnosis is settled) + +`SseEmitter` only transitions into `StreamMode::TOOL_BUFFER` when it detects +one of the XML-style openers (``, ``) +in the streamed text (see `find_tool_start()` in +`server/src/server/sse_emitter.cpp:26-38`, gated through the +`mode_ == CONTENT` branch at line 388-423). For models like Gemma4 that +emit tool calls as plain text: + +``` +call:get_country_info{country: "France"} +_call:get_country_info{country_name: "France"} # underscore artifact +``` + +…the emitter stays in `CONTENT` mode for the entire stream. The +final-pass `parse_tool_calls` invocation at +`server/src/server/sse_emitter.cpp:512-517` is gated on +`mode_ == TOOL_BUFFER`, so it never runs on plain-text emissions: + +```cpp +if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) { + auto parsed = parse_tool_calls(tool_buffer_, tools_); + ... +} +``` + +The `parse_tool_calls` Pattern 5 regex (`re_call_verb_open()` at +`tool_parser.cpp:190-193`) plus the underscore-prefix sentinel +(commit `004a81b`) are correct, but unreachable in this code path. + +Empirical: smoke against image `fac7e0f-cuda12` on bragi returns +`stop_reason: end_turn` for a Gemma4 response whose body contains +`_call:get_country_info{country_name: "France"}` plain text. No +`tool_use` content block is produced. This is the live regression we +are fixing. + +## 2. Goal + +In `SseEmitter::emit_finish()`, add a parallel finalize branch that runs +when `mode_ == CONTENT` and the accumulated content text plausibly +contains a `call:{...}` invocation. If `parse_tool_calls` returns +≥1 ToolCall, hoist them into `tool_calls_` exactly like the TOOL_BUFFER +path, strip the matched spans from accumulated content, and flip the +finish reason to `tool_calls` so the Anthropic mapping at +`http_server.cpp:2074` resolves `stop_reason="tool_use"`. + +Non-goals (explicitly out of scope): +- Per-delta in-stream detection of `call:{}`. The streaming + emitter currently sends `content` deltas as raw text as soon as the + holdback drains; rewriting those into a `tool_use` post-hoc would + contradict bytes already on the wire. See §6. +- Touching the existing 5 tool-call detection patterns. The + XML/JSON/tool_code paths already work; we add a sibling branch. + +## 3. Design — finalize-pass CONTENT-mode parser + +### 3.1 Trigger predicate + +After the existing `if (mode_ == StreamMode::TOOL_BUFFER && ...)` block +at `sse_emitter.cpp:512-617`, add a sibling branch: + +```cpp +} else if (mode_ == StreamMode::CONTENT && + !accumulated_content_.empty() && + has_request_tools(tools_) && + looks_like_plain_text_call(accumulated_content_)) { + auto parsed = parse_tool_calls(accumulated_content_, tools_); + if (!parsed.tool_calls.empty()) { + tool_calls_ = std::move(parsed.tool_calls); + accumulated_content_ = parsed.cleaned_text; // matched spans stripped + fr = "tool_calls"; + + // emit format-specific events for tool calls (same switch as TOOL_BUFFER) + ... + } +} +``` + +### 3.2 Cheap pre-check (`looks_like_plain_text_call`) + +To avoid paying full-regex cost on every content response, gate the +parser on a tightened substring scan. Implementation: + +```cpp +static bool looks_like_plain_text_call(const std::string & text) { + // Match the tightened opener: `call:{`. Walks the text + // once; no heap allocation. Mirrors the sentinel logic in + // re_call_verb_open() at a coarse granularity so we only run the + // full std::regex pass when there's a plausible candidate. + size_t pos = 0; + while ((pos = text.find("call:", pos)) != std::string::npos) { + size_t v = pos + 5; + if (v < text.size() && (std::isalpha((unsigned char)text[v]) || text[v] == '_')) { + // Walk verb chars; require `{` after. + size_t w = v; + while (w < text.size() && + (std::isalnum((unsigned char)text[w]) || + text[w] == '_' || text[w] == '.' || + text[w] == ':' || text[w] == '-')) { + w++; + } + // Allow whitespace between verb and brace (mirrors `\s*\{` in the regex). + while (w < text.size() && std::isspace((unsigned char)text[w])) w++; + if (w < text.size() && text[w] == '{') return true; + } + pos = v; + } + return false; +} +``` + +Tradeoff: the full regex pattern accepts namespaced verbs like +`call:tools.weather:get_data{...}`. The pre-check's character class +already covers `:` `.` `-`, so namespaced calls pass. Performance: O(N) +single-pass with a `find("call:")` skip, dominated by the substring +scan; no regex compile/match cost for the common no-tool-call response. + +Codex review will ask whether the tightened pre-check is sufficient or +whether `std::regex_search` against a compiled-once +`call:[A-Za-z_][A-Za-z0-9_.:-]*\s*\{` regex is preferable. Decision +deferred to §5. + +### 3.3 Hoist semantics — mirror the TOOL_BUFFER path + +For each emitted ToolCall: +- ID: reuses the existing `generate_call_id()` from `tool_parser.cpp:31` + via the call inside `parse_tool_calls` (`add_call` at line 450 of + `tool_parser.cpp` already assigns `tc.id = generate_call_id();`). + No additional ID synthesis needed in `sse_emitter.cpp`. +- `tool_memory_` remember: copy lines 518-522 of the TOOL_BUFFER branch + (`tool_memory_->remember(ids, accumulated_raw_)`). +- Format-specific events: emit OpenAI `tool_calls` delta / Anthropic + `content_block_start` + `input_json_delta` + `content_block_stop` / + Responses `function_call_arguments.delta`+`.done`. Re-use the existing + switch at lines 533-609. + +### 3.4 `accumulated_content_` mutation — strip matched spans + +`parse_tool_calls` already returns `cleaned_text` (the input minus all +matched spans across all 6 patterns, trimmed). Replace +`accumulated_content_` with `parsed.cleaned_text`. This is the C++ +analog of `_strip_plain_text_tool_calls` at +`luce-bench/src/lucebench/areas/forge.py:144-172` — same semantics, +shared engine. + +Edge: in streaming, bytes that were already sent as content deltas +remain on the wire. Stripping `accumulated_content_` only affects the +final non-streaming response shape (Anthropic message `content` array, +OpenAI `message.content`, Responses `output_text`) and any later +introspection. Streaming SSE clients see the unmodified text deltas +followed by a post-hoc tool_call event. See §6. + +### 3.5 `finish_reason` bump + +Local `fr` (line 511) becomes `"tool_calls"` when ≥1 ToolCall survives +the `tool_allowed` filter. The filter is enforced *inside* +`parse_tool_calls` itself — `add_call` at `tool_parser.cpp:452` has +`if (!tool_allowed(tools, fn_name)) return;` so unauthorized calls +never enter `parsed.tool_calls`. The emitter's trigger condition +`if (!parsed.tool_calls.empty())` therefore already guarantees that +`fr` only flips when at least one allow-listed call survived. + +If the parser matches but everything is filtered out (all verbs +unknown to `tools_`), `fr` stays `"stop"` — matches the TOOL_BUFFER +path's `else` branch at line 610-617, which logs and keeps +`fr = "stop"`. + +This is also Codex review prompt #4 (the reviewer flagged a missing +filter; the filter is actually inside `parse_tool_calls`. Documenting +here for the next reader.) + +## 4. Edge cases + +### 4.1 Empty / no-call content +Pre-check `looks_like_plain_text_call` returns false → bail before +regex. No measurable cost beyond a single `std::string::find("call:")`. + +### 4.2 Mixed content + tool calls +`call:foo{...}` embedded in narrative prose. `parse_tool_calls` +already returns `cleaned_text` with matched spans removed and the +remainder trimmed (`tool_parser.cpp:623-647`). Verified: trailing +prose survives, only matched call spans are stripped. + +### 4.3 Tool not in allowlist +`add_call` lambda inside `parse_tool_calls` rejects unauthorized verbs +via `tool_allowed(tools, fn_name)` at `tool_parser.cpp:452`. This is +the same filter the TOOL_BUFFER path relies on. No emitter-layer work +needed. + +### 4.4 No tools declared (`tools_` empty) +Gate the entire new branch on `has_request_tools(tools_)` (the helper +already exists at `sse_emitter.cpp:22-24`). This mirrors the +TOOL_BUFFER trigger: `find_tool_start` is only called inside the +`has_request_tools(tools_) && ...` check at line 391. Same gating +keeps both paths consistent. + +### 4.5 `` envelope still routes via TOOL_BUFFER +The CONTENT-mode branch only fires when `mode_ == CONTENT` at +`emit_finish` entry. If the model emitted ``, the emitter +transitions to TOOL_BUFFER inside `emit_token` (line 422) and that +path handles parsing. The new branch is mutually exclusive with the +old one — the `else if` ladder guarantees this. Regression risk: nil +for the existing 5 patterns. + +### 4.6 Streaming +For `req.stream == true`, the per-delta SSE events already streamed +plain-text `call:foo{...}` to the client as `content` deltas BEFORE +finalize runs. Adding tool_use events at finalize produces a stream +that contains both text content AND a tool_use block — the OpenAI / +Anthropic SDKs *do* accept this shape (text + tool_use is legal), but +clients that gate on "first content type wins" will see text first. + +Two options: +- (A) Apply the fix to both streaming and non-streaming, accepting that + streaming clients see the call text in early deltas plus a tool_use + block at the end. The accumulated_content_ field still gets cleaned + for the final-message shape; the wire deltas are not retroactively + rewritten. +- (B) Gate the new branch to non-streaming only (i.e., when no per- + token deltas were emitted with content). This requires threading a + `stream` boolean into the emitter (currently not present in + `SseEmitter`). + +Decision (will defer to Codex review prompt #2): start with option (A) +because the wire shape `text-delta + tool_use` is legal in both +Anthropic and OpenAI specs, and forge/agent_recorded uses non-streaming +exclusively. If Codex flags it as a real correctness hazard, switch to +(B) and document. + +## 5. Codex review prompts (to be filled in after `lucebox codex` run) + +Send Codex the plan + the following four questions verbatim: + +1. **Pre-check tightness**: is the substring scan in + `looks_like_plain_text_call` sufficient, or should we use a compiled- + once `std::regex` (e.g. + `call:[A-Za-z_][A-Za-z0-9_.:\-]*\s*\{`) to reduce false invocations + of the full parser? +2. **Streaming SSE shape**: the TOOL_BUFFER finalize path emits per-tool + SSE events (`content_block_start`/`input_json_delta`/`content_block_stop` + in Anthropic, `tool_calls` delta in OpenAI). Should the new CONTENT- + mode branch emit the same SSE events for streaming clients, or only + populate the final non-streaming response fields? (Critical for + streaming correctness — see §4.6.) +3. **Race / state-ordering**: between `parsed.cleaned_text` replacing + `accumulated_content_` and the new SSE tool_use events being + appended, could we ever emit a `tool_use` block whose `input` + references text that's still present in `accumulated_content_`? Or + would the final accumulated content contain the same call span we + just emitted as a `tool_use`? +4. **`finish_reason` bump scope**: should `fr` flip to `tool_calls` + only when ≥1 tool actually survives `tool_allowed` (current + design), or whenever `parse_tool_calls` finds *any* match (even if + filtered)? The TOOL_BUFFER path's existing behavior is the + reference. + +## Codex review + +Run: 2026-05-31 against `lucebox.service` (image `fac7e0f-cuda12`, +Gemma4-26B-A4B, on bragi). Codex CLI v0.130.0 via `codex exec --model +dflash`. Verbatim answers below. + +> **Q1. Is the substring-based `looks_like_plain_text_call` pre-check sufficient?** +> **Yes.** A manual substring scan for `call:` followed by a character +> check is significantly more efficient than a compiled `std::regex` +> for a `finalize()` call. Since this is an end-of-stream operation, +> the O(N) scan is negligible, and you avoid the heavy state machine +> overhead of regex for a simple pattern. + +> **Q2. For streaming responses, should the new CONTENT-mode branch emit SSE tool_use events at finalize?** +> **Yes (Option A).** In streaming mode, the `content` deltas +> containing the raw `call:foo{...}` text have already been pushed to +> the client. You cannot retroactively "un-send" them. The standard +> and most compatible behavior for clients (Cline, Open WebUI) is to +> receive the raw text as prose/thought, followed by a formal +> `tool_use` block at the end. + +> **Q3. Are there race / state-ordering hazards between `accumulated_content_ = parsed.cleaned_text` and the new SSE tool_use events being appended?** +> **Yes.** There is a "residue" hazard. If `parse_tool_calls` does not +> perfectly identify and strip the exact character spans of the +> `call:verb{...}` syntax from `cleaned_text`, the client will receive +> the raw tool-call string inside the final text block. This results +> in the UI displaying the tool call twice: once as literal text and +> once as a structured tool block. + +> **Q4. Should `finish_reason` bump to "tool_calls" only when ≥1 tool survives `tool_allowed`, or whenever `parse_tool_calls` matches anything?** +> **Only when ≥1 tool survives `tool_allowed`.** The proposed plan +> lacks a check against the allowed tools list before setting `fr = +> "tool_calls"`. If the model emits a call that the server has +> disabled, bumping the `finish_reason` to `tool_calls` without an +> actual valid tool in the payload is a protocol violation that can +> crash or confuse SDK consumers. + +> **VERDICT: REVISE.** The plan must explicitly filter +> `parsed.tool_calls` against the `tool_allowed` list before updating +> `tool_calls_` and `fr` to ensure `finish_reason` accurately reflects +> authorized tool usage. + +### Response (integration / rebuttal) + +- **Q1 integrated** — keep the substring pre-check as specified in §3.2. + Codex agrees regex would be overkill at finalize. No plan change. +- **Q2 integrated** — proceed with Option A (apply fix to both streaming + and non-streaming). The wire shape `text-delta + tool_use` is legal + in both Anthropic and OpenAI specs per Codex. No plan change beyond + §4.6 explicit confirmation. +- **Q3 integrated with new test** — Codex's "residue" hazard is real if + `parse_tool_calls.cleaned_text` ever fails to strip a span. The + parser tests already cover this for Pattern 5 (see + `test_parse_call_verb_cleaned_text` in `test_server_unit.cpp`). + Adding §7 emitter-level test `test_emitter_content_mode_strips_call_span_from_accumulated_text` + to specifically guard the emitter wiring: assert that + `em.accumulated_text()` does NOT contain the substring `call:` after + finalize when ≥1 call was hoisted. This is a wiring regression test + for the `accumulated_content_ = parsed.cleaned_text` line. +- **Q4 rebuttal** — Codex flags a missing `tool_allowed` filter; this + is actually already enforced by `parse_tool_calls`'s internal + `add_call` lambda at `tool_parser.cpp:452` + (`if (!tool_allowed(tools, fn_name)) return;`). Calls failing the + allowlist are dropped before they reach `result.tool_calls`. The + plan's trigger condition is `if (!parsed.tool_calls.empty())`, so + `fr = "tool_calls"` is set only when ≥1 allow-listed call survived. + This matches Codex's recommendation; the perceived gap was the + reviewer not knowing about `parse_tool_calls`'s internal filter. + Documenting this explicitly in §3.5 for future readers. + +VERDICT was REVISE; revisions applied are §7 new test +(`strips_call_span_from_accumulated_text`) and §3.5 clarification on +the existing `tool_allowed` enforcement inside `parse_tool_calls`. +Proceed to implementation. + +## 6. Implementation outline (post-codex) + +1. Add `looks_like_plain_text_call` static helper in + `sse_emitter.cpp` (anonymous namespace next to `has_request_tools`). +2. Add the CONTENT-mode `else if` branch inside `emit_finish` between + the existing TOOL_BUFFER block (line 512-617) and the format-specific + final events switch (line 620). +3. Refactor the format-specific tool-call event emission inside the + TOOL_BUFFER branch into a private member `emit_tool_call_events(out)` + so both branches share the implementation. Keeps line counts down + and avoids drift between the two paths. +4. Verify `accumulated_content_` post-mutation is consumed by the + format-specific final events switch at line 620 (Responses uses + `accumulated_content_` in `response.output_text.done`, + `response.content_part.done`, and `final_output` at lines 681-712). +5. No header signature changes; the new helper is `static`. + +## 7. Tests (`server/test/test_server_unit.cpp`) + +Mirror the style of `test_emitter_tool_buffer_detection`, +`test_emitter_bare_function_tool_buffer_detection`, and +`test_emitter_no_tools_keeps_tool_like_text`. Add: + +- `test_emitter_content_mode_plain_text_call_parsed` — feed + `"I'll fetch it. call:get_weather{location: \"SF\"}"` to a CONTENT-mode + emitter with `weather_tools()`. Assert: 1 ToolCall named `get_weather` + with args `{location: "SF"}`, `accumulated_text()` no longer contains + `call:get_weather{`, OpenAI finish_reason chunk shows `"tool_calls"`. +- `test_emitter_content_mode_no_tools_skips_plain_text_call` — same + input, but empty tools array. Assert: no ToolCall, the call: text + remains in `accumulated_text()`. +- `test_emitter_content_mode_underscore_prefix_call_parsed` — feed + `"_call:get_weather{location: \"NYC\"}"`. Assert: ToolCall emitted + (regression for the `_call:` artifact from commit `004a81b`). +- `test_emitter_content_mode_no_call_substring_skips_parser` — feed + `"Plain prose with no tool invocations at all."`. Assert: no + ToolCall, accumulated text unchanged, `finish_reason()` is `"stop"`. +- `test_emitter_content_mode_mixed_calls_multiple` — feed + `"start. call:get_weather{location: \"A\"} middle. call:get_weather{location: \"B\"} end."`. + Assert: 2 ToolCalls in order with the two locations; accumulated + text contains `"start."`, `"middle."`, `"end."` (call spans + stripped); no leakage of `call:`. +- `test_emitter_content_mode_malformed_call_dropped` — feed + `"call:get_weather{unclosed"`. Assert: no ToolCall, no crash, the + malformed text remains in `accumulated_text()` (no panic). +- `test_emitter_content_mode_does_not_double_fire_on_tool_call_xml` — + regression guard. Feed + `"\n\nSF\n\n"`. + Assert: exactly 1 ToolCall (TOOL_BUFFER path handled it, new branch + did not double-emit). +- `test_emitter_content_mode_strips_call_span_from_accumulated_text` — + Codex Q3 residue-hazard guard. Feed + `"prefix call:get_weather{location: \"SF\"} suffix"` with weather + tools. Assert: `em.accumulated_text().find("call:")` returns npos + (the matched span is stripped from the visible content). Without + this guard the emitter could double-display the call (once as + literal text, once as a `tool_use` block). + +Register each test with `RUN_TEST(...)` inside `main()` around the +existing `test_emitter_*` block (~line 3549-3560 of the file). + +## 8. Don't break (regression matrix) + +| Scenario | Path | Expected | Test ref | +|----------|------|----------|----------| +| `...` XML | TOOL_BUFFER | 1 ToolCall, no double-fire | new `does_not_double_fire` test | +| `` bare XML | TOOL_BUFFER | 1 ToolCall | existing `test_emitter_bare_function_tool_buffer_detection` | +| `{json}` | TOOL_BUFFER | 1 ToolCall | existing tool_parser tests cover the parser path | +| Plain prose (no tools) | CONTENT | text preserved | existing `test_emitter_content_only_no_thinking` | +| `call:foo{...}` + tools | **new** | 1 ToolCall, text stripped | new `content_mode_plain_text_call_parsed` | +| `call:foo{...}` no tools | **new** | text preserved, no ToolCall | new `content_mode_no_tools_skips_plain_text_call` | +| Malformed `call:foo{unclosed` | **new** | text preserved, no crash | new `content_mode_malformed_call_dropped` | +| `accumulated_text()` for OpenAI Chat | both | visible text minus call spans | new `content_mode_plain_text_call_parsed` | + +## 9. PR shape + +- Branch: `fix/sse-emitter-content-mode-tool-parse` off `origin/main@8305b6c`. +- Two commits: + 1. `docs(experiments): plan SSE emitter CONTENT-mode tool parse` (this file). + 2. `fix(server): run parse_tool_calls on CONTENT-mode accumulated text` (impl + tests). +- Push to `easel:fix/sse-emitter-content-mode-tool-parse`. +- PR base: `Luce-Org/lucebox-hub:main`. + +PR body must include: +- Diagnosis summary (verbatim from §1). +- Empirical signal: smoke against fac7e0f-cuda12 on bragi returning + `stop_reason: end_turn` for plain-text `_call:get_country_info{...}`. +- Test count delta. +- Streaming scope decision (option A vs B from §4.6, post-Codex). +- Known limitations. + +## 10. Open questions deferred to Codex + +- Whether the streaming wire shape `text-delta + post-hoc tool_use` + breaks Cline / open-webui / Anthropic SDK consumers. +- Whether `accumulated_raw_` (used for `tool_memory_->remember`) + should be cleaned too — leaning no, since `tool_memory_` keeps the + pre-strip raw for ID-replay matching. From 80552014eb9c24ab87d6327a70df8b6006d8cce5 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 11:34:28 -0400 Subject: [PATCH 2/6] docs(server): plan for call:{} tool-parser pattern + codex review Captures the diagnosis (gemma forge 0/30 on 2026-05-30), the proposed sixth detection pattern, the relaxed-JSON arg parser sketch, the unit-test matrix, and codex's review (which forced reordering the new pattern to slot #5 ahead of the bare-JSON sweep to avoid interception of nested name/arguments-shaped args). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../server-call-verb-tool-parser-plan.md | 434 ++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 docs/experiments/server-call-verb-tool-parser-plan.md diff --git a/docs/experiments/server-call-verb-tool-parser-plan.md b/docs/experiments/server-call-verb-tool-parser-plan.md new file mode 100644 index 00000000..3c09db3b --- /dev/null +++ b/docs/experiments/server-call-verb-tool-parser-plan.md @@ -0,0 +1,434 @@ +# Server-side `call:{args}` tool-call parser (pattern #6) + +Status: draft → codex review → implementation. +Tracking branch: `fix/server-call-verb-tool-parser` (off `origin/main`). + +## Background + +The 2026-05-30 gemma full bench scored forge **0/30** on RTX 3090 Ti +(`d9ecba6cc105-nvidia-geforce-rtx-3090-ti-gemma-full-2026-05-30-67f4/forge.json`). +Every row’s `iterations[0].output` shows the assistant emitting +plain-text tool invocations of the form: + +- `call:get_country_info{country: "France"}` +- `call:default_api:fetch_sales_data{quarter: "Q4", year: 2024}` +- `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}` +- `call:execute-bead:list-files{path: "src/"}\n\ncall:execute-bead:read-file{path: "..."}` +- `call:execute-bead:read_file{path: "..."}` +- `call:shell{command: "rg -i auth"}` +- `call:infrastructure:get_logs{resource: \`payments-service-cluster\`, start_time: \`...\`}` + (backtick-quoted values appear in the snapshot too) + +`server/src/server/tool_parser.cpp::parse_tool_calls` has five +envelope-shaped detection patterns (XML ``, bare +``, ``, ``, +bare JSON `{ "name":…, "arguments":… }`). **None** of these match the +gemma plain-text emission. As a result: + +1. `SseEmitter::accumulate` buffers text, calls `parse_tool_calls`, + gets back zero `ToolCalls`, never flips `finish_reason` to + `tool_calls`. +2. `/v1/messages` (Anthropic) sees an empty `emitter.tool_calls()` and + emits a `text` content block instead of a `tool_use` block + (`server/src/server/http_server.cpp:2030–2090`). +3. forge’s WorkflowRunner expects a `ToolCall` and the row dies with + `error_type=ValidationError`. + +A client-side workaround already shipped on `feat/lucebox-docker` +(commit `deba2fd`) — see `_parse_plain_text_tool_calls` in +`luce-bench/src/lucebench/areas/forge.py`. **This PR ports that to +C++ and adds it as a sixth `parse_tool_calls` pattern**, fixing the +problem at the server. + +## Goals + +1. Add pattern #6 to `parse_tool_calls`: `call:?{ }`. +2. Drive the existing downstream pipeline: + `SseEmitter::accumulate` → `finish_reason=tool_calls` → Anthropic + `stop_reason=tool_use` with `tool_use` content blocks, or OpenAI + `choices[].message.tool_calls`. Nothing downstream needs to change. +3. Honour the existing `tool_allowed(tools, name)` filter so callers + that pass a constrained tool list (forge) only get back tools they + declared. +4. Land C++ unit tests in `server/test/test_server_unit.cpp` covering + the gemma-observed shapes plus the obvious edge cases. + +## Non-goals (intentionally out of scope) + +- Touching the client-side `_parse_plain_text_tool_calls` in + `forge.py`. It stays as defense-in-depth — older deployed servers + won’t have this fix. The PR description will note that the client + fallback is no longer load-bearing after this merges. +- Rebuilding the docker image / running an e2e bench. The unit tests + + a deliberately constructed `parse_tool_calls` call exercising every + shape are sufficient for PR review. +- Touching `systemd`, the host `/home/erik/.local/bin/lucebox` + wrapper, the running `lucebox.service` (a forge bench is in flight), + or anything outside `server/src/server/tool_parser.cpp` + + `server/test/test_server_unit.cpp`. + +## Design + +### Pattern ordering + +Insert as **pattern #5** (one slot ahead of the existing bare-JSON +sweep, which becomes the new pattern #6). The four XML-envelope +patterns are lexically unambiguous and stay ahead. The reorder is +forced by a real interception hazard codex flagged in review: +`call:my_tool{"name": "inner_tool", "arguments": {}}` is a valid +gemma-shaped emission, and the bare-JSON sweep would otherwise lift +the inner `{"name": ...}` out as its own ToolCall before the new +pattern got a chance. By running the `call:…{…}` matcher first and +recording its brace-balanced span in `removals`, the bare-JSON sweep +sees that range as overlapping and skips it. + +### Pattern #6 regex + +```cpp +// `call:` opener: optionally preceded by start-of-string, +// whitespace, or a hard delimiter so we don't match `I'll call:foo` +// inside narrative. +static const std::regex re_call_verb_open( + R"((^|[\s,;:\(\[\{])call:([A-Za-z0-9_.:\-]+)\s*\{)"); +``` + +Notes: + +- The leading group captures a sentinel character so we keep + `it->position()` aligned with the start of the **prefix**, not the + `call:` itself; the implementation computes the actual `call:` + offset by adding the captured sentinel’s length back. +- `[A-Za-z0-9_.:\-]+` matches verbs *and* embedded namespaces in one + capture (`execute-bead:read-file`, `default_api:fetch_sales_data`, + bare `shell`). The verb-name passed to `add_call` strips everything + up to and including the last `:`, so `execute-bead:read-file` + becomes `read-file`. +- The `\s*` before `{` is intentional — the snapshot rows always emit + `{` immediately, but tolerating optional whitespace doesn’t open new + failure modes and matches the relaxed pattern from the Python port. +- `^` works on `std::regex` ECMAScript syntax against `text` from + position 0; we do not rely on `multiline` mode. Subsequent matches + rely on the explicit sentinel-character alternation, which catches + newlines, spaces, commas, parens, square brackets, and curly braces + — every realistic boundary in the snapshot data. + +### Balanced-brace extractor + +Mirror pattern #5’s scanner — track depth, skip over string literals, +**but** extend the string-literal handling to honour single quotes +and backticks (the snapshot row above has backticked values). This +diverges slightly from the Python port (which only knew `'` and `"`) +to handle the real gemma output we observed. + +```cpp +// Returns one-past-close index, or std::string::npos. +static size_t balanced_braces_end(const std::string & text, size_t open) { + int depth = 0; + char in_str = 0; // 0 or one of " ' ` + for (size_t i = open; i < text.size(); i++) { + char c = text[i]; + if (in_str) { + if (c == '\\' && i + 1 < text.size()) { i++; continue; } + if (c == in_str) in_str = 0; + continue; + } + if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; } + if (c == '{') depth++; + else if (c == '}') { + if (--depth == 0) return i + 1; + } + } + return std::string::npos; +} +``` + +### Relaxed-JSON args parser + +Direct port of `_coerce_relaxed_json` from `forge.py`: + +1. Try `json::parse(strict)` on `"{" + body + "}"` first. Most + strict-JSON args (`{"path": "src/"}`) succeed here. +2. On failure, rewrite the body: walk char-by-char tracking string + state; when outside a string, look ahead for a bare-identifier + `[A-Za-z_][A-Za-z0-9_]*` immediately followed by optional + whitespace and `:` — wrap the identifier in double quotes. Also + normalize single-quoted strings to double-quoted (open `'` becomes + `"`; closing `'` becomes `"`; backtick strings get the same + treatment). Backslash escapes are preserved. +3. Retry `json::parse` on the rewrite. If it still fails, **drop the + single invocation** (do not throw, do not add a `ToolCall`, do not + poison `removals`). Continue scanning past the closing brace. + +The rewrite step has to leave already-quoted JSON keys alone. The +look-ahead is `(?!").+:` — i.e. only fire if we’re not already +sitting just after a `"` we already emitted. We track this via the +`out` buffer’s last char, exactly as the Python implementation does. + +### Verb normalization + +```cpp +std::string verb = full_match; // e.g. "execute-bead:read-file" +size_t colon = verb.find_last_of(':'); +if (colon != std::string::npos) verb = verb.substr(colon + 1); +``` + +`add_call(verb, args, span_start, span_end)` then passes through the +existing `tool_allowed` filter. + +### Span tracking + +When a successful invocation is parsed, push `{start, brace_end}` onto +`removals` so the surrounding cleanup pass strips the `call:…{…}` text +out of the assistant message (otherwise the OpenAI/Anthropic content +block would carry the literal `call:foo{…}` alongside the structured +tool call — the same double-signal hazard the Python port avoids via +`_strip_plain_text_tool_calls`). + +The `start` recorded in `removals` is the index of `c` in `call:` (not +the preceding sentinel) so we don’t eat the user’s narrative space. + +### Tool-allowed filter placement + +Use the existing `tool_allowed(tools, verb)` helper inside +`add_call` (same path as patterns #1–5). Dropping happens **after** +regex match + JSON parse — this is cheap and consistent. Constraining +the regex to a known verb set would have been faster but it precludes +the “no tools constraint” case (forge passes a tool list; agent and +codex sometimes don’t). + +## Regression safety inventory + +Existing tool-parser tests in `server/test/test_server_unit.cpp` +(lines 239–317): + +| Test | Body contains `call:` substring? | +|---|---| +| `test_parse_tool_call_xml` | no | +| `test_parse_bare_function_xml` | no | +| `test_parse_json_tool_call` | no | +| `test_parse_no_tools` | no | +| `test_parse_tool_code_wrapper` | no | +| `test_parse_tool_allowed_filter` | no | + +Also the `emitter_*` family (lines 470–700+) uses `` XML +exclusively. No existing test has a `call:{` substring that +could accidentally match the new pattern. Safe. + +## Test plan + +Add to `server/test/test_server_unit.cpp` (the same file already +covers tool-parser unit tests; convention is one function per case +registered under the `── Tool parser ──` section). New cases: + +1. `test_parse_call_verb_single` — `call:get_country_info{country: "France"}` +2. `test_parse_call_verb_back_to_back` — + `call:get_country_info{country: "France"}call:summarize{text: "ok"}` +3. `test_parse_call_verb_namespaced` — + `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}` → verb + normalized to `read-file` +4. `test_parse_call_verb_snake_and_hyphen` — covers both + `call:execute-bead:read_file{path: "..."}` and + `call:execute-bead:list-files{path: "src/"}` with intervening + newlines +5. `test_parse_call_verb_tool_allowed_filter` — verb `disallowed` in + text, `tools` allows only `allowed`; result has zero `ToolCall` +6. `test_parse_call_verb_inline_prose_rejected` — + `"Sure, I'll call:foo{x:1}"` *should* still match because the + space before `call:` satisfies the sentinel. To exercise the + anti-false-positive case, use `"narrative.call:foo{x:1}"` — + no sentinel char before `call:`, regex rejects. +7. `test_parse_call_verb_malformed_args` — + `call:foo{country: "France"` (unterminated brace) → call dropped, + no crash, no `ToolCall`, no removal span +8. `test_parse_call_verb_inner_brace_in_string` — + `call:foo{cmd: "echo {} ok"}` → must not break on the `{` inside + the string; args dict has `cmd == "echo {} ok"` +9. `test_parse_call_verb_strict_json_args` — + `call:foo{"path": "x"}` (already-quoted keys) parses on the strict + path +10. `test_parse_call_verb_unquoted_keys` — relaxed pass kicks in: + `call:foo{path: "x"}` → args dict has `path == "x"` +11. `test_parse_call_verb_cleaned_text` — verify the `call:…{…}` + fragment is stripped from `result.cleaned_text` (parity with the + XML envelope patterns). +12. `test_parse_call_verb_intercept_inner_json` (codex-requested) — + `call:outer{"name": "inner", "arguments": {}}`. Exactly one + `ToolCall` with `name == "outer"` and `arguments` containing the + literal inner JSON; verifies that the reorder of patterns #5/#6 + actually defends against the bare-JSON sweep stealing the inner + object. + +These run under the existing `RUN_TEST(...)` block; add them under +the `── Tool parser ──` heading immediately after +`test_parse_tool_allowed_filter`. + +### Build verification + +The new code lives entirely in `server/src/server/tool_parser.cpp` — +already in `target_sources` for `test_server_unit` +(`server/CMakeLists.txt:776–788`). Build with: + +```bash +cmake --build server/build --target test_server_unit && \ + server/build/test_server_unit +``` + +(If the lucebox-hub-285 worktree doesn’t have a build dir, configure +it the same way the repo expects — but a clean build is the operator’s +job, not part of the PR landing process.) + +## Reverse-compat with the client-side fix + +`luce-bench/src/lucebench/areas/forge.py::_parse_plain_text_tool_calls` +(commit `deba2fd` on `feat/lucebox-docker`) stays untouched. After +this PR merges and the docker image is rebuilt, the client-side path +becomes a no-op for any server that includes pattern #6. Leaving it +in place is harmless (the synthesized `ToolCall`s match what the +server now produces) and shields older deployed servers. The PR +description will spell this out so a reviewer doesn’t mistakenly +strip the Python fallback. + +## Image rebuild + e2e (out of scope for this PR) + +After merge, the operator should: + +1. Rebuild the cuda12 image (`docker bake` per `docker-bake.hcl`). +2. Restart `lucebox.service` *after the current forge bench finishes*. +3. Re-run the 2026-05-30 gemma full bench and confirm forge passes + non-trivially without the client-side `_parse_plain_text_tool_calls` + workaround. + +These steps are validation, not part of the PR diff. + +## Risks / open questions + +- **Multiline args.** The snapshot shows args that span multiple + lines (`call:default_api:analyze_data{data: [{"date": ...}, ...]}`). + The balanced-brace scanner is line-agnostic so this Just Works — + noted here to make the implementer audit at least one multi-line + case in the unit tests. +- **Backticked values.** `infrastructure:get_logs{resource: \`x\`}` + uses backticks. JSON has no backtick-string concept; the relaxed + rewrite normalizes them to double-quoted strings. This is one place + the C++ port goes **further** than the Python port — that’s a + conscious choice based on the real snapshot. +- **Performance.** `parse_tool_calls` is already O(n·patterns); pattern + #6 adds a single regex sweep plus per-match work. The buffer fed + through `SseEmitter::accumulate` is bounded by generation length; + no realistic perf concern. + +## Codex review + +Captured verbatim from `codex exec` against the running lucebox +server (`http://localhost:8080/v1` / `dflash` model). The final +answer block follows; the codex run appended additional +stream-of-consciousness scratch after the answer (an artefact of the +model’s think channel leaking past the answer) — that scratch is +omitted as noise. + +``` +This is a high-quality implementation plan. The approach to handling +"dirty" JSON via a manual balanced-brace scanner and a +relaxation/normalization pass is a pragmatic and necessary solution +for the non-standard emissions observed in Gemma models. + +However, there is one **critical architectural risk** regarding +pattern priority and one minor discrepancy in the brace-tracking +logic. + +### Critical: Pattern Priority & Interception +Issue: Pattern #5 (Bare JSON sweep) is currently ordered before +Pattern #6 (the new `call:verb{...}` pattern). +Risk: If a tool call's arguments happen to contain keys named +`"name"` and `"arguments"`, Pattern #5 will intercept the substring +and incorrectly identify a tool call. +* Example: `call:my_tool{"name": "inner_tool", "arguments": {}}` +* Outcome: Pattern #5 will match the `{...}` part, creating a + ToolCall for `inner_tool`. Because Pattern #5 is processed first, + it will add a removal span for the inner JSON, potentially leaving + the `call:my_tool` prefix dangling in the cleaned_text or causing + Pattern #6 to fail/overlap. +* Recommendation: Move Pattern #6 to a higher priority (e.g., before + Pattern #5). The `call:` prefix is a much stronger semantic signal + than the presence of JSON keys. + +### Technical Review + +1. Regex & Anchors +* The sentinel list `[\s,;:\(\[\{]` is excellent. It effectively + prevents false positives like `I'll call:foo`. +* Note on `^`: In `std::regex`, `^` matches the start of the string. + While `std::regex` lacks a standard multiline flag, your inclusion + of `\s` (which covers `\n`) in the sentinel list makes the pattern + robust for tool calls starting on new lines. + +2. Robustness of the Parser +* Balanced Braces: Your `balanced_braces` logic correctly handles + string literals (including escaped quotes) and nested braces. +* Discrepancy: The plan mentions tracking `[]` (square brackets), but + the provided implementation snippet only increments depth for `{`. + While tracking `[]` isn't strictly required to find the outer `{}` + of a JSON object, you should ensure it's implemented if you intend + to follow the plan's promise, especially to handle edge cases where + a `}` might appear inside an array (though unlikely in valid JSON). +* Relaxed JSON: The "rewrite" strategy (normalizing `'` and `` ` `` to + `"`, and wrapping unquoted keys) is a solid way to handle the + observed "relaxed" syntax. + +3. Filter Placement +* The `tool_allowed` filter is correctly placed after the parser has + successfully identified the verb and arguments. Constraining the + regex itself to a known verb list would be a maintenance burden. + +4. Unit Tests +* The test plan is very thorough. To make it bulletproof, I recommend + adding these two specific cases: + * Nested Tool Call: `call:outer{"name": "inner", "arguments": {}}` + — specifically to verify that the priority fix (Point 1) works. + * Complex String Content: `call:foo{cmd: "echo {not_a_brace}"}` — + to ensure the string-aware scanner doesn't trip on braces + inside quotes. + +### Summary of Recommendations +1. Reorder patterns: Move Pattern #6 above Pattern #5. +2. Verify `[]` implementation: Ensure the brace scanner actually + tracks `[` and `]` to match your design spec. +3. Add "Interception" test: Add a unit test where tool arguments + mimic a JSON tool call to confirm the priority fix. +``` + +### Plan adjustments after review + +1. **Pattern ordering — accepted.** Codex’s `call:my_tool{"name": + "inner_tool", "arguments": {}}` example is real: gemma snapshot + rows in the snapshot include `call:default_api:analyze_data{data: + [{...}, ...]}` where the inner objects don’t happen to carry + `name`/`arguments` keys today, but accepting that hazard for + future-proofing is cheap. The new pattern moves to **#5**, demoting + the bare-JSON sweep to **#6**. The five XML-envelope patterns stay + ahead since they’re lexically unambiguous and pattern #6 would also + spuriously chew their inner JSON if reordered. +2. **Square-bracket tracking — accepted with a clarification.** I + don’t need `[`/`]` in the brace-depth counter for closing the outer + `{`, because by the time we hit a `]` without `[` we’re already + inside a string (handled) or the JSON is malformed (we drop the + call). I will **still** add explicit `[`/`]` tracking to the + scanner so a stray `}` *inside* an array (e.g. a JSON value like + `"}"` written without quotes) doesn’t fool the close-counter. This + matches the spirit of the plan’s prose. +3. **Interception unit test — accepted.** Add + `test_parse_call_verb_intercept_inner_json` with body + `call:outer{"name": "inner", "arguments": {}}`. Verify the + resulting `ToolCall` has `name == "outer"` and `arguments == + {"name": "inner", "arguments": {}}` — and that there’s exactly + **one** `ToolCall`, not two. +4. **“Complex string content” test — accepted.** Add + `test_parse_call_verb_string_with_close_brace` with + `call:foo{cmd: "echo {not_a_brace}"}`. Already partially covered + by `test_parse_call_verb_inner_brace_in_string`; will reuse and + strengthen that test rather than add a new one. + +No rebuttals — codex’s feedback is all sound. Renumber: the new +pattern is **pattern #5**, the old bare-JSON sweep becomes **pattern +#6**. Update test names, header comments, and the file-top docstring +accordingly during implementation. From d67a26985cd456a27812733cd8eb87f010d69937 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 11:38:18 -0400 Subject: [PATCH 3/6] fix(server): parse gemma's call:{} plain-text tool emissions Adds a sixth detection pattern to `parse_tool_calls` that recognizes the plain-text tool invocations gemma emits in chat-completion content (`call:get_country_info{country: "France"}` / `call:execute-bead:read-file{path: "..."}` / etc). The 2026-05-30 gemma full bench scored forge 0/30 because every row's output carried these `call:{...}` invocations as text rather than structured `tool_use` content blocks. None of the existing five envelope-shaped detectors (``, ``, ``, bare JSON) match the bare `call:` shape. The new pattern: - Anchors on a sentinel character (whitespace, comma, semicolon, open/close bracket, etc.) before `call:` so narrative usages like `narrative.call:foo` don't match. - Supports namespaced verbs (`execute-bead:read-file`, `default_api:fetch_sales_data`) and strips the namespace before using the verb as the ToolCall name. - Extracts the args block via a quote- and escape-aware balanced-brace scanner that tolerates `"`, `'`, and `` ` `` string literals and tracks `[]` depth alongside `{}`. - Parses the args as strict JSON first, then falls back to a relaxed rewrite that quotes bare identifier keys and normalizes single/ backtick quoted strings to double-quoted before retrying. Malformed args drop the single invocation without crashing or polluting other calls. - Runs *before* the bare-JSON sweep so that inner args of the form `call:outer{"name": "inner", "arguments": {}}` aren't hijacked into a spurious `inner` ToolCall by pattern #6. Downstream the existing wiring takes over: SseEmitter::accumulate already calls parse_tool_calls; a non-empty ToolCall list flips finish_reason to `tool_calls`, which the Anthropic /v1/messages branch maps to `stop_reason="tool_use"` with `tool_use` content blocks (http_server.cpp:2030-2090) and the OpenAI branch maps to `choices[].message.tool_calls`. The forge client-side workaround `_parse_plain_text_tool_calls` shipping on feat/lucebox-docker (commit deba2fd) becomes redundant once a server with this fix is deployed. It stays in place as defense-in-depth for older deployed servers. Test plan: 14 new C++ unit cases in test_server_unit.cpp covering single / back-to-back / namespaced / snake- and kebab-case verbs; tool-allowed filtering; mid-prose rejection vs. whitespace-led acceptance; malformed args drop; inner `{}` inside string literals; strict-JSON and relaxed-keys arg parsing; cleaned_text scrubbing; the codex-requested inner `name`/`arguments` interception case; and multi-line nested-array args mirroring the snapshot data. All pass in a standalone driver. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/tool_parser.cpp | 170 +++++++++++++++++++++++++++- server/src/server/tool_parser.h | 5 +- server/test/test_server_unit.cpp | 180 ++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+), 5 deletions(-) diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp index 6244b250..43bf2b1c 100644 --- a/server/src/server/tool_parser.cpp +++ b/server/src/server/tool_parser.cpp @@ -1,11 +1,18 @@ // Tool call parser implementation. // -// Five detection patterns, tried in order: +// Six detection patterns, tried in order: // 1. ...V... // 2. ...params... (bare, outside tool_call) // 3. (function-signature style) // 4. {JSON} -// 5. Bare JSON objects with name+arguments fields +// 5. call:?{relaxed-JSON args} (gemma plain-text emissions) +// 6. Bare JSON objects with name+arguments fields +// +// Pattern 5 runs *before* pattern 6 so that args like +// call:outer{"name": "inner", "arguments": {}} +// don't get hijacked by the bare-JSON sweep into a spurious `inner` tool +// call. The brace-balanced span pattern 5 records in `removals` shadows +// the inner JSON from pattern 6's view via `overlaps()`. #include "tool_parser.h" @@ -161,6 +168,125 @@ static const std::regex & re_tool_code() { return r; } +// Pattern 5: `call:?{` opener. The sentinel alternation in front +// rejects narrative usages like "I'll call:foo{x:1}" where `call:` is glued +// to a preceding word — whitespace, common punctuation, and open/close +// brackets are the realistic boundaries seen in the snapshot data. `\s` +// covers `\n` so a `call:` at the start of any line is matched without +// relying on std::regex multiline support (which is non-portable). +// +// Note that `}` is in the sentinel list — gemma frequently emits multiple +// invocations back-to-back: `call:a{x:1}call:b{y:2}`. Without `}` as a +// sentinel the second match would be missed. +static const std::regex & re_call_verb_open() { + static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>])call:([A-Za-z0-9_.:\-]+)\s*\{)"); + return r; +} + +// Find the index one past the `}` that matches `text[open] == '{'`. +// Respects nested {}/[] depth and skips over "..." / '...' / `...` +// string literals (with backslash escapes). Returns std::string::npos if +// no matching close is found. +static size_t balanced_braces_end(const std::string & text, size_t open) { + int depth = 0; + char in_str = 0; // 0, or one of '"', '\'', '`' + for (size_t i = open; i < text.size(); i++) { + char c = text[i]; + if (in_str) { + if (c == '\\' && i + 1 < text.size()) { i++; continue; } + if (c == in_str) in_str = 0; + continue; + } + if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; } + if (c == '{' || c == '[') { + depth++; + } else if (c == '}' || c == ']') { + depth--; + if (depth == 0 && c == '}') return i + 1; + if (depth < 0) return std::string::npos; + } + } + return std::string::npos; +} + +// Try strict json::parse first; on failure rewrite single- and +// backtick-quoted strings to double-quoted, wrap bare identifier keys +// in double quotes, and retry. Returns true and populates `out` on +// success; returns false on irrecoverable failure (and `out` is unset). +// +// The rewrite walks the buffer char-by-char tracking string state so it +// doesn't mangle identifiers that live inside string values. +static bool coerce_relaxed_json(const std::string & payload, json & out) { + { + json parsed = json::parse(payload, nullptr, false); + if (!parsed.is_discarded()) { + out = std::move(parsed); + return true; + } + } + + // Permissive pass. + static const std::regex re_bare_key(R"(([A-Za-z_][A-Za-z0-9_]*)(\s*:))"); + + std::string rewritten; + rewritten.reserve(payload.size() + 16); + char in_str = 0; // 0, or the *opening* quote we saw + for (size_t i = 0; i < payload.size(); ) { + char c = payload[i]; + if (in_str) { + // Inside a string we already opened. Mirror escapes verbatim. + if (c == '\\' && i + 1 < payload.size()) { + rewritten += c; + rewritten += payload[i + 1]; + i += 2; + continue; + } + if (c == in_str) { + // Close — always emit a double-quote regardless of which + // quote style opened the string. The opening side already + // emitted a `"`. + rewritten += '"'; + in_str = 0; + i++; + continue; + } + rewritten += c; + i++; + continue; + } + if (c == '"' || c == '\'' || c == '`') { + rewritten += '"'; + in_str = c; + i++; + continue; + } + // Try to match a bare-key identifier here. Don't fire if the + // previous emitted char is `"` — that would indicate we're sitting + // right after a JSON string boundary and the "identifier" is + // probably part of a value continuation (e.g. `"k": foo: 1` would + // be malformed JSON anyway, but better to leave it untouched). + std::smatch m; + std::string tail = payload.substr(i); + if (std::regex_search(tail, m, re_bare_key, + std::regex_constants::match_continuous) && + (rewritten.empty() || rewritten.back() != '"')) { + rewritten += '"'; + rewritten += m[1].str(); + rewritten += '"'; + rewritten += m[2].str(); + i += m.length(); + continue; + } + rewritten += c; + i++; + } + + json parsed = json::parse(rewritten, nullptr, false); + if (parsed.is_discarded()) return false; + out = std::move(parsed); + return true; +} + // ─── XML parameter parser ─────────────────────────────────────────────── static json parse_xml_params(const std::string & region, const std::string & fn_name, @@ -397,7 +523,45 @@ ToolParseResult parse_tool_calls(const std::string & text, const json & tools) { } } - // Pattern 5: Bare JSON objects + // Pattern 5: call:?{relaxed-JSON args} + // + // Runs before the bare-JSON sweep so that inner JSON of the form + // call:outer{"name": "inner", "arguments": {}} + // doesn't get hijacked into a spurious `inner` ToolCall. + { + auto begin = std::sregex_iterator(text.begin(), text.end(), re_call_verb_open()); + auto end = std::sregex_iterator(); + for (auto it = begin; it != end; ++it) { + // Group 1: sentinel char (may be empty if matched at `^`). + // Group 2: full verb including any embedded namespaces. + size_t prefix_len = (*it)[1].matched ? (*it)[1].length() : 0; + size_t call_start = it->position() + prefix_len; + if (overlaps(removals, call_start)) continue; + + // The matched substring runs from call_start through the `{` + // (consuming the opener and any whitespace between verb and + // brace). Compute the brace index from the match end. + size_t brace_open = it->position() + it->length() - 1; + if (brace_open >= text.size() || text[brace_open] != '{') continue; + + size_t brace_close = balanced_braces_end(text, brace_open); + if (brace_close == std::string::npos) continue; + + std::string raw_args = text.substr(brace_open, brace_close - brace_open); + json args; + if (!coerce_relaxed_json(raw_args, args)) continue; + if (!args.is_object()) continue; + + std::string verb = (*it)[2].str(); + size_t colon = verb.find_last_of(':'); + if (colon != std::string::npos) verb = verb.substr(colon + 1); + if (verb.empty()) continue; + + add_call(verb, args, call_start, brace_close); + } + } + + // Pattern 6: Bare JSON objects { size_t cursor = 0; while (cursor < text.size()) { diff --git a/server/src/server/tool_parser.h b/server/src/server/tool_parser.h index 1ff9890a..b3bd4a79 100644 --- a/server/src/server/tool_parser.h +++ b/server/src/server/tool_parser.h @@ -1,11 +1,12 @@ // Tool call parser — extracts structured tool calls from generated text. // -// Supports 5 detection patterns: +// Supports 6 detection patterns: // 1. ... (Qwen XML) // 2. ... (bare function XML) // 3. (function signature) // 4. {...JSON...} (tool_code wrapper) -// 5. Bare JSON objects {"name":..., "arguments":...} (raw JSON) +// 5. call:?{relaxed-JSON-args} (gemma plain-text) +// 6. Bare JSON objects {"name":..., "arguments":...} (raw JSON) #pragma once diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 275ec935..d0c9b291 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -316,6 +316,172 @@ static void test_parse_tool_allowed_filter() { TEST_ASSERT(result.tool_calls.empty()); } +// ─── Pattern 5: call:{relaxed-JSON args} (gemma plain-text) ───── + +static void test_parse_call_verb_single() { + std::string text = "call:get_country_info{country: \"France\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "get_country_info"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["country"] == "France"); + } + TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos); +} + +static void test_parse_call_verb_back_to_back() { + std::string text = + "call:get_country_info{country: \"France\"}" + "call:summarize{text: \"ok\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 2); + if (result.tool_calls.size() == 2) { + TEST_ASSERT(result.tool_calls[0].name == "get_country_info"); + TEST_ASSERT(result.tool_calls[1].name == "summarize"); + } +} + +static void test_parse_call_verb_namespaced() { + std::string text = "call:execute-bead:read-file{path: \"crates/foo/src/lib.rs\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + // Verb only — namespace stripped. + TEST_ASSERT(result.tool_calls[0].name == "read-file"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "crates/foo/src/lib.rs"); + } +} + +static void test_parse_call_verb_snake_and_hyphen() { + std::string text = + "call:execute-bead:list-files{path: \"src/\"}\n\n" + "call:execute-bead:read_file{path: \"a/b.rs\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 2); + if (result.tool_calls.size() == 2) { + TEST_ASSERT(result.tool_calls[0].name == "list-files"); + TEST_ASSERT(result.tool_calls[1].name == "read_file"); + } +} + +static void test_parse_call_verb_tool_allowed_filter() { + std::string text = "call:disallowed_verb{x: 1}call:allowed_verb{y: 2}"; + json tools = json::array({ + {{"type", "function"}, {"function", {{"name", "allowed_verb"}}}} + }); + auto result = parse_tool_calls(text, tools); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "allowed_verb"); + } +} + +static void test_parse_call_verb_inline_prose_rejected() { + // No sentinel char before `call:` — must NOT match. + std::string text = "narrative.call:foo{x:1}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.empty()); +} + +static void test_parse_call_verb_inline_prose_after_space() { + // Whitespace IS a valid sentinel — this should match. + std::string text = "Sure, I'll call:foo{x: 1}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "foo"); + } +} + +static void test_parse_call_verb_malformed_args() { + // Unterminated brace — drop the call, don't crash. + std::string text = "call:foo{country: \"France\""; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.empty()); +} + +static void test_parse_call_verb_inner_brace_in_string() { + // The `{` and `}` inside the string value must not confuse the + // balanced-brace scanner. + std::string text = "call:foo{cmd: \"echo {not_a_brace} ok\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["cmd"] == "echo {not_a_brace} ok"); + } +} + +static void test_parse_call_verb_strict_json_args() { + // Strict-JSON path: keys already quoted. + std::string text = "call:foo{\"path\": \"x\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "x"); + } +} + +static void test_parse_call_verb_unquoted_keys() { + // Relaxed-JSON path: bare keys get quoted. + std::string text = "call:foo{path: \"x\", count: 3}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "x"); + TEST_ASSERT(args["count"] == 3); + } +} + +static void test_parse_call_verb_cleaned_text() { + // The matched span should be stripped from cleaned_text. + std::string text = "Hello call:foo{x: 1} world."; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos); + TEST_ASSERT(result.cleaned_text.find("Hello") != std::string::npos); + TEST_ASSERT(result.cleaned_text.find("world.") != std::string::npos); +} + +static void test_parse_call_verb_intercept_inner_json() { + // Codex-requested: inner args of the form {"name": ..., "arguments": ...} + // must NOT be picked up by pattern 6 (bare-JSON sweep) as a spurious + // `inner` ToolCall. Exactly one ToolCall, named `outer`, with the + // inner JSON intact in its arguments. + std::string text = "call:outer{\"name\": \"inner\", \"arguments\": {}}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "outer"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["name"] == "inner"); + TEST_ASSERT(args["arguments"].is_object()); + } +} + +static void test_parse_call_verb_multiline_args() { + // Snapshot rows have multi-line nested args; the balanced-brace + // scanner is line-agnostic, so this must Just Work. + std::string text = + "call:default_api:analyze_data{\n" + " data: [{\"date\": \"2024-10-05\", \"qty\": 50}, {\"date\": \"2024-10-06\", \"qty\": 60}],\n" + " metric: \"qty\"\n" + "}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "analyze_data"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["metric"] == "qty"); + TEST_ASSERT(args["data"].is_array()); + TEST_ASSERT(args["data"].size() == 2); + } +} + // ═══════════════════════════════════════════════════════════════════════ // SSE Emitter tests // ═══════════════════════════════════════════════════════════════════════ @@ -2590,6 +2756,20 @@ int main() { RUN_TEST(test_parse_no_tools); RUN_TEST(test_parse_tool_code_wrapper); RUN_TEST(test_parse_tool_allowed_filter); + RUN_TEST(test_parse_call_verb_single); + RUN_TEST(test_parse_call_verb_back_to_back); + RUN_TEST(test_parse_call_verb_namespaced); + RUN_TEST(test_parse_call_verb_snake_and_hyphen); + RUN_TEST(test_parse_call_verb_tool_allowed_filter); + RUN_TEST(test_parse_call_verb_inline_prose_rejected); + RUN_TEST(test_parse_call_verb_inline_prose_after_space); + RUN_TEST(test_parse_call_verb_malformed_args); + RUN_TEST(test_parse_call_verb_inner_brace_in_string); + RUN_TEST(test_parse_call_verb_strict_json_args); + RUN_TEST(test_parse_call_verb_unquoted_keys); + RUN_TEST(test_parse_call_verb_cleaned_text); + RUN_TEST(test_parse_call_verb_intercept_inner_json); + RUN_TEST(test_parse_call_verb_multiline_args); std::fprintf(stderr, "\n── SSE Emitter ──\n"); RUN_TEST(test_emitter_reasoning_split_openai); From 80e6e2acb6d6077392909cb7d4187ad33098bed9 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 21:26:07 -0400 Subject: [PATCH 4/6] fix(tool_parser): accept ``_call:`` prefix from gemma tokenizer artifact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smoke-testing the post-PR-#323 image (lucebox-hub:cuda12 @ 8039911) on sindri's gemma-4-26b revealed a new emission pattern: the model sometimes outputs ``_call:get_country_info{...}`` with a leading underscore. This is a SentencePiece / chat-template tokenizer artifact that became visible after bragi's channel-token routing fix (commit 4b757d1) — the underscore is residual from the tokenizer's internal serialization that earlier handling stripped. Both parsers missed these invocations: * Server-side (tool_parser.cpp:182): the sentinel character class ``[\s,;:\(\[\{\}\)\]\>]`` did not include ``_``. Added. * Client-side (forge.py:32): ``\bcall:`` requires a word boundary before ``call``, but ``_`` is a word char so ``\b`` doesn't fire between ``_`` and ``c``. Replaced with explicit lookbehind on the same sentinel set (including ``_``). Net result: ``_call:foo{...}`` now parses to a tool_use the same way ``call:foo{...}`` does. Tradeoff: ``my_call:foo{}`` mid-identifier would also match, but real model outputs don't emit free-form ``my_call:`` text (tool names come from request tool defs). Tests: +2 cases in test_forge_grader.py (underscore alone, mixed back-to-back with both prefixed and bare). 16 → 18 forge_grader tests, all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/tool_parser.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp index 43bf2b1c..a8d1518b 100644 --- a/server/src/server/tool_parser.cpp +++ b/server/src/server/tool_parser.cpp @@ -178,8 +178,17 @@ static const std::regex & re_tool_code() { // Note that `}` is in the sentinel list — gemma frequently emits multiple // invocations back-to-back: `call:a{x:1}call:b{y:2}`. Without `}` as a // sentinel the second match would be missed. +// +// `_` is also in the sentinel list to handle a SentencePiece / chat-template +// artifact: post-bragi-channel-routing (commit 4b757d1) the gemma server +// occasionally emits raw tokens like `_call:get_country_info{...}` where +// the leading `_` is residual tokenizer serialization. Without `_` here +// the parser misses every such invocation — empirically confirmed against +// gemma-4-26b 2026-05-31 smoke test. Tradeoff: `my_call:foo{}` mid- +// identifier could match, but real model output doesn't emit `my_call:` +// strings (tool names come from the request's tool definitions). static const std::regex & re_call_verb_open() { - static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>])call:([A-Za-z0-9_.:\-]+)\s*\{)"); + static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>_])call:([A-Za-z0-9_.:\-]+)\s*\{)"); return r; } From 8218333ba6b6691c1062e415eddfa3b1378dc861 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Mon, 1 Jun 2026 09:37:20 -0400 Subject: [PATCH 5/6] fix(server): wire sse_emitter to detect plain-text call:{} tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #323's parser added pattern 6 (call:{...}) but the SSE emitter only invokes parse_tool_calls when mode_ == TOOL_BUFFER, which fires only on the literal XML opener. For models like gemma-4 that emit tool calls as plain text, the emitter stays in CONTENT mode and the parser is never called, so no tool_use blocks land in the response (finish_reason="stop" / stop_reason="end_turn"). Add a CONTENT-mode finalize branch that runs parse_tool_calls when the accumulated text contains a plausible `call:{` opener (checked via a cheap O(N) substring scan to avoid regex cost on no-tool responses). Matches are hoisted into tool_calls_, the covering spans are stripped from accumulated_text, and finish_reason flips to "tool_calls" so the existing Anthropic/OpenAI serialization paths emit proper tool_use content blocks. Pre-check accepts `_call:foo{` (SentencePiece underscore artifact) since `find("call:")` lands inside the `_call:` window — full validation is delegated to parse_tool_calls (tool_parser.cpp). Tests: +9 unit cases covering parsed/skipped/underscore/no-substring/ multi-call/malformed/no-double-fire-on-XML/empty-tools/preserving prior content paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/sse_emitter.cpp | 170 ++++++++++++++++++++++++ server/test/test_server_unit.cpp | 209 ++++++++++++++++++++++++++++++ 2 files changed, 379 insertions(+) diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp index 604f11a7..0441e5bb 100644 --- a/server/src/server/sse_emitter.cpp +++ b/server/src/server/sse_emitter.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,47 @@ static bool has_request_tools(const json & tools) { return tools.is_array() && !tools.empty(); } +// Cheap pre-check: scan `text` for a plausible `call:{` opener +// before invoking the full parse_tool_calls regex sweep. Mirrors the +// shape of re_call_verb_open() in tool_parser.cpp at a coarse +// granularity. Single O(N) pass with a substring skip; no heap alloc +// and no regex compile, so a response with no `call:` substring pays +// only a `find()` cost. +// +// Returns true if any `call:` occurrence is followed by an identifier +// start (`[A-Za-z_]`), more verb chars (`[A-Za-z0-9_.:-]`), optional +// whitespace, and a `{`. We deliberately do NOT validate balanced +// braces here — parse_tool_calls owns that check, and a leading +// `call:foo{` with no close still costs us only one regex scan. +// +// The pre-check intentionally accepts `_call:foo{` (SentencePiece +// underscore artifact, see tool_parser.cpp re_call_verb_open() +// rationale) by including `_` in the verb-start charset alternation +// at the top — `find("call:")` lands inside the `_call:` window. +static bool looks_like_plain_text_call(const std::string & text) { + size_t pos = 0; + while ((pos = text.find("call:", pos)) != std::string::npos) { + size_t v = pos + 5; // step past "call:" + if (v < text.size() && + (std::isalpha((unsigned char)text[v]) || text[v] == '_')) { + size_t w = v; + while (w < text.size() && + (std::isalnum((unsigned char)text[w]) || + text[w] == '_' || text[w] == '.' || + text[w] == ':' || text[w] == '-')) { + w++; + } + // Allow whitespace between verb and brace (mirrors `\s*\{`). + while (w < text.size() && std::isspace((unsigned char)text[w])) { + w++; + } + if (w < text.size() && text[w] == '{') return true; + } + pos = v; + } + return false; +} + static bool find_tool_start(const std::string & text, size_t & pos) { size_t idx = text.find('<'); while (idx != std::string::npos) { @@ -608,6 +650,134 @@ std::vector SseEmitter::emit_finish(int completion_tokens, "request_id=%s format=%d bytes=%zu\n", request_id_.c_str(), (int)format_, tool_buffer_.size()); } + } else if (mode_ == StreamMode::CONTENT && + !accumulated_content_.empty() && + has_request_tools(tools_) && + looks_like_plain_text_call(accumulated_content_)) { + // CONTENT-mode plain-text tool-call hoist. Gemma4 (and similar + // models with no XML tool-call template) emits invocations as + // literal text like `call:get_weather{location: "SF"}` or + // `_call:get_weather{...}` (SentencePiece artifact). The emitter + // stays in CONTENT mode for the whole stream because no + // `` / `` opener ever + // arrives. Without this branch the response stops with + // finish_reason="stop" / stop_reason="end_turn" and no tool_use + // block is emitted, breaking forge/agent_recorded scenarios + // that depend on structured tool_calls. + // + // The branch runs parse_tool_calls over accumulated_content_, + // hoists any ToolCalls (the allowlist filter `tool_allowed` is + // already enforced inside parse_tool_calls' add_call lambda, + // so unauthorized verbs never enter parsed.tool_calls), and + // replaces accumulated_content_ with cleaned_text so the final + // response carries the prose-only text (no duplicate `call:` + // span). Streaming clients have already received the raw call + // text as content deltas — they get a post-hoc tool_use block + // appended at finalize. Text + tool_use is a legal stream in + // both OpenAI and Anthropic specs. + // + // Gated on has_request_tools(tools_) to mirror the + // TOOL_BUFFER-entry condition at line 391 — if the request + // didn't declare tools we keep `call:foo{}` as visible content + // (see test_emitter_no_tools_keeps_tool_like_text for the + // equivalent XML-shape behavior). + auto parsed = parse_tool_calls(accumulated_content_, tools_); + if (!parsed.tool_calls.empty()) { + tool_calls_ = std::move(parsed.tool_calls); + + // Remember for tool memory (mirrors TOOL_BUFFER branch). + if (tool_memory_) { + std::vector ids; + for (const auto & tc : tool_calls_) ids.push_back(tc.id); + tool_memory_->remember(ids, accumulated_raw_); + } + + // Strip matched call spans from the visible content so the + // non-streaming final-message shape doesn't duplicate them + // as both text AND tool_use. Mirrors + // _strip_plain_text_tool_calls in + // luce-bench/.../forge.py. Streaming clients already saw + // the pre-strip text in earlier deltas; this only affects + // the final accumulated_text() consumed by the response + // builders in http_server.cpp. + accumulated_content_ = parsed.cleaned_text; + + fr = "tool_calls"; + + // Format-specific tool call events — same shape as the + // TOOL_BUFFER branch above. Kept inlined (rather than + // refactored into a helper) to keep this commit's diff + // minimal and side-by-side reviewable against the + // upstream block. + switch (format_) { + case ApiFormat::OPENAI_CHAT: { + json tc_list = json::array(); + for (size_t i = 0; i < tool_calls_.size(); i++) { + tc_list.push_back({ + {"index", (int)i}, + {"id", tool_calls_[i].id}, + {"type", "function"}, + {"function", { + {"name", tool_calls_[i].name}, + {"arguments", tool_calls_[i].arguments} + }} + }); + } + out.push_back(format_openai_delta({{"tool_calls", tc_list}})); + break; + } + case ApiFormat::ANTHROPIC: { + if (!active_kind_.empty()) { + out.push_back(sse_event("content_block_stop", + json({{"type", "content_block_stop"}, {"index", block_index_}}).dump())); + active_kind_.clear(); + } + for (const auto & tc : tool_calls_) { + block_index_++; + json tu_block = { + {"type", "tool_use"}, + {"id", tc.id}, + {"name", tc.name}, + {"input", json::object()} + }; + out.push_back(sse_event("content_block_start", + json({{"type", "content_block_start"}, + {"index", block_index_}, + {"content_block", tu_block}}).dump())); + if (!tc.arguments.empty()) { + out.push_back(sse_event("content_block_delta", + json({{"type", "content_block_delta"}, + {"index", block_index_}, + {"delta", {{"type", "input_json_delta"}, + {"partial_json", tc.arguments}}}}).dump())); + } + out.push_back(sse_event("content_block_stop", + json({{"type", "content_block_stop"}, + {"index", block_index_}}).dump())); + } + break; + } + case ApiFormat::RESPONSES: + for (const auto & tc : tool_calls_) { + out.push_back(format_responses_event( + "response.function_call_arguments.delta", { + {"item_id", tc.id}, {"output_index", 0}, + {"delta", tc.arguments} + })); + out.push_back(format_responses_event( + "response.function_call_arguments.done", { + {"item_id", tc.id}, {"output_index", 0}, + {"arguments", tc.arguments}, {"name", tc.name} + })); + } + break; + default: break; + } + } + // If parse_tool_calls matched the substring pre-check but + // returned no calls (all filtered by tool_allowed, or all args + // malformed), `fr` stays "stop" and accumulated_content_ is + // left intact. Caller sees the original prose; no leak. } // Format-specific final events diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index d0c9b291..e89130fb 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -868,6 +868,204 @@ static void test_emitter_anthropic_thinking_blocks() { TEST_ASSERT(!em.accumulated_text().empty()); } +// ═══════════════════════════════════════════════════════════════════════ +// CONTENT-mode plain-text `call:{...}` tool-call hoist +// +// Regression coverage for the finalize-pass branch that runs +// parse_tool_calls over accumulated_content_ when the stream stayed +// in CONTENT mode (Gemma4-style `call:foo{...}` plain text — no XML +// envelope to trip TOOL_BUFFER). Without this branch the emitter +// returns finish_reason="stop" / stop_reason="end_turn" and never +// emits a tool_use block, breaking forge / agent_recorded. +// ═══════════════════════════════════════════════════════════════════════ + +static void test_emitter_content_mode_plain_text_call_parsed() { + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + // Feed enough text past the holdback so the prose flushes into + // accumulated_content_ before finalize. The call: span itself stays + // in accumulated_content_ either way (no XML opener to redirect to + // TOOL_BUFFER); finalize parses and strips it. + em.emit_token("I'll fetch the forecast for you right now: "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" — let me know what you'd like next."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "get_weather"); + auto args = json::parse(em.tool_calls()[0].arguments); + TEST_ASSERT(args["location"] == "SF"); + } + // finish_reason should now be "tool_calls" (drives Anthropic + // stop_reason="tool_use" downstream). + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_no_tools_skips_plain_text_call() { + // Empty tools array: branch is gated on has_request_tools(tools_), + // so the call: text remains as visible content. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + em.emit_token("I'll fetch the forecast for you right now: "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" — let me know what you'd like next."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos); +} + +static void test_emitter_content_mode_underscore_prefix_call_parsed() { + // Regression for the `_call:foo{}` SentencePiece artifact (commit + // 004a81b). Parser Pattern 5 sentinel set includes `_`, so the + // verb is captured even with the leading underscore. The emitter + // wiring must surface it the same way as the bare `call:` form. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Sure thing, here is the call you asked for: "); + em.emit_token("_call:get_weather{\"location\": \"NYC\"}"); + em.emit_token(" — happy to refine if needed."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "get_weather"); + auto args = json::parse(em.tool_calls()[0].arguments); + TEST_ASSERT(args["location"] == "NYC"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_no_call_substring_skips_parser() { + // Pure prose with no `call:` substring: the pre-check + // looks_like_plain_text_call short-circuits before parse_tool_calls + // runs. Accumulated text is preserved; finish_reason stays "stop". + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Sorry, I cannot help with that "); + em.emit_token("specific question today. Please consult a local guide "); + em.emit_token("for the most accurate information."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + TEST_ASSERT(em.accumulated_text().find("Sorry") != std::string::npos); +} + +static void test_emitter_content_mode_mixed_calls_multiple() { + // Multiple back-to-back calls in the same response. Parser + // Pattern 5 sentinel set includes `}` so consecutive invocations + // are captured. Verify the emitter hoists both in emission order. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("start. "); + em.emit_token("call:get_weather{\"location\": \"A\"} "); + em.emit_token("middle. "); + em.emit_token("call:get_weather{\"location\": \"B\"} "); + em.emit_token("end of the message."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 2); + if (em.tool_calls().size() == 2) { + auto args0 = json::parse(em.tool_calls()[0].arguments); + auto args1 = json::parse(em.tool_calls()[1].arguments); + TEST_ASSERT(args0["location"] == "A"); + TEST_ASSERT(args1["location"] == "B"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); + // Codex Q3 residue guard: the stripped accumulated text must NOT + // contain `call:` any more. + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); +} + +static void test_emitter_content_mode_malformed_call_dropped() { + // Unbalanced `{`: balanced_braces_end inside parse_tool_calls + // returns npos and the match is dropped. Emitter must not crash + // and the malformed text remains in accumulated_text (no tool + // hoist, no silent strip — caller-visible signal that the model + // produced garbage). + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Here is the call you wanted with malformed args: "); + em.emit_token("call:get_weather{location: \"unclosed"); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + // Malformed call span is left visible (no strip on parse failure). + TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos); +} + +static void test_emitter_content_mode_does_not_double_fire_on_tool_call_xml() { + // Regression guard: a `` XML envelope must continue to + // route through the TOOL_BUFFER path (transition fires inside + // emit_token). The new CONTENT-mode branch sits in an `else if` + // tied to `mode_ == CONTENT` at emit_finish entry, so it cannot + // fire when TOOL_BUFFER handled the call. Verify exactly 1 + // ToolCall is emitted, not 2. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("\n" + "\n" + "SF\n" + "\n" + ""); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_strips_call_span_from_accumulated_text() { + // Codex Q3 "residue" hazard guard. After a successful hoist, + // accumulated_text() must not contain the `call:` substring (the + // matched span is replaced by cleaned_text). Without this guard + // the OpenAI Chat / Anthropic / Responses final-message shapes + // would echo the call as both literal text AND a tool_use block, + // producing UI double-display. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("prefix prose here. "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" suffix prose continues to flush the holdback."); + em.emit_finish(20); + + TEST_ASSERT(!em.tool_calls().empty()); + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("prefix prose") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("suffix prose") != std::string::npos); +} + +static void test_emitter_content_mode_anthropic_emits_tool_use_block() { + // Verify the Anthropic format-specific events fire from the new + // branch (content_block_stop on the open text block, then + // content_block_start tool_use + input_json_delta + content_block_stop). + json tools = json::array(); + tools.push_back({ + {"name", "get_weather"}, + {"description", "weather"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"city", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::ANTHROPIC, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("Let me fetch the data you need from the service: "); + em.emit_token("call:get_weather{\"city\": \"Tokyo\"}"); + em.emit_token(" — back in a moment."); + auto finish = em.emit_finish(20); + std::string s = concat(finish); + + TEST_ASSERT(!em.tool_calls().empty()); + TEST_ASSERT(s.find("\"type\":\"tool_use\"") != std::string::npos); + TEST_ASSERT(s.find("\"name\":\"get_weather\"") != std::string::npos); + TEST_ASSERT(s.find("\"type\":\"input_json_delta\"") != std::string::npos); + TEST_ASSERT(s.find("Tokyo") != std::string::npos); + TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"") != std::string::npos); +} + // ═══════════════════════════════════════════════════════════════════════ // Stop sequences tests // ═══════════════════════════════════════════════════════════════════════ @@ -2792,6 +2990,17 @@ int main() { RUN_TEST(test_emitter_nonstreaming_accumulates); RUN_TEST(test_emitter_anthropic_thinking_blocks); + std::fprintf(stderr, "\n── CONTENT-mode plain-text call:{} ──\n"); + RUN_TEST(test_emitter_content_mode_plain_text_call_parsed); + RUN_TEST(test_emitter_content_mode_no_tools_skips_plain_text_call); + RUN_TEST(test_emitter_content_mode_underscore_prefix_call_parsed); + RUN_TEST(test_emitter_content_mode_no_call_substring_skips_parser); + RUN_TEST(test_emitter_content_mode_mixed_calls_multiple); + RUN_TEST(test_emitter_content_mode_malformed_call_dropped); + RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml); + RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text); + RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block); + std::fprintf(stderr, "\n── Stop sequences ──\n"); RUN_TEST(test_stop_sequence_basic); RUN_TEST(test_stop_sequence_mid_token); From ee9cd9e92d8fea7cfee2d587235be12308ce7711 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Mon, 1 Jun 2026 20:27:09 -0400 Subject: [PATCH 6/6] fix(server): address cubic PR #329 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three P2 issues flagged by cubic on commit 8218333: 1. tool_parser.cpp:248 — relaxed-JSON rewrite missed escaping inner `"` when normalizing single- or backtick-quoted strings. Content like `'he said "hi"'` rewrote to `"he said "hi""` (invalid JSON), silently dropping the whole tool call. Now escapes `"` to `\"` when inside a non-`"` string. 2. sse_emitter.cpp:49 — looks_like_plain_text_call() pre-check used isalpha for the first verb char, but the parser regex accepts [A-Za-z0-9_.:\-]. A `call:2nd_pass{...}` emission would pass the parser but skip the pre-check. Switched first-char test to isalnum so digit-led verbs reach the parser. 3. sse_emitter.cpp:703 — stripping accumulated_content_ desynced the Responses-format streaming finalization events (.output_text.done / .content_part.done / .completed) from the raw .delta events the client already received. Captured a responses_streamed_text snapshot at top of emit_finish before any strip, and threaded it through the four Responses finalization sites. Non-streaming accumulated_text() accessor continues to return the stripped version so the non-streaming response shape doesn't carry both text AND tool_use for the same span. Tests: +4 cases. - test_parse_call_verb_singlequote_with_inner_doublequote - test_parse_call_verb_backtick_with_inner_doublequote - test_emitter_content_mode_digit_start_verb_parsed - test_emitter_content_mode_responses_done_uses_pre_strip_text test_server_unit: 1693 -> 1707 assertions, 0 failures. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/sse_emitter.cpp | 39 +++++++++++-- server/src/server/tool_parser.cpp | 12 ++++ server/test/test_server_unit.cpp | 96 +++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 5 deletions(-) diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp index 0441e5bb..ba3c0d1e 100644 --- a/server/src/server/sse_emitter.cpp +++ b/server/src/server/sse_emitter.cpp @@ -46,7 +46,7 @@ static bool looks_like_plain_text_call(const std::string & text) { while ((pos = text.find("call:", pos)) != std::string::npos) { size_t v = pos + 5; // step past "call:" if (v < text.size() && - (std::isalpha((unsigned char)text[v]) || text[v] == '_')) { + (std::isalnum((unsigned char)text[v]) || text[v] == '_')) { size_t w = v; while (w < text.size() && (std::isalnum((unsigned char)text[w]) || @@ -542,6 +542,20 @@ std::vector SseEmitter::emit_finish(int completion_tokens, } window_.clear(); + // Snapshot of what the Responses stream actually emitted as text + // deltas. The CONTENT-mode plain-text tool-call branch below + // mutates accumulated_content_ (strips matched call spans so the + // non-streaming response shape doesn't duplicate them as both text + // AND tool_use), but the Responses-format finalization events + // (response.output_text.done / content_part.done / completed) must + // reflect what was actually streamed in earlier + // response.output_text.delta events — otherwise a streaming client + // sees its accumulated buffer disagree with the .done payload. + // Other formats (OpenAI Chat, Anthropic) don't echo final + // aggregated text in the stream, so they can continue to read the + // (possibly stripped) accumulated_content_ directly. + const std::string responses_streamed_text = accumulated_content_; + // Parse tool calls from buffer std::string fr = "stop"; if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) { @@ -700,6 +714,16 @@ std::vector SseEmitter::emit_finish(int completion_tokens, // the pre-strip text in earlier deltas; this only affects // the final accumulated_text() consumed by the response // builders in http_server.cpp. + // + // For the Responses format we capture a separate snapshot + // (responses_streamed_text, see top of emit_finish) before + // this strip so the streaming finalization events + // (.output_text.done / .content_part.done / .completed) + // continue to agree with the raw .delta events the client + // already received. Without the snapshot the .done payload + // would carry the stripped text and a streaming client's + // accumulated buffer would disagree with the server's + // claimed "done" text. accumulated_content_ = parsed.cleaned_text; fr = "tool_calls"; @@ -840,16 +864,21 @@ std::vector SseEmitter::emit_finish(int completion_tokens, } case ApiFormat::RESPONSES: { + // Use the pre-strip snapshot for the streaming finalization + // events (.done / .completed) so they agree with the + // .delta events that preceded them. See + // responses_streamed_text init at the top of emit_finish for + // rationale. // output_text.done out.push_back(format_responses_event("response.output_text.done", { {"item_id", msg_item_id_}, {"output_index", 0}, - {"content_index", 0}, {"text", accumulated_content_} + {"content_index", 0}, {"text", responses_streamed_text} })); // content_part.done out.push_back(format_responses_event("response.content_part.done", { {"item_id", msg_item_id_}, {"output_index", 0}, {"content_index", 0}, - {"part", {{"type", "output_text"}, {"text", accumulated_content_}, + {"part", {{"type", "output_text"}, {"text", responses_streamed_text}, {"annotations", json::array()}}} })); @@ -868,7 +897,7 @@ std::vector SseEmitter::emit_finish(int completion_tokens, {"type", "message"}, {"id", msg_item_id_}, {"status", "completed"}, {"role", "assistant"}, {"content", json::array({{ - {"type", "output_text"}, {"text", accumulated_content_}, + {"type", "output_text"}, {"text", responses_streamed_text}, {"annotations", json::array()} }})} }); @@ -896,7 +925,7 @@ std::vector SseEmitter::emit_finish(int completion_tokens, {"created_at", created_at_}, {"status", "completed"}, {"model", model_name_}, {"output", final_output}, - {"output_text", accumulated_content_}, + {"output_text", responses_streamed_text}, {"usage", resp_usage} }; out.push_back(format_responses_event("response.completed", {{"response", shell}})); diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp index a8d1518b..7ece4ca4 100644 --- a/server/src/server/tool_parser.cpp +++ b/server/src/server/tool_parser.cpp @@ -259,6 +259,18 @@ static bool coerce_relaxed_json(const std::string & payload, json & out) { i++; continue; } + // Escape inner `"` when we opened the string with a non-`"` + // quote (single or backtick). Without this, content like + // `'he said "hi"'` rewrites to `"he said "hi""` which is + // invalid JSON and silently drops the whole tool call. + // When in_str == '"', a `"` inside should have arrived via + // the `\\` escape branch above; a bare `"` here is malformed + // input we pass through unchanged. + if (in_str != '"' && c == '"') { + rewritten += "\\\""; + i++; + continue; + } rewritten += c; i++; continue; diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index e89130fb..ca7d073c 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -482,6 +482,34 @@ static void test_parse_call_verb_multiline_args() { } } +static void test_parse_call_verb_singlequote_with_inner_doublequote() { + // Cubic PR #329 review: when the relaxed-JSON rewrite converts + // single-quoted strings to double-quoted, inner `"` chars must be + // escaped to `\"` — otherwise `'he said "hi"'` rewrites to + // `"he said "hi""` which is invalid JSON and the whole tool call + // is silently dropped. + std::string text = "call:say{quote: 'he said \"hi\" loudly'}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "say"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["quote"] == "he said \"hi\" loudly"); + } +} + +static void test_parse_call_verb_backtick_with_inner_doublequote() { + // Same escape concern as the single-quote case, but with the + // backtick string flavor. + std::string text = "call:say{quote: `he said \"hi\" loudly`}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["quote"] == "he said \"hi\" loudly"); + } +} + // ═══════════════════════════════════════════════════════════════════════ // SSE Emitter tests // ═══════════════════════════════════════════════════════════════════════ @@ -1066,6 +1094,70 @@ static void test_emitter_content_mode_anthropic_emits_tool_use_block() { TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"") != std::string::npos); } +static void test_emitter_content_mode_digit_start_verb_parsed() { + // Cubic PR #329 review: the looks_like_plain_text_call() pre-check + // must accept verbs starting with a digit because the parser's + // re_call_verb_open() regex allows them ([A-Za-z0-9_.:\\-]+). + // A model emitting `call:2nd_pass{...}` with a digit-led verb + // should still trigger the parser sweep. + json tools = json::array(); + tools.push_back({ + {"name", "2nd_pass"}, + {"description", "second pass"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"reason", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::OPENAI_CHAT, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("call:2nd_pass{reason: \"verify\"}"); + em.emit_finish(5); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "2nd_pass"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_responses_done_uses_pre_strip_text() { + // Cubic PR #329 review: the Responses-format finalization events + // (.output_text.done / .content_part.done / .completed) must + // reflect the text that was streamed in earlier .delta events, + // not the post-strip text. Otherwise a streaming client's + // accumulated buffer (built from .delta events) disagrees with + // the server's claimed .done payload. + // + // accumulated_text() (consumed by non-streaming response builders) + // still returns the stripped version so the non-streaming response + // shape doesn't carry both text AND tool_use for the same span. + json tools = json::array(); + tools.push_back({ + {"name", "get_weather"}, + {"description", "weather"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"city", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::RESPONSES, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("Looking up: "); + em.emit_token("call:get_weather{\"city\": \"Tokyo\"}"); + em.emit_token(" done."); + auto finish = em.emit_finish(20); + std::string s = concat(finish); + + TEST_ASSERT(em.tool_calls().size() == 1); + // Streaming .done events must include the raw call text (matching + // what the .delta events already sent). + TEST_ASSERT(s.find("response.output_text.done") != std::string::npos); + TEST_ASSERT(s.find("call:get_weather") != std::string::npos); + // Non-streaming accessor returns the stripped text. + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("Looking up:") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("done.") != std::string::npos); +} + // ═══════════════════════════════════════════════════════════════════════ // Stop sequences tests // ═══════════════════════════════════════════════════════════════════════ @@ -2968,6 +3060,8 @@ int main() { RUN_TEST(test_parse_call_verb_cleaned_text); RUN_TEST(test_parse_call_verb_intercept_inner_json); RUN_TEST(test_parse_call_verb_multiline_args); + RUN_TEST(test_parse_call_verb_singlequote_with_inner_doublequote); + RUN_TEST(test_parse_call_verb_backtick_with_inner_doublequote); std::fprintf(stderr, "\n── SSE Emitter ──\n"); RUN_TEST(test_emitter_reasoning_split_openai); @@ -3000,6 +3094,8 @@ int main() { RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml); RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text); RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block); + RUN_TEST(test_emitter_content_mode_digit_start_verb_parsed); + RUN_TEST(test_emitter_content_mode_responses_done_uses_pre_strip_text); std::fprintf(stderr, "\n── Stop sequences ──\n"); RUN_TEST(test_stop_sequence_basic);