From e974ac388fcec36db79bb08249ef1c5740863e18 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Sun, 31 May 2026 22:59:33 -0400
Subject: [PATCH 1/6] docs(experiments): plan SSE emitter CONTENT-mode tool
 parse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plan + Codex review for running parse_tool_calls on accumulated
CONTENT-mode text so plain-text `call:<verb>{...}` invocations
(Gemma4) actually produce tool_use blocks instead of stop=end_turn.

Codex verdict: REVISE (residue hazard) → integrated as a new emitter-
level test guarding accumulated_text() span strip. Q4 rebuttal:
tool_allowed enforcement is already inside parse_tool_calls.
---
 ...se-emitter-content-mode-tool-parse-plan.md | 439 ++++++++++++++++++
 1 file changed, 439 insertions(+)
 create mode 100644 docs/experiments/sse-emitter-content-mode-tool-parse-plan.md
diff --git a/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md b/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md
new file mode 100644
index 00000000..335bf36f
--- /dev/null
+++ b/docs/experiments/sse-emitter-content-mode-tool-parse-plan.md
@@ -0,0 +1,439 @@
+# SSE Emitter: run `parse_tool_calls` on CONTENT-mode text (plain-text `call:<verb>{}` path)
+
+Status: PLAN — pre-implementation. No code changes in this commit.
+
+Branch: `fix/sse-emitter-content-mode-tool-parse`
+Base: `Luce-Org/lucebox-hub:main` @ `8305b6c`
+Affected files:
+- `server/src/server/sse_emitter.cpp` — add CONTENT-mode finalize branch.
+- `server/test/test_server_unit.cpp` — emitter-level coverage.
+
+## 1. Problem statement (diagnosis is settled)
+
+`SseEmitter` only transitions into `StreamMode::TOOL_BUFFER` when it detects
+one of the XML-style openers (`<tool_call>`, `<function=`, `<tool_code>`)
+in the streamed text (see `find_tool_start()` in
+`server/src/server/sse_emitter.cpp:26-38`, gated through the
+`mode_ == CONTENT` branch at line 388-423). For models like Gemma4 that
+emit tool calls as plain text:
+
+```
+call:get_country_info{country: "France"}
+_call:get_country_info{country_name: "France"}    # underscore artifact
+```
+
+…the emitter stays in `CONTENT` mode for the entire stream. The
+final-pass `parse_tool_calls` invocation at
+`server/src/server/sse_emitter.cpp:512-517` is gated on
+`mode_ == TOOL_BUFFER`, so it never runs on plain-text emissions:
+
+```cpp
+if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) {
+    auto parsed = parse_tool_calls(tool_buffer_, tools_);
+    ...
+}
+```
+
+The `parse_tool_calls` Pattern 5 regex (`re_call_verb_open()` at
+`tool_parser.cpp:190-193`) plus the underscore-prefix sentinel
+(commit `004a81b`) are correct, but unreachable in this code path.
+
+Empirical: smoke against image `fac7e0f-cuda12` on bragi returns
+`stop_reason: end_turn` for a Gemma4 response whose body contains
+`_call:get_country_info{country_name: "France"}` plain text. No
+`tool_use` content block is produced. This is the live regression we
+are fixing.
+
+## 2. Goal
+
+In `SseEmitter::emit_finish()`, add a parallel finalize branch that runs
+when `mode_ == CONTENT` and the accumulated content text plausibly
+contains a `call:<verb>{...}` invocation. If `parse_tool_calls` returns
+≥1 ToolCall, hoist them into `tool_calls_` exactly like the TOOL_BUFFER
+path, strip the matched spans from accumulated content, and flip the
+finish reason to `tool_calls` so the Anthropic mapping at
+`http_server.cpp:2074` resolves `stop_reason="tool_use"`.
+
+Non-goals (explicitly out of scope):
+- Per-delta in-stream detection of `call:<verb>{}`. The streaming
+  emitter currently sends `content` deltas as raw text as soon as the
+  holdback drains; rewriting those into a `tool_use` post-hoc would
+  contradict bytes already on the wire. See §6.
+- Touching the existing 5 tool-call detection patterns. The
+  XML/JSON/tool_code paths already work; we add a sibling branch.
+
+## 3. Design — finalize-pass CONTENT-mode parser
+
+### 3.1 Trigger predicate
+
+After the existing `if (mode_ == StreamMode::TOOL_BUFFER && ...)` block
+at `sse_emitter.cpp:512-617`, add a sibling branch:
+
+```cpp
+} else if (mode_ == StreamMode::CONTENT &&
+           !accumulated_content_.empty() &&
+           has_request_tools(tools_) &&
+           looks_like_plain_text_call(accumulated_content_)) {
+    auto parsed = parse_tool_calls(accumulated_content_, tools_);
+    if (!parsed.tool_calls.empty()) {
+        tool_calls_ = std::move(parsed.tool_calls);
+        accumulated_content_ = parsed.cleaned_text;  // matched spans stripped
+        fr = "tool_calls";
+
+        // emit format-specific events for tool calls (same switch as TOOL_BUFFER)
+        ...
+    }
+}
+```
+
+### 3.2 Cheap pre-check (`looks_like_plain_text_call`)
+
+To avoid paying full-regex cost on every content response, gate the
+parser on a tightened substring scan. Implementation:
+
+```cpp
+static bool looks_like_plain_text_call(const std::string & text) {
+    // Match the tightened opener: `call:<alpha>{`. Walks the text
+    // once; no heap allocation. Mirrors the sentinel logic in
+    // re_call_verb_open() at a coarse granularity so we only run the
+    // full std::regex pass when there's a plausible candidate.
+    size_t pos = 0;
+    while ((pos = text.find("call:", pos)) != std::string::npos) {
+        size_t v = pos + 5;
+        if (v < text.size() && (std::isalpha((unsigned char)text[v]) || text[v] == '_')) {
+            // Walk verb chars; require `{` after.
+            size_t w = v;
+            while (w < text.size() &&
+                   (std::isalnum((unsigned char)text[w]) ||
+                    text[w] == '_' || text[w] == '.' ||
+                    text[w] == ':' || text[w] == '-')) {
+                w++;
+            }
+            // Allow whitespace between verb and brace (mirrors `\s*\{` in the regex).
+            while (w < text.size() && std::isspace((unsigned char)text[w])) w++;
+            if (w < text.size() && text[w] == '{') return true;
+        }
+        pos = v;
+    }
+    return false;
+}
+```
+
+Tradeoff: the full regex pattern accepts namespaced verbs like
+`call:tools.weather:get_data{...}`. The pre-check's character class
+already covers `:` `.` `-`, so namespaced calls pass. Performance: O(N)
+single-pass with a `find("call:")` skip, dominated by the substring
+scan; no regex compile/match cost for the common no-tool-call response.
+
+Codex review will ask whether the tightened pre-check is sufficient or
+whether `std::regex_search` against a compiled-once
+`call:[A-Za-z_][A-Za-z0-9_.:-]*\s*\{` regex is preferable. Decision
+deferred to §5.
+
+### 3.3 Hoist semantics — mirror the TOOL_BUFFER path
+
+For each emitted ToolCall:
+- ID: reuses the existing `generate_call_id()` from `tool_parser.cpp:31`
+  via the call inside `parse_tool_calls` (`add_call` at line 450 of
+  `tool_parser.cpp` already assigns `tc.id = generate_call_id();`).
+  No additional ID synthesis needed in `sse_emitter.cpp`.
+- `tool_memory_` remember: copy lines 518-522 of the TOOL_BUFFER branch
+  (`tool_memory_->remember(ids, accumulated_raw_)`).
+- Format-specific events: emit OpenAI `tool_calls` delta / Anthropic
+  `content_block_start` + `input_json_delta` + `content_block_stop` /
+  Responses `function_call_arguments.delta`+`.done`. Re-use the existing
+  switch at lines 533-609.
+
+### 3.4 `accumulated_content_` mutation — strip matched spans
+
+`parse_tool_calls` already returns `cleaned_text` (the input minus all
+matched spans across all 6 patterns, trimmed). Replace
+`accumulated_content_` with `parsed.cleaned_text`. This is the C++
+analog of `_strip_plain_text_tool_calls` at
+`luce-bench/src/lucebench/areas/forge.py:144-172` — same semantics,
+shared engine.
+
+Edge: in streaming, bytes that were already sent as content deltas
+remain on the wire. Stripping `accumulated_content_` only affects the
+final non-streaming response shape (Anthropic message `content` array,
+OpenAI `message.content`, Responses `output_text`) and any later
+introspection. Streaming SSE clients see the unmodified text deltas
+followed by a post-hoc tool_call event. See §6.
+
+### 3.5 `finish_reason` bump
+
+Local `fr` (line 511) becomes `"tool_calls"` when ≥1 ToolCall survives
+the `tool_allowed` filter. The filter is enforced *inside*
+`parse_tool_calls` itself — `add_call` at `tool_parser.cpp:452` has
+`if (!tool_allowed(tools, fn_name)) return;` so unauthorized calls
+never enter `parsed.tool_calls`. The emitter's trigger condition
+`if (!parsed.tool_calls.empty())` therefore already guarantees that
+`fr` only flips when at least one allow-listed call survived.
+
+If the parser matches but everything is filtered out (all verbs
+unknown to `tools_`), `fr` stays `"stop"` — matches the TOOL_BUFFER
+path's `else` branch at line 610-617, which logs and keeps
+`fr = "stop"`.
+
+This is also Codex review prompt #4 (the reviewer flagged a missing
+filter; the filter is actually inside `parse_tool_calls`. Documenting
+here for the next reader.)
+
+## 4. Edge cases
+
+### 4.1 Empty / no-call content
+Pre-check `looks_like_plain_text_call` returns false → bail before
+regex. No measurable cost beyond a single `std::string::find("call:")`.
+
+### 4.2 Mixed content + tool calls
+`call:foo{...}` embedded in narrative prose. `parse_tool_calls`
+already returns `cleaned_text` with matched spans removed and the
+remainder trimmed (`tool_parser.cpp:623-647`). Verified: trailing
+prose survives, only matched call spans are stripped.
+
+### 4.3 Tool not in allowlist
+`add_call` lambda inside `parse_tool_calls` rejects unauthorized verbs
+via `tool_allowed(tools, fn_name)` at `tool_parser.cpp:452`. This is
+the same filter the TOOL_BUFFER path relies on. No emitter-layer work
+needed.
+
+### 4.4 No tools declared (`tools_` empty)
+Gate the entire new branch on `has_request_tools(tools_)` (the helper
+already exists at `sse_emitter.cpp:22-24`). This mirrors the
+TOOL_BUFFER trigger: `find_tool_start` is only called inside the
+`has_request_tools(tools_) && ...` check at line 391. Same gating
+keeps both paths consistent.
+
+### 4.5 `<tool_call>` envelope still routes via TOOL_BUFFER
+The CONTENT-mode branch only fires when `mode_ == CONTENT` at
+`emit_finish` entry. If the model emitted `<tool_call>`, the emitter
+transitions to TOOL_BUFFER inside `emit_token` (line 422) and that
+path handles parsing. The new branch is mutually exclusive with the
+old one — the `else if` ladder guarantees this. Regression risk: nil
+for the existing 5 patterns.
+
+### 4.6 Streaming
+For `req.stream == true`, the per-delta SSE events already streamed
+plain-text `call:foo{...}` to the client as `content` deltas BEFORE
+finalize runs. Adding tool_use events at finalize produces a stream
+that contains both text content AND a tool_use block — the OpenAI /
+Anthropic SDKs *do* accept this shape (text + tool_use is legal), but
+clients that gate on "first content type wins" will see text first.
+
+Two options:
+- (A) Apply the fix to both streaming and non-streaming, accepting that
+  streaming clients see the call text in early deltas plus a tool_use
+  block at the end. The accumulated_content_ field still gets cleaned
+  for the final-message shape; the wire deltas are not retroactively
+  rewritten.
+- (B) Gate the new branch to non-streaming only (i.e., when no per-
+  token deltas were emitted with content). This requires threading a
+  `stream` boolean into the emitter (currently not present in
+  `SseEmitter`).
+
+Decision (will defer to Codex review prompt #2): start with option (A)
+because the wire shape `text-delta + tool_use` is legal in both
+Anthropic and OpenAI specs, and forge/agent_recorded uses non-streaming
+exclusively. If Codex flags it as a real correctness hazard, switch to
+(B) and document.
+
+## 5. Codex review prompts (to be filled in after `lucebox codex` run)
+
+Send Codex the plan + the following four questions verbatim:
+
+1. **Pre-check tightness**: is the substring scan in
+   `looks_like_plain_text_call` sufficient, or should we use a compiled-
+   once `std::regex` (e.g.
+   `call:[A-Za-z_][A-Za-z0-9_.:\-]*\s*\{`) to reduce false invocations
+   of the full parser?
+2. **Streaming SSE shape**: the TOOL_BUFFER finalize path emits per-tool
+   SSE events (`content_block_start`/`input_json_delta`/`content_block_stop`
+   in Anthropic, `tool_calls` delta in OpenAI). Should the new CONTENT-
+   mode branch emit the same SSE events for streaming clients, or only
+   populate the final non-streaming response fields? (Critical for
+   streaming correctness — see §4.6.)
+3. **Race / state-ordering**: between `parsed.cleaned_text` replacing
+   `accumulated_content_` and the new SSE tool_use events being
+   appended, could we ever emit a `tool_use` block whose `input`
+   references text that's still present in `accumulated_content_`? Or
+   would the final accumulated content contain the same call span we
+   just emitted as a `tool_use`?
+4. **`finish_reason` bump scope**: should `fr` flip to `tool_calls`
+   only when ≥1 tool actually survives `tool_allowed` (current
+   design), or whenever `parse_tool_calls` finds *any* match (even if
+   filtered)? The TOOL_BUFFER path's existing behavior is the
+   reference.
+
+## Codex review
+
+Run: 2026-05-31 against `lucebox.service` (image `fac7e0f-cuda12`,
+Gemma4-26B-A4B, on bragi). Codex CLI v0.130.0 via `codex exec --model
+dflash`. Verbatim answers below.
+
+> **Q1. Is the substring-based `looks_like_plain_text_call` pre-check sufficient?**
+> **Yes.** A manual substring scan for `call:` followed by a character
+> check is significantly more efficient than a compiled `std::regex`
+> for a `finalize()` call. Since this is an end-of-stream operation,
+> the O(N) scan is negligible, and you avoid the heavy state machine
+> overhead of regex for a simple pattern.
+
+> **Q2. For streaming responses, should the new CONTENT-mode branch emit SSE tool_use events at finalize?**
+> **Yes (Option A).** In streaming mode, the `content` deltas
+> containing the raw `call:foo{...}` text have already been pushed to
+> the client. You cannot retroactively "un-send" them. The standard
+> and most compatible behavior for clients (Cline, Open WebUI) is to
+> receive the raw text as prose/thought, followed by a formal
+> `tool_use` block at the end.
+
+> **Q3. Are there race / state-ordering hazards between `accumulated_content_ = parsed.cleaned_text` and the new SSE tool_use events being appended?**
+> **Yes.** There is a "residue" hazard. If `parse_tool_calls` does not
+> perfectly identify and strip the exact character spans of the
+> `call:verb{...}` syntax from `cleaned_text`, the client will receive
+> the raw tool-call string inside the final text block. This results
+> in the UI displaying the tool call twice: once as literal text and
+> once as a structured tool block.
+
+> **Q4. Should `finish_reason` bump to "tool_calls" only when ≥1 tool survives `tool_allowed`, or whenever `parse_tool_calls` matches anything?**
+> **Only when ≥1 tool survives `tool_allowed`.** The proposed plan
+> lacks a check against the allowed tools list before setting `fr =
+> "tool_calls"`. If the model emits a call that the server has
+> disabled, bumping the `finish_reason` to `tool_calls` without an
+> actual valid tool in the payload is a protocol violation that can
+> crash or confuse SDK consumers.
+
+> **VERDICT: REVISE.** The plan must explicitly filter
+> `parsed.tool_calls` against the `tool_allowed` list before updating
+> `tool_calls_` and `fr` to ensure `finish_reason` accurately reflects
+> authorized tool usage.
+
+### Response (integration / rebuttal)
+
+- **Q1 integrated** — keep the substring pre-check as specified in §3.2.
+  Codex agrees regex would be overkill at finalize. No plan change.
+- **Q2 integrated** — proceed with Option A (apply fix to both streaming
+  and non-streaming). The wire shape `text-delta + tool_use` is legal
+  in both Anthropic and OpenAI specs per Codex. No plan change beyond
+  §4.6 explicit confirmation.
+- **Q3 integrated with new test** — Codex's "residue" hazard is real if
+  `parse_tool_calls.cleaned_text` ever fails to strip a span. The
+  parser tests already cover this for Pattern 5 (see
+  `test_parse_call_verb_cleaned_text` in `test_server_unit.cpp`).
+  Adding §7 emitter-level test `test_emitter_content_mode_strips_call_span_from_accumulated_text`
+  to specifically guard the emitter wiring: assert that
+  `em.accumulated_text()` does NOT contain the substring `call:` after
+  finalize when ≥1 call was hoisted. This is a wiring regression test
+  for the `accumulated_content_ = parsed.cleaned_text` line.
+- **Q4 rebuttal** — Codex flags a missing `tool_allowed` filter; this
+  is actually already enforced by `parse_tool_calls`'s internal
+  `add_call` lambda at `tool_parser.cpp:452`
+  (`if (!tool_allowed(tools, fn_name)) return;`). Calls failing the
+  allowlist are dropped before they reach `result.tool_calls`. The
+  plan's trigger condition is `if (!parsed.tool_calls.empty())`, so
+  `fr = "tool_calls"` is set only when ≥1 allow-listed call survived.
+  This matches Codex's recommendation; the perceived gap was the
+  reviewer not knowing about `parse_tool_calls`'s internal filter.
+  Documenting this explicitly in §3.5 for future readers.
+
+VERDICT was REVISE; revisions applied are §7 new test
+(`strips_call_span_from_accumulated_text`) and §3.5 clarification on
+the existing `tool_allowed` enforcement inside `parse_tool_calls`.
+Proceed to implementation.
+
+## 6. Implementation outline (post-codex)
+
+1. Add `looks_like_plain_text_call` static helper in
+   `sse_emitter.cpp` (anonymous namespace next to `has_request_tools`).
+2. Add the CONTENT-mode `else if` branch inside `emit_finish` between
+   the existing TOOL_BUFFER block (line 512-617) and the format-specific
+   final events switch (line 620).
+3. Refactor the format-specific tool-call event emission inside the
+   TOOL_BUFFER branch into a private member `emit_tool_call_events(out)`
+   so both branches share the implementation. Keeps line counts down
+   and avoids drift between the two paths.
+4. Verify `accumulated_content_` post-mutation is consumed by the
+   format-specific final events switch at line 620 (Responses uses
+   `accumulated_content_` in `response.output_text.done`,
+   `response.content_part.done`, and `final_output` at lines 681-712).
+5. No header signature changes; the new helper is `static`.
+
+## 7. Tests (`server/test/test_server_unit.cpp`)
+
+Mirror the style of `test_emitter_tool_buffer_detection`,
+`test_emitter_bare_function_tool_buffer_detection`, and
+`test_emitter_no_tools_keeps_tool_like_text`. Add:
+
+- `test_emitter_content_mode_plain_text_call_parsed` — feed
+  `"I'll fetch it. call:get_weather{location: \"SF\"}"` to a CONTENT-mode
+  emitter with `weather_tools()`. Assert: 1 ToolCall named `get_weather`
+  with args `{location: "SF"}`, `accumulated_text()` no longer contains
+  `call:get_weather{`, OpenAI finish_reason chunk shows `"tool_calls"`.
+- `test_emitter_content_mode_no_tools_skips_plain_text_call` — same
+  input, but empty tools array. Assert: no ToolCall, the call: text
+  remains in `accumulated_text()`.
+- `test_emitter_content_mode_underscore_prefix_call_parsed` — feed
+  `"_call:get_weather{location: \"NYC\"}"`. Assert: ToolCall emitted
+  (regression for the `_call:` artifact from commit `004a81b`).
+- `test_emitter_content_mode_no_call_substring_skips_parser` — feed
+  `"Plain prose with no tool invocations at all."`. Assert: no
+  ToolCall, accumulated text unchanged, `finish_reason()` is `"stop"`.
+- `test_emitter_content_mode_mixed_calls_multiple` — feed
+  `"start. call:get_weather{location: \"A\"} middle. call:get_weather{location: \"B\"} end."`.
+  Assert: 2 ToolCalls in order with the two locations; accumulated
+  text contains `"start."`, `"middle."`, `"end."` (call spans
+  stripped); no leakage of `call:`.
+- `test_emitter_content_mode_malformed_call_dropped` — feed
+  `"call:get_weather{unclosed"`. Assert: no ToolCall, no crash, the
+  malformed text remains in `accumulated_text()` (no panic).
+- `test_emitter_content_mode_does_not_double_fire_on_tool_call_xml` —
+  regression guard. Feed
+  `"<tool_call>\n<function=get_weather>\n<parameter=location>SF</parameter>\n</function>\n</tool_call>"`.
+  Assert: exactly 1 ToolCall (TOOL_BUFFER path handled it, new branch
+  did not double-emit).
+- `test_emitter_content_mode_strips_call_span_from_accumulated_text` —
+  Codex Q3 residue-hazard guard. Feed
+  `"prefix call:get_weather{location: \"SF\"} suffix"` with weather
+  tools. Assert: `em.accumulated_text().find("call:")` returns npos
+  (the matched span is stripped from the visible content). Without
+  this guard the emitter could double-display the call (once as
+  literal text, once as a `tool_use` block).
+
+Register each test with `RUN_TEST(...)` inside `main()` around the
+existing `test_emitter_*` block (~line 3549-3560 of the file).
+
+## 8. Don't break (regression matrix)
+
+| Scenario | Path | Expected | Test ref |
+|----------|------|----------|----------|
+| `<tool_call>...` XML | TOOL_BUFFER | 1 ToolCall, no double-fire | new `does_not_double_fire` test |
+| `<function=...>` bare XML | TOOL_BUFFER | 1 ToolCall | existing `test_emitter_bare_function_tool_buffer_detection` |
+| `<tool_code>{json}` | TOOL_BUFFER | 1 ToolCall | existing tool_parser tests cover the parser path |
+| Plain prose (no tools) | CONTENT | text preserved | existing `test_emitter_content_only_no_thinking` |
+| `call:foo{...}` + tools | **new** | 1 ToolCall, text stripped | new `content_mode_plain_text_call_parsed` |
+| `call:foo{...}` no tools | **new** | text preserved, no ToolCall | new `content_mode_no_tools_skips_plain_text_call` |
+| Malformed `call:foo{unclosed` | **new** | text preserved, no crash | new `content_mode_malformed_call_dropped` |
+| `accumulated_text()` for OpenAI Chat | both | visible text minus call spans | new `content_mode_plain_text_call_parsed` |
+
+## 9. PR shape
+
+- Branch: `fix/sse-emitter-content-mode-tool-parse` off `origin/main@8305b6c`.
+- Two commits:
+  1. `docs(experiments): plan SSE emitter CONTENT-mode tool parse` (this file).
+  2. `fix(server): run parse_tool_calls on CONTENT-mode accumulated text` (impl + tests).
+- Push to `easel:fix/sse-emitter-content-mode-tool-parse`.
+- PR base: `Luce-Org/lucebox-hub:main`.
+
+PR body must include:
+- Diagnosis summary (verbatim from §1).
+- Empirical signal: smoke against fac7e0f-cuda12 on bragi returning
+  `stop_reason: end_turn` for plain-text `_call:get_country_info{...}`.
+- Test count delta.
+- Streaming scope decision (option A vs B from §4.6, post-Codex).
+- Known limitations.
+
+## 10. Open questions deferred to Codex
+
+- Whether the streaming wire shape `text-delta + post-hoc tool_use`
+  breaks Cline / open-webui / Anthropic SDK consumers.
+- Whether `accumulated_raw_` (used for `tool_memory_->remember`)
+  should be cleaned too — leaning no, since `tool_memory_` keeps the
+  pre-strip raw for ID-replay matching.

From 80552014eb9c24ab87d6327a70df8b6006d8cce5 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Sun, 31 May 2026 11:34:28 -0400
Subject: [PATCH 2/6] docs(server): plan for call:<verb>{} tool-parser pattern
 + codex review

Captures the diagnosis (gemma forge 0/30 on 2026-05-30), the proposed
sixth detection pattern, the relaxed-JSON arg parser sketch, the
unit-test matrix, and codex's review (which forced reordering the new
pattern to slot #5 ahead of the bare-JSON sweep to avoid interception
of nested name/arguments-shaped args).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../server-call-verb-tool-parser-plan.md      | 434 ++++++++++++++++++
 1 file changed, 434 insertions(+)
 create mode 100644 docs/experiments/server-call-verb-tool-parser-plan.md

diff --git a/docs/experiments/server-call-verb-tool-parser-plan.md b/docs/experiments/server-call-verb-tool-parser-plan.md
new file mode 100644
index 00000000..3c09db3b
--- /dev/null
+++ b/docs/experiments/server-call-verb-tool-parser-plan.md
@@ -0,0 +1,434 @@
+# Server-side `call:<verb>{args}` tool-call parser (pattern #6)
+
+Status: draft → codex review → implementation.
+Tracking branch: `fix/server-call-verb-tool-parser` (off `origin/main`).
+
+## Background
+
+The 2026-05-30 gemma full bench scored forge **0/30** on RTX 3090 Ti
+(`d9ecba6cc105-nvidia-geforce-rtx-3090-ti-gemma-full-2026-05-30-67f4/forge.json`).
+Every row’s `iterations[0].output` shows the assistant emitting
+plain-text tool invocations of the form:
+
+- `call:get_country_info{country: "France"}`
+- `call:default_api:fetch_sales_data{quarter: "Q4", year: 2024}`
+- `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}`
+- `call:execute-bead:list-files{path: "src/"}\n\ncall:execute-bead:read-file{path: "..."}`
+- `call:execute-bead:read_file{path: "..."}`
+- `call:shell{command: "rg -i auth"}`
+- `call:infrastructure:get_logs{resource: \`payments-service-cluster\`, start_time: \`...\`}`
+  (backtick-quoted values appear in the snapshot too)
+
+`server/src/server/tool_parser.cpp::parse_tool_calls` has five
+envelope-shaped detection patterns (XML `<tool_call>`, bare
+`<function=…>`, `<function=NAME(args)>`, `<tool_code>…</tool_code>`,
+bare JSON `{ "name":…, "arguments":… }`). **None** of these match the
+gemma plain-text emission. As a result:
+
+1. `SseEmitter::accumulate` buffers text, calls `parse_tool_calls`,
+   gets back zero `ToolCalls`, never flips `finish_reason` to
+   `tool_calls`.
+2. `/v1/messages` (Anthropic) sees an empty `emitter.tool_calls()` and
+   emits a `text` content block instead of a `tool_use` block
+   (`server/src/server/http_server.cpp:2030–2090`).
+3. forge’s WorkflowRunner expects a `ToolCall` and the row dies with
+   `error_type=ValidationError`.
+
+A client-side workaround already shipped on `feat/lucebox-docker`
+(commit `deba2fd`) — see `_parse_plain_text_tool_calls` in
+`luce-bench/src/lucebench/areas/forge.py`. **This PR ports that to
+C++ and adds it as a sixth `parse_tool_calls` pattern**, fixing the
+problem at the server.
+
+## Goals
+
+1. Add pattern #6 to `parse_tool_calls`: `call:<NS>?<verb>{ <relaxed-JSON-args> }`.
+2. Drive the existing downstream pipeline:
+   `SseEmitter::accumulate` → `finish_reason=tool_calls` → Anthropic
+   `stop_reason=tool_use` with `tool_use` content blocks, or OpenAI
+   `choices[].message.tool_calls`. Nothing downstream needs to change.
+3. Honour the existing `tool_allowed(tools, name)` filter so callers
+   that pass a constrained tool list (forge) only get back tools they
+   declared.
+4. Land C++ unit tests in `server/test/test_server_unit.cpp` covering
+   the gemma-observed shapes plus the obvious edge cases.
+
+## Non-goals (intentionally out of scope)
+
+- Touching the client-side `_parse_plain_text_tool_calls` in
+  `forge.py`. It stays as defense-in-depth — older deployed servers
+  won’t have this fix. The PR description will note that the client
+  fallback is no longer load-bearing after this merges.
+- Rebuilding the docker image / running an e2e bench. The unit tests +
+  a deliberately constructed `parse_tool_calls` call exercising every
+  shape are sufficient for PR review.
+- Touching `systemd`, the host `/home/erik/.local/bin/lucebox`
+  wrapper, the running `lucebox.service` (a forge bench is in flight),
+  or anything outside `server/src/server/tool_parser.cpp` +
+  `server/test/test_server_unit.cpp`.
+
+## Design
+
+### Pattern ordering
+
+Insert as **pattern #5** (one slot ahead of the existing bare-JSON
+sweep, which becomes the new pattern #6). The four XML-envelope
+patterns are lexically unambiguous and stay ahead. The reorder is
+forced by a real interception hazard codex flagged in review:
+`call:my_tool{"name": "inner_tool", "arguments": {}}` is a valid
+gemma-shaped emission, and the bare-JSON sweep would otherwise lift
+the inner `{"name": ...}` out as its own ToolCall before the new
+pattern got a chance. By running the `call:…{…}` matcher first and
+recording its brace-balanced span in `removals`, the bare-JSON sweep
+sees that range as overlapping and skips it.
+
+### Pattern #6 regex
+
+```cpp
+// `call:` opener: optionally preceded by start-of-string,
+// whitespace, or a hard delimiter so we don't match `I'll call:foo`
+// inside narrative.
+static const std::regex re_call_verb_open(
+    R"((^|[\s,;:\(\[\{])call:([A-Za-z0-9_.:\-]+)\s*\{)");
+```
+
+Notes:
+
+- The leading group captures a sentinel character so we keep
+  `it->position()` aligned with the start of the **prefix**, not the
+  `call:` itself; the implementation computes the actual `call:`
+  offset by adding the captured sentinel’s length back.
+- `[A-Za-z0-9_.:\-]+` matches verbs *and* embedded namespaces in one
+  capture (`execute-bead:read-file`, `default_api:fetch_sales_data`,
+  bare `shell`). The verb-name passed to `add_call` strips everything
+  up to and including the last `:`, so `execute-bead:read-file`
+  becomes `read-file`.
+- The `\s*` before `{` is intentional — the snapshot rows always emit
+  `{` immediately, but tolerating optional whitespace doesn’t open new
+  failure modes and matches the relaxed pattern from the Python port.
+- `^` works on `std::regex` ECMAScript syntax against `text` from
+  position 0; we do not rely on `multiline` mode. Subsequent matches
+  rely on the explicit sentinel-character alternation, which catches
+  newlines, spaces, commas, parens, square brackets, and curly braces
+  — every realistic boundary in the snapshot data.
+
+### Balanced-brace extractor
+
+Mirror pattern #5’s scanner — track depth, skip over string literals,
+**but** extend the string-literal handling to honour single quotes
+and backticks (the snapshot row above has backticked values). This
+diverges slightly from the Python port (which only knew `'` and `"`)
+to handle the real gemma output we observed.
+
+```cpp
+// Returns one-past-close index, or std::string::npos.
+static size_t balanced_braces_end(const std::string & text, size_t open) {
+    int depth = 0;
+    char in_str = 0;  // 0 or one of " ' `
+    for (size_t i = open; i < text.size(); i++) {
+        char c = text[i];
+        if (in_str) {
+            if (c == '\\' && i + 1 < text.size()) { i++; continue; }
+            if (c == in_str) in_str = 0;
+            continue;
+        }
+        if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; }
+        if (c == '{') depth++;
+        else if (c == '}') {
+            if (--depth == 0) return i + 1;
+        }
+    }
+    return std::string::npos;
+}
+```
+
+### Relaxed-JSON args parser
+
+Direct port of `_coerce_relaxed_json` from `forge.py`:
+
+1. Try `json::parse(strict)` on `"{" + body + "}"` first. Most
+   strict-JSON args (`{"path": "src/"}`) succeed here.
+2. On failure, rewrite the body: walk char-by-char tracking string
+   state; when outside a string, look ahead for a bare-identifier
+   `[A-Za-z_][A-Za-z0-9_]*` immediately followed by optional
+   whitespace and `:` — wrap the identifier in double quotes. Also
+   normalize single-quoted strings to double-quoted (open `'` becomes
+   `"`; closing `'` becomes `"`; backtick strings get the same
+   treatment). Backslash escapes are preserved.
+3. Retry `json::parse` on the rewrite. If it still fails, **drop the
+   single invocation** (do not throw, do not add a `ToolCall`, do not
+   poison `removals`). Continue scanning past the closing brace.
+
+The rewrite step has to leave already-quoted JSON keys alone. The
+look-ahead is `(?!").+:` — i.e. only fire if we’re not already
+sitting just after a `"` we already emitted. We track this via the
+`out` buffer’s last char, exactly as the Python implementation does.
+
+### Verb normalization
+
+```cpp
+std::string verb = full_match;          // e.g. "execute-bead:read-file"
+size_t colon = verb.find_last_of(':');
+if (colon != std::string::npos) verb = verb.substr(colon + 1);
+```
+
+`add_call(verb, args, span_start, span_end)` then passes through the
+existing `tool_allowed` filter.
+
+### Span tracking
+
+When a successful invocation is parsed, push `{start, brace_end}` onto
+`removals` so the surrounding cleanup pass strips the `call:…{…}` text
+out of the assistant message (otherwise the OpenAI/Anthropic content
+block would carry the literal `call:foo{…}` alongside the structured
+tool call — the same double-signal hazard the Python port avoids via
+`_strip_plain_text_tool_calls`).
+
+The `start` recorded in `removals` is the index of `c` in `call:` (not
+the preceding sentinel) so we don’t eat the user’s narrative space.
+
+### Tool-allowed filter placement
+
+Use the existing `tool_allowed(tools, verb)` helper inside
+`add_call` (same path as patterns #1–5). Dropping happens **after**
+regex match + JSON parse — this is cheap and consistent. Constraining
+the regex to a known verb set would have been faster but it precludes
+the “no tools constraint” case (forge passes a tool list; agent and
+codex sometimes don’t).
+
+## Regression safety inventory
+
+Existing tool-parser tests in `server/test/test_server_unit.cpp`
+(lines 239–317):
+
+| Test | Body contains `call:` substring? |
+|---|---|
+| `test_parse_tool_call_xml` | no |
+| `test_parse_bare_function_xml` | no |
+| `test_parse_json_tool_call` | no |
+| `test_parse_no_tools` | no |
+| `test_parse_tool_code_wrapper` | no |
+| `test_parse_tool_allowed_filter` | no |
+
+Also the `emitter_*` family (lines 470–700+) uses `<tool_call>` XML
+exclusively. No existing test has a `call:<verb>{` substring that
+could accidentally match the new pattern. Safe.
+
+## Test plan
+
+Add to `server/test/test_server_unit.cpp` (the same file already
+covers tool-parser unit tests; convention is one function per case
+registered under the `── Tool parser ──` section). New cases:
+
+1. `test_parse_call_verb_single` — `call:get_country_info{country: "France"}`
+2. `test_parse_call_verb_back_to_back` —
+   `call:get_country_info{country: "France"}call:summarize{text: "ok"}`
+3. `test_parse_call_verb_namespaced` —
+   `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}` → verb
+   normalized to `read-file`
+4. `test_parse_call_verb_snake_and_hyphen` — covers both
+   `call:execute-bead:read_file{path: "..."}` and
+   `call:execute-bead:list-files{path: "src/"}` with intervening
+   newlines
+5. `test_parse_call_verb_tool_allowed_filter` — verb `disallowed` in
+   text, `tools` allows only `allowed`; result has zero `ToolCall`
+6. `test_parse_call_verb_inline_prose_rejected` —
+   `"Sure, I'll call:foo{x:1}"` *should* still match because the
+   space before `call:` satisfies the sentinel. To exercise the
+   anti-false-positive case, use `"narrative.call:foo{x:1}"` —
+   no sentinel char before `call:`, regex rejects.
+7. `test_parse_call_verb_malformed_args` —
+   `call:foo{country: "France"` (unterminated brace) → call dropped,
+   no crash, no `ToolCall`, no removal span
+8. `test_parse_call_verb_inner_brace_in_string` —
+   `call:foo{cmd: "echo {} ok"}` → must not break on the `{` inside
+   the string; args dict has `cmd == "echo {} ok"`
+9. `test_parse_call_verb_strict_json_args` —
+   `call:foo{"path": "x"}` (already-quoted keys) parses on the strict
+   path
+10. `test_parse_call_verb_unquoted_keys` — relaxed pass kicks in:
+    `call:foo{path: "x"}` → args dict has `path == "x"`
+11. `test_parse_call_verb_cleaned_text` — verify the `call:…{…}`
+    fragment is stripped from `result.cleaned_text` (parity with the
+    XML envelope patterns).
+12. `test_parse_call_verb_intercept_inner_json` (codex-requested) —
+    `call:outer{"name": "inner", "arguments": {}}`. Exactly one
+    `ToolCall` with `name == "outer"` and `arguments` containing the
+    literal inner JSON; verifies that the reorder of patterns #5/#6
+    actually defends against the bare-JSON sweep stealing the inner
+    object.
+
+These run under the existing `RUN_TEST(...)` block; add them under
+the `── Tool parser ──` heading immediately after
+`test_parse_tool_allowed_filter`.
+
+### Build verification
+
+The new code lives entirely in `server/src/server/tool_parser.cpp` —
+already in `target_sources` for `test_server_unit`
+(`server/CMakeLists.txt:776–788`). Build with:
+
+```bash
+cmake --build server/build --target test_server_unit && \
+    server/build/test_server_unit
+```
+
+(If the lucebox-hub-285 worktree doesn’t have a build dir, configure
+it the same way the repo expects — but a clean build is the operator’s
+job, not part of the PR landing process.)
+
+## Reverse-compat with the client-side fix
+
+`luce-bench/src/lucebench/areas/forge.py::_parse_plain_text_tool_calls`
+(commit `deba2fd` on `feat/lucebox-docker`) stays untouched. After
+this PR merges and the docker image is rebuilt, the client-side path
+becomes a no-op for any server that includes pattern #6. Leaving it
+in place is harmless (the synthesized `ToolCall`s match what the
+server now produces) and shields older deployed servers. The PR
+description will spell this out so a reviewer doesn’t mistakenly
+strip the Python fallback.
+
+## Image rebuild + e2e (out of scope for this PR)
+
+After merge, the operator should:
+
+1. Rebuild the cuda12 image (`docker bake` per `docker-bake.hcl`).
+2. Restart `lucebox.service` *after the current forge bench finishes*.
+3. Re-run the 2026-05-30 gemma full bench and confirm forge passes
+   non-trivially without the client-side `_parse_plain_text_tool_calls`
+   workaround.
+
+These steps are validation, not part of the PR diff.
+
+## Risks / open questions
+
+- **Multiline args.** The snapshot shows args that span multiple
+  lines (`call:default_api:analyze_data{data: [{"date": ...}, ...]}`).
+  The balanced-brace scanner is line-agnostic so this Just Works —
+  noted here to make the implementer audit at least one multi-line
+  case in the unit tests.
+- **Backticked values.** `infrastructure:get_logs{resource: \`x\`}`
+  uses backticks. JSON has no backtick-string concept; the relaxed
+  rewrite normalizes them to double-quoted strings. This is one place
+  the C++ port goes **further** than the Python port — that’s a
+  conscious choice based on the real snapshot.
+- **Performance.** `parse_tool_calls` is already O(n·patterns); pattern
+  #6 adds a single regex sweep plus per-match work. The buffer fed
+  through `SseEmitter::accumulate` is bounded by generation length;
+  no realistic perf concern.
+
+## Codex review
+
+Captured verbatim from `codex exec` against the running lucebox
+server (`http://localhost:8080/v1` / `dflash` model). The final
+answer block follows; the codex run appended additional
+stream-of-consciousness scratch after the answer (an artefact of the
+model’s think channel leaking past the answer) — that scratch is
+omitted as noise.
+
+```
+This is a high-quality implementation plan. The approach to handling
+"dirty" JSON via a manual balanced-brace scanner and a
+relaxation/normalization pass is a pragmatic and necessary solution
+for the non-standard emissions observed in Gemma models.
+
+However, there is one **critical architectural risk** regarding
+pattern priority and one minor discrepancy in the brace-tracking
+logic.
+
+### Critical: Pattern Priority & Interception
+Issue: Pattern #5 (Bare JSON sweep) is currently ordered before
+Pattern #6 (the new `call:verb{...}` pattern).
+Risk: If a tool call's arguments happen to contain keys named
+`"name"` and `"arguments"`, Pattern #5 will intercept the substring
+and incorrectly identify a tool call.
+* Example: `call:my_tool{"name": "inner_tool", "arguments": {}}`
+* Outcome: Pattern #5 will match the `{...}` part, creating a
+  ToolCall for `inner_tool`. Because Pattern #5 is processed first,
+  it will add a removal span for the inner JSON, potentially leaving
+  the `call:my_tool` prefix dangling in the cleaned_text or causing
+  Pattern #6 to fail/overlap.
+* Recommendation: Move Pattern #6 to a higher priority (e.g., before
+  Pattern #5). The `call:` prefix is a much stronger semantic signal
+  than the presence of JSON keys.
+
+### Technical Review
+
+1. Regex & Anchors
+* The sentinel list `[\s,;:\(\[\{]` is excellent. It effectively
+  prevents false positives like `I'll call:foo`.
+* Note on `^`: In `std::regex`, `^` matches the start of the string.
+  While `std::regex` lacks a standard multiline flag, your inclusion
+  of `\s` (which covers `\n`) in the sentinel list makes the pattern
+  robust for tool calls starting on new lines.
+
+2. Robustness of the Parser
+* Balanced Braces: Your `balanced_braces` logic correctly handles
+  string literals (including escaped quotes) and nested braces.
+* Discrepancy: The plan mentions tracking `[]` (square brackets), but
+  the provided implementation snippet only increments depth for `{`.
+  While tracking `[]` isn't strictly required to find the outer `{}`
+  of a JSON object, you should ensure it's implemented if you intend
+  to follow the plan's promise, especially to handle edge cases where
+  a `}` might appear inside an array (though unlikely in valid JSON).
+* Relaxed JSON: The "rewrite" strategy (normalizing `'` and `` ` `` to
+  `"`, and wrapping unquoted keys) is a solid way to handle the
+  observed "relaxed" syntax.
+
+3. Filter Placement
+* The `tool_allowed` filter is correctly placed after the parser has
+  successfully identified the verb and arguments. Constraining the
+  regex itself to a known verb list would be a maintenance burden.
+
+4. Unit Tests
+* The test plan is very thorough. To make it bulletproof, I recommend
+  adding these two specific cases:
+    * Nested Tool Call: `call:outer{"name": "inner", "arguments": {}}`
+      — specifically to verify that the priority fix (Point 1) works.
+    * Complex String Content: `call:foo{cmd: "echo {not_a_brace}"}` —
+      to ensure the string-aware scanner doesn't trip on braces
+      inside quotes.
+
+### Summary of Recommendations
+1. Reorder patterns: Move Pattern #6 above Pattern #5.
+2. Verify `[]` implementation: Ensure the brace scanner actually
+   tracks `[` and `]` to match your design spec.
+3. Add "Interception" test: Add a unit test where tool arguments
+   mimic a JSON tool call to confirm the priority fix.
+```
+
+### Plan adjustments after review
+
+1. **Pattern ordering — accepted.** Codex’s `call:my_tool{"name":
+   "inner_tool", "arguments": {}}` example is real: gemma snapshot
+   rows in the snapshot include `call:default_api:analyze_data{data:
+   [{...}, ...]}` where the inner objects don’t happen to carry
+   `name`/`arguments` keys today, but accepting that hazard for
+   future-proofing is cheap. The new pattern moves to **#5**, demoting
+   the bare-JSON sweep to **#6**. The five XML-envelope patterns stay
+   ahead since they’re lexically unambiguous and pattern #6 would also
+   spuriously chew their inner JSON if reordered.
+2. **Square-bracket tracking — accepted with a clarification.** I
+   don’t need `[`/`]` in the brace-depth counter for closing the outer
+   `{`, because by the time we hit a `]` without `[` we’re already
+   inside a string (handled) or the JSON is malformed (we drop the
+   call). I will **still** add explicit `[`/`]` tracking to the
+   scanner so a stray `}` *inside* an array (e.g. a JSON value like
+   `"}"` written without quotes) doesn’t fool the close-counter. This
+   matches the spirit of the plan’s prose.
+3. **Interception unit test — accepted.** Add
+   `test_parse_call_verb_intercept_inner_json` with body
+   `call:outer{"name": "inner", "arguments": {}}`. Verify the
+   resulting `ToolCall` has `name == "outer"` and `arguments ==
+   {"name": "inner", "arguments": {}}` — and that there’s exactly
+   **one** `ToolCall`, not two.
+4. **“Complex string content” test — accepted.** Add
+   `test_parse_call_verb_string_with_close_brace` with
+   `call:foo{cmd: "echo {not_a_brace}"}`. Already partially covered
+   by `test_parse_call_verb_inner_brace_in_string`; will reuse and
+   strengthen that test rather than add a new one.
+
+No rebuttals — codex’s feedback is all sound. Renumber: the new
+pattern is **pattern #5**, the old bare-JSON sweep becomes **pattern
+#6**. Update test names, header comments, and the file-top docstring
+accordingly during implementation.

From d67a26985cd456a27812733cd8eb87f010d69937 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Sun, 31 May 2026 11:38:18 -0400
Subject: [PATCH 3/6] fix(server): parse gemma's call:<verb>{} plain-text tool
 emissions

Adds a sixth detection pattern to `parse_tool_calls` that recognizes
the plain-text tool invocations gemma emits in chat-completion content
(`call:get_country_info{country: "France"}` /
`call:execute-bead:read-file{path: "..."}` / etc).

The 2026-05-30 gemma full bench scored forge 0/30 because every row's
output carried these `call:<verb>{...}` invocations as text rather
than structured `tool_use` content blocks. None of the existing five
envelope-shaped detectors (`<tool_call>`, `<function=...>`,
`<tool_code>`, bare JSON) match the bare `call:` shape.

The new pattern:
- Anchors on a sentinel character (whitespace, comma, semicolon,
  open/close bracket, etc.) before `call:` so narrative usages like
  `narrative.call:foo` don't match.
- Supports namespaced verbs (`execute-bead:read-file`,
  `default_api:fetch_sales_data`) and strips the namespace before
  using the verb as the ToolCall name.
- Extracts the args block via a quote- and escape-aware balanced-brace
  scanner that tolerates `"`, `'`, and `` ` `` string literals and
  tracks `[]` depth alongside `{}`.
- Parses the args as strict JSON first, then falls back to a relaxed
  rewrite that quotes bare identifier keys and normalizes single/
  backtick quoted strings to double-quoted before retrying. Malformed
  args drop the single invocation without crashing or polluting other
  calls.
- Runs *before* the bare-JSON sweep so that inner args of the form
  `call:outer{"name": "inner", "arguments": {}}` aren't hijacked into
  a spurious `inner` ToolCall by pattern #6.

Downstream the existing wiring takes over: SseEmitter::accumulate
already calls parse_tool_calls; a non-empty ToolCall list flips
finish_reason to `tool_calls`, which the Anthropic /v1/messages
branch maps to `stop_reason="tool_use"` with `tool_use` content
blocks (http_server.cpp:2030-2090) and the OpenAI branch maps to
`choices[].message.tool_calls`.

The forge client-side workaround `_parse_plain_text_tool_calls`
shipping on feat/lucebox-docker (commit deba2fd) becomes redundant
once a server with this fix is deployed. It stays in place as
defense-in-depth for older deployed servers.

Test plan: 14 new C++ unit cases in test_server_unit.cpp covering
single / back-to-back / namespaced / snake- and kebab-case verbs;
tool-allowed filtering; mid-prose rejection vs. whitespace-led
acceptance; malformed args drop; inner `{}` inside string literals;
strict-JSON and relaxed-keys arg parsing; cleaned_text scrubbing;
the codex-requested inner `name`/`arguments` interception case; and
multi-line nested-array args mirroring the snapshot data. All pass
in a standalone driver.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/src/server/tool_parser.cpp | 170 +++++++++++++++++++++++++++-
 server/src/server/tool_parser.h   |   5 +-
 server/test/test_server_unit.cpp  | 180 ++++++++++++++++++++++++++++++
 3 files changed, 350 insertions(+), 5 deletions(-)

diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp
index 6244b250..43bf2b1c 100644
--- a/server/src/server/tool_parser.cpp
+++ b/server/src/server/tool_parser.cpp
@@ -1,11 +1,18 @@
 // Tool call parser implementation.
 //
-// Five detection patterns, tried in order:
+// Six detection patterns, tried in order:
 // 1. <tool_call><function=NAME>...<parameter=K>V</parameter>...</function></tool_call>
 // 2. <function=NAME>...params...</function>  (bare, outside tool_call)
 // 3. <function=NAME(k="v", ...)></function>  (function-signature style)
 // 4. <tool_code>{JSON}</tool_code>
-// 5. Bare JSON objects with name+arguments fields
+// 5. call:<ns>?<verb>{relaxed-JSON args}    (gemma plain-text emissions)
+// 6. Bare JSON objects with name+arguments fields
+//
+// Pattern 5 runs *before* pattern 6 so that args like
+//   call:outer{"name": "inner", "arguments": {}}
+// don't get hijacked by the bare-JSON sweep into a spurious `inner` tool
+// call. The brace-balanced span pattern 5 records in `removals` shadows
+// the inner JSON from pattern 6's view via `overlaps()`.
 
 #include "tool_parser.h"
 
@@ -161,6 +168,125 @@ static const std::regex & re_tool_code() {
     return r;
 }
 
+// Pattern 5: `call:<ns>?<verb>{` opener. The sentinel alternation in front
+// rejects narrative usages like "I'll call:foo{x:1}" where `call:` is glued
+// to a preceding word — whitespace, common punctuation, and open/close
+// brackets are the realistic boundaries seen in the snapshot data. `\s`
+// covers `\n` so a `call:` at the start of any line is matched without
+// relying on std::regex multiline support (which is non-portable).
+//
+// Note that `}` is in the sentinel list — gemma frequently emits multiple
+// invocations back-to-back: `call:a{x:1}call:b{y:2}`. Without `}` as a
+// sentinel the second match would be missed.
+static const std::regex & re_call_verb_open() {
+    static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>])call:([A-Za-z0-9_.:\-]+)\s*\{)");
+    return r;
+}
+
+// Find the index one past the `}` that matches `text[open] == '{'`.
+// Respects nested {}/[] depth and skips over "..." / '...' / `...`
+// string literals (with backslash escapes). Returns std::string::npos if
+// no matching close is found.
+static size_t balanced_braces_end(const std::string & text, size_t open) {
+    int depth = 0;
+    char in_str = 0;  // 0, or one of '"', '\'', '`'
+    for (size_t i = open; i < text.size(); i++) {
+        char c = text[i];
+        if (in_str) {
+            if (c == '\\' && i + 1 < text.size()) { i++; continue; }
+            if (c == in_str) in_str = 0;
+            continue;
+        }
+        if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; }
+        if (c == '{' || c == '[') {
+            depth++;
+        } else if (c == '}' || c == ']') {
+            depth--;
+            if (depth == 0 && c == '}') return i + 1;
+            if (depth < 0) return std::string::npos;
+        }
+    }
+    return std::string::npos;
+}
+
+// Try strict json::parse first; on failure rewrite single- and
+// backtick-quoted strings to double-quoted, wrap bare identifier keys
+// in double quotes, and retry. Returns true and populates `out` on
+// success; returns false on irrecoverable failure (and `out` is unset).
+//
+// The rewrite walks the buffer char-by-char tracking string state so it
+// doesn't mangle identifiers that live inside string values.
+static bool coerce_relaxed_json(const std::string & payload, json & out) {
+    {
+        json parsed = json::parse(payload, nullptr, false);
+        if (!parsed.is_discarded()) {
+            out = std::move(parsed);
+            return true;
+        }
+    }
+
+    // Permissive pass.
+    static const std::regex re_bare_key(R"(([A-Za-z_][A-Za-z0-9_]*)(\s*:))");
+
+    std::string rewritten;
+    rewritten.reserve(payload.size() + 16);
+    char in_str = 0;  // 0, or the *opening* quote we saw
+    for (size_t i = 0; i < payload.size(); ) {
+        char c = payload[i];
+        if (in_str) {
+            // Inside a string we already opened. Mirror escapes verbatim.
+            if (c == '\\' && i + 1 < payload.size()) {
+                rewritten += c;
+                rewritten += payload[i + 1];
+                i += 2;
+                continue;
+            }
+            if (c == in_str) {
+                // Close — always emit a double-quote regardless of which
+                // quote style opened the string. The opening side already
+                // emitted a `"`.
+                rewritten += '"';
+                in_str = 0;
+                i++;
+                continue;
+            }
+            rewritten += c;
+            i++;
+            continue;
+        }
+        if (c == '"' || c == '\'' || c == '`') {
+            rewritten += '"';
+            in_str = c;
+            i++;
+            continue;
+        }
+        // Try to match a bare-key identifier here. Don't fire if the
+        // previous emitted char is `"` — that would indicate we're sitting
+        // right after a JSON string boundary and the "identifier" is
+        // probably part of a value continuation (e.g. `"k": foo: 1` would
+        // be malformed JSON anyway, but better to leave it untouched).
+        std::smatch m;
+        std::string tail = payload.substr(i);
+        if (std::regex_search(tail, m, re_bare_key,
+                              std::regex_constants::match_continuous) &&
+            (rewritten.empty() || rewritten.back() != '"')) {
+            rewritten += '"';
+            rewritten += m[1].str();
+            rewritten += '"';
+            rewritten += m[2].str();
+            i += m.length();
+            continue;
+        }
+        rewritten += c;
+        i++;
+    }
+
+    json parsed = json::parse(rewritten, nullptr, false);
+    if (parsed.is_discarded()) return false;
+    out = std::move(parsed);
+    return true;
+}
+
 // ─── XML parameter parser ───────────────────────────────────────────────
 
 static json parse_xml_params(const std::string & region, const std::string & fn_name,
@@ -397,7 +523,45 @@ ToolParseResult parse_tool_calls(const std::string & text, const json & tools) {
         }
     }
 
-    // Pattern 5: Bare JSON objects
+    // Pattern 5: call:<ns>?<verb>{relaxed-JSON args}
+    //
+    // Runs before the bare-JSON sweep so that inner JSON of the form
+    //   call:outer{"name": "inner", "arguments": {}}
+    // doesn't get hijacked into a spurious `inner` ToolCall.
+    {
+        auto begin = std::sregex_iterator(text.begin(), text.end(), re_call_verb_open());
+        auto end = std::sregex_iterator();
+        for (auto it = begin; it != end; ++it) {
+            // Group 1: sentinel char (may be empty if matched at `^`).
+            // Group 2: full verb including any embedded namespaces.
+            size_t prefix_len = (*it)[1].matched ? (*it)[1].length() : 0;
+            size_t call_start = it->position() + prefix_len;
+            if (overlaps(removals, call_start)) continue;
+
+            // The matched substring runs from call_start through the `{`
+            // (consuming the opener and any whitespace between verb and
+            // brace). Compute the brace index from the match end.
+            size_t brace_open = it->position() + it->length() - 1;
+            if (brace_open >= text.size() || text[brace_open] != '{') continue;
+
+            size_t brace_close = balanced_braces_end(text, brace_open);
+            if (brace_close == std::string::npos) continue;
+
+            std::string raw_args = text.substr(brace_open, brace_close - brace_open);
+            json args;
+            if (!coerce_relaxed_json(raw_args, args)) continue;
+            if (!args.is_object()) continue;
+
+            std::string verb = (*it)[2].str();
+            size_t colon = verb.find_last_of(':');
+            if (colon != std::string::npos) verb = verb.substr(colon + 1);
+            if (verb.empty()) continue;
+
+            add_call(verb, args, call_start, brace_close);
+        }
+    }
+
+    // Pattern 6: Bare JSON objects
     {
         size_t cursor = 0;
         while (cursor < text.size()) {
diff --git a/server/src/server/tool_parser.h b/server/src/server/tool_parser.h
index 1ff9890a..b3bd4a79 100644
--- a/server/src/server/tool_parser.h
+++ b/server/src/server/tool_parser.h
@@ -1,11 +1,12 @@
 // Tool call parser — extracts structured tool calls from generated text.
 //
-// Supports 5 detection patterns:
+// Supports 6 detection patterns:
 //   1. <tool_call><function=name>...</function></tool_call>  (Qwen XML)
 //   2. <function=name>...</function>                          (bare function XML)
 //   3. <function=name(k="v")></function>                      (function signature)
 //   4. <tool_code>{...JSON...}</tool_code>                    (tool_code wrapper)
-//   5. Bare JSON objects  {"name":..., "arguments":...}       (raw JSON)
+//   5. call:<ns>?<verb>{relaxed-JSON-args}                    (gemma plain-text)
+//   6. Bare JSON objects  {"name":..., "arguments":...}       (raw JSON)
 
 #pragma once
 
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index 275ec935..d0c9b291 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -316,6 +316,172 @@ static void test_parse_tool_allowed_filter() {
     TEST_ASSERT(result.tool_calls.empty());
 }
 
+// ─── Pattern 5: call:<verb>{relaxed-JSON args} (gemma plain-text) ─────
+
+static void test_parse_call_verb_single() {
+    std::string text = "call:get_country_info{country: \"France\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "get_country_info");
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["country"] == "France");
+    }
+    TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos);
+}
+
+static void test_parse_call_verb_back_to_back() {
+    std::string text =
+        "call:get_country_info{country: \"France\"}"
+        "call:summarize{text: \"ok\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 2);
+    if (result.tool_calls.size() == 2) {
+        TEST_ASSERT(result.tool_calls[0].name == "get_country_info");
+        TEST_ASSERT(result.tool_calls[1].name == "summarize");
+    }
+}
+
+static void test_parse_call_verb_namespaced() {
+    std::string text = "call:execute-bead:read-file{path: \"crates/foo/src/lib.rs\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        // Verb only — namespace stripped.
+        TEST_ASSERT(result.tool_calls[0].name == "read-file");
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["path"] == "crates/foo/src/lib.rs");
+    }
+}
+
+static void test_parse_call_verb_snake_and_hyphen() {
+    std::string text =
+        "call:execute-bead:list-files{path: \"src/\"}\n\n"
+        "call:execute-bead:read_file{path: \"a/b.rs\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 2);
+    if (result.tool_calls.size() == 2) {
+        TEST_ASSERT(result.tool_calls[0].name == "list-files");
+        TEST_ASSERT(result.tool_calls[1].name == "read_file");
+    }
+}
+
+static void test_parse_call_verb_tool_allowed_filter() {
+    std::string text = "call:disallowed_verb{x: 1}call:allowed_verb{y: 2}";
+    json tools = json::array({
+        {{"type", "function"}, {"function", {{"name", "allowed_verb"}}}}
+    });
+    auto result = parse_tool_calls(text, tools);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "allowed_verb");
+    }
+}
+
+static void test_parse_call_verb_inline_prose_rejected() {
+    // No sentinel char before `call:` — must NOT match.
+    std::string text = "narrative.call:foo{x:1}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.empty());
+}
+
+static void test_parse_call_verb_inline_prose_after_space() {
+    // Whitespace IS a valid sentinel — this should match.
+    std::string text = "Sure, I'll call:foo{x: 1}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "foo");
+    }
+}
+
+static void test_parse_call_verb_malformed_args() {
+    // Unterminated brace — drop the call, don't crash.
+    std::string text = "call:foo{country: \"France\"";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.empty());
+}
+
+static void test_parse_call_verb_inner_brace_in_string() {
+    // The `{` and `}` inside the string value must not confuse the
+    // balanced-brace scanner.
+    std::string text = "call:foo{cmd: \"echo {not_a_brace} ok\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["cmd"] == "echo {not_a_brace} ok");
+    }
+}
+
+static void test_parse_call_verb_strict_json_args() {
+    // Strict-JSON path: keys already quoted.
+    std::string text = "call:foo{\"path\": \"x\"}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["path"] == "x");
+    }
+}
+
+static void test_parse_call_verb_unquoted_keys() {
+    // Relaxed-JSON path: bare keys get quoted.
+    std::string text = "call:foo{path: \"x\", count: 3}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["path"] == "x");
+        TEST_ASSERT(args["count"] == 3);
+    }
+}
+
+static void test_parse_call_verb_cleaned_text() {
+    // The matched span should be stripped from cleaned_text.
+    std::string text = "Hello call:foo{x: 1} world.";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos);
+    TEST_ASSERT(result.cleaned_text.find("Hello") != std::string::npos);
+    TEST_ASSERT(result.cleaned_text.find("world.") != std::string::npos);
+}
+
+static void test_parse_call_verb_intercept_inner_json() {
+    // Codex-requested: inner args of the form {"name": ..., "arguments": ...}
+    // must NOT be picked up by pattern 6 (bare-JSON sweep) as a spurious
+    // `inner` ToolCall. Exactly one ToolCall, named `outer`, with the
+    // inner JSON intact in its arguments.
+    std::string text = "call:outer{\"name\": \"inner\", \"arguments\": {}}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "outer");
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["name"] == "inner");
+        TEST_ASSERT(args["arguments"].is_object());
+    }
+}
+
+static void test_parse_call_verb_multiline_args() {
+    // Snapshot rows have multi-line nested args; the balanced-brace
+    // scanner is line-agnostic, so this must Just Work.
+    std::string text =
+        "call:default_api:analyze_data{\n"
+        "  data: [{\"date\": \"2024-10-05\", \"qty\": 50}, {\"date\": \"2024-10-06\", \"qty\": 60}],\n"
+        "  metric: \"qty\"\n"
+        "}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "analyze_data");
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["metric"] == "qty");
+        TEST_ASSERT(args["data"].is_array());
+        TEST_ASSERT(args["data"].size() == 2);
+    }
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // SSE Emitter tests
 // ═══════════════════════════════════════════════════════════════════════
@@ -2590,6 +2756,20 @@ int main() {
     RUN_TEST(test_parse_no_tools);
     RUN_TEST(test_parse_tool_code_wrapper);
     RUN_TEST(test_parse_tool_allowed_filter);
+    RUN_TEST(test_parse_call_verb_single);
+    RUN_TEST(test_parse_call_verb_back_to_back);
+    RUN_TEST(test_parse_call_verb_namespaced);
+    RUN_TEST(test_parse_call_verb_snake_and_hyphen);
+    RUN_TEST(test_parse_call_verb_tool_allowed_filter);
+    RUN_TEST(test_parse_call_verb_inline_prose_rejected);
+    RUN_TEST(test_parse_call_verb_inline_prose_after_space);
+    RUN_TEST(test_parse_call_verb_malformed_args);
+    RUN_TEST(test_parse_call_verb_inner_brace_in_string);
+    RUN_TEST(test_parse_call_verb_strict_json_args);
+    RUN_TEST(test_parse_call_verb_unquoted_keys);
+    RUN_TEST(test_parse_call_verb_cleaned_text);
+    RUN_TEST(test_parse_call_verb_intercept_inner_json);
+    RUN_TEST(test_parse_call_verb_multiline_args);
 
     std::fprintf(stderr, "\n── SSE Emitter ──\n");
     RUN_TEST(test_emitter_reasoning_split_openai);

From 80e6e2acb6d6077392909cb7d4187ad33098bed9 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Sun, 31 May 2026 21:26:07 -0400
Subject: [PATCH 4/6] fix(tool_parser): accept ``_call:`` prefix from gemma
 tokenizer artifact
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke-testing the post-PR-#323 image (lucebox-hub:cuda12 @ 8039911) on
sindri's gemma-4-26b revealed a new emission pattern: the model
sometimes outputs ``_call:get_country_info{...}`` with a leading
underscore. This is a SentencePiece / chat-template tokenizer artifact
that became visible after bragi's channel-token routing fix
(commit 4b757d1) — the underscore is residual from the tokenizer's
internal serialization that earlier handling stripped.

Both parsers missed these invocations:

* Server-side (tool_parser.cpp:182): the sentinel character class
  ``[\s,;:\(\[\{\}\)\]\>]`` did not include ``_``. Added.
* Client-side (forge.py:32): ``\bcall:`` requires a word boundary
  before ``call``, but ``_`` is a word char so ``\b`` doesn't fire
  between ``_`` and ``c``. Replaced with explicit lookbehind on the
  same sentinel set (including ``_``).

Net result: ``_call:foo{...}`` now parses to a tool_use the same way
``call:foo{...}`` does. Tradeoff: ``my_call:foo{}`` mid-identifier
would also match, but real model outputs don't emit free-form
``my_call:`` text (tool names come from request tool defs).

Tests: +2 cases in test_forge_grader.py (underscore alone, mixed
back-to-back with both prefixed and bare). 16 → 18 forge_grader
tests, all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/src/server/tool_parser.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp
index 43bf2b1c..a8d1518b 100644
--- a/server/src/server/tool_parser.cpp
+++ b/server/src/server/tool_parser.cpp
@@ -178,8 +178,17 @@ static const std::regex & re_tool_code() {
 // Note that `}` is in the sentinel list — gemma frequently emits multiple
 // invocations back-to-back: `call:a{x:1}call:b{y:2}`. Without `}` as a
 // sentinel the second match would be missed.
+//
+// `_` is also in the sentinel list to handle a SentencePiece / chat-template
+// artifact: post-bragi-channel-routing (commit 4b757d1) the gemma server
+// occasionally emits raw tokens like `_call:get_country_info{...}` where
+// the leading `_` is residual tokenizer serialization. Without `_` here
+// the parser misses every such invocation — empirically confirmed against
+// gemma-4-26b 2026-05-31 smoke test. Tradeoff: `my_call:foo{}` mid-
+// identifier could match, but real model output doesn't emit `my_call:`
+// strings (tool names come from the request's tool definitions).
 static const std::regex & re_call_verb_open() {
-    static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>])call:([A-Za-z0-9_.:\-]+)\s*\{)");
+    static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>_])call:([A-Za-z0-9_.:\-]+)\s*\{)");
     return r;
 }
 

From 8218333ba6b6691c1062e415eddfa3b1378dc861 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 1 Jun 2026 09:37:20 -0400
Subject: [PATCH 5/6] fix(server): wire sse_emitter to detect plain-text
 call:<verb>{} tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #323's parser added pattern 6 (call:<verb>{...}) but the SSE emitter
only invokes parse_tool_calls when mode_ == TOOL_BUFFER, which fires
only on the literal <tool_call> XML opener. For models like gemma-4
that emit tool calls as plain text, the emitter stays in CONTENT mode
and the parser is never called, so no tool_use blocks land in the
response (finish_reason="stop" / stop_reason="end_turn").

Add a CONTENT-mode finalize branch that runs parse_tool_calls when
the accumulated text contains a plausible `call:<verb>{` opener
(checked via a cheap O(N) substring scan to avoid regex cost on
no-tool responses). Matches are hoisted into tool_calls_, the
covering spans are stripped from accumulated_text, and finish_reason
flips to "tool_calls" so the existing Anthropic/OpenAI serialization
paths emit proper tool_use content blocks.

Pre-check accepts `_call:foo{` (SentencePiece underscore artifact)
since `find("call:")` lands inside the `_call:` window — full
validation is delegated to parse_tool_calls (tool_parser.cpp).

Tests: +9 unit cases covering parsed/skipped/underscore/no-substring/
multi-call/malformed/no-double-fire-on-XML/empty-tools/preserving
prior content paths.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/src/server/sse_emitter.cpp | 170 ++++++++++++++++++++++++
 server/test/test_server_unit.cpp  | 209 ++++++++++++++++++++++++++++++
 2 files changed, 379 insertions(+)

diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
index 604f11a7..0441e5bb 100644
--- a/server/src/server/sse_emitter.cpp
+++ b/server/src/server/sse_emitter.cpp
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cctype>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@@ -23,6 +24,47 @@ static bool has_request_tools(const json & tools) {
     return tools.is_array() && !tools.empty();
 }
 
+// Cheap pre-check: scan `text` for a plausible `call:<verb>{` opener
+// before invoking the full parse_tool_calls regex sweep. Mirrors the
+// shape of re_call_verb_open() in tool_parser.cpp at a coarse
+// granularity. Single O(N) pass with a substring skip; no heap alloc
+// and no regex compile, so a response with no `call:` substring pays
+// only a `find()` cost.
+//
+// Returns true if any `call:` occurrence is followed by an identifier
+// start (`[A-Za-z_]`), more verb chars (`[A-Za-z0-9_.:-]`), optional
+// whitespace, and a `{`. We deliberately do NOT validate balanced
+// braces here — parse_tool_calls owns that check, and a leading
+// `call:foo{` with no close still costs us only one regex scan.
+//
+// The pre-check intentionally accepts `_call:foo{` (SentencePiece
+// underscore artifact, see tool_parser.cpp re_call_verb_open()
+// rationale) by including `_` in the verb-start charset alternation
+// at the top — `find("call:")` lands inside the `_call:` window.
+static bool looks_like_plain_text_call(const std::string & text) {
+    size_t pos = 0;
+    while ((pos = text.find("call:", pos)) != std::string::npos) {
+        size_t v = pos + 5;  // step past "call:"
+        if (v < text.size() &&
+            (std::isalpha((unsigned char)text[v]) || text[v] == '_')) {
+            size_t w = v;
+            while (w < text.size() &&
+                   (std::isalnum((unsigned char)text[w]) ||
+                    text[w] == '_' || text[w] == '.' ||
+                    text[w] == ':' || text[w] == '-')) {
+                w++;
+            }
+            // Allow whitespace between verb and brace (mirrors `\s*\{`).
+            while (w < text.size() && std::isspace((unsigned char)text[w])) {
+                w++;
+            }
+            if (w < text.size() && text[w] == '{') return true;
+        }
+        pos = v;
+    }
+    return false;
+}
+
 static bool find_tool_start(const std::string & text, size_t & pos) {
     size_t idx = text.find('<');
     while (idx != std::string::npos) {
@@ -608,6 +650,134 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
                 "request_id=%s format=%d bytes=%zu\n",
                 request_id_.c_str(), (int)format_, tool_buffer_.size());
         }
+    } else if (mode_ == StreamMode::CONTENT &&
+               !accumulated_content_.empty() &&
+               has_request_tools(tools_) &&
+               looks_like_plain_text_call(accumulated_content_)) {
+        // CONTENT-mode plain-text tool-call hoist. Gemma4 (and similar
+        // models with no XML tool-call template) emits invocations as
+        // literal text like `call:get_weather{location: "SF"}` or
+        // `_call:get_weather{...}` (SentencePiece artifact). The emitter
+        // stays in CONTENT mode for the whole stream because no
+        // `<tool_call>` / `<function=` / `<tool_code>` opener ever
+        // arrives. Without this branch the response stops with
+        // finish_reason="stop" / stop_reason="end_turn" and no tool_use
+        // block is emitted, breaking forge/agent_recorded scenarios
+        // that depend on structured tool_calls.
+        //
+        // The branch runs parse_tool_calls over accumulated_content_,
+        // hoists any ToolCalls (the allowlist filter `tool_allowed` is
+        // already enforced inside parse_tool_calls' add_call lambda,
+        // so unauthorized verbs never enter parsed.tool_calls), and
+        // replaces accumulated_content_ with cleaned_text so the final
+        // response carries the prose-only text (no duplicate `call:`
+        // span). Streaming clients have already received the raw call
+        // text as content deltas — they get a post-hoc tool_use block
+        // appended at finalize. Text + tool_use is a legal stream in
+        // both OpenAI and Anthropic specs.
+        //
+        // Gated on has_request_tools(tools_) to mirror the
+        // TOOL_BUFFER-entry condition at line 391 — if the request
+        // didn't declare tools we keep `call:foo{}` as visible content
+        // (see test_emitter_no_tools_keeps_tool_like_text for the
+        // equivalent XML-shape behavior).
+        auto parsed = parse_tool_calls(accumulated_content_, tools_);
+        if (!parsed.tool_calls.empty()) {
+            tool_calls_ = std::move(parsed.tool_calls);
+
+            // Remember for tool memory (mirrors TOOL_BUFFER branch).
+            if (tool_memory_) {
+                std::vector<std::string> ids;
+                for (const auto & tc : tool_calls_) ids.push_back(tc.id);
+                tool_memory_->remember(ids, accumulated_raw_);
+            }
+
+            // Strip matched call spans from the visible content so the
+            // non-streaming final-message shape doesn't duplicate them
+            // as both text AND tool_use. Mirrors
+            // _strip_plain_text_tool_calls in
+            // luce-bench/.../forge.py. Streaming clients already saw
+            // the pre-strip text in earlier deltas; this only affects
+            // the final accumulated_text() consumed by the response
+            // builders in http_server.cpp.
+            accumulated_content_ = parsed.cleaned_text;
+
+            fr = "tool_calls";
+
+            // Format-specific tool call events — same shape as the
+            // TOOL_BUFFER branch above. Kept inlined (rather than
+            // refactored into a helper) to keep this commit's diff
+            // minimal and side-by-side reviewable against the
+            // upstream block.
+            switch (format_) {
+            case ApiFormat::OPENAI_CHAT: {
+                json tc_list = json::array();
+                for (size_t i = 0; i < tool_calls_.size(); i++) {
+                    tc_list.push_back({
+                        {"index", (int)i},
+                        {"id", tool_calls_[i].id},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", tool_calls_[i].name},
+                            {"arguments", tool_calls_[i].arguments}
+                        }}
+                    });
+                }
+                out.push_back(format_openai_delta({{"tool_calls", tc_list}}));
+                break;
+            }
+            case ApiFormat::ANTHROPIC: {
+                if (!active_kind_.empty()) {
+                    out.push_back(sse_event("content_block_stop",
+                        json({{"type", "content_block_stop"}, {"index", block_index_}}).dump()));
+                    active_kind_.clear();
+                }
+                for (const auto & tc : tool_calls_) {
+                    block_index_++;
+                    json tu_block = {
+                        {"type",  "tool_use"},
+                        {"id",    tc.id},
+                        {"name",  tc.name},
+                        {"input", json::object()}
+                    };
+                    out.push_back(sse_event("content_block_start",
+                        json({{"type", "content_block_start"},
+                              {"index", block_index_},
+                              {"content_block", tu_block}}).dump()));
+                    if (!tc.arguments.empty()) {
+                        out.push_back(sse_event("content_block_delta",
+                            json({{"type",  "content_block_delta"},
+                                  {"index", block_index_},
+                                  {"delta", {{"type",         "input_json_delta"},
+                                             {"partial_json", tc.arguments}}}}).dump()));
+                    }
+                    out.push_back(sse_event("content_block_stop",
+                        json({{"type", "content_block_stop"},
+                              {"index", block_index_}}).dump()));
+                }
+                break;
+            }
+            case ApiFormat::RESPONSES:
+                for (const auto & tc : tool_calls_) {
+                    out.push_back(format_responses_event(
+                        "response.function_call_arguments.delta", {
+                            {"item_id", tc.id}, {"output_index", 0},
+                            {"delta", tc.arguments}
+                        }));
+                    out.push_back(format_responses_event(
+                        "response.function_call_arguments.done", {
+                            {"item_id", tc.id}, {"output_index", 0},
+                            {"arguments", tc.arguments}, {"name", tc.name}
+                        }));
+                }
+                break;
+            default: break;
+            }
+        }
+        // If parse_tool_calls matched the substring pre-check but
+        // returned no calls (all filtered by tool_allowed, or all args
+        // malformed), `fr` stays "stop" and accumulated_content_ is
+        // left intact. Caller sees the original prose; no leak.
     }
 
     // Format-specific final events
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index d0c9b291..e89130fb 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -868,6 +868,204 @@ static void test_emitter_anthropic_thinking_blocks() {
     TEST_ASSERT(!em.accumulated_text().empty());
 }
 
+// ═══════════════════════════════════════════════════════════════════════
+// CONTENT-mode plain-text `call:<verb>{...}` tool-call hoist
+//
+// Regression coverage for the finalize-pass branch that runs
+// parse_tool_calls over accumulated_content_ when the stream stayed
+// in CONTENT mode (Gemma4-style `call:foo{...}` plain text — no XML
+// envelope to trip TOOL_BUFFER). Without this branch the emitter
+// returns finish_reason="stop" / stop_reason="end_turn" and never
+// emits a tool_use block, breaking forge / agent_recorded.
+// ═══════════════════════════════════════════════════════════════════════
+
+static void test_emitter_content_mode_plain_text_call_parsed() {
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    // Feed enough text past the holdback so the prose flushes into
+    // accumulated_content_ before finalize. The call: span itself stays
+    // in accumulated_content_ either way (no XML opener to redirect to
+    // TOOL_BUFFER); finalize parses and strips it.
+    em.emit_token("I'll fetch the forecast for you right now: ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" — let me know what you'd like next.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "get_weather");
+        auto args = json::parse(em.tool_calls()[0].arguments);
+        TEST_ASSERT(args["location"] == "SF");
+    }
+    // finish_reason should now be "tool_calls" (drives Anthropic
+    // stop_reason="tool_use" downstream).
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_no_tools_skips_plain_text_call() {
+    // Empty tools array: branch is gated on has_request_tools(tools_),
+    // so the call: text remains as visible content.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT);
+    em.emit_start();
+    em.emit_token("I'll fetch the forecast for you right now: ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" — let me know what you'd like next.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos);
+}
+
+static void test_emitter_content_mode_underscore_prefix_call_parsed() {
+    // Regression for the `_call:foo{}` SentencePiece artifact (commit
+    // 004a81b). Parser Pattern 5 sentinel set includes `_`, so the
+    // verb is captured even with the leading underscore. The emitter
+    // wiring must surface it the same way as the bare `call:` form.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Sure thing, here is the call you asked for: ");
+    em.emit_token("_call:get_weather{\"location\": \"NYC\"}");
+    em.emit_token(" — happy to refine if needed.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "get_weather");
+        auto args = json::parse(em.tool_calls()[0].arguments);
+        TEST_ASSERT(args["location"] == "NYC");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_no_call_substring_skips_parser() {
+    // Pure prose with no `call:` substring: the pre-check
+    // looks_like_plain_text_call short-circuits before parse_tool_calls
+    // runs. Accumulated text is preserved; finish_reason stays "stop".
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Sorry, I cannot help with that ");
+    em.emit_token("specific question today. Please consult a local guide ");
+    em.emit_token("for the most accurate information.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    TEST_ASSERT(em.accumulated_text().find("Sorry") != std::string::npos);
+}
+
+static void test_emitter_content_mode_mixed_calls_multiple() {
+    // Multiple back-to-back calls in the same response. Parser
+    // Pattern 5 sentinel set includes `}` so consecutive invocations
+    // are captured. Verify the emitter hoists both in emission order.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("start. ");
+    em.emit_token("call:get_weather{\"location\": \"A\"} ");
+    em.emit_token("middle. ");
+    em.emit_token("call:get_weather{\"location\": \"B\"} ");
+    em.emit_token("end of the message.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 2);
+    if (em.tool_calls().size() == 2) {
+        auto args0 = json::parse(em.tool_calls()[0].arguments);
+        auto args1 = json::parse(em.tool_calls()[1].arguments);
+        TEST_ASSERT(args0["location"] == "A");
+        TEST_ASSERT(args1["location"] == "B");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+    // Codex Q3 residue guard: the stripped accumulated text must NOT
+    // contain `call:` any more.
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+}
+
+static void test_emitter_content_mode_malformed_call_dropped() {
+    // Unbalanced `{`: balanced_braces_end inside parse_tool_calls
+    // returns npos and the match is dropped. Emitter must not crash
+    // and the malformed text remains in accumulated_text (no tool
+    // hoist, no silent strip — caller-visible signal that the model
+    // produced garbage).
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Here is the call you wanted with malformed args: ");
+    em.emit_token("call:get_weather{location: \"unclosed");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    // Malformed call span is left visible (no strip on parse failure).
+    TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos);
+}
+
+static void test_emitter_content_mode_does_not_double_fire_on_tool_call_xml() {
+    // Regression guard: a `<tool_call>` XML envelope must continue to
+    // route through the TOOL_BUFFER path (transition fires inside
+    // emit_token). The new CONTENT-mode branch sits in an `else if`
+    // tied to `mode_ == CONTENT` at emit_finish entry, so it cannot
+    // fire when TOOL_BUFFER handled the call. Verify exactly 1
+    // ToolCall is emitted, not 2.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("<tool_call>\n"
+                  "<function=get_weather>\n"
+                  "<parameter=location>SF</parameter>\n"
+                  "</function>\n"
+                  "</tool_call>");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_strips_call_span_from_accumulated_text() {
+    // Codex Q3 "residue" hazard guard. After a successful hoist,
+    // accumulated_text() must not contain the `call:` substring (the
+    // matched span is replaced by cleaned_text). Without this guard
+    // the OpenAI Chat / Anthropic / Responses final-message shapes
+    // would echo the call as both literal text AND a tool_use block,
+    // producing UI double-display.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("prefix prose here. ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" suffix prose continues to flush the holdback.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(!em.tool_calls().empty());
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("prefix prose") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("suffix prose") != std::string::npos);
+}
+
+static void test_emitter_content_mode_anthropic_emits_tool_use_block() {
+    // Verify the Anthropic format-specific events fire from the new
+    // branch (content_block_stop on the open text block, then
+    // content_block_start tool_use + input_json_delta + content_block_stop).
+    json tools = json::array();
+    tools.push_back({
+        {"name", "get_weather"},
+        {"description", "weather"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"city", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::ANTHROPIC, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("Let me fetch the data you need from the service: ");
+    em.emit_token("call:get_weather{\"city\": \"Tokyo\"}");
+    em.emit_token(" — back in a moment.");
+    auto finish = em.emit_finish(20);
+    std::string s = concat(finish);
+
+    TEST_ASSERT(!em.tool_calls().empty());
+    TEST_ASSERT(s.find("\"type\":\"tool_use\"")          != std::string::npos);
+    TEST_ASSERT(s.find("\"name\":\"get_weather\"")     != std::string::npos);
+    TEST_ASSERT(s.find("\"type\":\"input_json_delta\"") != std::string::npos);
+    TEST_ASSERT(s.find("Tokyo")                          != std::string::npos);
+    TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"")  != std::string::npos);
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // Stop sequences tests
 // ═══════════════════════════════════════════════════════════════════════
@@ -2792,6 +2990,17 @@ int main() {
     RUN_TEST(test_emitter_nonstreaming_accumulates);
     RUN_TEST(test_emitter_anthropic_thinking_blocks);
 
+    std::fprintf(stderr, "\n── CONTENT-mode plain-text call:<verb>{} ──\n");
+    RUN_TEST(test_emitter_content_mode_plain_text_call_parsed);
+    RUN_TEST(test_emitter_content_mode_no_tools_skips_plain_text_call);
+    RUN_TEST(test_emitter_content_mode_underscore_prefix_call_parsed);
+    RUN_TEST(test_emitter_content_mode_no_call_substring_skips_parser);
+    RUN_TEST(test_emitter_content_mode_mixed_calls_multiple);
+    RUN_TEST(test_emitter_content_mode_malformed_call_dropped);
+    RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml);
+    RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text);
+    RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block);
+
     std::fprintf(stderr, "\n── Stop sequences ──\n");
     RUN_TEST(test_stop_sequence_basic);
     RUN_TEST(test_stop_sequence_mid_token);

From ee9cd9e92d8fea7cfee2d587235be12308ce7711 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 1 Jun 2026 20:27:09 -0400
Subject: [PATCH 6/6] fix(server): address cubic PR #329 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three P2 issues flagged by cubic on commit 8218333:

1. tool_parser.cpp:248 — relaxed-JSON rewrite missed escaping inner
   `"` when normalizing single- or backtick-quoted strings. Content
   like `'he said "hi"'` rewrote to `"he said "hi""` (invalid JSON),
   silently dropping the whole tool call. Now escapes `"` to `\"`
   when inside a non-`"` string.

2. sse_emitter.cpp:49 — looks_like_plain_text_call() pre-check used
   isalpha for the first verb char, but the parser regex accepts
   [A-Za-z0-9_.:\-]. A `call:2nd_pass{...}` emission would pass the
   parser but skip the pre-check. Switched first-char test to
   isalnum so digit-led verbs reach the parser.

3. sse_emitter.cpp:703 — stripping accumulated_content_ desynced
   the Responses-format streaming finalization events
   (.output_text.done / .content_part.done / .completed) from the
   raw .delta events the client already received. Captured a
   responses_streamed_text snapshot at top of emit_finish before
   any strip, and threaded it through the four Responses
   finalization sites. Non-streaming accumulated_text() accessor
   continues to return the stripped version so the non-streaming
   response shape doesn't carry both text AND tool_use for the
   same span.

Tests: +4 cases.
- test_parse_call_verb_singlequote_with_inner_doublequote
- test_parse_call_verb_backtick_with_inner_doublequote
- test_emitter_content_mode_digit_start_verb_parsed
- test_emitter_content_mode_responses_done_uses_pre_strip_text

test_server_unit: 1693 -> 1707 assertions, 0 failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/src/server/sse_emitter.cpp | 39 +++++++++++--
 server/src/server/tool_parser.cpp | 12 ++++
 server/test/test_server_unit.cpp  | 96 +++++++++++++++++++++++++++++++
 3 files changed, 142 insertions(+), 5 deletions(-)

diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
index 0441e5bb..ba3c0d1e 100644
--- a/server/src/server/sse_emitter.cpp
+++ b/server/src/server/sse_emitter.cpp
@@ -46,7 +46,7 @@ static bool looks_like_plain_text_call(const std::string & text) {
     while ((pos = text.find("call:", pos)) != std::string::npos) {
         size_t v = pos + 5;  // step past "call:"
         if (v < text.size() &&
-            (std::isalpha((unsigned char)text[v]) || text[v] == '_')) {
+            (std::isalnum((unsigned char)text[v]) || text[v] == '_')) {
             size_t w = v;
             while (w < text.size() &&
                    (std::isalnum((unsigned char)text[w]) ||
@@ -542,6 +542,20 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
     }
     window_.clear();
 
+    // Snapshot of what the Responses stream actually emitted as text
+    // deltas. The CONTENT-mode plain-text tool-call branch below
+    // mutates accumulated_content_ (strips matched call spans so the
+    // non-streaming response shape doesn't duplicate them as both text
+    // AND tool_use), but the Responses-format finalization events
+    // (response.output_text.done / content_part.done / completed) must
+    // reflect what was actually streamed in earlier
+    // response.output_text.delta events — otherwise a streaming client
+    // sees its accumulated buffer disagree with the .done payload.
+    // Other formats (OpenAI Chat, Anthropic) don't echo final
+    // aggregated text in the stream, so they can continue to read the
+    // (possibly stripped) accumulated_content_ directly.
+    const std::string responses_streamed_text = accumulated_content_;
+
     // Parse tool calls from buffer
     std::string fr = "stop";
     if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) {
@@ -700,6 +714,16 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
             // the pre-strip text in earlier deltas; this only affects
             // the final accumulated_text() consumed by the response
             // builders in http_server.cpp.
+            //
+            // For the Responses format we capture a separate snapshot
+            // (responses_streamed_text, see top of emit_finish) before
+            // this strip so the streaming finalization events
+            // (.output_text.done / .content_part.done / .completed)
+            // continue to agree with the raw .delta events the client
+            // already received. Without the snapshot the .done payload
+            // would carry the stripped text and a streaming client's
+            // accumulated buffer would disagree with the server's
+            // claimed "done" text.
             accumulated_content_ = parsed.cleaned_text;
 
             fr = "tool_calls";
@@ -840,16 +864,21 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
     }
 
     case ApiFormat::RESPONSES: {
+        // Use the pre-strip snapshot for the streaming finalization
+        // events (.done / .completed) so they agree with the
+        // .delta events that preceded them. See
+        // responses_streamed_text init at the top of emit_finish for
+        // rationale.
         // output_text.done
         out.push_back(format_responses_event("response.output_text.done", {
             {"item_id", msg_item_id_}, {"output_index", 0},
-            {"content_index", 0}, {"text", accumulated_content_}
+            {"content_index", 0}, {"text", responses_streamed_text}
         }));
         // content_part.done
         out.push_back(format_responses_event("response.content_part.done", {
             {"item_id", msg_item_id_}, {"output_index", 0},
             {"content_index", 0},
-            {"part", {{"type", "output_text"}, {"text", accumulated_content_},
+            {"part", {{"type", "output_text"}, {"text", responses_streamed_text},
                       {"annotations", json::array()}}}
         }));
 
@@ -868,7 +897,7 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
                 {"type", "message"}, {"id", msg_item_id_},
                 {"status", "completed"}, {"role", "assistant"},
                 {"content", json::array({{
-                    {"type", "output_text"}, {"text", accumulated_content_},
+                    {"type", "output_text"}, {"text", responses_streamed_text},
                     {"annotations", json::array()}
                 }})}
             });
@@ -896,7 +925,7 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
             {"created_at", created_at_}, {"status", "completed"},
             {"model", model_name_},
             {"output", final_output},
-            {"output_text", accumulated_content_},
+            {"output_text", responses_streamed_text},
             {"usage", resp_usage}
         };
         out.push_back(format_responses_event("response.completed", {{"response", shell}}));
diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp
index a8d1518b..7ece4ca4 100644
--- a/server/src/server/tool_parser.cpp
+++ b/server/src/server/tool_parser.cpp
@@ -259,6 +259,18 @@ static bool coerce_relaxed_json(const std::string & payload, json & out) {
                 i++;
                 continue;
             }
+            // Escape inner `"` when we opened the string with a non-`"`
+            // quote (single or backtick). Without this, content like
+            // `'he said "hi"'` rewrites to `"he said "hi""` which is
+            // invalid JSON and silently drops the whole tool call.
+            // When in_str == '"', a `"` inside should have arrived via
+            // the `\\` escape branch above; a bare `"` here is malformed
+            // input we pass through unchanged.
+            if (in_str != '"' && c == '"') {
+                rewritten += "\\\"";
+                i++;
+                continue;
+            }
             rewritten += c;
             i++;
             continue;
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index e89130fb..ca7d073c 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -482,6 +482,34 @@ static void test_parse_call_verb_multiline_args() {
     }
 }
 
+static void test_parse_call_verb_singlequote_with_inner_doublequote() {
+    // Cubic PR #329 review: when the relaxed-JSON rewrite converts
+    // single-quoted strings to double-quoted, inner `"` chars must be
+    // escaped to `\"` — otherwise `'he said "hi"'` rewrites to
+    // `"he said "hi""` which is invalid JSON and the whole tool call
+    // is silently dropped.
+    std::string text = "call:say{quote: 'he said \"hi\" loudly'}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        TEST_ASSERT(result.tool_calls[0].name == "say");
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["quote"] == "he said \"hi\" loudly");
+    }
+}
+
+static void test_parse_call_verb_backtick_with_inner_doublequote() {
+    // Same escape concern as the single-quote case, but with the
+    // backtick string flavor.
+    std::string text = "call:say{quote: `he said \"hi\" loudly`}";
+    auto result = parse_tool_calls(text);
+    TEST_ASSERT(result.tool_calls.size() == 1);
+    if (!result.tool_calls.empty()) {
+        auto args = json::parse(result.tool_calls[0].arguments);
+        TEST_ASSERT(args["quote"] == "he said \"hi\" loudly");
+    }
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // SSE Emitter tests
 // ═══════════════════════════════════════════════════════════════════════
@@ -1066,6 +1094,70 @@ static void test_emitter_content_mode_anthropic_emits_tool_use_block() {
     TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"")  != std::string::npos);
 }
 
+static void test_emitter_content_mode_digit_start_verb_parsed() {
+    // Cubic PR #329 review: the looks_like_plain_text_call() pre-check
+    // must accept verbs starting with a digit because the parser's
+    // re_call_verb_open() regex allows them ([A-Za-z0-9_.:\\-]+).
+    // A model emitting `call:2nd_pass{...}` with a digit-led verb
+    // should still trigger the parser sweep.
+    json tools = json::array();
+    tools.push_back({
+        {"name", "2nd_pass"},
+        {"description", "second pass"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"reason", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("call:2nd_pass{reason: \"verify\"}");
+    em.emit_finish(5);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "2nd_pass");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_responses_done_uses_pre_strip_text() {
+    // Cubic PR #329 review: the Responses-format finalization events
+    // (.output_text.done / .content_part.done / .completed) must
+    // reflect the text that was streamed in earlier .delta events,
+    // not the post-strip text. Otherwise a streaming client's
+    // accumulated buffer (built from .delta events) disagrees with
+    // the server's claimed .done payload.
+    //
+    // accumulated_text() (consumed by non-streaming response builders)
+    // still returns the stripped version so the non-streaming response
+    // shape doesn't carry both text AND tool_use for the same span.
+    json tools = json::array();
+    tools.push_back({
+        {"name", "get_weather"},
+        {"description", "weather"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"city", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::RESPONSES, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("Looking up: ");
+    em.emit_token("call:get_weather{\"city\": \"Tokyo\"}");
+    em.emit_token(" done.");
+    auto finish = em.emit_finish(20);
+    std::string s = concat(finish);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    // Streaming .done events must include the raw call text (matching
+    // what the .delta events already sent).
+    TEST_ASSERT(s.find("response.output_text.done") != std::string::npos);
+    TEST_ASSERT(s.find("call:get_weather") != std::string::npos);
+    // Non-streaming accessor returns the stripped text.
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("Looking up:") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("done.") != std::string::npos);
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // Stop sequences tests
 // ═══════════════════════════════════════════════════════════════════════
@@ -2968,6 +3060,8 @@ int main() {
     RUN_TEST(test_parse_call_verb_cleaned_text);
     RUN_TEST(test_parse_call_verb_intercept_inner_json);
     RUN_TEST(test_parse_call_verb_multiline_args);
+    RUN_TEST(test_parse_call_verb_singlequote_with_inner_doublequote);
+    RUN_TEST(test_parse_call_verb_backtick_with_inner_doublequote);
 
     std::fprintf(stderr, "\n── SSE Emitter ──\n");
     RUN_TEST(test_emitter_reasoning_split_openai);
@@ -3000,6 +3094,8 @@ int main() {
     RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml);
     RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text);
     RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block);
+    RUN_TEST(test_emitter_content_mode_digit_start_verb_parsed);
+    RUN_TEST(test_emitter_content_mode_responses_done_uses_pre_strip_text);
 
     std::fprintf(stderr, "\n── Stop sequences ──\n");
     RUN_TEST(test_stop_sequence_basic);