From 12c50c045da48e042d3ce81b976fa3a124f80109 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 11:34:28 -0400 Subject: [PATCH 1/2] docs(server): plan for call:{} tool-parser pattern + codex review Captures the diagnosis (gemma forge 0/30 on 2026-05-30), the proposed sixth detection pattern, the relaxed-JSON arg parser sketch, the unit-test matrix, and codex's review (which forced reordering the new pattern to slot #5 ahead of the bare-JSON sweep to avoid interception of nested name/arguments-shaped args). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../server-call-verb-tool-parser-plan.md | 434 ++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 docs/experiments/server-call-verb-tool-parser-plan.md diff --git a/docs/experiments/server-call-verb-tool-parser-plan.md b/docs/experiments/server-call-verb-tool-parser-plan.md new file mode 100644 index 000000000..3c09db3b9 --- /dev/null +++ b/docs/experiments/server-call-verb-tool-parser-plan.md @@ -0,0 +1,434 @@ +# Server-side `call:{args}` tool-call parser (pattern #6) + +Status: draft → codex review → implementation. +Tracking branch: `fix/server-call-verb-tool-parser` (off `origin/main`). + +## Background + +The 2026-05-30 gemma full bench scored forge **0/30** on RTX 3090 Ti +(`d9ecba6cc105-nvidia-geforce-rtx-3090-ti-gemma-full-2026-05-30-67f4/forge.json`). +Every row’s `iterations[0].output` shows the assistant emitting +plain-text tool invocations of the form: + +- `call:get_country_info{country: "France"}` +- `call:default_api:fetch_sales_data{quarter: "Q4", year: 2024}` +- `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}` +- `call:execute-bead:list-files{path: "src/"}\n\ncall:execute-bead:read-file{path: "..."}` +- `call:execute-bead:read_file{path: "..."}` +- `call:shell{command: "rg -i auth"}` +- `call:infrastructure:get_logs{resource: \`payments-service-cluster\`, start_time: \`...\`}` + (backtick-quoted values appear in the snapshot too) + +`server/src/server/tool_parser.cpp::parse_tool_calls` has five +envelope-shaped detection patterns (XML ``, bare +``, ``, ``, +bare JSON `{ "name":…, "arguments":… }`). **None** of these match the +gemma plain-text emission. As a result: + +1. `SseEmitter::accumulate` buffers text, calls `parse_tool_calls`, + gets back zero `ToolCalls`, never flips `finish_reason` to + `tool_calls`. +2. `/v1/messages` (Anthropic) sees an empty `emitter.tool_calls()` and + emits a `text` content block instead of a `tool_use` block + (`server/src/server/http_server.cpp:2030–2090`). +3. forge’s WorkflowRunner expects a `ToolCall` and the row dies with + `error_type=ValidationError`. + +A client-side workaround already shipped on `feat/lucebox-docker` +(commit `deba2fd`) — see `_parse_plain_text_tool_calls` in +`luce-bench/src/lucebench/areas/forge.py`. **This PR ports that to +C++ and adds it as a sixth `parse_tool_calls` pattern**, fixing the +problem at the server. + +## Goals + +1. Add pattern #6 to `parse_tool_calls`: `call:?{ }`. +2. Drive the existing downstream pipeline: + `SseEmitter::accumulate` → `finish_reason=tool_calls` → Anthropic + `stop_reason=tool_use` with `tool_use` content blocks, or OpenAI + `choices[].message.tool_calls`. Nothing downstream needs to change. +3. Honour the existing `tool_allowed(tools, name)` filter so callers + that pass a constrained tool list (forge) only get back tools they + declared. +4. Land C++ unit tests in `server/test/test_server_unit.cpp` covering + the gemma-observed shapes plus the obvious edge cases. + +## Non-goals (intentionally out of scope) + +- Touching the client-side `_parse_plain_text_tool_calls` in + `forge.py`. It stays as defense-in-depth — older deployed servers + won’t have this fix. The PR description will note that the client + fallback is no longer load-bearing after this merges. +- Rebuilding the docker image / running an e2e bench. The unit tests + + a deliberately constructed `parse_tool_calls` call exercising every + shape are sufficient for PR review. +- Touching `systemd`, the host `/home/erik/.local/bin/lucebox` + wrapper, the running `lucebox.service` (a forge bench is in flight), + or anything outside `server/src/server/tool_parser.cpp` + + `server/test/test_server_unit.cpp`. + +## Design + +### Pattern ordering + +Insert as **pattern #5** (one slot ahead of the existing bare-JSON +sweep, which becomes the new pattern #6). The four XML-envelope +patterns are lexically unambiguous and stay ahead. The reorder is +forced by a real interception hazard codex flagged in review: +`call:my_tool{"name": "inner_tool", "arguments": {}}` is a valid +gemma-shaped emission, and the bare-JSON sweep would otherwise lift +the inner `{"name": ...}` out as its own ToolCall before the new +pattern got a chance. By running the `call:…{…}` matcher first and +recording its brace-balanced span in `removals`, the bare-JSON sweep +sees that range as overlapping and skips it. + +### Pattern #6 regex + +```cpp +// `call:` opener: optionally preceded by start-of-string, +// whitespace, or a hard delimiter so we don't match `I'll call:foo` +// inside narrative. +static const std::regex re_call_verb_open( + R"((^|[\s,;:\(\[\{])call:([A-Za-z0-9_.:\-]+)\s*\{)"); +``` + +Notes: + +- The leading group captures a sentinel character so we keep + `it->position()` aligned with the start of the **prefix**, not the + `call:` itself; the implementation computes the actual `call:` + offset by adding the captured sentinel’s length back. +- `[A-Za-z0-9_.:\-]+` matches verbs *and* embedded namespaces in one + capture (`execute-bead:read-file`, `default_api:fetch_sales_data`, + bare `shell`). The verb-name passed to `add_call` strips everything + up to and including the last `:`, so `execute-bead:read-file` + becomes `read-file`. +- The `\s*` before `{` is intentional — the snapshot rows always emit + `{` immediately, but tolerating optional whitespace doesn’t open new + failure modes and matches the relaxed pattern from the Python port. +- `^` works on `std::regex` ECMAScript syntax against `text` from + position 0; we do not rely on `multiline` mode. Subsequent matches + rely on the explicit sentinel-character alternation, which catches + newlines, spaces, commas, parens, square brackets, and curly braces + — every realistic boundary in the snapshot data. + +### Balanced-brace extractor + +Mirror pattern #5’s scanner — track depth, skip over string literals, +**but** extend the string-literal handling to honour single quotes +and backticks (the snapshot row above has backticked values). This +diverges slightly from the Python port (which only knew `'` and `"`) +to handle the real gemma output we observed. + +```cpp +// Returns one-past-close index, or std::string::npos. +static size_t balanced_braces_end(const std::string & text, size_t open) { + int depth = 0; + char in_str = 0; // 0 or one of " ' ` + for (size_t i = open; i < text.size(); i++) { + char c = text[i]; + if (in_str) { + if (c == '\\' && i + 1 < text.size()) { i++; continue; } + if (c == in_str) in_str = 0; + continue; + } + if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; } + if (c == '{') depth++; + else if (c == '}') { + if (--depth == 0) return i + 1; + } + } + return std::string::npos; +} +``` + +### Relaxed-JSON args parser + +Direct port of `_coerce_relaxed_json` from `forge.py`: + +1. Try `json::parse(strict)` on `"{" + body + "}"` first. Most + strict-JSON args (`{"path": "src/"}`) succeed here. +2. On failure, rewrite the body: walk char-by-char tracking string + state; when outside a string, look ahead for a bare-identifier + `[A-Za-z_][A-Za-z0-9_]*` immediately followed by optional + whitespace and `:` — wrap the identifier in double quotes. Also + normalize single-quoted strings to double-quoted (open `'` becomes + `"`; closing `'` becomes `"`; backtick strings get the same + treatment). Backslash escapes are preserved. +3. Retry `json::parse` on the rewrite. If it still fails, **drop the + single invocation** (do not throw, do not add a `ToolCall`, do not + poison `removals`). Continue scanning past the closing brace. + +The rewrite step has to leave already-quoted JSON keys alone. The +look-ahead is `(?!").+:` — i.e. only fire if we’re not already +sitting just after a `"` we already emitted. We track this via the +`out` buffer’s last char, exactly as the Python implementation does. + +### Verb normalization + +```cpp +std::string verb = full_match; // e.g. "execute-bead:read-file" +size_t colon = verb.find_last_of(':'); +if (colon != std::string::npos) verb = verb.substr(colon + 1); +``` + +`add_call(verb, args, span_start, span_end)` then passes through the +existing `tool_allowed` filter. + +### Span tracking + +When a successful invocation is parsed, push `{start, brace_end}` onto +`removals` so the surrounding cleanup pass strips the `call:…{…}` text +out of the assistant message (otherwise the OpenAI/Anthropic content +block would carry the literal `call:foo{…}` alongside the structured +tool call — the same double-signal hazard the Python port avoids via +`_strip_plain_text_tool_calls`). + +The `start` recorded in `removals` is the index of `c` in `call:` (not +the preceding sentinel) so we don’t eat the user’s narrative space. + +### Tool-allowed filter placement + +Use the existing `tool_allowed(tools, verb)` helper inside +`add_call` (same path as patterns #1–5). Dropping happens **after** +regex match + JSON parse — this is cheap and consistent. Constraining +the regex to a known verb set would have been faster but it precludes +the “no tools constraint” case (forge passes a tool list; agent and +codex sometimes don’t). + +## Regression safety inventory + +Existing tool-parser tests in `server/test/test_server_unit.cpp` +(lines 239–317): + +| Test | Body contains `call:` substring? | +|---|---| +| `test_parse_tool_call_xml` | no | +| `test_parse_bare_function_xml` | no | +| `test_parse_json_tool_call` | no | +| `test_parse_no_tools` | no | +| `test_parse_tool_code_wrapper` | no | +| `test_parse_tool_allowed_filter` | no | + +Also the `emitter_*` family (lines 470–700+) uses `` XML +exclusively. No existing test has a `call:{` substring that +could accidentally match the new pattern. Safe. + +## Test plan + +Add to `server/test/test_server_unit.cpp` (the same file already +covers tool-parser unit tests; convention is one function per case +registered under the `── Tool parser ──` section). New cases: + +1. `test_parse_call_verb_single` — `call:get_country_info{country: "France"}` +2. `test_parse_call_verb_back_to_back` — + `call:get_country_info{country: "France"}call:summarize{text: "ok"}` +3. `test_parse_call_verb_namespaced` — + `call:execute-bead:read-file{path: "crates/foo/src/lib.rs"}` → verb + normalized to `read-file` +4. `test_parse_call_verb_snake_and_hyphen` — covers both + `call:execute-bead:read_file{path: "..."}` and + `call:execute-bead:list-files{path: "src/"}` with intervening + newlines +5. `test_parse_call_verb_tool_allowed_filter` — verb `disallowed` in + text, `tools` allows only `allowed`; result has zero `ToolCall` +6. `test_parse_call_verb_inline_prose_rejected` — + `"Sure, I'll call:foo{x:1}"` *should* still match because the + space before `call:` satisfies the sentinel. To exercise the + anti-false-positive case, use `"narrative.call:foo{x:1}"` — + no sentinel char before `call:`, regex rejects. +7. `test_parse_call_verb_malformed_args` — + `call:foo{country: "France"` (unterminated brace) → call dropped, + no crash, no `ToolCall`, no removal span +8. `test_parse_call_verb_inner_brace_in_string` — + `call:foo{cmd: "echo {} ok"}` → must not break on the `{` inside + the string; args dict has `cmd == "echo {} ok"` +9. `test_parse_call_verb_strict_json_args` — + `call:foo{"path": "x"}` (already-quoted keys) parses on the strict + path +10. `test_parse_call_verb_unquoted_keys` — relaxed pass kicks in: + `call:foo{path: "x"}` → args dict has `path == "x"` +11. `test_parse_call_verb_cleaned_text` — verify the `call:…{…}` + fragment is stripped from `result.cleaned_text` (parity with the + XML envelope patterns). +12. `test_parse_call_verb_intercept_inner_json` (codex-requested) — + `call:outer{"name": "inner", "arguments": {}}`. Exactly one + `ToolCall` with `name == "outer"` and `arguments` containing the + literal inner JSON; verifies that the reorder of patterns #5/#6 + actually defends against the bare-JSON sweep stealing the inner + object. + +These run under the existing `RUN_TEST(...)` block; add them under +the `── Tool parser ──` heading immediately after +`test_parse_tool_allowed_filter`. + +### Build verification + +The new code lives entirely in `server/src/server/tool_parser.cpp` — +already in `target_sources` for `test_server_unit` +(`server/CMakeLists.txt:776–788`). Build with: + +```bash +cmake --build server/build --target test_server_unit && \ + server/build/test_server_unit +``` + +(If the lucebox-hub-285 worktree doesn’t have a build dir, configure +it the same way the repo expects — but a clean build is the operator’s +job, not part of the PR landing process.) + +## Reverse-compat with the client-side fix + +`luce-bench/src/lucebench/areas/forge.py::_parse_plain_text_tool_calls` +(commit `deba2fd` on `feat/lucebox-docker`) stays untouched. After +this PR merges and the docker image is rebuilt, the client-side path +becomes a no-op for any server that includes pattern #6. Leaving it +in place is harmless (the synthesized `ToolCall`s match what the +server now produces) and shields older deployed servers. The PR +description will spell this out so a reviewer doesn’t mistakenly +strip the Python fallback. + +## Image rebuild + e2e (out of scope for this PR) + +After merge, the operator should: + +1. Rebuild the cuda12 image (`docker bake` per `docker-bake.hcl`). +2. Restart `lucebox.service` *after the current forge bench finishes*. +3. Re-run the 2026-05-30 gemma full bench and confirm forge passes + non-trivially without the client-side `_parse_plain_text_tool_calls` + workaround. + +These steps are validation, not part of the PR diff. + +## Risks / open questions + +- **Multiline args.** The snapshot shows args that span multiple + lines (`call:default_api:analyze_data{data: [{"date": ...}, ...]}`). + The balanced-brace scanner is line-agnostic so this Just Works — + noted here to make the implementer audit at least one multi-line + case in the unit tests. +- **Backticked values.** `infrastructure:get_logs{resource: \`x\`}` + uses backticks. JSON has no backtick-string concept; the relaxed + rewrite normalizes them to double-quoted strings. This is one place + the C++ port goes **further** than the Python port — that’s a + conscious choice based on the real snapshot. +- **Performance.** `parse_tool_calls` is already O(n·patterns); pattern + #6 adds a single regex sweep plus per-match work. The buffer fed + through `SseEmitter::accumulate` is bounded by generation length; + no realistic perf concern. + +## Codex review + +Captured verbatim from `codex exec` against the running lucebox +server (`http://localhost:8080/v1` / `dflash` model). The final +answer block follows; the codex run appended additional +stream-of-consciousness scratch after the answer (an artefact of the +model’s think channel leaking past the answer) — that scratch is +omitted as noise. + +``` +This is a high-quality implementation plan. The approach to handling +"dirty" JSON via a manual balanced-brace scanner and a +relaxation/normalization pass is a pragmatic and necessary solution +for the non-standard emissions observed in Gemma models. + +However, there is one **critical architectural risk** regarding +pattern priority and one minor discrepancy in the brace-tracking +logic. + +### Critical: Pattern Priority & Interception +Issue: Pattern #5 (Bare JSON sweep) is currently ordered before +Pattern #6 (the new `call:verb{...}` pattern). +Risk: If a tool call's arguments happen to contain keys named +`"name"` and `"arguments"`, Pattern #5 will intercept the substring +and incorrectly identify a tool call. +* Example: `call:my_tool{"name": "inner_tool", "arguments": {}}` +* Outcome: Pattern #5 will match the `{...}` part, creating a + ToolCall for `inner_tool`. Because Pattern #5 is processed first, + it will add a removal span for the inner JSON, potentially leaving + the `call:my_tool` prefix dangling in the cleaned_text or causing + Pattern #6 to fail/overlap. +* Recommendation: Move Pattern #6 to a higher priority (e.g., before + Pattern #5). The `call:` prefix is a much stronger semantic signal + than the presence of JSON keys. + +### Technical Review + +1. Regex & Anchors +* The sentinel list `[\s,;:\(\[\{]` is excellent. It effectively + prevents false positives like `I'll call:foo`. +* Note on `^`: In `std::regex`, `^` matches the start of the string. + While `std::regex` lacks a standard multiline flag, your inclusion + of `\s` (which covers `\n`) in the sentinel list makes the pattern + robust for tool calls starting on new lines. + +2. Robustness of the Parser +* Balanced Braces: Your `balanced_braces` logic correctly handles + string literals (including escaped quotes) and nested braces. +* Discrepancy: The plan mentions tracking `[]` (square brackets), but + the provided implementation snippet only increments depth for `{`. + While tracking `[]` isn't strictly required to find the outer `{}` + of a JSON object, you should ensure it's implemented if you intend + to follow the plan's promise, especially to handle edge cases where + a `}` might appear inside an array (though unlikely in valid JSON). +* Relaxed JSON: The "rewrite" strategy (normalizing `'` and `` ` `` to + `"`, and wrapping unquoted keys) is a solid way to handle the + observed "relaxed" syntax. + +3. Filter Placement +* The `tool_allowed` filter is correctly placed after the parser has + successfully identified the verb and arguments. Constraining the + regex itself to a known verb list would be a maintenance burden. + +4. Unit Tests +* The test plan is very thorough. To make it bulletproof, I recommend + adding these two specific cases: + * Nested Tool Call: `call:outer{"name": "inner", "arguments": {}}` + — specifically to verify that the priority fix (Point 1) works. + * Complex String Content: `call:foo{cmd: "echo {not_a_brace}"}` — + to ensure the string-aware scanner doesn't trip on braces + inside quotes. + +### Summary of Recommendations +1. Reorder patterns: Move Pattern #6 above Pattern #5. +2. Verify `[]` implementation: Ensure the brace scanner actually + tracks `[` and `]` to match your design spec. +3. Add "Interception" test: Add a unit test where tool arguments + mimic a JSON tool call to confirm the priority fix. +``` + +### Plan adjustments after review + +1. **Pattern ordering — accepted.** Codex’s `call:my_tool{"name": + "inner_tool", "arguments": {}}` example is real: gemma snapshot + rows in the snapshot include `call:default_api:analyze_data{data: + [{...}, ...]}` where the inner objects don’t happen to carry + `name`/`arguments` keys today, but accepting that hazard for + future-proofing is cheap. The new pattern moves to **#5**, demoting + the bare-JSON sweep to **#6**. The five XML-envelope patterns stay + ahead since they’re lexically unambiguous and pattern #6 would also + spuriously chew their inner JSON if reordered. +2. **Square-bracket tracking — accepted with a clarification.** I + don’t need `[`/`]` in the brace-depth counter for closing the outer + `{`, because by the time we hit a `]` without `[` we’re already + inside a string (handled) or the JSON is malformed (we drop the + call). I will **still** add explicit `[`/`]` tracking to the + scanner so a stray `}` *inside* an array (e.g. a JSON value like + `"}"` written without quotes) doesn’t fool the close-counter. This + matches the spirit of the plan’s prose. +3. **Interception unit test — accepted.** Add + `test_parse_call_verb_intercept_inner_json` with body + `call:outer{"name": "inner", "arguments": {}}`. Verify the + resulting `ToolCall` has `name == "outer"` and `arguments == + {"name": "inner", "arguments": {}}` — and that there’s exactly + **one** `ToolCall`, not two. +4. **“Complex string content” test — accepted.** Add + `test_parse_call_verb_string_with_close_brace` with + `call:foo{cmd: "echo {not_a_brace}"}`. Already partially covered + by `test_parse_call_verb_inner_brace_in_string`; will reuse and + strengthen that test rather than add a new one. + +No rebuttals — codex’s feedback is all sound. Renumber: the new +pattern is **pattern #5**, the old bare-JSON sweep becomes **pattern +#6**. Update test names, header comments, and the file-top docstring +accordingly during implementation. From cdb8b9c06d9b33f1c58637dfbda4a0276088cca3 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Sun, 31 May 2026 11:38:18 -0400 Subject: [PATCH 2/2] fix(server): parse gemma's call:{} plain-text tool emissions Adds a sixth detection pattern to `parse_tool_calls` that recognizes the plain-text tool invocations gemma emits in chat-completion content (`call:get_country_info{country: "France"}` / `call:execute-bead:read-file{path: "..."}` / etc). The 2026-05-30 gemma full bench scored forge 0/30 because every row's output carried these `call:{...}` invocations as text rather than structured `tool_use` content blocks. None of the existing five envelope-shaped detectors (``, ``, ``, bare JSON) match the bare `call:` shape. The new pattern: - Anchors on a sentinel character (whitespace, comma, semicolon, open/close bracket, etc.) before `call:` so narrative usages like `narrative.call:foo` don't match. - Supports namespaced verbs (`execute-bead:read-file`, `default_api:fetch_sales_data`) and strips the namespace before using the verb as the ToolCall name. - Extracts the args block via a quote- and escape-aware balanced-brace scanner that tolerates `"`, `'`, and `` ` `` string literals and tracks `[]` depth alongside `{}`. - Parses the args as strict JSON first, then falls back to a relaxed rewrite that quotes bare identifier keys and normalizes single/ backtick quoted strings to double-quoted before retrying. Malformed args drop the single invocation without crashing or polluting other calls. - Runs *before* the bare-JSON sweep so that inner args of the form `call:outer{"name": "inner", "arguments": {}}` aren't hijacked into a spurious `inner` ToolCall by pattern #6. Downstream the existing wiring takes over: SseEmitter::accumulate already calls parse_tool_calls; a non-empty ToolCall list flips finish_reason to `tool_calls`, which the Anthropic /v1/messages branch maps to `stop_reason="tool_use"` with `tool_use` content blocks (http_server.cpp:2030-2090) and the OpenAI branch maps to `choices[].message.tool_calls`. The forge client-side workaround `_parse_plain_text_tool_calls` shipping on feat/lucebox-docker (commit deba2fd) becomes redundant once a server with this fix is deployed. It stays in place as defense-in-depth for older deployed servers. Test plan: 14 new C++ unit cases in test_server_unit.cpp covering single / back-to-back / namespaced / snake- and kebab-case verbs; tool-allowed filtering; mid-prose rejection vs. whitespace-led acceptance; malformed args drop; inner `{}` inside string literals; strict-JSON and relaxed-keys arg parsing; cleaned_text scrubbing; the codex-requested inner `name`/`arguments` interception case; and multi-line nested-array args mirroring the snapshot data. All pass in a standalone driver. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/server/tool_parser.cpp | 170 +++++++++++++++++++++++++++- server/src/server/tool_parser.h | 5 +- server/test/test_server_unit.cpp | 180 ++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+), 5 deletions(-) diff --git a/server/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp index 6244b250a..43bf2b1c1 100644 --- a/server/src/server/tool_parser.cpp +++ b/server/src/server/tool_parser.cpp @@ -1,11 +1,18 @@ // Tool call parser implementation. // -// Five detection patterns, tried in order: +// Six detection patterns, tried in order: // 1. ...V... // 2. ...params... (bare, outside tool_call) // 3. (function-signature style) // 4. {JSON} -// 5. Bare JSON objects with name+arguments fields +// 5. call:?{relaxed-JSON args} (gemma plain-text emissions) +// 6. Bare JSON objects with name+arguments fields +// +// Pattern 5 runs *before* pattern 6 so that args like +// call:outer{"name": "inner", "arguments": {}} +// don't get hijacked by the bare-JSON sweep into a spurious `inner` tool +// call. The brace-balanced span pattern 5 records in `removals` shadows +// the inner JSON from pattern 6's view via `overlaps()`. #include "tool_parser.h" @@ -161,6 +168,125 @@ static const std::regex & re_tool_code() { return r; } +// Pattern 5: `call:?{` opener. The sentinel alternation in front +// rejects narrative usages like "I'll call:foo{x:1}" where `call:` is glued +// to a preceding word — whitespace, common punctuation, and open/close +// brackets are the realistic boundaries seen in the snapshot data. `\s` +// covers `\n` so a `call:` at the start of any line is matched without +// relying on std::regex multiline support (which is non-portable). +// +// Note that `}` is in the sentinel list — gemma frequently emits multiple +// invocations back-to-back: `call:a{x:1}call:b{y:2}`. Without `}` as a +// sentinel the second match would be missed. +static const std::regex & re_call_verb_open() { + static std::regex r(R"((^|[\s,;:\(\[\{\}\)\]\>])call:([A-Za-z0-9_.:\-]+)\s*\{)"); + return r; +} + +// Find the index one past the `}` that matches `text[open] == '{'`. +// Respects nested {}/[] depth and skips over "..." / '...' / `...` +// string literals (with backslash escapes). Returns std::string::npos if +// no matching close is found. +static size_t balanced_braces_end(const std::string & text, size_t open) { + int depth = 0; + char in_str = 0; // 0, or one of '"', '\'', '`' + for (size_t i = open; i < text.size(); i++) { + char c = text[i]; + if (in_str) { + if (c == '\\' && i + 1 < text.size()) { i++; continue; } + if (c == in_str) in_str = 0; + continue; + } + if (c == '"' || c == '\'' || c == '`') { in_str = c; continue; } + if (c == '{' || c == '[') { + depth++; + } else if (c == '}' || c == ']') { + depth--; + if (depth == 0 && c == '}') return i + 1; + if (depth < 0) return std::string::npos; + } + } + return std::string::npos; +} + +// Try strict json::parse first; on failure rewrite single- and +// backtick-quoted strings to double-quoted, wrap bare identifier keys +// in double quotes, and retry. Returns true and populates `out` on +// success; returns false on irrecoverable failure (and `out` is unset). +// +// The rewrite walks the buffer char-by-char tracking string state so it +// doesn't mangle identifiers that live inside string values. +static bool coerce_relaxed_json(const std::string & payload, json & out) { + { + json parsed = json::parse(payload, nullptr, false); + if (!parsed.is_discarded()) { + out = std::move(parsed); + return true; + } + } + + // Permissive pass. + static const std::regex re_bare_key(R"(([A-Za-z_][A-Za-z0-9_]*)(\s*:))"); + + std::string rewritten; + rewritten.reserve(payload.size() + 16); + char in_str = 0; // 0, or the *opening* quote we saw + for (size_t i = 0; i < payload.size(); ) { + char c = payload[i]; + if (in_str) { + // Inside a string we already opened. Mirror escapes verbatim. + if (c == '\\' && i + 1 < payload.size()) { + rewritten += c; + rewritten += payload[i + 1]; + i += 2; + continue; + } + if (c == in_str) { + // Close — always emit a double-quote regardless of which + // quote style opened the string. The opening side already + // emitted a `"`. + rewritten += '"'; + in_str = 0; + i++; + continue; + } + rewritten += c; + i++; + continue; + } + if (c == '"' || c == '\'' || c == '`') { + rewritten += '"'; + in_str = c; + i++; + continue; + } + // Try to match a bare-key identifier here. Don't fire if the + // previous emitted char is `"` — that would indicate we're sitting + // right after a JSON string boundary and the "identifier" is + // probably part of a value continuation (e.g. `"k": foo: 1` would + // be malformed JSON anyway, but better to leave it untouched). + std::smatch m; + std::string tail = payload.substr(i); + if (std::regex_search(tail, m, re_bare_key, + std::regex_constants::match_continuous) && + (rewritten.empty() || rewritten.back() != '"')) { + rewritten += '"'; + rewritten += m[1].str(); + rewritten += '"'; + rewritten += m[2].str(); + i += m.length(); + continue; + } + rewritten += c; + i++; + } + + json parsed = json::parse(rewritten, nullptr, false); + if (parsed.is_discarded()) return false; + out = std::move(parsed); + return true; +} + // ─── XML parameter parser ─────────────────────────────────────────────── static json parse_xml_params(const std::string & region, const std::string & fn_name, @@ -397,7 +523,45 @@ ToolParseResult parse_tool_calls(const std::string & text, const json & tools) { } } - // Pattern 5: Bare JSON objects + // Pattern 5: call:?{relaxed-JSON args} + // + // Runs before the bare-JSON sweep so that inner JSON of the form + // call:outer{"name": "inner", "arguments": {}} + // doesn't get hijacked into a spurious `inner` ToolCall. + { + auto begin = std::sregex_iterator(text.begin(), text.end(), re_call_verb_open()); + auto end = std::sregex_iterator(); + for (auto it = begin; it != end; ++it) { + // Group 1: sentinel char (may be empty if matched at `^`). + // Group 2: full verb including any embedded namespaces. + size_t prefix_len = (*it)[1].matched ? (*it)[1].length() : 0; + size_t call_start = it->position() + prefix_len; + if (overlaps(removals, call_start)) continue; + + // The matched substring runs from call_start through the `{` + // (consuming the opener and any whitespace between verb and + // brace). Compute the brace index from the match end. + size_t brace_open = it->position() + it->length() - 1; + if (brace_open >= text.size() || text[brace_open] != '{') continue; + + size_t brace_close = balanced_braces_end(text, brace_open); + if (brace_close == std::string::npos) continue; + + std::string raw_args = text.substr(brace_open, brace_close - brace_open); + json args; + if (!coerce_relaxed_json(raw_args, args)) continue; + if (!args.is_object()) continue; + + std::string verb = (*it)[2].str(); + size_t colon = verb.find_last_of(':'); + if (colon != std::string::npos) verb = verb.substr(colon + 1); + if (verb.empty()) continue; + + add_call(verb, args, call_start, brace_close); + } + } + + // Pattern 6: Bare JSON objects { size_t cursor = 0; while (cursor < text.size()) { diff --git a/server/src/server/tool_parser.h b/server/src/server/tool_parser.h index 1ff9890a6..b3bd4a796 100644 --- a/server/src/server/tool_parser.h +++ b/server/src/server/tool_parser.h @@ -1,11 +1,12 @@ // Tool call parser — extracts structured tool calls from generated text. // -// Supports 5 detection patterns: +// Supports 6 detection patterns: // 1. ... (Qwen XML) // 2. ... (bare function XML) // 3. (function signature) // 4. {...JSON...} (tool_code wrapper) -// 5. Bare JSON objects {"name":..., "arguments":...} (raw JSON) +// 5. call:?{relaxed-JSON-args} (gemma plain-text) +// 6. Bare JSON objects {"name":..., "arguments":...} (raw JSON) #pragma once diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 275ec935b..d0c9b2915 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -316,6 +316,172 @@ static void test_parse_tool_allowed_filter() { TEST_ASSERT(result.tool_calls.empty()); } +// ─── Pattern 5: call:{relaxed-JSON args} (gemma plain-text) ───── + +static void test_parse_call_verb_single() { + std::string text = "call:get_country_info{country: \"France\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "get_country_info"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["country"] == "France"); + } + TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos); +} + +static void test_parse_call_verb_back_to_back() { + std::string text = + "call:get_country_info{country: \"France\"}" + "call:summarize{text: \"ok\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 2); + if (result.tool_calls.size() == 2) { + TEST_ASSERT(result.tool_calls[0].name == "get_country_info"); + TEST_ASSERT(result.tool_calls[1].name == "summarize"); + } +} + +static void test_parse_call_verb_namespaced() { + std::string text = "call:execute-bead:read-file{path: \"crates/foo/src/lib.rs\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + // Verb only — namespace stripped. + TEST_ASSERT(result.tool_calls[0].name == "read-file"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "crates/foo/src/lib.rs"); + } +} + +static void test_parse_call_verb_snake_and_hyphen() { + std::string text = + "call:execute-bead:list-files{path: \"src/\"}\n\n" + "call:execute-bead:read_file{path: \"a/b.rs\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 2); + if (result.tool_calls.size() == 2) { + TEST_ASSERT(result.tool_calls[0].name == "list-files"); + TEST_ASSERT(result.tool_calls[1].name == "read_file"); + } +} + +static void test_parse_call_verb_tool_allowed_filter() { + std::string text = "call:disallowed_verb{x: 1}call:allowed_verb{y: 2}"; + json tools = json::array({ + {{"type", "function"}, {"function", {{"name", "allowed_verb"}}}} + }); + auto result = parse_tool_calls(text, tools); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "allowed_verb"); + } +} + +static void test_parse_call_verb_inline_prose_rejected() { + // No sentinel char before `call:` — must NOT match. + std::string text = "narrative.call:foo{x:1}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.empty()); +} + +static void test_parse_call_verb_inline_prose_after_space() { + // Whitespace IS a valid sentinel — this should match. + std::string text = "Sure, I'll call:foo{x: 1}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "foo"); + } +} + +static void test_parse_call_verb_malformed_args() { + // Unterminated brace — drop the call, don't crash. + std::string text = "call:foo{country: \"France\""; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.empty()); +} + +static void test_parse_call_verb_inner_brace_in_string() { + // The `{` and `}` inside the string value must not confuse the + // balanced-brace scanner. + std::string text = "call:foo{cmd: \"echo {not_a_brace} ok\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["cmd"] == "echo {not_a_brace} ok"); + } +} + +static void test_parse_call_verb_strict_json_args() { + // Strict-JSON path: keys already quoted. + std::string text = "call:foo{\"path\": \"x\"}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "x"); + } +} + +static void test_parse_call_verb_unquoted_keys() { + // Relaxed-JSON path: bare keys get quoted. + std::string text = "call:foo{path: \"x\", count: 3}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["path"] == "x"); + TEST_ASSERT(args["count"] == 3); + } +} + +static void test_parse_call_verb_cleaned_text() { + // The matched span should be stripped from cleaned_text. + std::string text = "Hello call:foo{x: 1} world."; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + TEST_ASSERT(result.cleaned_text.find("call:") == std::string::npos); + TEST_ASSERT(result.cleaned_text.find("Hello") != std::string::npos); + TEST_ASSERT(result.cleaned_text.find("world.") != std::string::npos); +} + +static void test_parse_call_verb_intercept_inner_json() { + // Codex-requested: inner args of the form {"name": ..., "arguments": ...} + // must NOT be picked up by pattern 6 (bare-JSON sweep) as a spurious + // `inner` ToolCall. Exactly one ToolCall, named `outer`, with the + // inner JSON intact in its arguments. + std::string text = "call:outer{\"name\": \"inner\", \"arguments\": {}}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "outer"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["name"] == "inner"); + TEST_ASSERT(args["arguments"].is_object()); + } +} + +static void test_parse_call_verb_multiline_args() { + // Snapshot rows have multi-line nested args; the balanced-brace + // scanner is line-agnostic, so this must Just Work. + std::string text = + "call:default_api:analyze_data{\n" + " data: [{\"date\": \"2024-10-05\", \"qty\": 50}, {\"date\": \"2024-10-06\", \"qty\": 60}],\n" + " metric: \"qty\"\n" + "}"; + auto result = parse_tool_calls(text); + TEST_ASSERT(result.tool_calls.size() == 1); + if (!result.tool_calls.empty()) { + TEST_ASSERT(result.tool_calls[0].name == "analyze_data"); + auto args = json::parse(result.tool_calls[0].arguments); + TEST_ASSERT(args["metric"] == "qty"); + TEST_ASSERT(args["data"].is_array()); + TEST_ASSERT(args["data"].size() == 2); + } +} + // ═══════════════════════════════════════════════════════════════════════ // SSE Emitter tests // ═══════════════════════════════════════════════════════════════════════ @@ -2590,6 +2756,20 @@ int main() { RUN_TEST(test_parse_no_tools); RUN_TEST(test_parse_tool_code_wrapper); RUN_TEST(test_parse_tool_allowed_filter); + RUN_TEST(test_parse_call_verb_single); + RUN_TEST(test_parse_call_verb_back_to_back); + RUN_TEST(test_parse_call_verb_namespaced); + RUN_TEST(test_parse_call_verb_snake_and_hyphen); + RUN_TEST(test_parse_call_verb_tool_allowed_filter); + RUN_TEST(test_parse_call_verb_inline_prose_rejected); + RUN_TEST(test_parse_call_verb_inline_prose_after_space); + RUN_TEST(test_parse_call_verb_malformed_args); + RUN_TEST(test_parse_call_verb_inner_brace_in_string); + RUN_TEST(test_parse_call_verb_strict_json_args); + RUN_TEST(test_parse_call_verb_unquoted_keys); + RUN_TEST(test_parse_call_verb_cleaned_text); + RUN_TEST(test_parse_call_verb_intercept_inner_json); + RUN_TEST(test_parse_call_verb_multiline_args); std::fprintf(stderr, "\n── SSE Emitter ──\n"); RUN_TEST(test_emitter_reasoning_split_openai);