Luce-Org · easel · Jun 15, 2026
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
@@ -2826,7 +2826,11 @@ void HttpServer::worker_loop() {
         // `usage.timings` (OpenAI Chat usage chunk, Anthropic
         // message_delta usage, Responses response.completed usage).
         // See docs/specs/thinking-budget.md §6.3.
-        GenTimings gen_timings{ result.prefill_s, result.decode_s };
+        GenTimings gen_timings{
+            result.prefill_s,
+            result.decode_s,
+            using_restore ? prefix_len : 0,
+        };
 
         // Record performance for /status page.
         if (result.ok) {

diff --git a/server/src/server/prefix_cache.cpp b/server/src/server/prefix_cache.cpp
@@ -95,9 +95,16 @@ std::vector<int> find_all_boundaries(const std::vector<int32_t> & ids,
                                      const ChatMarkers & markers) {
     std::vector<int> out;
     int sys_idx = find_first_seq(ids, markers.sys_role_prefix);
-    if (sys_idx < 0) return out;
+    int start_idx = sys_idx;
+    int start_len = (int)markers.sys_role_prefix.size();
+    if (start_idx < 0) {
+        auto first_start = find_first_seq_any(ids, markers.next_role_starts);
+        start_idx = first_start.first;
+        start_len = first_start.second;
+    }
+    if (start_idx < 0 || start_len <= 0) return out;
 
-    int cursor = sys_idx + (int)markers.sys_role_prefix.size();
+    int cursor = start_idx + start_len;
     while (true) {
         auto [end_idx, end_len] = find_first_seq_any(ids, markers.end_msg_seqs, cursor);
         if (end_idx < 0) break;

diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
@@ -64,6 +64,7 @@ json build_timings_json(const GenTimings & t, int completion_tokens) {
     return json{
         {"prefill_ms",            prefill_ms},
         {"decode_ms",             decode_ms},
+        {"prompt_n_cached",       std::max(0, t.prompt_n_cached)},
         {"decode_tokens_per_sec", tps}
     };
 }

diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h
@@ -39,13 +39,15 @@ enum class StreamMode { REASONING, CONTENT, TOOL_BUFFER };
 // & client side compute `decode_tokens_per_sec = completion_tokens /
 // decode_s` (server emits it pre-computed to avoid drift).
 struct GenTimings {
-    double prefill_s = 0.0;
-    double decode_s  = 0.0;
+    double prefill_s       = 0.0;
+    double decode_s        = 0.0;
+    int    prompt_n_cached = 0;
 };
 
 // Build the `timings` sub-object emitted under `usage`.
 //   prefill_ms              = prefill_s * 1000.0  (1 decimal)
 //   decode_ms               = decode_s  * 1000.0  (1 decimal)
+//   prompt_n_cached         = number of prompt tokens restored from cache
 //   decode_tokens_per_sec   = completion_tokens / decode_s (0.0 when
 //                              decode_s == 0 to avoid div-by-zero on
 //                              prefill-only / count_tokens responses)

diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
@@ -1224,6 +1224,43 @@ static void test_find_boundaries_empty() {
     TEST_ASSERT(bounds.empty());
 }
 
+static ChatMarkers make_qwen_boundary_markers_for_test() {
+    ChatMarkers markers;
+    markers.family = "qwen";
+    markers.sys_role_prefix = {100, 200};
+    markers.end_msg_seqs = {{101}};
+    markers.next_role_starts = {{100}};
+    return markers;
+}
+
+static void test_find_boundaries_qwen_system_first() {
+    auto markers = make_qwen_boundary_markers_for_test();
+    // <im_start> system ... <im_end> <im_start> user ...
+    std::vector<int32_t> ids = {
+        100, 200, 10, 11, 101,
+        100, 201, 12, 13, 101,
+        100, 202, 14,
+    };
+    auto bounds = find_all_boundaries(ids, markers);
+    TEST_ASSERT(bounds.size() == 2);
+    TEST_ASSERT(bounds[0] == 6);
+    TEST_ASSERT(bounds[1] == 11);
+}
+
+static void test_find_boundaries_qwen_user_first() {
+    auto markers = make_qwen_boundary_markers_for_test();
+    // <im_start> user ... <im_end> <im_start> assistant ...
+    std::vector<int32_t> ids = {
+        100, 201, 10, 11, 101,
+        100, 202, 12, 13, 101,
+        100, 201, 14,
+    };
+    auto bounds = find_all_boundaries(ids, markers);
+    TEST_ASSERT(bounds.size() == 2);
+    TEST_ASSERT(bounds[0] == 6);
+    TEST_ASSERT(bounds[1] == 11);
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // PFlash config tests (model-free)
 // ═══════════════════════════════════════════════════════════════════════
@@ -3342,10 +3379,11 @@ static void test_usage_timings_responses_streaming() {
 static void test_usage_timings_zero_decode_no_div_by_zero() {
     // decode_s == 0 (prefill-only / no tokens generated path): emit
     // decode_tokens_per_sec = 0.0 without div-by-zero.
-    GenTimings t{0.123, 0.0};
+    GenTimings t{0.123, 0.0, 2048};
     json j = build_timings_json(t, /*completion_tokens*/ 42);
     TEST_ASSERT(j["prefill_ms"].get<double>() == 123.0);
     TEST_ASSERT(j["decode_ms"].get<double>() == 0.0);
+    TEST_ASSERT(j["prompt_n_cached"].get<int>() == 2048);
     TEST_ASSERT(j["decode_tokens_per_sec"].get<double>() == 0.0);
 
     // Also exercise via OpenAI streaming path — finite JSON output, no NaN/Inf.
@@ -3922,6 +3960,8 @@ int main() {
     RUN_TEST(test_hash_prefix_different_lengths);
     RUN_TEST(test_hash_prefix_empty);
     RUN_TEST(test_find_boundaries_empty);
+    RUN_TEST(test_find_boundaries_qwen_system_first);
+    RUN_TEST(test_find_boundaries_qwen_user_first);
 
     std::fprintf(stderr, "\n── PFlash config ──\n");
     RUN_TEST(test_pflash_config_defaults);