Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2826,7 +2826,11 @@ void HttpServer::worker_loop() {
// `usage.timings` (OpenAI Chat usage chunk, Anthropic
// message_delta usage, Responses response.completed usage).
// See docs/specs/thinking-budget.md §6.3.
GenTimings gen_timings{ result.prefill_s, result.decode_s };
GenTimings gen_timings{
result.prefill_s,
result.decode_s,
using_restore ? prefix_len : 0,
};

// Record performance for /status page.
if (result.ok) {
Expand Down
11 changes: 9 additions & 2 deletions server/src/server/prefix_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,16 @@ std::vector<int> find_all_boundaries(const std::vector<int32_t> & ids,
const ChatMarkers & markers) {
std::vector<int> out;
int sys_idx = find_first_seq(ids, markers.sys_role_prefix);
if (sys_idx < 0) return out;
int start_idx = sys_idx;
int start_len = (int)markers.sys_role_prefix.size();
if (start_idx < 0) {
auto first_start = find_first_seq_any(ids, markers.next_role_starts);
start_idx = first_start.first;
start_len = first_start.second;
}
if (start_idx < 0 || start_len <= 0) return out;

int cursor = sys_idx + (int)markers.sys_role_prefix.size();
int cursor = start_idx + start_len;
while (true) {
auto [end_idx, end_len] = find_first_seq_any(ids, markers.end_msg_seqs, cursor);
if (end_idx < 0) break;
Expand Down
1 change: 1 addition & 0 deletions server/src/server/sse_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ json build_timings_json(const GenTimings & t, int completion_tokens) {
return json{
{"prefill_ms", prefill_ms},
{"decode_ms", decode_ms},
{"prompt_n_cached", std::max(0, t.prompt_n_cached)},
{"decode_tokens_per_sec", tps}
};
}
Expand Down
6 changes: 4 additions & 2 deletions server/src/server/sse_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@ enum class StreamMode { REASONING, CONTENT, TOOL_BUFFER };
// & client side compute `decode_tokens_per_sec = completion_tokens /
// decode_s` (server emits it pre-computed to avoid drift).
struct GenTimings {
double prefill_s = 0.0;
double decode_s = 0.0;
double prefill_s = 0.0;
double decode_s = 0.0;
int prompt_n_cached = 0;
};

// Build the `timings` sub-object emitted under `usage`.
// prefill_ms = prefill_s * 1000.0 (1 decimal)
// decode_ms = decode_s * 1000.0 (1 decimal)
// prompt_n_cached = number of prompt tokens restored from cache
// decode_tokens_per_sec = completion_tokens / decode_s (0.0 when
// decode_s == 0 to avoid div-by-zero on
// prefill-only / count_tokens responses)
Expand Down
42 changes: 41 additions & 1 deletion server/test/test_server_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,43 @@ static void test_find_boundaries_empty() {
TEST_ASSERT(bounds.empty());
}

static ChatMarkers make_qwen_boundary_markers_for_test() {
ChatMarkers markers;
markers.family = "qwen";
markers.sys_role_prefix = {100, 200};
markers.end_msg_seqs = {{101}};
markers.next_role_starts = {{100}};
return markers;
}

static void test_find_boundaries_qwen_system_first() {
auto markers = make_qwen_boundary_markers_for_test();
// <im_start> system ... <im_end> <im_start> user ...
std::vector<int32_t> ids = {
100, 200, 10, 11, 101,
100, 201, 12, 13, 101,
100, 202, 14,
};
auto bounds = find_all_boundaries(ids, markers);
TEST_ASSERT(bounds.size() == 2);
TEST_ASSERT(bounds[0] == 6);
TEST_ASSERT(bounds[1] == 11);
}

static void test_find_boundaries_qwen_user_first() {
auto markers = make_qwen_boundary_markers_for_test();
// <im_start> user ... <im_end> <im_start> assistant ...
std::vector<int32_t> ids = {
100, 201, 10, 11, 101,
100, 202, 12, 13, 101,
100, 201, 14,
};
auto bounds = find_all_boundaries(ids, markers);
TEST_ASSERT(bounds.size() == 2);
TEST_ASSERT(bounds[0] == 6);
TEST_ASSERT(bounds[1] == 11);
}

// ═══════════════════════════════════════════════════════════════════════
// PFlash config tests (model-free)
// ═══════════════════════════════════════════════════════════════════════
Expand Down Expand Up @@ -3342,10 +3379,11 @@ static void test_usage_timings_responses_streaming() {
static void test_usage_timings_zero_decode_no_div_by_zero() {
// decode_s == 0 (prefill-only / no tokens generated path): emit
// decode_tokens_per_sec = 0.0 without div-by-zero.
GenTimings t{0.123, 0.0};
GenTimings t{0.123, 0.0, 2048};
json j = build_timings_json(t, /*completion_tokens*/ 42);
TEST_ASSERT(j["prefill_ms"].get<double>() == 123.0);
TEST_ASSERT(j["decode_ms"].get<double>() == 0.0);
TEST_ASSERT(j["prompt_n_cached"].get<int>() == 2048);
TEST_ASSERT(j["decode_tokens_per_sec"].get<double>() == 0.0);

// Also exercise via OpenAI streaming path — finite JSON output, no NaN/Inf.
Expand Down Expand Up @@ -3922,6 +3960,8 @@ int main() {
RUN_TEST(test_hash_prefix_different_lengths);
RUN_TEST(test_hash_prefix_empty);
RUN_TEST(test_find_boundaries_empty);
RUN_TEST(test_find_boundaries_qwen_system_first);
RUN_TEST(test_find_boundaries_qwen_user_first);

std::fprintf(stderr, "\n── PFlash config ──\n");
RUN_TEST(test_pflash_config_defaults);
Expand Down
Loading