From eb24fb45a39a660d26b570c24e22c52301fdd96c Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 15:12:43 +0800
Subject: [PATCH 1/7] feat(server): add /status page with SSE push updates

Add a real-time server status dashboard accessible at GET /status:
- Serves standalone HTML from server/share/status.html (editable without recompile)
- GET /status/events SSE endpoint pushes live JSON updates per spec-decode step
- GET /status/json provides a snapshot for non-SSE clients

Status tracking (server_status.h):
- Current phase (idle/prefill/decode) with prompt excerpt and token counts
- Draft tokens being verified (updated each spec-decode step)
- Performance history (last 50 requests): prefill tok/s, decode tok/s, accept rate
- RAII StatusGuard ensures status resets to idle on all exit paths

Backend instrumentation (InferenceObserver on DaemonIO):
- Observer callback in model_backend.h, called at each draft/verify step
- Instrumented in qwen35_backend.cpp and generic dflash_spec_decode.cpp
- Zero overhead when no SSE clients are connected (empty std::function check)

Dashboard features (status.html):
- Dark-themed responsive UI with phase badges and live counters
- Draft token display updated per spec-decode step
- SVG-based performance charts (prefill tok/s, decode tok/s, accept rate)
- Auto-reconnecting EventSource with connection status indicator
- No external CDN dependencies

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 server/share/status.html                 | 211 +++++++++++++++++++++++
 server/src/common/dflash_spec_decode.cpp |  11 ++
 server/src/common/model_backend.h        |  12 ++
 server/src/qwen35/qwen35_backend.cpp     |  11 ++
 server/src/server/http_server.cpp        | 169 ++++++++++++++++++
 server/src/server/http_server.h          |  15 ++
 server/src/server/server_status.h        | 189 ++++++++++++++++++++
 7 files changed, 618 insertions(+)
 create mode 100644 server/share/status.html
 create mode 100644 server/src/server/server_status.h
diff --git a/server/share/status.html b/server/share/status.html
new file mode 100644
index 000000000..eaf277adc
--- /dev/null
+++ b/server/share/status.html
@@ -0,0 +1,211 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>DFlash Server Status</title>
+<style>
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+       background: #0d1117; color: #c9d1d9; padding: 20px; }
+h1 { color: #58a6ff; margin-bottom: 16px; font-size: 1.5em; }
+.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px;
+        padding: 16px; margin-bottom: 16px; }
+.card h2 { color: #8b949e; font-size: 0.85em; text-transform: uppercase;
+            letter-spacing: 0.05em; margin-bottom: 8px; }
+.badge { display: inline-block; padding: 3px 10px; border-radius: 12px;
+         font-size: 0.8em; font-weight: 600; }
+.badge-idle { background: #1f6feb33; color: #58a6ff; }
+.badge-prefill { background: #f0883e33; color: #f0883e; }
+.badge-decode { background: #3fb95033; color: #3fb950; }
+.stat { display: inline-block; margin-right: 24px; margin-bottom: 8px; }
+.stat-value { font-size: 1.4em; font-weight: 700; color: #f0f6fc; }
+.stat-label { font-size: 0.75em; color: #8b949e; }
+.prompt-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+              padding: 8px 12px; font-family: monospace; font-size: 0.85em;
+              max-height: 80px; overflow: hidden; word-break: break-all;
+              color: #8b949e; margin-top: 8px; }
+.tokens-box { margin-top: 8px; }
+.token { display: inline-block; background: #1f6feb22; border: 1px solid #1f6feb;
+         border-radius: 4px; padding: 2px 6px; margin: 2px; font-family: monospace;
+         font-size: 0.8em; color: #79c0ff; }
+.chart-container { width: 100%; height: 200px; position: relative; }
+.chart-container svg { width: 100%; height: 100%; }
+.legend { display: flex; gap: 16px; margin-top: 8px; font-size: 0.75em; }
+.legend-item { display: flex; align-items: center; gap: 4px; }
+.legend-dot { width: 10px; height: 10px; border-radius: 50%; }
+.connection-status { float: right; font-size: 0.75em; padding: 3px 8px;
+                     border-radius: 4px; }
+.connected { background: #3fb95033; color: #3fb950; }
+.disconnected { background: #f8514933; color: #f85149; }
+#no-data { color: #484f58; text-align: center; padding: 40px; }
+</style>
+</head>
+<body>
+<h1>&#x26A1; DFlash Server Status
+  <span class="connection-status disconnected" id="conn-status">disconnected</span>
+</h1>
+
+<div class="card">
+  <h2>Current State</h2>
+  <div id="phase-section">
+    <span id="phase-badge" class="badge badge-idle">idle</span>
+    <span class="stat" style="margin-left:16px">
+      <span class="stat-value" id="total-req">0</span>
+      <span class="stat-label">total requests</span>
+    </span>
+  </div>
+  <div id="current-info" style="display:none; margin-top:12px;">
+    <div>
+      <span class="stat">
+        <span class="stat-value" id="cur-prompt-tokens">0</span>
+        <span class="stat-label">prompt tokens</span>
+      </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-completion-tokens">0</span>
+        <span class="stat-label">completion tokens</span>
+      </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-elapsed">0.0s</span>
+        <span class="stat-label">elapsed</span>
+      </span>
+    </div>
+    <div class="prompt-box" id="cur-prompt"></div>
+    <div class="tokens-box" id="cur-draft-tokens"></div>
+  </div>
+</div>
+
+<div class="card">
+  <h2>Prefill Performance</h2>
+  <div class="chart-container" id="chart-prefill"></div>
+  <div class="legend">
+    <div class="legend-item"><div class="legend-dot" style="background:#f0883e"></div>Prefill tok/s</div>
+  </div>
+</div>
+
+<div class="card">
+  <h2>Decode Performance</h2>
+  <div class="chart-container" id="chart-decode"></div>
+  <div class="legend">
+    <div class="legend-item"><div class="legend-dot" style="background:#3fb950"></div>Decode tok/s</div>
+    <div class="legend-item"><div class="legend-dot" style="background:#58a6ff"></div>Accept Rate %</div>
+  </div>
+</div>
+
+<script>
+function drawChart(containerId, datasets, yMax) {
+  const container = document.getElementById(containerId);
+  if (!datasets[0].data.length) {
+    container.innerHTML = '<div id="no-data">No data yet</div>';
+    return;
+  }
+  const W = 800, H = 180, PAD = 40;
+  const n = datasets[0].data.length;
+  const maxVal = yMax || Math.max(...datasets.flatMap(d => d.data), 1);
+
+  let svg = '<svg viewBox="0 0 ' + W + ' ' + H + '" preserveAspectRatio="none">';
+  // Grid lines
+  for (let i = 0; i <= 4; i++) {
+    const y = PAD + (H - PAD * 2) * (1 - i / 4);
+    const val = (maxVal * i / 4).toFixed(0);
+    svg += '<line x1="' + PAD + '" y1="' + y + '" x2="' + (W - 10) + '" y2="' + y + '" stroke="#30363d" stroke-width="0.5"/>';
+    svg += '<text x="' + (PAD - 4) + '" y="' + (y + 4) + '" fill="#8b949e" font-size="10" text-anchor="end">' + val + '</text>';
+  }
+  // Data lines
+  for (const ds of datasets) {
+    let path = '';
+    for (let i = 0; i < n; i++) {
+      const x = PAD + (W - PAD - 10) * i / Math.max(n - 1, 1);
+      const y = PAD + (H - PAD * 2) * (1 - Math.min(ds.data[i] / maxVal, 1));
+      path += (i === 0 ? 'M' : 'L') + x.toFixed(1) + ',' + y.toFixed(1);
+    }
+    svg += '<path d="' + path + '" fill="none" stroke="' + ds.color + '" stroke-width="2"/>';
+    // Dots for last 10 points
+    const dotStart = Math.max(0, n - 10);
+    for (let i = dotStart; i < n; i++) {
+      const x = PAD + (W - PAD - 10) * i / Math.max(n - 1, 1);
+      const y = PAD + (H - PAD * 2) * (1 - Math.min(ds.data[i] / maxVal, 1));
+      svg += '<circle cx="' + x.toFixed(1) + '" cy="' + y.toFixed(1) + '" r="3" fill="' + ds.color + '"/>';
+    }
+  }
+  svg += '</svg>';
+  container.innerHTML = svg;
+}
+
+function update(data) {
+  // Phase badge
+  const badge = document.getElementById('phase-badge');
+  badge.textContent = data.phase;
+  badge.className = 'badge badge-' + data.phase;
+
+  document.getElementById('total-req').textContent = data.total_requests;
+
+  // Current info
+  const info = document.getElementById('current-info');
+  if (data.current) {
+    info.style.display = 'block';
+    document.getElementById('cur-prompt-tokens').textContent = data.current.prompt_tokens;
+    document.getElementById('cur-completion-tokens').textContent = data.current.completion_tokens;
+    document.getElementById('cur-elapsed').textContent = data.current.elapsed_s.toFixed(1) + 's';
+    document.getElementById('cur-prompt').textContent = data.current.prompt_excerpt || '(no excerpt)';
+    const dtContainer = document.getElementById('cur-draft-tokens');
+    if (data.current.draft_tokens && data.current.draft_tokens.length) {
+      dtContainer.innerHTML = '<strong style="color:#8b949e;font-size:0.8em">Draft tokens: </strong>' +
+        data.current.draft_tokens.map(function(t) { return '<span class="token">' + escapeHtml(t) + '</span>'; }).join('');
+    } else {
+      dtContainer.innerHTML = '';
+    }
+  } else {
+    info.style.display = 'none';
+  }
+
+  // Charts
+  const hist = data.perf_history || [];
+  const prefillData = hist.map(function(h) { return h.prefill_tok_s; });
+  const decodeData = hist.map(function(h) { return h.decode_tok_s; });
+  const acceptData = hist.map(function(h) { return h.accept_rate * 100; });
+
+  const prefillMax = Math.max.apply(null, prefillData.concat([100]));
+  drawChart('chart-prefill', [{data: prefillData, color: '#f0883e'}], prefillMax * 1.1);
+
+  const decodeMax = Math.max.apply(null, decodeData.concat(acceptData).concat([10]));
+  drawChart('chart-decode', [
+    {data: decodeData, color: '#3fb950'},
+    {data: acceptData, color: '#58a6ff'}
+  ], decodeMax * 1.1);
+}
+
+function escapeHtml(s) {
+  return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+}
+
+// SSE connection
+function connectSSE() {
+  const connEl = document.getElementById('conn-status');
+  const es = new EventSource('/status/events');
+
+  es.onopen = function() {
+    connEl.textContent = 'connected';
+    connEl.className = 'connection-status connected';
+  };
+
+  es.addEventListener('status', function(e) {
+    try {
+      const data = JSON.parse(e.data);
+      update(data);
+    } catch (err) {}
+  });
+
+  es.onerror = function() {
+    connEl.textContent = 'disconnected';
+    connEl.className = 'connection-status disconnected';
+    es.close();
+    // Reconnect after 2 seconds
+    setTimeout(connectSSE, 2000);
+  };
+}
+
+connectSSE();
+</script>
+</body>
+</html>
diff --git a/server/src/common/dflash_spec_decode.cpp b/server/src/common/dflash_spec_decode.cpp
index 141e45e92..3b075493e 100644
--- a/server/src/common/dflash_spec_decode.cpp
+++ b/server/src/common/dflash_spec_decode.cpp
@@ -169,6 +169,11 @@ bool run_dflash_spec_decode(
             }
         }
 
+        // Notify observer with draft tokens for this step.
+        if (io.observer) {
+            io.observer("draft", draft_tok);
+        }
+
         // ── Verify pass: speculative target forward over q_len tokens ────
         if (!target.snapshot_kv()) {
             std::fprintf(stderr, "dflash-spec snapshot_kv failed\n");
@@ -234,6 +239,12 @@ bool run_dflash_spec_decode(
         n_generated += emitted;
         n_accept_sum += std::min(accept_n, emitted);
         n_draft_steps++;
+
+        // Notify observer with accepted tokens for this step.
+        if (io.observer) {
+            io.observer("verify", replay_tok);
+        }
+
         if (io.cancelled) break;
         if (hit_eos) break;
     }
diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index b808d0c39..af5f6ba72 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -27,6 +27,14 @@ namespace dflash::common {
 // Return true to continue generation, false to abort.
 using TokenCallback = std::function<bool(int32_t token)>;
 
+// Inference observer callback for live status updates. Called by backends
+// at each spec-decode step to report phase/detail. When empty, backends
+// skip the call (zero overhead).
+//   phase: "draft", "verify", "accept", "prefill_chunk"
+//   detail: JSON string with step-specific data
+using InferenceObserver = std::function<void(const char * phase,
+                                             const std::vector<int32_t> & tokens)>;
+
 // ─── I/O handle passed to backend methods that need protocol output ─────
 struct DaemonIO {
     int stream_fd = -1;
@@ -37,6 +45,10 @@ struct DaemonIO {
     TokenCallback on_token;
     mutable bool cancelled = false;
 
+    // Optional inference observer for /status page. When set, backends call
+    // this at each spec-decode step with draft tokens and phase info.
+    InferenceObserver observer;
+
     // Write a single int32 to the stream fd (token or -1 sentinel).
     // Also invokes on_token if set. Sets cancelled=true if on_token
     // returns false (client disconnected).
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 4a3d9674e..1c442d83e 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -1315,6 +1315,11 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
             }
         }
 
+        // Notify observer with draft tokens for this step.
+        if (io.observer) {
+            io.observer("draft", draft_tok);
+        }
+
         // 4. Verify: snapshot KV, run target forward over draft tokens
         if (!target->snapshot_kv()) {
             step_graph_destroy(draft_sg);
@@ -1391,6 +1396,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
         n_generated += emitted;
         n_accept_sum += std::min(accept_n, emitted);
         n_draft_steps++;
+
+        // Notify observer with accepted tokens for this step.
+        if (io.observer) {
+            io.observer("verify", replay_tok);
+        }
+
         if (io.cancelled) break;
         if (hit_eos) break;
     }
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index e751cad70..7a93f26ce 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -12,6 +12,8 @@
 #include <chrono>
 #include <cstdio>
 #include <cstring>
+#include <fstream>
+#include <sstream>
 
 #include <arpa/inet.h>
 #include <fcntl.h>
@@ -20,6 +22,7 @@
 #include <poll.h>
 #include <signal.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 namespace dflash::common {
@@ -44,6 +47,8 @@ static constexpr char kServerName[] = "luce-dflash";
 static const std::vector<std::string> kApiEndpoints = {
     "GET /health",
     "GET /props",
+    "GET /status",
+    "GET /status/events",
     "GET /v1/models",
     "POST /v1/chat/completions",
     "POST /v1/messages",
@@ -465,6 +470,55 @@ HttpServer::HttpServer(ModelBackend & backend,
                    config.disk_cache_cold_max_tokens}, backend)
 {
     disk_cache_.init();
+    status_html_path_ = resolve_status_html();
+}
+
+// Resolve path to share/status.html at startup.
+std::string HttpServer::resolve_status_html() {
+    // 1. DFLASH_SHARE_DIR env var
+    if (const char * dir = std::getenv("DFLASH_SHARE_DIR")) {
+        std::string path = std::string(dir) + "/status.html";
+        struct stat st;
+        if (::stat(path.c_str(), &st) == 0) return path;
+    }
+    // 2. ../share/ relative to /proc/self/exe
+    char exe_buf[1024] = {};
+    ssize_t len = ::readlink("/proc/self/exe", exe_buf, sizeof(exe_buf) - 1);
+    if (len > 0) {
+        exe_buf[len] = '\0';
+        std::string exe_dir(exe_buf);
+        auto slash = exe_dir.rfind('/');
+        if (slash != std::string::npos) {
+            exe_dir = exe_dir.substr(0, slash);
+            std::string path = exe_dir + "/../share/status.html";
+            struct stat st;
+            if (::stat(path.c_str(), &st) == 0) return path;
+        }
+    }
+    // 3. ./share/status.html (development)
+    {
+        struct stat st;
+        if (::stat("share/status.html", &st) == 0) return "share/status.html";
+    }
+    return {};
+}
+
+// Broadcast current status as SSE event to all connected /status/events clients.
+void HttpServer::broadcast_status() {
+    std::string event = status_.to_sse_event();
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        ssize_t sent = ::send(fd, event.data(), event.size(), MSG_NOSIGNAL);
+        if (sent <= 0) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
 }
 
 HttpServer::~HttpServer() {
@@ -483,6 +537,13 @@ void HttpServer::shutdown() {
         worker_thread_.join();
     }
 
+    // Close SSE client connections.
+    {
+        std::lock_guard<std::mutex> lk(sse_mu_);
+        for (int fd : sse_fds_) ::close(fd);
+        sse_fds_.clear();
+    }
+
     // Drain any pending jobs.
     {
         std::lock_guard<std::mutex> lk(queue_mu_);
@@ -679,6 +740,60 @@ void HttpServer::handle_client(int fd) {
         return;
     }
 
+    // Status page: serve HTML file from disk.
+    if (hr.method == "GET" && hr.path == "/status") {
+        if (status_html_path_.empty()) {
+            send_error(fd, 404,
+                "status.html not found. Set DFLASH_SHARE_DIR or place it in share/status.html");
+            ::close(fd);
+            return;
+        }
+        std::ifstream ifs(status_html_path_);
+        if (!ifs.is_open()) {
+            send_error(fd, 500, "failed to open status.html");
+            ::close(fd);
+            return;
+        }
+        std::ostringstream oss;
+        oss << ifs.rdbuf();
+        send_response(fd, 200, "text/html; charset=utf-8", oss.str());
+        ::close(fd);
+        return;
+    }
+
+    // Status JSON snapshot (for non-SSE clients / debugging).
+    if (hr.method == "GET" && hr.path == "/status/json") {
+        send_response(fd, 200, "application/json", status_.to_json().dump() + "\n");
+        ::close(fd);
+        return;
+    }
+
+    // Status SSE stream: hold connection open and push updates.
+    if (hr.method == "GET" && hr.path == "/status/events") {
+        // Send SSE headers.
+        const char * headers =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Type: text/event-stream\r\n"
+            "Cache-Control: no-cache\r\n"
+            "Connection: keep-alive\r\n"
+            "Access-Control-Allow-Origin: *\r\n"
+            "\r\n";
+        if (!send_all(fd, headers, std::strlen(headers))) {
+            ::close(fd);
+            return;
+        }
+        // Send initial state immediately.
+        std::string initial = status_.to_sse_event();
+        send_all(fd, initial.data(), initial.size());
+        // Register for future broadcasts. The fd is NOT closed here — it stays
+        // open until the client disconnects (detected on next broadcast send).
+        {
+            std::lock_guard<std::mutex> lk(sse_mu_);
+            sse_fds_.push_back(fd);
+        }
+        return;  // Do NOT close fd — it's now owned by the SSE broadcast loop.
+    }
+
     // Models endpoint.
     if (hr.method == "GET" && hr.path == "/v1/models") {
         // Codex sends ?client_version= — serve the Codex-specific schema.
@@ -1114,6 +1229,20 @@ void HttpServer::worker_loop() {
         const auto & req = job->req;
         auto started_at = std::chrono::steady_clock::now();
 
+        // Track live status for /status page. RAII guard ensures idle on all paths.
+        std::string prompt_excerpt;
+        if (!req.prompt_tokens.empty()) {
+            // Decode first ~40 tokens as a prompt excerpt (cheap, bounded).
+            const int excerpt_len = std::min((int)req.prompt_tokens.size(), 40);
+            std::vector<int32_t> excerpt_toks(req.prompt_tokens.begin(),
+                                               req.prompt_tokens.begin() + excerpt_len);
+            prompt_excerpt = tokenizer_.decode(excerpt_toks);
+            if (prompt_excerpt.size() > 200) prompt_excerpt.resize(200);
+        }
+        status_.set_running(prompt_excerpt, (int)req.prompt_tokens.size(), req.stream);
+        broadcast_status();
+        StatusGuard status_guard{status_};
+
         auto finish_job = [&]() {
             std::lock_guard<std::mutex> lk(job->mu);
             job->done = true;
@@ -1444,6 +1573,17 @@ void HttpServer::worker_loop() {
         DaemonIO io;
         io.stream_fd = -1;  // no pipe — we write SSE directly
 
+        // Inference observer: updates status page with draft tokens per step.
+        io.observer = [&](const char * phase, const std::vector<int32_t> & tokens) {
+            std::vector<std::string> token_strs;
+            token_strs.reserve(tokens.size());
+            for (int32_t t : tokens) {
+                token_strs.push_back(tokenizer_.token_text(t));
+            }
+            status_.set_draft_tokens(token_strs);
+            broadcast_status();
+        };
+
         int completion_tokens = 0;
         bool client_disconnected = false;
 
@@ -1451,6 +1591,12 @@ void HttpServer::worker_loop() {
             if (client_disconnected) return false;
             completion_tokens++;
 
+            // Update status page every 10 tokens (low overhead).
+            if (completion_tokens % 10 == 0) {
+                status_.update_completion_tokens(completion_tokens);
+                broadcast_status();
+            }
+
             // Skip EOS/EOT/special tokens — don't forward to SSE.
             int32_t eos = tokenizer_.eos_id();
             int32_t eot = tokenizer_.eos_chat_id();
@@ -1534,6 +1680,10 @@ void HttpServer::worker_loop() {
             backend_.unpark("draft");   // reload decode draft (~3.3 GB)
         }
 
+        // Transition status to decode phase.
+        status_.set_decode();
+        broadcast_status();
+
         GenerateResult result;
         if (using_restore) {
             result = backend_.restore_and_generate_with_empty_spec_fallback(cache_slot, gen_req, io);
@@ -1630,6 +1780,25 @@ void HttpServer::worker_loop() {
         // message_delta usage, Responses response.completed usage).
         // See docs/specs/thinking-budget.md §6.3.
         GenTimings gen_timings{ result.prefill_s, result.decode_s };
+
+        // Record performance for /status page.
+        if (result.ok) {
+            PerfRecord perf;
+            perf.prompt_tokens = (int)req.prompt_tokens.size();
+            perf.completion_tokens = completion_tokens;
+            perf.prefill_tok_s = (result.prefill_s > 0.0)
+                ? (double)req.prompt_tokens.size() / result.prefill_s : 0.0;
+            perf.decode_tok_s = (result.decode_s > 0.0)
+                ? (double)completion_tokens / result.decode_s : 0.0;
+            perf.accept_rate = result.accept_rate;
+            perf.cache_hit = using_restore;
+            perf.pflash = pflash_compressed;
+            perf.spec_decode = result.spec_decode_ran;
+            perf.timestamp = std::chrono::steady_clock::now();
+            status_.record_perf(perf);
+            status_.update_completion_tokens(completion_tokens);
+            broadcast_status();
+        }
         if (req.stream && !client_disconnected) {
             auto final_chunks = emitter.emit_finish(completion_tokens, &gen_timings);
             for (const auto & chunk : final_chunks) {
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 71c544acb..4c6aac239 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -23,6 +23,7 @@
 #include "common/pflash_drafter_ipc.h"
 #include "model_card.h"
 #include "adaptive_keep_ratio.h"
+#include "server_status.h"
 #include <nlohmann/json.hpp>
 
 #include <atomic>
@@ -289,6 +290,20 @@ class HttpServer {
     // Per-session adaptive keep_ratio bandit state.
     HttpServerSessions sessions_;
 
+    // Live status tracker (read by /status/json, written by worker thread).
+    ServerStatus status_;
+
+    // SSE client connections for /status/events push.
+    std::mutex             sse_mu_;
+    std::vector<int>       sse_fds_;
+
+    // Broadcast current status to all SSE clients. Removes dead fds.
+    void broadcast_status();
+
+    // Resolve and cache path to share/status.html.
+    std::string status_html_path_;
+    std::string resolve_status_html();
+
     // Track prompt tokens for each snapshot slot (for shutdown save).
     std::unordered_map<int, std::vector<int32_t>> slot_tokens_;
 
diff --git a/server/src/server/server_status.h b/server/src/server/server_status.h
new file mode 100644
index 000000000..6700cb159
--- /dev/null
+++ b/server/src/server/server_status.h
@@ -0,0 +1,189 @@
+// Server status tracking for the /status introspection page.
+//
+// Thread-safe status tracker: worker thread writes, HTTP client threads read.
+// Designed for minimal overhead on the inference hot path.
+
+#pragma once
+
+#include <nlohmann/json.hpp>
+
+#include <chrono>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace dflash::common {
+
+using json = nlohmann::json;
+
+// Performance record for one completed request.
+struct PerfRecord {
+    double prefill_tok_s    = 0.0;
+    double decode_tok_s     = 0.0;
+    float  accept_rate      = 0.0f;
+    int    prompt_tokens    = 0;
+    int    completion_tokens = 0;
+    bool   cache_hit        = false;
+    bool   pflash           = false;
+    bool   spec_decode      = false;
+    std::chrono::steady_clock::time_point timestamp;
+};
+
+// Live inference phase.
+enum class InferencePhase {
+    IDLE,
+    PREFILL,
+    DECODE,
+};
+
+static inline const char * phase_name(InferencePhase p) {
+    switch (p) {
+    case InferencePhase::IDLE:    return "idle";
+    case InferencePhase::PREFILL: return "prefill";
+    case InferencePhase::DECODE:  return "decode";
+    default:                      return "unknown";
+    }
+}
+
+class ServerStatus {
+public:
+    static constexpr int kMaxHistory = 50;
+
+    // Called by worker thread to update live state.
+    void set_running(const std::string & prompt_excerpt, int prompt_tokens,
+                     bool is_stream) {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::PREFILL;
+        prompt_excerpt_ = prompt_excerpt;
+        prompt_tokens_ = prompt_tokens;
+        completion_tokens_ = 0;
+        is_stream_ = is_stream;
+        draft_tokens_.clear();
+        started_at_ = std::chrono::steady_clock::now();
+    }
+
+    void set_decode() {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::DECODE;
+    }
+
+    void update_completion_tokens(int n) {
+        std::lock_guard<std::mutex> lk(mu_);
+        completion_tokens_ = n;
+    }
+
+    void set_draft_tokens(const std::vector<std::string> & tokens) {
+        std::lock_guard<std::mutex> lk(mu_);
+        draft_tokens_ = tokens;
+    }
+
+    void set_idle() {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::IDLE;
+        prompt_excerpt_.clear();
+        draft_tokens_.clear();
+    }
+
+    void record_perf(const PerfRecord & rec) {
+        std::lock_guard<std::mutex> lk(mu_);
+        if ((int)perf_history_.size() >= kMaxHistory) {
+            perf_history_.erase(perf_history_.begin());
+        }
+        perf_history_.push_back(rec);
+        total_requests_++;
+    }
+
+    // Snapshot current state as JSON (thread-safe).
+    json to_json() const {
+        InferencePhase phase;
+        std::string prompt_excerpt;
+        int prompt_tokens = 0;
+        int completion_tokens = 0;
+        bool is_stream = false;
+        std::vector<std::string> draft_tokens;
+        std::vector<PerfRecord> history;
+        int total_requests = 0;
+        double elapsed_s = 0.0;
+
+        {
+            std::lock_guard<std::mutex> lk(mu_);
+            phase = phase_;
+            prompt_excerpt = prompt_excerpt_;
+            prompt_tokens = prompt_tokens_;
+            completion_tokens = completion_tokens_;
+            is_stream = is_stream_;
+            draft_tokens = draft_tokens_;
+            history = perf_history_;
+            total_requests = total_requests_;
+            if (phase != InferencePhase::IDLE) {
+                elapsed_s = std::chrono::duration<double>(
+                    std::chrono::steady_clock::now() - started_at_).count();
+            }
+        }
+
+        json j;
+        j["phase"] = phase_name(phase);
+        j["total_requests"] = total_requests;
+
+        if (phase != InferencePhase::IDLE) {
+            j["current"] = {
+                {"prompt_excerpt", prompt_excerpt},
+                {"prompt_tokens", prompt_tokens},
+                {"completion_tokens", completion_tokens},
+                {"stream", is_stream},
+                {"elapsed_s", elapsed_s},
+                {"draft_tokens", draft_tokens},
+            };
+        } else {
+            j["current"] = nullptr;
+        }
+
+        json perf = json::array();
+        for (const auto & r : history) {
+            perf.push_back({
+                {"prefill_tok_s", r.prefill_tok_s},
+                {"decode_tok_s", r.decode_tok_s},
+                {"accept_rate", r.accept_rate},
+                {"prompt_tokens", r.prompt_tokens},
+                {"completion_tokens", r.completion_tokens},
+                {"cache_hit", r.cache_hit},
+                {"pflash", r.pflash},
+                {"spec_decode", r.spec_decode},
+            });
+        }
+        j["perf_history"] = perf;
+
+        return j;
+    }
+
+    // Format as SSE event string: "event: status\ndata: {json}\n\n"
+    std::string to_sse_event() const {
+        std::string data = to_json().dump();
+        return "event: status\ndata: " + data + "\n\n";
+    }
+
+private:
+    mutable std::mutex mu_;
+
+    // Live state.
+    InferencePhase phase_ = InferencePhase::IDLE;
+    std::string prompt_excerpt_;
+    int prompt_tokens_ = 0;
+    int completion_tokens_ = 0;
+    bool is_stream_ = false;
+    std::vector<std::string> draft_tokens_;
+    std::chrono::steady_clock::time_point started_at_;
+
+    // History.
+    std::vector<PerfRecord> perf_history_;
+    int total_requests_ = 0;
+};
+
+// RAII guard that resets status to idle on scope exit.
+struct StatusGuard {
+    ServerStatus & status;
+    ~StatusGuard() { status.set_idle(); }
+};
+
+}  // namespace dflash::common

From 70a50cb136dd17aa9933ecf5a6ad106f8697f1f3 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 16:01:13 +0800
Subject: [PATCH 2/7] fix: ensure status.html is found from build directory

- Add CMake POST_BUILD rule to copy share/status.html into build/share/
- Add exe_dir/share/status.html as a search path (build dir layout)
- Keeps existing ../share/ and ./share/ fallbacks

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../plan.md                                   | 50 +++++++++++++++++++
 server/CMakeLists.txt                         |  9 ++++
 server/src/server/http_server.cpp             | 17 +++++--
 3 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 .copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md

diff --git a/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md b/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md
new file mode 100644
index 000000000..a5a786fcc
--- /dev/null
+++ b/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md
@@ -0,0 +1,50 @@
+# Plan: Hybrid Routed+Cold Path for All MoE Layers
+
+## Problem
+When `cold_compute` is enabled and layers have cold experts, DeltaNet layers are forced off the
+routed fast path into the slower split path. This adds overhead from:
+- Extra sync at split path entry (line 365)
+- Hot graph rebuild/dispatch instead of reusing the pre-cached `rffn` graph
+- Less pipelined execution
+
+## Approach
+Extend both the **routed fast path** (DeltaNet) and the **split-path routed FFN sub-path**
+(attention layers) to handle cold experts inline — running the cold fused kernel on CPU in
+parallel with the GPU routed FFN dispatch.
+
+**Key property:** When all experts are hot at runtime (n_cold = 0), the cold branch is simply
+skipped. The path executes identically to today's routed fast path — zero overhead.
+
+## Changes
+
+### 1. Routed Fast Path (DeltaNet layers, lines 260-355)
+
+**Remove** the cold_compute guard from line 263:
+```cpp
+// Before:
+&& !(state.cold_compute && !hybrid.layers[(size_t)il].cold_expert_ids.empty())
+// After: removed — hybrid path handles cold inline
+```
+
+**Modify** the routing remap loop (lines 296-308) to also record cold IDs/weights.
+
+**After remap, before rffn dispatch** — read ffn_post to CPU if cold compute needed.
+
+**After rffn dispatch (async)** — run cold compute on CPU (parallel with GPU rffn).
+
+**Combine** — upload cold result instead of zeroing.
+
+### 2. Split-Path Routed FFN Sub-Path (attention layers, lines 441-495)
+
+Same pattern: remove cold_compute guard, add cold partition + D2H + cold compute + upload.
+
+### 3. Telemetry
+
+Add `hybrid_cold_compute_us` counter for the new inline cold compute timing.
+
+## Timing Analysis
+
+- Current split path per mixed layer: ~1630µs
+- Proposed hybrid: prefn(300µs) + max(cold(850µs), rffn(300µs)) + combine(50µs) ≈ 1200µs
+- Conservative estimate: 5-10% throughput improvement
+- All-hot case: zero overhead (cold branch skipped)
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index d42762f35..79a356115 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -726,6 +726,15 @@ if(DFLASH27B_TESTS)
         else()
             target_link_libraries(dflash_server PRIVATE hip::host)
         endif()
+
+        # Copy share/status.html next to the binary so it can be found at runtime.
+        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/share")
+        add_custom_command(TARGET dflash_server POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html"
+                "${CMAKE_CURRENT_BINARY_DIR}/share/status.html"
+            COMMENT "Copying status.html to build/share/"
+        )
     endif()
 
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/backend_ipc_main.cpp")
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 7a93f26ce..525279fd2 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -481,7 +481,7 @@ std::string HttpServer::resolve_status_html() {
         struct stat st;
         if (::stat(path.c_str(), &st) == 0) return path;
     }
-    // 2. ../share/ relative to /proc/self/exe
+    // 2. share/ relative to /proc/self/exe (build dir or installed prefix)
     char exe_buf[1024] = {};
     ssize_t len = ::readlink("/proc/self/exe", exe_buf, sizeof(exe_buf) - 1);
     if (len > 0) {
@@ -490,9 +490,18 @@ std::string HttpServer::resolve_status_html() {
         auto slash = exe_dir.rfind('/');
         if (slash != std::string::npos) {
             exe_dir = exe_dir.substr(0, slash);
-            std::string path = exe_dir + "/../share/status.html";
-            struct stat st;
-            if (::stat(path.c_str(), &st) == 0) return path;
+            // 2a. <exe_dir>/share/status.html  (build directory layout)
+            {
+                std::string path = exe_dir + "/share/status.html";
+                struct stat st;
+                if (::stat(path.c_str(), &st) == 0) return path;
+            }
+            // 2b. <exe_dir>/../share/status.html  (installed prefix layout)
+            {
+                std::string path = exe_dir + "/../share/status.html";
+                struct stat st;
+                if (::stat(path.c_str(), &st) == 0) return path;
+            }
         }
     }
     // 3. ./share/status.html (development)

From ac807385a862ebbd55e6f4af5171d483322e66a2 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 16:30:25 +0800
Subject: [PATCH 3/7] feat(status): show full request details and live response
 output

- Add request params to status: model, format, temperature, top_p/k,
  max_output, thinking_enabled, session_id, cache/pflash/spec_decode flags
- Add incremental 'event: token' SSE events (browser accumulates output)
- Add messages JSON to status event (sent once per request)
- Redesigned HTML: two-column request/response view, params grid, feature
  tags (cache hit, pflash, spec decode, stream, thinking), live tok/s
- All state accumulated client-side; server stays stateless for output text

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 server/share/status.html          | 173 ++++++++++++++++++++++++++----
 server/src/server/http_server.cpp |  50 ++++++++-
 server/src/server/http_server.h   |   3 +
 server/src/server/server_status.h |  55 +++++++++-
 4 files changed, 256 insertions(+), 25 deletions(-)

diff --git a/server/share/status.html b/server/share/status.html
index eaf277adc..6c8b4fee4 100644
--- a/server/share/status.html
+++ b/server/share/status.html
@@ -21,15 +21,32 @@
 .stat { display: inline-block; margin-right: 24px; margin-bottom: 8px; }
 .stat-value { font-size: 1.4em; font-weight: 700; color: #f0f6fc; }
 .stat-label { font-size: 0.75em; color: #8b949e; }
-.prompt-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
-              padding: 8px 12px; font-family: monospace; font-size: 0.85em;
-              max-height: 80px; overflow: hidden; word-break: break-all;
-              color: #8b949e; margin-top: 8px; }
+.tag { display: inline-block; padding: 2px 8px; border-radius: 4px;
+       font-size: 0.7em; font-weight: 600; margin-right: 6px; margin-top: 4px; }
+.tag-green { background: #3fb95022; color: #3fb950; border: 1px solid #3fb95044; }
+.tag-orange { background: #f0883e22; color: #f0883e; border: 1px solid #f0883e44; }
+.tag-blue { background: #1f6feb22; color: #58a6ff; border: 1px solid #1f6feb44; }
+.tag-gray { background: #8b949e22; color: #8b949e; border: 1px solid #8b949e44; }
+.params-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr));
+               gap: 8px; margin-top: 8px; }
+.param { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+         padding: 6px 10px; }
+.param-label { font-size: 0.7em; color: #8b949e; text-transform: uppercase; }
+.param-value { font-size: 0.9em; color: #f0f6fc; font-weight: 500; }
+.text-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+            padding: 8px 12px; font-family: monospace; font-size: 0.82em;
+            max-height: 300px; overflow-y: auto; word-break: break-word;
+            white-space: pre-wrap; color: #c9d1d9; margin-top: 8px; line-height: 1.5; }
+.text-box:empty::after { content: '(waiting...)'; color: #484f58; }
+.messages-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+                padding: 8px 12px; font-family: monospace; font-size: 0.78em;
+                max-height: 200px; overflow-y: auto; word-break: break-word;
+                white-space: pre-wrap; color: #8b949e; margin-top: 8px; }
 .tokens-box { margin-top: 8px; }
 .token { display: inline-block; background: #1f6feb22; border: 1px solid #1f6feb;
          border-radius: 4px; padding: 2px 6px; margin: 2px; font-family: monospace;
          font-size: 0.8em; color: #79c0ff; }
-.chart-container { width: 100%; height: 200px; position: relative; }
+.chart-container { width: 100%; height: 180px; position: relative; }
 .chart-container svg { width: 100%; height: 100%; }
 .legend { display: flex; gap: 16px; margin-top: 8px; font-size: 0.75em; }
 .legend-item { display: flex; align-items: center; gap: 4px; }
@@ -38,7 +55,8 @@
                      border-radius: 4px; }
 .connected { background: #3fb95033; color: #3fb950; }
 .disconnected { background: #f8514933; color: #f85149; }
-#no-data { color: #484f58; text-align: center; padding: 40px; }
+.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+@media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } }
 </style>
 </head>
 <body>
@@ -47,7 +65,7 @@ <h1>&#x26A1; DFlash Server Status
 </h1>
 
 <div class="card">
-  <h2>Current State</h2>
+  <h2>Current Request</h2>
   <div id="phase-section">
     <span id="phase-badge" class="badge badge-idle">idle</span>
     <span class="stat" style="margin-left:16px">
@@ -69,12 +87,28 @@ <h2>Current State</h2>
         <span class="stat-value" id="cur-elapsed">0.0s</span>
         <span class="stat-label">elapsed</span>
       </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-toks">-</span>
+        <span class="stat-label">tok/s (live)</span>
+      </span>
     </div>
-    <div class="prompt-box" id="cur-prompt"></div>
+    <div id="cur-tags"></div>
+    <div class="params-grid" id="cur-params"></div>
     <div class="tokens-box" id="cur-draft-tokens"></div>
   </div>
 </div>
 
+<div class="two-col" id="req-resp-section" style="display:none;">
+  <div class="card">
+    <h2>Request Messages</h2>
+    <div class="messages-box" id="cur-messages"></div>
+  </div>
+  <div class="card">
+    <h2>Response Output</h2>
+    <div class="text-box" id="cur-output"></div>
+  </div>
+</div>
+
 <div class="card">
   <h2>Prefill Performance</h2>
   <div class="chart-container" id="chart-prefill"></div>
@@ -93,25 +127,27 @@ <h2>Decode Performance</h2>
 </div>
 
 <script>
+// Client-side state
+let outputText = '';
+let lastRequestId = null;
+
 function drawChart(containerId, datasets, yMax) {
   const container = document.getElementById(containerId);
   if (!datasets[0].data.length) {
-    container.innerHTML = '<div id="no-data">No data yet</div>';
+    container.innerHTML = '<div style="color:#484f58;text-align:center;padding:40px">No data yet</div>';
     return;
   }
-  const W = 800, H = 180, PAD = 40;
+  const W = 800, H = 160, PAD = 40;
   const n = datasets[0].data.length;
   const maxVal = yMax || Math.max(...datasets.flatMap(d => d.data), 1);
 
   let svg = '<svg viewBox="0 0 ' + W + ' ' + H + '" preserveAspectRatio="none">';
-  // Grid lines
   for (let i = 0; i <= 4; i++) {
     const y = PAD + (H - PAD * 2) * (1 - i / 4);
     const val = (maxVal * i / 4).toFixed(0);
     svg += '<line x1="' + PAD + '" y1="' + y + '" x2="' + (W - 10) + '" y2="' + y + '" stroke="#30363d" stroke-width="0.5"/>';
     svg += '<text x="' + (PAD - 4) + '" y="' + (y + 4) + '" fill="#8b949e" font-size="10" text-anchor="end">' + val + '</text>';
   }
-  // Data lines
   for (const ds of datasets) {
     let path = '';
     for (let i = 0; i < n; i++) {
@@ -120,7 +156,6 @@ <h2>Decode Performance</h2>
       path += (i === 0 ? 'M' : 'L') + x.toFixed(1) + ',' + y.toFixed(1);
     }
     svg += '<path d="' + path + '" fill="none" stroke="' + ds.color + '" stroke-width="2"/>';
-    // Dots for last 10 points
     const dotStart = Math.max(0, n - 10);
     for (let i = dotStart; i < n; i++) {
       const x = PAD + (W - PAD - 10) * i / Math.max(n - 1, 1);
@@ -132,22 +167,90 @@ <h2>Decode Performance</h2>
   container.innerHTML = svg;
 }
 
+function escapeHtml(s) {
+  return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+}
+
+function formatMessages(messagesStr) {
+  if (!messagesStr) return '';
+  try {
+    const msgs = JSON.parse(messagesStr);
+    if (!Array.isArray(msgs)) return escapeHtml(messagesStr);
+    return msgs.map(function(m) {
+      const role = m.role || '?';
+      let content = '';
+      if (typeof m.content === 'string') content = m.content;
+      else if (Array.isArray(m.content)) {
+        content = m.content.map(function(p) {
+          if (p.type === 'text') return p.text || '';
+          return '[' + p.type + ']';
+        }).join('');
+      }
+      // Truncate long messages for display
+      if (content.length > 500) content = content.substring(0, 500) + '...';
+      return '<b style="color:#58a6ff">' + escapeHtml(role) + ':</b> ' + escapeHtml(content);
+    }).join('\n\n');
+  } catch (e) {
+    return escapeHtml(messagesStr);
+  }
+}
+
 function update(data) {
-  // Phase badge
   const badge = document.getElementById('phase-badge');
   badge.textContent = data.phase;
   badge.className = 'badge badge-' + data.phase;
-
   document.getElementById('total-req').textContent = data.total_requests;
 
-  // Current info
   const info = document.getElementById('current-info');
+  const reqResp = document.getElementById('req-resp-section');
+
   if (data.current) {
     info.style.display = 'block';
+    reqResp.style.display = 'grid';
+
+    // Reset output accumulator on new request
+    const curId = data.current.prompt_tokens + '_' + data.total_requests;
+    if (curId !== lastRequestId) {
+      outputText = '';
+      lastRequestId = curId;
+    }
+
     document.getElementById('cur-prompt-tokens').textContent = data.current.prompt_tokens;
     document.getElementById('cur-completion-tokens').textContent = data.current.completion_tokens;
-    document.getElementById('cur-elapsed').textContent = data.current.elapsed_s.toFixed(1) + 's';
-    document.getElementById('cur-prompt').textContent = data.current.prompt_excerpt || '(no excerpt)';
+    const elapsed = data.current.elapsed_s.toFixed(1);
+    document.getElementById('cur-elapsed').textContent = elapsed + 's';
+
+    // Live tok/s
+    const liveToks = data.current.elapsed_s > 0.5
+      ? (data.current.completion_tokens / data.current.elapsed_s).toFixed(1)
+      : '-';
+    document.getElementById('cur-toks').textContent = liveToks;
+
+    // Tags: cache, pflash, spec_decode, stream, thinking
+    let tags = '';
+    if (data.current.cache_hit) tags += '<span class="tag tag-green">cache hit</span>';
+    if (data.current.pflash) tags += '<span class="tag tag-orange">pflash</span>';
+    if (data.current.spec_decode) tags += '<span class="tag tag-blue">spec decode</span>';
+    if (data.current.stream) tags += '<span class="tag tag-gray">stream</span>';
+    if (data.current.thinking_enabled) tags += '<span class="tag tag-blue">thinking</span>';
+    document.getElementById('cur-tags').innerHTML = tags;
+
+    // Params grid
+    let params = '';
+    function addParam(label, value) {
+      if (value === undefined || value === null || value === '') return;
+      params += '<div class="param"><div class="param-label">' + label + '</div><div class="param-value">' + escapeHtml(String(value)) + '</div></div>';
+    }
+    addParam('Model', data.current.model);
+    addParam('Format', data.current.format);
+    addParam('Max Output', data.current.max_output);
+    addParam('Temperature', data.current.temperature);
+    addParam('Top P', data.current.top_p);
+    if (data.current.top_k > 0) addParam('Top K', data.current.top_k);
+    if (data.current.session_id) addParam('Session', data.current.session_id);
+    document.getElementById('cur-params').innerHTML = params;
+
+    // Draft tokens
     const dtContainer = document.getElementById('cur-draft-tokens');
     if (data.current.draft_tokens && data.current.draft_tokens.length) {
       dtContainer.innerHTML = '<strong style="color:#8b949e;font-size:0.8em">Draft tokens: </strong>' +
@@ -155,8 +258,24 @@ <h2>Decode Performance</h2>
     } else {
       dtContainer.innerHTML = '';
     }
+
+    // Messages
+    const msgEl = document.getElementById('cur-messages');
+    if (data.current.messages) {
+      msgEl.innerHTML = formatMessages(data.current.messages);
+    } else {
+      msgEl.innerHTML = '<span style="color:#484f58">(no messages)</span>';
+    }
+
+    // Output (accumulated client-side)
+    const outEl = document.getElementById('cur-output');
+    outEl.textContent = outputText;
+    outEl.scrollTop = outEl.scrollHeight;
   } else {
     info.style.display = 'none';
+    reqResp.style.display = 'none';
+    outputText = '';
+    lastRequestId = null;
   }
 
   // Charts
@@ -175,10 +294,6 @@ <h2>Decode Performance</h2>
   ], decodeMax * 1.1);
 }
 
-function escapeHtml(s) {
-  return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
-}
-
 // SSE connection
 function connectSSE() {
   const connEl = document.getElementById('conn-status');
@@ -196,11 +311,23 @@ <h2>Decode Performance</h2>
     } catch (err) {}
   });
 
+  // Incremental token events — accumulate in browser
+  es.addEventListener('token', function(e) {
+    try {
+      const data = JSON.parse(e.data);
+      if (data.text) {
+        outputText += data.text;
+        const outEl = document.getElementById('cur-output');
+        outEl.textContent = outputText;
+        outEl.scrollTop = outEl.scrollHeight;
+      }
+    } catch (err) {}
+  });
+
   es.onerror = function() {
     connEl.textContent = 'disconnected';
     connEl.className = 'connection-status disconnected';
     es.close();
-    // Reconnect after 2 seconds
     setTimeout(connectSSE, 2000);
   };
 }
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 525279fd2..7124e67ef 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -530,6 +530,25 @@ void HttpServer::broadcast_status() {
     }
 }
 
+// Broadcast a token text delta as an incremental SSE event.
+void HttpServer::broadcast_token(const std::string & text) {
+    json j = {{"text", text}};
+    std::string event = "event: token\ndata: " + j.dump() + "\n\n";
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        ssize_t sent = ::send(fd, event.data(), event.size(), MSG_NOSIGNAL);
+        if (sent <= 0) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
+}
+
 HttpServer::~HttpServer() {
     shutdown();
 }
@@ -1248,7 +1267,24 @@ void HttpServer::worker_loop() {
             prompt_excerpt = tokenizer_.decode(excerpt_toks);
             if (prompt_excerpt.size() > 200) prompt_excerpt.resize(200);
         }
-        status_.set_running(prompt_excerpt, (int)req.prompt_tokens.size(), req.stream);
+        {
+            ServerStatus::RequestInfo info;
+            info.model = req.model;
+            info.format = api_format_name(req.format);
+            info.session_id = req.session_id;
+            info.max_output = req.max_output;
+            info.temperature = req.sampler.temp;
+            info.top_p = req.sampler.top_p;
+            info.top_k = req.sampler.top_k;
+            info.thinking_enabled = req.thinking_enabled;
+            status_.set_running(prompt_excerpt, (int)req.prompt_tokens.size(), req.stream, info);
+        }
+        // Store messages JSON for request inspection (truncate to avoid huge payloads).
+        if (!req.messages.is_null()) {
+            std::string msg_str = req.messages.dump();
+            if (msg_str.size() > 4096) msg_str.resize(4096);
+            status_.set_messages(msg_str);
+        }
         broadcast_status();
         StatusGuard status_guard{status_};
 
@@ -1578,6 +1614,10 @@ void HttpServer::worker_loop() {
             snap_slot,
             snap_cut);
 
+        // Update status page with cache/pflash/spec-decode flags.
+        status_.set_flags(using_restore, pflash_compressed, !config_.draft_path.empty());
+        broadcast_status();
+
         // Set up DaemonIO with on_token callback for streaming + disconnect.
         DaemonIO io;
         io.stream_fd = -1;  // no pipe — we write SSE directly
@@ -1615,6 +1655,7 @@ void HttpServer::worker_loop() {
 
             // Gemma4 thinking channel: map <|channel> → <think>, <channel|> → </think>\n
             if (raw == "<|channel>") {
+                broadcast_token("<think>");
                 if (req.stream) {
                     auto chunks = emitter.emit_token("<think>");
                     for (const auto & chunk : chunks)
@@ -1623,6 +1664,7 @@ void HttpServer::worker_loop() {
                 return true;
             }
             if (raw == "<channel|>") {
+                broadcast_token("</think>\n");
                 if (req.stream) {
                     auto chunks = emitter.emit_token("</think>\n");
                     for (const auto & chunk : chunks)
@@ -1639,6 +1681,7 @@ void HttpServer::worker_loop() {
             // reasoning_content with empty visible content. Forward the text
             // form into the emitter so parse_reasoning() can split correctly.
             if (raw == "<think>" || raw == "</think>") {
+                broadcast_token(raw == "</think>" ? "</think>\n" : "<think>");
                 if (req.stream) {
                     auto chunks = emitter.emit_token(
                         raw == "</think>" ? "</think>\n" : "<think>");
@@ -1657,6 +1700,11 @@ void HttpServer::worker_loop() {
 
             std::string text = tokenizer_.token_text(token);
 
+            // Send token text to status page clients (browser accumulates).
+            if (!text.empty()) {
+                broadcast_token(text);
+            }
+
             if (req.stream && !text.empty()) {
                 auto chunks = emitter.emit_token(text);
                 for (const auto & chunk : chunks) {
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 4c6aac239..f40d8b221 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -300,6 +300,9 @@ class HttpServer {
     // Broadcast current status to all SSE clients. Removes dead fds.
     void broadcast_status();
 
+    // Broadcast incremental token text to SSE clients.
+    void broadcast_token(const std::string & text);
+
     // Resolve and cache path to share/status.html.
     std::string status_html_path_;
     std::string resolve_status_html();
diff --git a/server/src/server/server_status.h b/server/src/server/server_status.h
index 6700cb159..132d34364 100644
--- a/server/src/server/server_status.h
+++ b/server/src/server/server_status.h
@@ -50,9 +50,21 @@ class ServerStatus {
 public:
     static constexpr int kMaxHistory = 50;
 
+    // Request details passed at set_running time.
+    struct RequestInfo {
+        std::string model;
+        std::string format;       // "chat", "anthropic", "responses"
+        std::string session_id;
+        int         max_output       = 0;
+        float       temperature      = 0.0f;
+        float       top_p            = 1.0f;
+        int         top_k            = 0;
+        bool        thinking_enabled = false;
+    };
+
     // Called by worker thread to update live state.
     void set_running(const std::string & prompt_excerpt, int prompt_tokens,
-                     bool is_stream) {
+                     bool is_stream, const RequestInfo & info) {
         std::lock_guard<std::mutex> lk(mu_);
         phase_ = InferencePhase::PREFILL;
         prompt_excerpt_ = prompt_excerpt;
@@ -60,14 +72,30 @@ class ServerStatus {
         completion_tokens_ = 0;
         is_stream_ = is_stream;
         draft_tokens_.clear();
+        request_info_ = info;
+        cache_hit_ = false;
+        pflash_ = false;
+        spec_decode_ = false;
         started_at_ = std::chrono::steady_clock::now();
     }
 
+    void set_messages(const std::string & messages_json) {
+        std::lock_guard<std::mutex> lk(mu_);
+        messages_json_ = messages_json;
+    }
+
     void set_decode() {
         std::lock_guard<std::mutex> lk(mu_);
         phase_ = InferencePhase::DECODE;
     }
 
+    void set_flags(bool cache_hit, bool pflash, bool spec_decode) {
+        std::lock_guard<std::mutex> lk(mu_);
+        cache_hit_ = cache_hit;
+        pflash_ = pflash;
+        spec_decode_ = spec_decode;
+    }
+
     void update_completion_tokens(int n) {
         std::lock_guard<std::mutex> lk(mu_);
         completion_tokens_ = n;
@@ -105,6 +133,9 @@ class ServerStatus {
         std::vector<PerfRecord> history;
         int total_requests = 0;
         double elapsed_s = 0.0;
+        RequestInfo info;
+        bool cache_hit = false, pflash = false, spec_decode = false;
+        std::string messages_json;
 
         {
             std::lock_guard<std::mutex> lk(mu_);
@@ -116,6 +147,11 @@ class ServerStatus {
             draft_tokens = draft_tokens_;
             history = perf_history_;
             total_requests = total_requests_;
+            info = request_info_;
+            cache_hit = cache_hit_;
+            pflash = pflash_;
+            spec_decode = spec_decode_;
+            messages_json = messages_json_;
             if (phase != InferencePhase::IDLE) {
                 elapsed_s = std::chrono::duration<double>(
                     std::chrono::steady_clock::now() - started_at_).count();
@@ -134,6 +170,18 @@ class ServerStatus {
                 {"stream", is_stream},
                 {"elapsed_s", elapsed_s},
                 {"draft_tokens", draft_tokens},
+                {"model", info.model},
+                {"format", info.format},
+                {"max_output", info.max_output},
+                {"temperature", info.temperature},
+                {"top_p", info.top_p},
+                {"top_k", info.top_k},
+                {"thinking_enabled", info.thinking_enabled},
+                {"session_id", info.session_id},
+                {"cache_hit", cache_hit},
+                {"pflash", pflash},
+                {"spec_decode", spec_decode},
+                {"messages", messages_json},
             };
         } else {
             j["current"] = nullptr;
@@ -174,6 +222,11 @@ class ServerStatus {
     bool is_stream_ = false;
     std::vector<std::string> draft_tokens_;
     std::chrono::steady_clock::time_point started_at_;
+    RequestInfo request_info_;
+    bool cache_hit_ = false;
+    bool pflash_ = false;
+    bool spec_decode_ = false;
+    std::string messages_json_;
 
     // History.
     std::vector<PerfRecord> perf_history_;

From 55af9341c5d82d024975510222db39b179e6faf1 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 16:54:58 +0800
Subject: [PATCH 4/7] fix: prevent crash on incomplete UTF-8 in status JSON
 serialization

Token text can contain partial UTF-8 sequences (tokens split multi-byte
codepoints). Use json::error_handler_t::replace in all dump() calls on
status paths so invalid bytes become U+FFFD instead of throwing
type_error 316.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../plan.md                                   | 50 -------------------
 server/src/server/http_server.cpp             | 12 +++--
 server/src/server/server_status.h             |  3 +-
 3 files changed, 10 insertions(+), 55 deletions(-)
 delete mode 100644 .copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md

diff --git a/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md b/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md
deleted file mode 100644
index a5a786fcc..000000000
--- a/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# Plan: Hybrid Routed+Cold Path for All MoE Layers
-
-## Problem
-When `cold_compute` is enabled and layers have cold experts, DeltaNet layers are forced off the
-routed fast path into the slower split path. This adds overhead from:
-- Extra sync at split path entry (line 365)
-- Hot graph rebuild/dispatch instead of reusing the pre-cached `rffn` graph
-- Less pipelined execution
-
-## Approach
-Extend both the **routed fast path** (DeltaNet) and the **split-path routed FFN sub-path**
-(attention layers) to handle cold experts inline — running the cold fused kernel on CPU in
-parallel with the GPU routed FFN dispatch.
-
-**Key property:** When all experts are hot at runtime (n_cold = 0), the cold branch is simply
-skipped. The path executes identically to today's routed fast path — zero overhead.
-
-## Changes
-
-### 1. Routed Fast Path (DeltaNet layers, lines 260-355)
-
-**Remove** the cold_compute guard from line 263:
-```cpp
-// Before:
-&& !(state.cold_compute && !hybrid.layers[(size_t)il].cold_expert_ids.empty())
-// After: removed — hybrid path handles cold inline
-```
-
-**Modify** the routing remap loop (lines 296-308) to also record cold IDs/weights.
-
-**After remap, before rffn dispatch** — read ffn_post to CPU if cold compute needed.
-
-**After rffn dispatch (async)** — run cold compute on CPU (parallel with GPU rffn).
-
-**Combine** — upload cold result instead of zeroing.
-
-### 2. Split-Path Routed FFN Sub-Path (attention layers, lines 441-495)
-
-Same pattern: remove cold_compute guard, add cold partition + D2H + cold compute + upload.
-
-### 3. Telemetry
-
-Add `hybrid_cold_compute_us` counter for the new inline cold compute timing.
-
-## Timing Analysis
-
-- Current split path per mixed layer: ~1630µs
-- Proposed hybrid: prefn(300µs) + max(cold(850µs), rffn(300µs)) + combine(50µs) ≈ 1200µs
-- Conservative estimate: 5-10% throughput improvement
-- All-hot case: zero overhead (cold branch skipped)
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 7124e67ef..27e7e1310 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -532,8 +532,13 @@ void HttpServer::broadcast_status() {
 
 // Broadcast a token text delta as an incremental SSE event.
 void HttpServer::broadcast_token(const std::string & text) {
-    json j = {{"text", text}};
-    std::string event = "event: token\ndata: " + j.dump() + "\n\n";
+    // Token text may contain incomplete UTF-8 (tokens can split multi-byte
+    // codepoints). Manually build the SSE payload with json string escaping
+    // that replaces invalid UTF-8 with U+FFFD instead of throwing.
+    json j;
+    j["text"] = text;
+    std::string event = "event: token\ndata: " +
+        j.dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";
     std::lock_guard<std::mutex> lk(sse_mu_);
     std::vector<int> dead;
     for (int fd : sse_fds_) {
@@ -791,7 +796,8 @@ void HttpServer::handle_client(int fd) {
 
     // Status JSON snapshot (for non-SSE clients / debugging).
     if (hr.method == "GET" && hr.path == "/status/json") {
-        send_response(fd, 200, "application/json", status_.to_json().dump() + "\n");
+        send_response(fd, 200, "application/json",
+            status_.to_json().dump(-1, ' ', false, json::error_handler_t::replace) + "\n");
         ::close(fd);
         return;
     }
diff --git a/server/src/server/server_status.h b/server/src/server/server_status.h
index 132d34364..98794e919 100644
--- a/server/src/server/server_status.h
+++ b/server/src/server/server_status.h
@@ -205,9 +205,8 @@ class ServerStatus {
         return j;
     }
 
-    // Format as SSE event string: "event: status\ndata: {json}\n\n"
     std::string to_sse_event() const {
-        std::string data = to_json().dump();
+        std::string data = to_json().dump(-1, ' ', false, json::error_handler_t::replace);
         return "event: status\ndata: " + data + "\n\n";
     }
 

From 2cdbab2eeac6d548c89c4c33065f36a19673cdc1 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 19:51:44 +0800
Subject: [PATCH 5/7] fix(status): address PR #322 review comments

- Add /status/json to kApiEndpoints registry
- Replace raw ::send() with sse_try_send() helper that handles partial
  writes via poll loop with a short 1s timeout (avoids stalling worker)
- Add sse_heartbeat() to prune disconnected SSE clients during idle
  periods (worker dequeue uses timed wait, sends heartbeat every 30s)
- Use $<TARGET_FILE_DIR:dflash_server> in CMake POST_BUILD copy rule
  for correct output path with multi-config generators
- Add install(FILES) rule for status.html
- Clear messages panel when a new request starts in the browser
- Use incremental DOM append (createTextNode) for token events instead
  of re-rendering full output text on each token

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 server/CMakeLists.txt             |  7 +++-
 server/share/status.html          | 13 +++++--
 server/src/server/http_server.cpp | 63 ++++++++++++++++++++++++++++---
 server/src/server/http_server.h   |  3 ++
 4 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index 79a356115..ced5b164b 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -728,13 +728,16 @@ if(DFLASH27B_TESTS)
         endif()
 
         # Copy share/status.html next to the binary so it can be found at runtime.
-        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/share")
         add_custom_command(TARGET dflash_server POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory
+                "$<TARGET_FILE_DIR:dflash_server>/share"
             COMMAND ${CMAKE_COMMAND} -E copy_if_different
                 "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html"
-                "${CMAKE_CURRENT_BINARY_DIR}/share/status.html"
+                "$<TARGET_FILE_DIR:dflash_server>/share/status.html"
             COMMENT "Copying status.html to build/share/"
         )
+        install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html"
+                DESTINATION share)
     endif()
 
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/backend_ipc_main.cpp")
diff --git a/server/share/status.html b/server/share/status.html
index 6c8b4fee4..dbfe6f3aa 100644
--- a/server/share/status.html
+++ b/server/share/status.html
@@ -213,6 +213,9 @@ <h2>Decode Performance</h2>
     if (curId !== lastRequestId) {
       outputText = '';
       lastRequestId = curId;
+      // Clear output and messages for the new request
+      document.getElementById('cur-output').textContent = '';
+      document.getElementById('cur-messages').innerHTML = '';
     }
 
     document.getElementById('cur-prompt-tokens').textContent = data.current.prompt_tokens;
@@ -267,9 +270,11 @@ <h2>Decode Performance</h2>
       msgEl.innerHTML = '<span style="color:#484f58">(no messages)</span>';
     }
 
-    // Output (accumulated client-side)
+    // Output — only set if empty (incremental append handles tokens)
     const outEl = document.getElementById('cur-output');
-    outEl.textContent = outputText;
+    if (outEl.textContent === '' && outputText) {
+      outEl.textContent = outputText;
+    }
     outEl.scrollTop = outEl.scrollHeight;
   } else {
     info.style.display = 'none';
@@ -311,14 +316,14 @@ <h2>Decode Performance</h2>
     } catch (err) {}
   });
 
-  // Incremental token events — accumulate in browser
+  // Incremental token events — append to DOM incrementally (no full re-render)
   es.addEventListener('token', function(e) {
     try {
       const data = JSON.parse(e.data);
       if (data.text) {
         outputText += data.text;
         const outEl = document.getElementById('cur-output');
-        outEl.textContent = outputText;
+        outEl.appendChild(document.createTextNode(data.text));
         outEl.scrollTop = outEl.scrollHeight;
       }
     } catch (err) {}
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 27e7e1310..14833b9e1 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -49,6 +49,7 @@ static const std::vector<std::string> kApiEndpoints = {
     "GET /props",
     "GET /status",
     "GET /status/events",
+    "GET /status/json",
     "GET /v1/models",
     "POST /v1/chat/completions",
     "POST /v1/messages",
@@ -512,14 +513,42 @@ std::string HttpServer::resolve_status_html() {
     return {};
 }
 
+// Send data to an SSE client fd with a short (1s) timeout to avoid stalling
+// the inference worker. Returns false if the send fails or times out.
+static bool sse_try_send(int fd, const void * data, size_t len) {
+    const char * p = static_cast<const char *>(data);
+    size_t sent = 0;
+    auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(1);
+    while (sent < len) {
+        auto remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
+            deadline - std::chrono::steady_clock::now()).count();
+        if (remaining <= 0) return false;
+
+        struct pollfd pfd = {fd, POLLOUT, 0};
+        int ret;
+        do {
+            ret = poll(&pfd, 1, static_cast<int>(std::min(remaining, (long)50)));
+        } while (ret < 0 && errno == EINTR);
+        if (ret < 0 || (pfd.revents & (POLLERR | POLLHUP | POLLNVAL))) return false;
+        if (ret == 0) continue;
+
+        ssize_t n = ::send(fd, p + sent, len - sent, MSG_NOSIGNAL);
+        if (n < 0) {
+            if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) continue;
+            return false;
+        }
+        sent += n;
+    }
+    return true;
+}
+
 // Broadcast current status as SSE event to all connected /status/events clients.
 void HttpServer::broadcast_status() {
     std::string event = status_.to_sse_event();
     std::lock_guard<std::mutex> lk(sse_mu_);
     std::vector<int> dead;
     for (int fd : sse_fds_) {
-        ssize_t sent = ::send(fd, event.data(), event.size(), MSG_NOSIGNAL);
-        if (sent <= 0) {
+        if (!sse_try_send(fd, event.data(), event.size())) {
             dead.push_back(fd);
         }
     }
@@ -542,8 +571,24 @@ void HttpServer::broadcast_token(const std::string & text) {
     std::lock_guard<std::mutex> lk(sse_mu_);
     std::vector<int> dead;
     for (int fd : sse_fds_) {
-        ssize_t sent = ::send(fd, event.data(), event.size(), MSG_NOSIGNAL);
-        if (sent <= 0) {
+        if (!sse_try_send(fd, event.data(), event.size())) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
+}
+
+// Send an SSE comment as a heartbeat to detect disconnected clients when idle.
+void HttpServer::sse_heartbeat() {
+    static const char ping[] = ":heartbeat\n\n";
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        if (!sse_try_send(fd, ping, sizeof(ping) - 1)) {
             dead.push_back(fd);
         }
     }
@@ -2193,7 +2238,15 @@ void HttpServer::enqueue(ServerJob * job) {
 
 ServerJob * HttpServer::dequeue() {
     std::unique_lock<std::mutex> lk(queue_mu_);
-    queue_cv_.wait(lk, [this]() { return queue_head_ != nullptr || stopping_.load(); });
+    // Use timed wait so the worker periodically wakes to send SSE heartbeats.
+    while (!queue_head_ && !stopping_.load()) {
+        if (queue_cv_.wait_for(lk, std::chrono::seconds(30)) == std::cv_status::timeout) {
+            // Send SSE heartbeat (comment line) to detect disconnected clients.
+            lk.unlock();
+            sse_heartbeat();
+            lk.lock();
+        }
+    }
     if (!queue_head_) return nullptr;
     ServerJob * j = queue_head_;
     queue_head_ = j->next;
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index f40d8b221..a33578ce8 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -303,6 +303,9 @@ class HttpServer {
     // Broadcast incremental token text to SSE clients.
     void broadcast_token(const std::string & text);
 
+    // Send SSE heartbeat comment to prune disconnected clients.
+    void sse_heartbeat();
+
     // Resolve and cache path to share/status.html.
     std::string status_html_path_;
     std::string resolve_status_html();

From bdff1a2ca25202a542b337dbd6f0ba8a57e362ad Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 31 May 2026 20:08:03 +0800
Subject: [PATCH 6/7] fix(status): correct prefill_tok_s for cache hits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a prefix cache hit occurs, the backend only prefills the delta
tokens beyond the cached prefix. The previous calculation divided the
full prompt token count by the delta prefill time, giving either 0
(full cache hit, no delta) or a wildly inflated number (partial hit).

Now uses the actual number of tokens that were prefilled:
- Full cache hit: 0 tok/s (correct — no prefill work done)
- Partial cache hit: delta_tokens / prefill_time
- No cache hit: effective_prompt.size() / prefill_time

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 server/src/server/http_server.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 14833b9e1..ca76cb1e4 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -1894,8 +1894,14 @@ void HttpServer::worker_loop() {
             PerfRecord perf;
             perf.prompt_tokens = (int)req.prompt_tokens.size();
             perf.completion_tokens = completion_tokens;
+            // Use actual prefilled token count: on cache hit the backend only
+            // prefills the delta beyond the cached prefix, so dividing the full
+            // prompt size by delta time would be wrong.
+            const int prefill_tokens = using_restore
+                ? std::max(0, (int)effective_prompt.size() - prefix_len)
+                : (int)effective_prompt.size();
             perf.prefill_tok_s = (result.prefill_s > 0.0)
-                ? (double)req.prompt_tokens.size() / result.prefill_s : 0.0;
+                ? (double)prefill_tokens / result.prefill_s : 0.0;
             perf.decode_tok_s = (result.decode_s > 0.0)
                 ? (double)completion_tokens / result.decode_s : 0.0;
             perf.accept_rate = result.accept_rate;

From 5c6d51c32ef2492b1025800ae57e551f67efe2b4 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Mon, 1 Jun 2026 18:34:51 +0800
Subject: [PATCH 7/7] fix(dflash): use non-blocking sends for SSE heartbeat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace blocking sse_try_send() (1s timeout per client) with MSG_DONTWAIT
send in sse_heartbeat(). The 12-byte heartbeat ping will succeed instantly
for any healthy client; slow clients with full buffers are pruned
immediately instead of stalling the worker thread.

This eliminates up to N×1s latency on idle-to-active transitions when
slow SSE clients are connected.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 server/src/server/http_server.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index ca76cb1e4..0cd5f2a85 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -583,12 +583,16 @@ void HttpServer::broadcast_token(const std::string & text) {
 }
 
 // Send an SSE comment as a heartbeat to detect disconnected clients when idle.
+// Uses non-blocking sends to avoid stalling the worker thread on slow clients.
 void HttpServer::sse_heartbeat() {
     static const char ping[] = ":heartbeat\n\n";
     std::lock_guard<std::mutex> lk(sse_mu_);
     std::vector<int> dead;
     for (int fd : sse_fds_) {
-        if (!sse_try_send(fd, ping, sizeof(ping) - 1)) {
+        // Non-blocking send: if the socket buffer can't accept 12 bytes
+        // immediately, the client is too far behind — treat as dead.
+        ssize_t n = ::send(fd, ping, sizeof(ping) - 1, MSG_NOSIGNAL | MSG_DONTWAIT);
+        if (n <= 0) {
             dead.push_back(fd);
         }
     }