diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
index d42762f3..ced5b164 100644
--- a/server/CMakeLists.txt
+++ b/server/CMakeLists.txt
@@ -726,6 +726,18 @@ if(DFLASH27B_TESTS)
         else()
             target_link_libraries(dflash_server PRIVATE hip::host)
         endif()
+
+        # Copy share/status.html next to the binary so it can be found at runtime.
+        add_custom_command(TARGET dflash_server POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory
+                "$<TARGET_FILE_DIR:dflash_server>/share"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html"
+                "$<TARGET_FILE_DIR:dflash_server>/share/status.html"
+            COMMENT "Copying status.html to build/share/"
+        )
+        install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html"
+                DESTINATION share)
     endif()
 
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/backend_ipc_main.cpp")
diff --git a/server/share/status.html b/server/share/status.html
new file mode 100644
index 00000000..dbfe6f3a
--- /dev/null
+++ b/server/share/status.html
@@ -0,0 +1,343 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>DFlash Server Status</title>
+<style>
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+       background: #0d1117; color: #c9d1d9; padding: 20px; }
+h1 { color: #58a6ff; margin-bottom: 16px; font-size: 1.5em; }
+.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px;
+        padding: 16px; margin-bottom: 16px; }
+.card h2 { color: #8b949e; font-size: 0.85em; text-transform: uppercase;
+            letter-spacing: 0.05em; margin-bottom: 8px; }
+.badge { display: inline-block; padding: 3px 10px; border-radius: 12px;
+         font-size: 0.8em; font-weight: 600; }
+.badge-idle { background: #1f6feb33; color: #58a6ff; }
+.badge-prefill { background: #f0883e33; color: #f0883e; }
+.badge-decode { background: #3fb95033; color: #3fb950; }
+.stat { display: inline-block; margin-right: 24px; margin-bottom: 8px; }
+.stat-value { font-size: 1.4em; font-weight: 700; color: #f0f6fc; }
+.stat-label { font-size: 0.75em; color: #8b949e; }
+.tag { display: inline-block; padding: 2px 8px; border-radius: 4px;
+       font-size: 0.7em; font-weight: 600; margin-right: 6px; margin-top: 4px; }
+.tag-green { background: #3fb95022; color: #3fb950; border: 1px solid #3fb95044; }
+.tag-orange { background: #f0883e22; color: #f0883e; border: 1px solid #f0883e44; }
+.tag-blue { background: #1f6feb22; color: #58a6ff; border: 1px solid #1f6feb44; }
+.tag-gray { background: #8b949e22; color: #8b949e; border: 1px solid #8b949e44; }
+.params-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr));
+               gap: 8px; margin-top: 8px; }
+.param { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+         padding: 6px 10px; }
+.param-label { font-size: 0.7em; color: #8b949e; text-transform: uppercase; }
+.param-value { font-size: 0.9em; color: #f0f6fc; font-weight: 500; }
+.text-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+            padding: 8px 12px; font-family: monospace; font-size: 0.82em;
+            max-height: 300px; overflow-y: auto; word-break: break-word;
+            white-space: pre-wrap; color: #c9d1d9; margin-top: 8px; line-height: 1.5; }
+.text-box:empty::after { content: '(waiting...)'; color: #484f58; }
+.messages-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px;
+                padding: 8px 12px; font-family: monospace; font-size: 0.78em;
+                max-height: 200px; overflow-y: auto; word-break: break-word;
+                white-space: pre-wrap; color: #8b949e; margin-top: 8px; }
+.tokens-box { margin-top: 8px; }
+.token { display: inline-block; background: #1f6feb22; border: 1px solid #1f6feb;
+         border-radius: 4px; padding: 2px 6px; margin: 2px; font-family: monospace;
+         font-size: 0.8em; color: #79c0ff; }
+.chart-container { width: 100%; height: 180px; position: relative; }
+.chart-container svg { width: 100%; height: 100%; }
+.legend { display: flex; gap: 16px; margin-top: 8px; font-size: 0.75em; }
+.legend-item { display: flex; align-items: center; gap: 4px; }
+.legend-dot { width: 10px; height: 10px; border-radius: 50%; }
+.connection-status { float: right; font-size: 0.75em; padding: 3px 8px;
+                     border-radius: 4px; }
+.connected { background: #3fb95033; color: #3fb950; }
+.disconnected { background: #f8514933; color: #f85149; }
+.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+@media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } }
+</style>
+</head>
+<body>
+<h1>&#x26A1; DFlash Server Status
+  <span class="connection-status disconnected" id="conn-status">disconnected</span>
+</h1>
+
+<div class="card">
+  <h2>Current Request</h2>
+  <div id="phase-section">
+    <span id="phase-badge" class="badge badge-idle">idle</span>
+    <span class="stat" style="margin-left:16px">
+      <span class="stat-value" id="total-req">0</span>
+      <span class="stat-label">total requests</span>
+    </span>
+  </div>
+  <div id="current-info" style="display:none; margin-top:12px;">
+    <div>
+      <span class="stat">
+        <span class="stat-value" id="cur-prompt-tokens">0</span>
+        <span class="stat-label">prompt tokens</span>
+      </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-completion-tokens">0</span>
+        <span class="stat-label">completion tokens</span>
+      </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-elapsed">0.0s</span>
+        <span class="stat-label">elapsed</span>
+      </span>
+      <span class="stat">
+        <span class="stat-value" id="cur-toks">-</span>
+        <span class="stat-label">tok/s (live)</span>
+      </span>
+    </div>
+    <div id="cur-tags"></div>
+    <div class="params-grid" id="cur-params"></div>
+    <div class="tokens-box" id="cur-draft-tokens"></div>
+  </div>
+</div>
+
+<div class="two-col" id="req-resp-section" style="display:none;">
+  <div class="card">
+    <h2>Request Messages</h2>
+    <div class="messages-box" id="cur-messages"></div>
+  </div>
+  <div class="card">
+    <h2>Response Output</h2>
+    <div class="text-box" id="cur-output"></div>
+  </div>
+</div>
+
+<div class="card">
+  <h2>Prefill Performance</h2>
+  <div class="chart-container" id="chart-prefill"></div>
+  <div class="legend">
+    <div class="legend-item"><div class="legend-dot" style="background:#f0883e"></div>Prefill tok/s</div>
+  </div>
+</div>
+
+<div class="card">
+  <h2>Decode Performance</h2>
+  <div class="chart-container" id="chart-decode"></div>
+  <div class="legend">
+    <div class="legend-item"><div class="legend-dot" style="background:#3fb950"></div>Decode tok/s</div>
+    <div class="legend-item"><div class="legend-dot" style="background:#58a6ff"></div>Accept Rate %</div>
+  </div>
+</div>
+
+<script>
+// Client-side state
+let outputText = '';
+let lastRequestId = null;
+
+function drawChart(containerId, datasets, yMax) {
+  const container = document.getElementById(containerId);
+  if (!datasets[0].data.length) {
+    container.innerHTML = '<div style="color:#484f58;text-align:center;padding:40px">No data yet</div>';
+    return;
+  }
+  const W = 800, H = 160, PAD = 40;
+  const n = datasets[0].data.length;
+  const maxVal = yMax || Math.max(...datasets.flatMap(d => d.data), 1);
+
+  let svg = '<svg viewBox="0 0 ' + W + ' ' + H + '" preserveAspectRatio="none">';
+  for (let i = 0; i <= 4; i++) {
+    const y = PAD + (H - PAD * 2) * (1 - i / 4);
+    const val = (maxVal * i / 4).toFixed(0);
+    svg += '<line x1="' + PAD + '" y1="' + y + '" x2="' + (W - 10) + '" y2="' + y + '" stroke="#30363d" stroke-width="0.5"/>';
+    svg += '<text x="' + (PAD - 4) + '" y="' + (y + 4) + '" fill="#8b949e" font-size="10" text-anchor="end">' + val + '</text>';
+  }
+  for (const ds of datasets) {
+    let path = '';
+    for (let i = 0; i < n; i++) {
+      const x = PAD + (W - PAD - 10) * i / Math.max(n - 1, 1);
+      const y = PAD + (H - PAD * 2) * (1 - Math.min(ds.data[i] / maxVal, 1));
+      path += (i === 0 ? 'M' : 'L') + x.toFixed(1) + ',' + y.toFixed(1);
+    }
+    svg += '<path d="' + path + '" fill="none" stroke="' + ds.color + '" stroke-width="2"/>';
+    const dotStart = Math.max(0, n - 10);
+    for (let i = dotStart; i < n; i++) {
+      const x = PAD + (W - PAD - 10) * i / Math.max(n - 1, 1);
+      const y = PAD + (H - PAD * 2) * (1 - Math.min(ds.data[i] / maxVal, 1));
+      svg += '<circle cx="' + x.toFixed(1) + '" cy="' + y.toFixed(1) + '" r="3" fill="' + ds.color + '"/>';
+    }
+  }
+  svg += '</svg>';
+  container.innerHTML = svg;
+}
+
+function escapeHtml(s) {
+  return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+}
+
+function formatMessages(messagesStr) {
+  if (!messagesStr) return '';
+  try {
+    const msgs = JSON.parse(messagesStr);
+    if (!Array.isArray(msgs)) return escapeHtml(messagesStr);
+    return msgs.map(function(m) {
+      const role = m.role || '?';
+      let content = '';
+      if (typeof m.content === 'string') content = m.content;
+      else if (Array.isArray(m.content)) {
+        content = m.content.map(function(p) {
+          if (p.type === 'text') return p.text || '';
+          return '[' + p.type + ']';
+        }).join('');
+      }
+      // Truncate long messages for display
+      if (content.length > 500) content = content.substring(0, 500) + '...';
+      return '<b style="color:#58a6ff">' + escapeHtml(role) + ':</b> ' + escapeHtml(content);
+    }).join('\n\n');
+  } catch (e) {
+    return escapeHtml(messagesStr);
+  }
+}
+
+function update(data) {
+  const badge = document.getElementById('phase-badge');
+  badge.textContent = data.phase;
+  badge.className = 'badge badge-' + data.phase;
+  document.getElementById('total-req').textContent = data.total_requests;
+
+  const info = document.getElementById('current-info');
+  const reqResp = document.getElementById('req-resp-section');
+
+  if (data.current) {
+    info.style.display = 'block';
+    reqResp.style.display = 'grid';
+
+    // Reset output accumulator on new request
+    const curId = data.current.prompt_tokens + '_' + data.total_requests;
+    if (curId !== lastRequestId) {
+      outputText = '';
+      lastRequestId = curId;
+      // Clear output and messages for the new request
+      document.getElementById('cur-output').textContent = '';
+      document.getElementById('cur-messages').innerHTML = '';
+    }
+
+    document.getElementById('cur-prompt-tokens').textContent = data.current.prompt_tokens;
+    document.getElementById('cur-completion-tokens').textContent = data.current.completion_tokens;
+    const elapsed = data.current.elapsed_s.toFixed(1);
+    document.getElementById('cur-elapsed').textContent = elapsed + 's';
+
+    // Live tok/s
+    const liveToks = data.current.elapsed_s > 0.5
+      ? (data.current.completion_tokens / data.current.elapsed_s).toFixed(1)
+      : '-';
+    document.getElementById('cur-toks').textContent = liveToks;
+
+    // Tags: cache, pflash, spec_decode, stream, thinking
+    let tags = '';
+    if (data.current.cache_hit) tags += '<span class="tag tag-green">cache hit</span>';
+    if (data.current.pflash) tags += '<span class="tag tag-orange">pflash</span>';
+    if (data.current.spec_decode) tags += '<span class="tag tag-blue">spec decode</span>';
+    if (data.current.stream) tags += '<span class="tag tag-gray">stream</span>';
+    if (data.current.thinking_enabled) tags += '<span class="tag tag-blue">thinking</span>';
+    document.getElementById('cur-tags').innerHTML = tags;
+
+    // Params grid
+    let params = '';
+    function addParam(label, value) {
+      if (value === undefined || value === null || value === '') return;
+      params += '<div class="param"><div class="param-label">' + label + '</div><div class="param-value">' + escapeHtml(String(value)) + '</div></div>';
+    }
+    addParam('Model', data.current.model);
+    addParam('Format', data.current.format);
+    addParam('Max Output', data.current.max_output);
+    addParam('Temperature', data.current.temperature);
+    addParam('Top P', data.current.top_p);
+    if (data.current.top_k > 0) addParam('Top K', data.current.top_k);
+    if (data.current.session_id) addParam('Session', data.current.session_id);
+    document.getElementById('cur-params').innerHTML = params;
+
+    // Draft tokens
+    const dtContainer = document.getElementById('cur-draft-tokens');
+    if (data.current.draft_tokens && data.current.draft_tokens.length) {
+      dtContainer.innerHTML = '<strong style="color:#8b949e;font-size:0.8em">Draft tokens: </strong>' +
+        data.current.draft_tokens.map(function(t) { return '<span class="token">' + escapeHtml(t) + '</span>'; }).join('');
+    } else {
+      dtContainer.innerHTML = '';
+    }
+
+    // Messages
+    const msgEl = document.getElementById('cur-messages');
+    if (data.current.messages) {
+      msgEl.innerHTML = formatMessages(data.current.messages);
+    } else {
+      msgEl.innerHTML = '<span style="color:#484f58">(no messages)</span>';
+    }
+
+    // Output — only set if empty (incremental append handles tokens)
+    const outEl = document.getElementById('cur-output');
+    if (outEl.textContent === '' && outputText) {
+      outEl.textContent = outputText;
+    }
+    outEl.scrollTop = outEl.scrollHeight;
+  } else {
+    info.style.display = 'none';
+    reqResp.style.display = 'none';
+    outputText = '';
+    lastRequestId = null;
+  }
+
+  // Charts
+  const hist = data.perf_history || [];
+  const prefillData = hist.map(function(h) { return h.prefill_tok_s; });
+  const decodeData = hist.map(function(h) { return h.decode_tok_s; });
+  const acceptData = hist.map(function(h) { return h.accept_rate * 100; });
+
+  const prefillMax = Math.max.apply(null, prefillData.concat([100]));
+  drawChart('chart-prefill', [{data: prefillData, color: '#f0883e'}], prefillMax * 1.1);
+
+  const decodeMax = Math.max.apply(null, decodeData.concat(acceptData).concat([10]));
+  drawChart('chart-decode', [
+    {data: decodeData, color: '#3fb950'},
+    {data: acceptData, color: '#58a6ff'}
+  ], decodeMax * 1.1);
+}
+
+// SSE connection
+function connectSSE() {
+  const connEl = document.getElementById('conn-status');
+  const es = new EventSource('/status/events');
+
+  es.onopen = function() {
+    connEl.textContent = 'connected';
+    connEl.className = 'connection-status connected';
+  };
+
+  es.addEventListener('status', function(e) {
+    try {
+      const data = JSON.parse(e.data);
+      update(data);
+    } catch (err) {}
+  });
+
+  // Incremental token events — append to DOM incrementally (no full re-render)
+  es.addEventListener('token', function(e) {
+    try {
+      const data = JSON.parse(e.data);
+      if (data.text) {
+        outputText += data.text;
+        const outEl = document.getElementById('cur-output');
+        outEl.appendChild(document.createTextNode(data.text));
+        outEl.scrollTop = outEl.scrollHeight;
+      }
+    } catch (err) {}
+  });
+
+  es.onerror = function() {
+    connEl.textContent = 'disconnected';
+    connEl.className = 'connection-status disconnected';
+    es.close();
+    setTimeout(connectSSE, 2000);
+  };
+}
+
+connectSSE();
+</script>
+</body>
+</html>
diff --git a/server/src/common/dflash_spec_decode.cpp b/server/src/common/dflash_spec_decode.cpp
index 141e45e9..3b075493 100644
--- a/server/src/common/dflash_spec_decode.cpp
+++ b/server/src/common/dflash_spec_decode.cpp
@@ -169,6 +169,11 @@ bool run_dflash_spec_decode(
             }
         }
 
+        // Notify observer with draft tokens for this step.
+        if (io.observer) {
+            io.observer("draft", draft_tok);
+        }
+
         // ── Verify pass: speculative target forward over q_len tokens ────
         if (!target.snapshot_kv()) {
             std::fprintf(stderr, "dflash-spec snapshot_kv failed\n");
@@ -234,6 +239,12 @@ bool run_dflash_spec_decode(
         n_generated += emitted;
         n_accept_sum += std::min(accept_n, emitted);
         n_draft_steps++;
+
+        // Notify observer with accepted tokens for this step.
+        if (io.observer) {
+            io.observer("verify", replay_tok);
+        }
+
         if (io.cancelled) break;
         if (hit_eos) break;
     }
diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h
index b808d0c3..af5f6ba7 100644
--- a/server/src/common/model_backend.h
+++ b/server/src/common/model_backend.h
@@ -27,6 +27,14 @@ namespace dflash::common {
 // Return true to continue generation, false to abort.
 using TokenCallback = std::function<bool(int32_t token)>;
 
+// Inference observer callback for live status updates. Called by backends
+// at each spec-decode step to report phase/detail. When empty, backends
+// skip the call (zero overhead).
+//   phase: "draft", "verify", "accept", "prefill_chunk"
+//   detail: JSON string with step-specific data
+using InferenceObserver = std::function<void(const char * phase,
+                                             const std::vector<int32_t> & tokens)>;
+
 // ─── I/O handle passed to backend methods that need protocol output ─────
 struct DaemonIO {
     int stream_fd = -1;
@@ -37,6 +45,10 @@ struct DaemonIO {
     TokenCallback on_token;
     mutable bool cancelled = false;
 
+    // Optional inference observer for /status page. When set, backends call
+    // this at each spec-decode step with draft tokens and phase info.
+    InferenceObserver observer;
+
     // Write a single int32 to the stream fd (token or -1 sentinel).
     // Also invokes on_token if set. Sets cancelled=true if on_token
     // returns false (client disconnected).
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
index 4a3d9674..1c442d83 100644
--- a/server/src/qwen35/qwen35_backend.cpp
+++ b/server/src/qwen35/qwen35_backend.cpp
@@ -1315,6 +1315,11 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
             }
         }
 
+        // Notify observer with draft tokens for this step.
+        if (io.observer) {
+            io.observer("draft", draft_tok);
+        }
+
         // 4. Verify: snapshot KV, run target forward over draft tokens
         if (!target->snapshot_kv()) {
             step_graph_destroy(draft_sg);
@@ -1391,6 +1396,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen,
         n_generated += emitted;
         n_accept_sum += std::min(accept_n, emitted);
         n_draft_steps++;
+
+        // Notify observer with accepted tokens for this step.
+        if (io.observer) {
+            io.observer("verify", replay_tok);
+        }
+
         if (io.cancelled) break;
         if (hit_eos) break;
     }
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index e751cad7..0cd5f2a8 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -12,6 +12,8 @@
 #include <chrono>
 #include <cstdio>
 #include <cstring>
+#include <fstream>
+#include <sstream>
 
 #include <arpa/inet.h>
 #include <fcntl.h>
@@ -20,6 +22,7 @@
 #include <poll.h>
 #include <signal.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 namespace dflash::common {
@@ -44,6 +47,9 @@ static constexpr char kServerName[] = "luce-dflash";
 static const std::vector<std::string> kApiEndpoints = {
     "GET /health",
     "GET /props",
+    "GET /status",
+    "GET /status/events",
+    "GET /status/json",
     "GET /v1/models",
     "POST /v1/chat/completions",
     "POST /v1/messages",
@@ -465,6 +471,136 @@ HttpServer::HttpServer(ModelBackend & backend,
                    config.disk_cache_cold_max_tokens}, backend)
 {
     disk_cache_.init();
+    status_html_path_ = resolve_status_html();
+}
+
+// Resolve path to share/status.html at startup.
+std::string HttpServer::resolve_status_html() {
+    // 1. DFLASH_SHARE_DIR env var
+    if (const char * dir = std::getenv("DFLASH_SHARE_DIR")) {
+        std::string path = std::string(dir) + "/status.html";
+        struct stat st;
+        if (::stat(path.c_str(), &st) == 0) return path;
+    }
+    // 2. share/ relative to /proc/self/exe (build dir or installed prefix)
+    char exe_buf[1024] = {};
+    ssize_t len = ::readlink("/proc/self/exe", exe_buf, sizeof(exe_buf) - 1);
+    if (len > 0) {
+        exe_buf[len] = '\0';
+        std::string exe_dir(exe_buf);
+        auto slash = exe_dir.rfind('/');
+        if (slash != std::string::npos) {
+            exe_dir = exe_dir.substr(0, slash);
+            // 2a. <exe_dir>/share/status.html  (build directory layout)
+            {
+                std::string path = exe_dir + "/share/status.html";
+                struct stat st;
+                if (::stat(path.c_str(), &st) == 0) return path;
+            }
+            // 2b. <exe_dir>/../share/status.html  (installed prefix layout)
+            {
+                std::string path = exe_dir + "/../share/status.html";
+                struct stat st;
+                if (::stat(path.c_str(), &st) == 0) return path;
+            }
+        }
+    }
+    // 3. ./share/status.html (development)
+    {
+        struct stat st;
+        if (::stat("share/status.html", &st) == 0) return "share/status.html";
+    }
+    return {};
+}
+
+// Send data to an SSE client fd with a short (1s) timeout to avoid stalling
+// the inference worker. Returns false if the send fails or times out.
+static bool sse_try_send(int fd, const void * data, size_t len) {
+    const char * p = static_cast<const char *>(data);
+    size_t sent = 0;
+    auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(1);
+    while (sent < len) {
+        auto remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
+            deadline - std::chrono::steady_clock::now()).count();
+        if (remaining <= 0) return false;
+
+        struct pollfd pfd = {fd, POLLOUT, 0};
+        int ret;
+        do {
+            ret = poll(&pfd, 1, static_cast<int>(std::min(remaining, (long)50)));
+        } while (ret < 0 && errno == EINTR);
+        if (ret < 0 || (pfd.revents & (POLLERR | POLLHUP | POLLNVAL))) return false;
+        if (ret == 0) continue;
+
+        ssize_t n = ::send(fd, p + sent, len - sent, MSG_NOSIGNAL);
+        if (n < 0) {
+            if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) continue;
+            return false;
+        }
+        sent += n;
+    }
+    return true;
+}
+
+// Broadcast current status as SSE event to all connected /status/events clients.
+void HttpServer::broadcast_status() {
+    std::string event = status_.to_sse_event();
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        if (!sse_try_send(fd, event.data(), event.size())) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
+}
+
+// Broadcast a token text delta as an incremental SSE event.
+void HttpServer::broadcast_token(const std::string & text) {
+    // Token text may contain incomplete UTF-8 (tokens can split multi-byte
+    // codepoints). Manually build the SSE payload with json string escaping
+    // that replaces invalid UTF-8 with U+FFFD instead of throwing.
+    json j;
+    j["text"] = text;
+    std::string event = "event: token\ndata: " +
+        j.dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        if (!sse_try_send(fd, event.data(), event.size())) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
+}
+
+// Send an SSE comment as a heartbeat to detect disconnected clients when idle.
+// Uses non-blocking sends to avoid stalling the worker thread on slow clients.
+void HttpServer::sse_heartbeat() {
+    static const char ping[] = ":heartbeat\n\n";
+    std::lock_guard<std::mutex> lk(sse_mu_);
+    std::vector<int> dead;
+    for (int fd : sse_fds_) {
+        // Non-blocking send: if the socket buffer can't accept 12 bytes
+        // immediately, the client is too far behind — treat as dead.
+        ssize_t n = ::send(fd, ping, sizeof(ping) - 1, MSG_NOSIGNAL | MSG_DONTWAIT);
+        if (n <= 0) {
+            dead.push_back(fd);
+        }
+    }
+    for (int fd : dead) {
+        ::close(fd);
+        sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd),
+                       sse_fds_.end());
+    }
 }
 
 HttpServer::~HttpServer() {
@@ -483,6 +619,13 @@ void HttpServer::shutdown() {
         worker_thread_.join();
     }
 
+    // Close SSE client connections.
+    {
+        std::lock_guard<std::mutex> lk(sse_mu_);
+        for (int fd : sse_fds_) ::close(fd);
+        sse_fds_.clear();
+    }
+
     // Drain any pending jobs.
     {
         std::lock_guard<std::mutex> lk(queue_mu_);
@@ -679,6 +822,61 @@ void HttpServer::handle_client(int fd) {
         return;
     }
 
+    // Status page: serve HTML file from disk.
+    if (hr.method == "GET" && hr.path == "/status") {
+        if (status_html_path_.empty()) {
+            send_error(fd, 404,
+                "status.html not found. Set DFLASH_SHARE_DIR or place it in share/status.html");
+            ::close(fd);
+            return;
+        }
+        std::ifstream ifs(status_html_path_);
+        if (!ifs.is_open()) {
+            send_error(fd, 500, "failed to open status.html");
+            ::close(fd);
+            return;
+        }
+        std::ostringstream oss;
+        oss << ifs.rdbuf();
+        send_response(fd, 200, "text/html; charset=utf-8", oss.str());
+        ::close(fd);
+        return;
+    }
+
+    // Status JSON snapshot (for non-SSE clients / debugging).
+    if (hr.method == "GET" && hr.path == "/status/json") {
+        send_response(fd, 200, "application/json",
+            status_.to_json().dump(-1, ' ', false, json::error_handler_t::replace) + "\n");
+        ::close(fd);
+        return;
+    }
+
+    // Status SSE stream: hold connection open and push updates.
+    if (hr.method == "GET" && hr.path == "/status/events") {
+        // Send SSE headers.
+        const char * headers =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Type: text/event-stream\r\n"
+            "Cache-Control: no-cache\r\n"
+            "Connection: keep-alive\r\n"
+            "Access-Control-Allow-Origin: *\r\n"
+            "\r\n";
+        if (!send_all(fd, headers, std::strlen(headers))) {
+            ::close(fd);
+            return;
+        }
+        // Send initial state immediately.
+        std::string initial = status_.to_sse_event();
+        send_all(fd, initial.data(), initial.size());
+        // Register for future broadcasts. The fd is NOT closed here — it stays
+        // open until the client disconnects (detected on next broadcast send).
+        {
+            std::lock_guard<std::mutex> lk(sse_mu_);
+            sse_fds_.push_back(fd);
+        }
+        return;  // Do NOT close fd — it's now owned by the SSE broadcast loop.
+    }
+
     // Models endpoint.
     if (hr.method == "GET" && hr.path == "/v1/models") {
         // Codex sends ?client_version= — serve the Codex-specific schema.
@@ -1114,6 +1312,37 @@ void HttpServer::worker_loop() {
         const auto & req = job->req;
         auto started_at = std::chrono::steady_clock::now();
 
+        // Track live status for /status page. RAII guard ensures idle on all paths.
+        std::string prompt_excerpt;
+        if (!req.prompt_tokens.empty()) {
+            // Decode first ~40 tokens as a prompt excerpt (cheap, bounded).
+            const int excerpt_len = std::min((int)req.prompt_tokens.size(), 40);
+            std::vector<int32_t> excerpt_toks(req.prompt_tokens.begin(),
+                                               req.prompt_tokens.begin() + excerpt_len);
+            prompt_excerpt = tokenizer_.decode(excerpt_toks);
+            if (prompt_excerpt.size() > 200) prompt_excerpt.resize(200);
+        }
+        {
+            ServerStatus::RequestInfo info;
+            info.model = req.model;
+            info.format = api_format_name(req.format);
+            info.session_id = req.session_id;
+            info.max_output = req.max_output;
+            info.temperature = req.sampler.temp;
+            info.top_p = req.sampler.top_p;
+            info.top_k = req.sampler.top_k;
+            info.thinking_enabled = req.thinking_enabled;
+            status_.set_running(prompt_excerpt, (int)req.prompt_tokens.size(), req.stream, info);
+        }
+        // Store messages JSON for request inspection (truncate to avoid huge payloads).
+        if (!req.messages.is_null()) {
+            std::string msg_str = req.messages.dump();
+            if (msg_str.size() > 4096) msg_str.resize(4096);
+            status_.set_messages(msg_str);
+        }
+        broadcast_status();
+        StatusGuard status_guard{status_};
+
         auto finish_job = [&]() {
             std::lock_guard<std::mutex> lk(job->mu);
             job->done = true;
@@ -1440,10 +1669,25 @@ void HttpServer::worker_loop() {
             snap_slot,
             snap_cut);
 
+        // Update status page with cache/pflash/spec-decode flags.
+        status_.set_flags(using_restore, pflash_compressed, !config_.draft_path.empty());
+        broadcast_status();
+
         // Set up DaemonIO with on_token callback for streaming + disconnect.
         DaemonIO io;
         io.stream_fd = -1;  // no pipe — we write SSE directly
 
+        // Inference observer: updates status page with draft tokens per step.
+        io.observer = [&](const char * phase, const std::vector<int32_t> & tokens) {
+            std::vector<std::string> token_strs;
+            token_strs.reserve(tokens.size());
+            for (int32_t t : tokens) {
+                token_strs.push_back(tokenizer_.token_text(t));
+            }
+            status_.set_draft_tokens(token_strs);
+            broadcast_status();
+        };
+
         int completion_tokens = 0;
         bool client_disconnected = false;
 
@@ -1451,6 +1695,12 @@ void HttpServer::worker_loop() {
             if (client_disconnected) return false;
             completion_tokens++;
 
+            // Update status page every 10 tokens (low overhead).
+            if (completion_tokens % 10 == 0) {
+                status_.update_completion_tokens(completion_tokens);
+                broadcast_status();
+            }
+
             // Skip EOS/EOT/special tokens — don't forward to SSE.
             int32_t eos = tokenizer_.eos_id();
             int32_t eot = tokenizer_.eos_chat_id();
@@ -1460,6 +1710,7 @@ void HttpServer::worker_loop() {
 
             // Gemma4 thinking channel: map <|channel> → <think>, <channel|> → </think>\n
             if (raw == "<|channel>") {
+                broadcast_token("<think>");
                 if (req.stream) {
                     auto chunks = emitter.emit_token("<think>");
                     for (const auto & chunk : chunks)
@@ -1468,6 +1719,7 @@ void HttpServer::worker_loop() {
                 return true;
             }
             if (raw == "<channel|>") {
+                broadcast_token("</think>\n");
                 if (req.stream) {
                     auto chunks = emitter.emit_token("</think>\n");
                     for (const auto & chunk : chunks)
@@ -1484,6 +1736,7 @@ void HttpServer::worker_loop() {
             // reasoning_content with empty visible content. Forward the text
             // form into the emitter so parse_reasoning() can split correctly.
             if (raw == "<think>" || raw == "</think>") {
+                broadcast_token(raw == "</think>" ? "</think>\n" : "<think>");
                 if (req.stream) {
                     auto chunks = emitter.emit_token(
                         raw == "</think>" ? "</think>\n" : "<think>");
@@ -1502,6 +1755,11 @@ void HttpServer::worker_loop() {
 
             std::string text = tokenizer_.token_text(token);
 
+            // Send token text to status page clients (browser accumulates).
+            if (!text.empty()) {
+                broadcast_token(text);
+            }
+
             if (req.stream && !text.empty()) {
                 auto chunks = emitter.emit_token(text);
                 for (const auto & chunk : chunks) {
@@ -1534,6 +1792,10 @@ void HttpServer::worker_loop() {
             backend_.unpark("draft");   // reload decode draft (~3.3 GB)
         }
 
+        // Transition status to decode phase.
+        status_.set_decode();
+        broadcast_status();
+
         GenerateResult result;
         if (using_restore) {
             result = backend_.restore_and_generate_with_empty_spec_fallback(cache_slot, gen_req, io);
@@ -1630,6 +1892,31 @@ void HttpServer::worker_loop() {
         // message_delta usage, Responses response.completed usage).
         // See docs/specs/thinking-budget.md §6.3.
         GenTimings gen_timings{ result.prefill_s, result.decode_s };
+
+        // Record performance for /status page.
+        if (result.ok) {
+            PerfRecord perf;
+            perf.prompt_tokens = (int)req.prompt_tokens.size();
+            perf.completion_tokens = completion_tokens;
+            // Use actual prefilled token count: on cache hit the backend only
+            // prefills the delta beyond the cached prefix, so dividing the full
+            // prompt size by delta time would be wrong.
+            const int prefill_tokens = using_restore
+                ? std::max(0, (int)effective_prompt.size() - prefix_len)
+                : (int)effective_prompt.size();
+            perf.prefill_tok_s = (result.prefill_s > 0.0)
+                ? (double)prefill_tokens / result.prefill_s : 0.0;
+            perf.decode_tok_s = (result.decode_s > 0.0)
+                ? (double)completion_tokens / result.decode_s : 0.0;
+            perf.accept_rate = result.accept_rate;
+            perf.cache_hit = using_restore;
+            perf.pflash = pflash_compressed;
+            perf.spec_decode = result.spec_decode_ran;
+            perf.timestamp = std::chrono::steady_clock::now();
+            status_.record_perf(perf);
+            status_.update_completion_tokens(completion_tokens);
+            broadcast_status();
+        }
         if (req.stream && !client_disconnected) {
             auto final_chunks = emitter.emit_finish(completion_tokens, &gen_timings);
             for (const auto & chunk : final_chunks) {
@@ -1961,7 +2248,15 @@ void HttpServer::enqueue(ServerJob * job) {
 
 ServerJob * HttpServer::dequeue() {
     std::unique_lock<std::mutex> lk(queue_mu_);
-    queue_cv_.wait(lk, [this]() { return queue_head_ != nullptr || stopping_.load(); });
+    // Use timed wait so the worker periodically wakes to send SSE heartbeats.
+    while (!queue_head_ && !stopping_.load()) {
+        if (queue_cv_.wait_for(lk, std::chrono::seconds(30)) == std::cv_status::timeout) {
+            // Send SSE heartbeat (comment line) to detect disconnected clients.
+            lk.unlock();
+            sse_heartbeat();
+            lk.lock();
+        }
+    }
     if (!queue_head_) return nullptr;
     ServerJob * j = queue_head_;
     queue_head_ = j->next;
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 71c544ac..a33578ce 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -23,6 +23,7 @@
 #include "common/pflash_drafter_ipc.h"
 #include "model_card.h"
 #include "adaptive_keep_ratio.h"
+#include "server_status.h"
 #include <nlohmann/json.hpp>
 
 #include <atomic>
@@ -289,6 +290,26 @@ class HttpServer {
     // Per-session adaptive keep_ratio bandit state.
     HttpServerSessions sessions_;
 
+    // Live status tracker (read by /status/json, written by worker thread).
+    ServerStatus status_;
+
+    // SSE client connections for /status/events push.
+    std::mutex             sse_mu_;
+    std::vector<int>       sse_fds_;
+
+    // Broadcast current status to all SSE clients. Removes dead fds.
+    void broadcast_status();
+
+    // Broadcast incremental token text to SSE clients.
+    void broadcast_token(const std::string & text);
+
+    // Send SSE heartbeat comment to prune disconnected clients.
+    void sse_heartbeat();
+
+    // Resolve and cache path to share/status.html.
+    std::string status_html_path_;
+    std::string resolve_status_html();
+
     // Track prompt tokens for each snapshot slot (for shutdown save).
     std::unordered_map<int, std::vector<int32_t>> slot_tokens_;
 
diff --git a/server/src/server/server_status.h b/server/src/server/server_status.h
new file mode 100644
index 00000000..98794e91
--- /dev/null
+++ b/server/src/server/server_status.h
@@ -0,0 +1,241 @@
+// Server status tracking for the /status introspection page.
+//
+// Thread-safe status tracker: worker thread writes, HTTP client threads read.
+// Designed for minimal overhead on the inference hot path.
+
+#pragma once
+
+#include <nlohmann/json.hpp>
+
+#include <chrono>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace dflash::common {
+
+using json = nlohmann::json;
+
+// Performance record for one completed request.
+struct PerfRecord {
+    double prefill_tok_s    = 0.0;
+    double decode_tok_s     = 0.0;
+    float  accept_rate      = 0.0f;
+    int    prompt_tokens    = 0;
+    int    completion_tokens = 0;
+    bool   cache_hit        = false;
+    bool   pflash           = false;
+    bool   spec_decode      = false;
+    std::chrono::steady_clock::time_point timestamp;
+};
+
+// Live inference phase.
+enum class InferencePhase {
+    IDLE,
+    PREFILL,
+    DECODE,
+};
+
+static inline const char * phase_name(InferencePhase p) {
+    switch (p) {
+    case InferencePhase::IDLE:    return "idle";
+    case InferencePhase::PREFILL: return "prefill";
+    case InferencePhase::DECODE:  return "decode";
+    default:                      return "unknown";
+    }
+}
+
+class ServerStatus {
+public:
+    static constexpr int kMaxHistory = 50;
+
+    // Request details passed at set_running time.
+    struct RequestInfo {
+        std::string model;
+        std::string format;       // "chat", "anthropic", "responses"
+        std::string session_id;
+        int         max_output       = 0;
+        float       temperature      = 0.0f;
+        float       top_p            = 1.0f;
+        int         top_k            = 0;
+        bool        thinking_enabled = false;
+    };
+
+    // Called by worker thread to update live state.
+    void set_running(const std::string & prompt_excerpt, int prompt_tokens,
+                     bool is_stream, const RequestInfo & info) {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::PREFILL;
+        prompt_excerpt_ = prompt_excerpt;
+        prompt_tokens_ = prompt_tokens;
+        completion_tokens_ = 0;
+        is_stream_ = is_stream;
+        draft_tokens_.clear();
+        request_info_ = info;
+        cache_hit_ = false;
+        pflash_ = false;
+        spec_decode_ = false;
+        started_at_ = std::chrono::steady_clock::now();
+    }
+
+    void set_messages(const std::string & messages_json) {
+        std::lock_guard<std::mutex> lk(mu_);
+        messages_json_ = messages_json;
+    }
+
+    void set_decode() {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::DECODE;
+    }
+
+    void set_flags(bool cache_hit, bool pflash, bool spec_decode) {
+        std::lock_guard<std::mutex> lk(mu_);
+        cache_hit_ = cache_hit;
+        pflash_ = pflash;
+        spec_decode_ = spec_decode;
+    }
+
+    void update_completion_tokens(int n) {
+        std::lock_guard<std::mutex> lk(mu_);
+        completion_tokens_ = n;
+    }
+
+    void set_draft_tokens(const std::vector<std::string> & tokens) {
+        std::lock_guard<std::mutex> lk(mu_);
+        draft_tokens_ = tokens;
+    }
+
+    void set_idle() {
+        std::lock_guard<std::mutex> lk(mu_);
+        phase_ = InferencePhase::IDLE;
+        prompt_excerpt_.clear();
+        draft_tokens_.clear();
+    }
+
+    void record_perf(const PerfRecord & rec) {
+        std::lock_guard<std::mutex> lk(mu_);
+        if ((int)perf_history_.size() >= kMaxHistory) {
+            perf_history_.erase(perf_history_.begin());
+        }
+        perf_history_.push_back(rec);
+        total_requests_++;
+    }
+
+    // Snapshot current state as JSON (thread-safe).
+    json to_json() const {
+        InferencePhase phase;
+        std::string prompt_excerpt;
+        int prompt_tokens = 0;
+        int completion_tokens = 0;
+        bool is_stream = false;
+        std::vector<std::string> draft_tokens;
+        std::vector<PerfRecord> history;
+        int total_requests = 0;
+        double elapsed_s = 0.0;
+        RequestInfo info;
+        bool cache_hit = false, pflash = false, spec_decode = false;
+        std::string messages_json;
+
+        {
+            std::lock_guard<std::mutex> lk(mu_);
+            phase = phase_;
+            prompt_excerpt = prompt_excerpt_;
+            prompt_tokens = prompt_tokens_;
+            completion_tokens = completion_tokens_;
+            is_stream = is_stream_;
+            draft_tokens = draft_tokens_;
+            history = perf_history_;
+            total_requests = total_requests_;
+            info = request_info_;
+            cache_hit = cache_hit_;
+            pflash = pflash_;
+            spec_decode = spec_decode_;
+            messages_json = messages_json_;
+            if (phase != InferencePhase::IDLE) {
+                elapsed_s = std::chrono::duration<double>(
+                    std::chrono::steady_clock::now() - started_at_).count();
+            }
+        }
+
+        json j;
+        j["phase"] = phase_name(phase);
+        j["total_requests"] = total_requests;
+
+        if (phase != InferencePhase::IDLE) {
+            j["current"] = {
+                {"prompt_excerpt", prompt_excerpt},
+                {"prompt_tokens", prompt_tokens},
+                {"completion_tokens", completion_tokens},
+                {"stream", is_stream},
+                {"elapsed_s", elapsed_s},
+                {"draft_tokens", draft_tokens},
+                {"model", info.model},
+                {"format", info.format},
+                {"max_output", info.max_output},
+                {"temperature", info.temperature},
+                {"top_p", info.top_p},
+                {"top_k", info.top_k},
+                {"thinking_enabled", info.thinking_enabled},
+                {"session_id", info.session_id},
+                {"cache_hit", cache_hit},
+                {"pflash", pflash},
+                {"spec_decode", spec_decode},
+                {"messages", messages_json},
+            };
+        } else {
+            j["current"] = nullptr;
+        }
+
+        json perf = json::array();
+        for (const auto & r : history) {
+            perf.push_back({
+                {"prefill_tok_s", r.prefill_tok_s},
+                {"decode_tok_s", r.decode_tok_s},
+                {"accept_rate", r.accept_rate},
+                {"prompt_tokens", r.prompt_tokens},
+                {"completion_tokens", r.completion_tokens},
+                {"cache_hit", r.cache_hit},
+                {"pflash", r.pflash},
+                {"spec_decode", r.spec_decode},
+            });
+        }
+        j["perf_history"] = perf;
+
+        return j;
+    }
+
+    std::string to_sse_event() const {
+        std::string data = to_json().dump(-1, ' ', false, json::error_handler_t::replace);
+        return "event: status\ndata: " + data + "\n\n";
+    }
+
+private:
+    mutable std::mutex mu_;
+
+    // Live state.
+    InferencePhase phase_ = InferencePhase::IDLE;
+    std::string prompt_excerpt_;
+    int prompt_tokens_ = 0;
+    int completion_tokens_ = 0;
+    bool is_stream_ = false;
+    std::vector<std::string> draft_tokens_;
+    std::chrono::steady_clock::time_point started_at_;
+    RequestInfo request_info_;
+    bool cache_hit_ = false;
+    bool pflash_ = false;
+    bool spec_decode_ = false;
+    std::string messages_json_;
+
+    // History.
+    std::vector<PerfRecord> perf_history_;
+    int total_requests_ = 0;
+};
+
+// RAII guard that resets status to idle on scope exit.
+struct StatusGuard {
+    ServerStatus & status;
+    ~StatusGuard() { status.set_idle(); }
+};
+
+}  // namespace dflash::common