From eb24fb45a39a660d26b570c24e22c52301fdd96c Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 31 May 2026 15:12:43 +0800 Subject: [PATCH 1/7] feat(server): add /status page with SSE push updates Add a real-time server status dashboard accessible at GET /status: - Serves standalone HTML from server/share/status.html (editable without recompile) - GET /status/events SSE endpoint pushes live JSON updates per spec-decode step - GET /status/json provides a snapshot for non-SSE clients Status tracking (server_status.h): - Current phase (idle/prefill/decode) with prompt excerpt and token counts - Draft tokens being verified (updated each spec-decode step) - Performance history (last 50 requests): prefill tok/s, decode tok/s, accept rate - RAII StatusGuard ensures status resets to idle on all exit paths Backend instrumentation (InferenceObserver on DaemonIO): - Observer callback in model_backend.h, called at each draft/verify step - Instrumented in qwen35_backend.cpp and generic dflash_spec_decode.cpp - Zero overhead when no SSE clients are connected (empty std::function check) Dashboard features (status.html): - Dark-themed responsive UI with phase badges and live counters - Draft token display updated per spec-decode step - SVG-based performance charts (prefill tok/s, decode tok/s, accept rate) - Auto-reconnecting EventSource with connection status indicator - No external CDN dependencies Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- server/share/status.html | 211 +++++++++++++++++++++++ server/src/common/dflash_spec_decode.cpp | 11 ++ server/src/common/model_backend.h | 12 ++ server/src/qwen35/qwen35_backend.cpp | 11 ++ server/src/server/http_server.cpp | 169 ++++++++++++++++++ server/src/server/http_server.h | 15 ++ server/src/server/server_status.h | 189 ++++++++++++++++++++ 7 files changed, 618 insertions(+) create mode 100644 server/share/status.html create mode 100644 server/src/server/server_status.h diff --git a/server/share/status.html b/server/share/status.html new file mode 100644 index 000000000..eaf277adc --- /dev/null +++ b/server/share/status.html @@ -0,0 +1,211 @@ + + + + + +DFlash Server Status + + + +

⚡ DFlash Server Status + disconnected +

+ +
+

Current State

+
+ idle + + 0 + total requests + +
+ +
+ +
+

Prefill Performance

+
+
+
Prefill tok/s
+
+
+ +
+

Decode Performance

+
+
+
Decode tok/s
+
Accept Rate %
+
+
+ + + + diff --git a/server/src/common/dflash_spec_decode.cpp b/server/src/common/dflash_spec_decode.cpp index 141e45e92..3b075493e 100644 --- a/server/src/common/dflash_spec_decode.cpp +++ b/server/src/common/dflash_spec_decode.cpp @@ -169,6 +169,11 @@ bool run_dflash_spec_decode( } } + // Notify observer with draft tokens for this step. + if (io.observer) { + io.observer("draft", draft_tok); + } + // ── Verify pass: speculative target forward over q_len tokens ──── if (!target.snapshot_kv()) { std::fprintf(stderr, "dflash-spec snapshot_kv failed\n"); @@ -234,6 +239,12 @@ bool run_dflash_spec_decode( n_generated += emitted; n_accept_sum += std::min(accept_n, emitted); n_draft_steps++; + + // Notify observer with accepted tokens for this step. + if (io.observer) { + io.observer("verify", replay_tok); + } + if (io.cancelled) break; if (hit_eos) break; } diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index b808d0c39..af5f6ba72 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -27,6 +27,14 @@ namespace dflash::common { // Return true to continue generation, false to abort. using TokenCallback = std::function; +// Inference observer callback for live status updates. Called by backends +// at each spec-decode step to report phase/detail. When empty, backends +// skip the call (zero overhead). +// phase: "draft", "verify", "accept", "prefill_chunk" +// detail: JSON string with step-specific data +using InferenceObserver = std::function & tokens)>; + // ─── I/O handle passed to backend methods that need protocol output ───── struct DaemonIO { int stream_fd = -1; @@ -37,6 +45,10 @@ struct DaemonIO { TokenCallback on_token; mutable bool cancelled = false; + // Optional inference observer for /status page. When set, backends call + // this at each spec-decode step with draft tokens and phase info. + InferenceObserver observer; + // Write a single int32 to the stream fd (token or -1 sentinel). // Also invokes on_token if set. Sets cancelled=true if on_token // returns false (client disconnected). diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index 4a3d9674e..1c442d83e 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -1315,6 +1315,11 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen, } } + // Notify observer with draft tokens for this step. + if (io.observer) { + io.observer("draft", draft_tok); + } + // 4. Verify: snapshot KV, run target forward over draft tokens if (!target->snapshot_kv()) { step_graph_destroy(draft_sg); @@ -1391,6 +1396,12 @@ bool Qwen35Backend::do_spec_decode(int committed, int n_gen, n_generated += emitted; n_accept_sum += std::min(accept_n, emitted); n_draft_steps++; + + // Notify observer with accepted tokens for this step. + if (io.observer) { + io.observer("verify", replay_tok); + } + if (io.cancelled) break; if (hit_eos) break; } diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index e751cad70..7a93f26ce 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -20,6 +22,7 @@ #include #include #include +#include #include namespace dflash::common { @@ -44,6 +47,8 @@ static constexpr char kServerName[] = "luce-dflash"; static const std::vector kApiEndpoints = { "GET /health", "GET /props", + "GET /status", + "GET /status/events", "GET /v1/models", "POST /v1/chat/completions", "POST /v1/messages", @@ -465,6 +470,55 @@ HttpServer::HttpServer(ModelBackend & backend, config.disk_cache_cold_max_tokens}, backend) { disk_cache_.init(); + status_html_path_ = resolve_status_html(); +} + +// Resolve path to share/status.html at startup. +std::string HttpServer::resolve_status_html() { + // 1. DFLASH_SHARE_DIR env var + if (const char * dir = std::getenv("DFLASH_SHARE_DIR")) { + std::string path = std::string(dir) + "/status.html"; + struct stat st; + if (::stat(path.c_str(), &st) == 0) return path; + } + // 2. ../share/ relative to /proc/self/exe + char exe_buf[1024] = {}; + ssize_t len = ::readlink("/proc/self/exe", exe_buf, sizeof(exe_buf) - 1); + if (len > 0) { + exe_buf[len] = '\0'; + std::string exe_dir(exe_buf); + auto slash = exe_dir.rfind('/'); + if (slash != std::string::npos) { + exe_dir = exe_dir.substr(0, slash); + std::string path = exe_dir + "/../share/status.html"; + struct stat st; + if (::stat(path.c_str(), &st) == 0) return path; + } + } + // 3. ./share/status.html (development) + { + struct stat st; + if (::stat("share/status.html", &st) == 0) return "share/status.html"; + } + return {}; +} + +// Broadcast current status as SSE event to all connected /status/events clients. +void HttpServer::broadcast_status() { + std::string event = status_.to_sse_event(); + std::lock_guard lk(sse_mu_); + std::vector dead; + for (int fd : sse_fds_) { + ssize_t sent = ::send(fd, event.data(), event.size(), MSG_NOSIGNAL); + if (sent <= 0) { + dead.push_back(fd); + } + } + for (int fd : dead) { + ::close(fd); + sse_fds_.erase(std::remove(sse_fds_.begin(), sse_fds_.end(), fd), + sse_fds_.end()); + } } HttpServer::~HttpServer() { @@ -483,6 +537,13 @@ void HttpServer::shutdown() { worker_thread_.join(); } + // Close SSE client connections. + { + std::lock_guard lk(sse_mu_); + for (int fd : sse_fds_) ::close(fd); + sse_fds_.clear(); + } + // Drain any pending jobs. { std::lock_guard lk(queue_mu_); @@ -679,6 +740,60 @@ void HttpServer::handle_client(int fd) { return; } + // Status page: serve HTML file from disk. + if (hr.method == "GET" && hr.path == "/status") { + if (status_html_path_.empty()) { + send_error(fd, 404, + "status.html not found. Set DFLASH_SHARE_DIR or place it in share/status.html"); + ::close(fd); + return; + } + std::ifstream ifs(status_html_path_); + if (!ifs.is_open()) { + send_error(fd, 500, "failed to open status.html"); + ::close(fd); + return; + } + std::ostringstream oss; + oss << ifs.rdbuf(); + send_response(fd, 200, "text/html; charset=utf-8", oss.str()); + ::close(fd); + return; + } + + // Status JSON snapshot (for non-SSE clients / debugging). + if (hr.method == "GET" && hr.path == "/status/json") { + send_response(fd, 200, "application/json", status_.to_json().dump() + "\n"); + ::close(fd); + return; + } + + // Status SSE stream: hold connection open and push updates. + if (hr.method == "GET" && hr.path == "/status/events") { + // Send SSE headers. + const char * headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/event-stream\r\n" + "Cache-Control: no-cache\r\n" + "Connection: keep-alive\r\n" + "Access-Control-Allow-Origin: *\r\n" + "\r\n"; + if (!send_all(fd, headers, std::strlen(headers))) { + ::close(fd); + return; + } + // Send initial state immediately. + std::string initial = status_.to_sse_event(); + send_all(fd, initial.data(), initial.size()); + // Register for future broadcasts. The fd is NOT closed here — it stays + // open until the client disconnects (detected on next broadcast send). + { + std::lock_guard lk(sse_mu_); + sse_fds_.push_back(fd); + } + return; // Do NOT close fd — it's now owned by the SSE broadcast loop. + } + // Models endpoint. if (hr.method == "GET" && hr.path == "/v1/models") { // Codex sends ?client_version= — serve the Codex-specific schema. @@ -1114,6 +1229,20 @@ void HttpServer::worker_loop() { const auto & req = job->req; auto started_at = std::chrono::steady_clock::now(); + // Track live status for /status page. RAII guard ensures idle on all paths. + std::string prompt_excerpt; + if (!req.prompt_tokens.empty()) { + // Decode first ~40 tokens as a prompt excerpt (cheap, bounded). + const int excerpt_len = std::min((int)req.prompt_tokens.size(), 40); + std::vector excerpt_toks(req.prompt_tokens.begin(), + req.prompt_tokens.begin() + excerpt_len); + prompt_excerpt = tokenizer_.decode(excerpt_toks); + if (prompt_excerpt.size() > 200) prompt_excerpt.resize(200); + } + status_.set_running(prompt_excerpt, (int)req.prompt_tokens.size(), req.stream); + broadcast_status(); + StatusGuard status_guard{status_}; + auto finish_job = [&]() { std::lock_guard lk(job->mu); job->done = true; @@ -1444,6 +1573,17 @@ void HttpServer::worker_loop() { DaemonIO io; io.stream_fd = -1; // no pipe — we write SSE directly + // Inference observer: updates status page with draft tokens per step. + io.observer = [&](const char * phase, const std::vector & tokens) { + std::vector token_strs; + token_strs.reserve(tokens.size()); + for (int32_t t : tokens) { + token_strs.push_back(tokenizer_.token_text(t)); + } + status_.set_draft_tokens(token_strs); + broadcast_status(); + }; + int completion_tokens = 0; bool client_disconnected = false; @@ -1451,6 +1591,12 @@ void HttpServer::worker_loop() { if (client_disconnected) return false; completion_tokens++; + // Update status page every 10 tokens (low overhead). + if (completion_tokens % 10 == 0) { + status_.update_completion_tokens(completion_tokens); + broadcast_status(); + } + // Skip EOS/EOT/special tokens — don't forward to SSE. int32_t eos = tokenizer_.eos_id(); int32_t eot = tokenizer_.eos_chat_id(); @@ -1534,6 +1680,10 @@ void HttpServer::worker_loop() { backend_.unpark("draft"); // reload decode draft (~3.3 GB) } + // Transition status to decode phase. + status_.set_decode(); + broadcast_status(); + GenerateResult result; if (using_restore) { result = backend_.restore_and_generate_with_empty_spec_fallback(cache_slot, gen_req, io); @@ -1630,6 +1780,25 @@ void HttpServer::worker_loop() { // message_delta usage, Responses response.completed usage). // See docs/specs/thinking-budget.md §6.3. GenTimings gen_timings{ result.prefill_s, result.decode_s }; + + // Record performance for /status page. + if (result.ok) { + PerfRecord perf; + perf.prompt_tokens = (int)req.prompt_tokens.size(); + perf.completion_tokens = completion_tokens; + perf.prefill_tok_s = (result.prefill_s > 0.0) + ? (double)req.prompt_tokens.size() / result.prefill_s : 0.0; + perf.decode_tok_s = (result.decode_s > 0.0) + ? (double)completion_tokens / result.decode_s : 0.0; + perf.accept_rate = result.accept_rate; + perf.cache_hit = using_restore; + perf.pflash = pflash_compressed; + perf.spec_decode = result.spec_decode_ran; + perf.timestamp = std::chrono::steady_clock::now(); + status_.record_perf(perf); + status_.update_completion_tokens(completion_tokens); + broadcast_status(); + } if (req.stream && !client_disconnected) { auto final_chunks = emitter.emit_finish(completion_tokens, &gen_timings); for (const auto & chunk : final_chunks) { diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 71c544acb..4c6aac239 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -23,6 +23,7 @@ #include "common/pflash_drafter_ipc.h" #include "model_card.h" #include "adaptive_keep_ratio.h" +#include "server_status.h" #include #include @@ -289,6 +290,20 @@ class HttpServer { // Per-session adaptive keep_ratio bandit state. HttpServerSessions sessions_; + // Live status tracker (read by /status/json, written by worker thread). + ServerStatus status_; + + // SSE client connections for /status/events push. + std::mutex sse_mu_; + std::vector sse_fds_; + + // Broadcast current status to all SSE clients. Removes dead fds. + void broadcast_status(); + + // Resolve and cache path to share/status.html. + std::string status_html_path_; + std::string resolve_status_html(); + // Track prompt tokens for each snapshot slot (for shutdown save). std::unordered_map> slot_tokens_; diff --git a/server/src/server/server_status.h b/server/src/server/server_status.h new file mode 100644 index 000000000..6700cb159 --- /dev/null +++ b/server/src/server/server_status.h @@ -0,0 +1,189 @@ +// Server status tracking for the /status introspection page. +// +// Thread-safe status tracker: worker thread writes, HTTP client threads read. +// Designed for minimal overhead on the inference hot path. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace dflash::common { + +using json = nlohmann::json; + +// Performance record for one completed request. +struct PerfRecord { + double prefill_tok_s = 0.0; + double decode_tok_s = 0.0; + float accept_rate = 0.0f; + int prompt_tokens = 0; + int completion_tokens = 0; + bool cache_hit = false; + bool pflash = false; + bool spec_decode = false; + std::chrono::steady_clock::time_point timestamp; +}; + +// Live inference phase. +enum class InferencePhase { + IDLE, + PREFILL, + DECODE, +}; + +static inline const char * phase_name(InferencePhase p) { + switch (p) { + case InferencePhase::IDLE: return "idle"; + case InferencePhase::PREFILL: return "prefill"; + case InferencePhase::DECODE: return "decode"; + default: return "unknown"; + } +} + +class ServerStatus { +public: + static constexpr int kMaxHistory = 50; + + // Called by worker thread to update live state. + void set_running(const std::string & prompt_excerpt, int prompt_tokens, + bool is_stream) { + std::lock_guard lk(mu_); + phase_ = InferencePhase::PREFILL; + prompt_excerpt_ = prompt_excerpt; + prompt_tokens_ = prompt_tokens; + completion_tokens_ = 0; + is_stream_ = is_stream; + draft_tokens_.clear(); + started_at_ = std::chrono::steady_clock::now(); + } + + void set_decode() { + std::lock_guard lk(mu_); + phase_ = InferencePhase::DECODE; + } + + void update_completion_tokens(int n) { + std::lock_guard lk(mu_); + completion_tokens_ = n; + } + + void set_draft_tokens(const std::vector & tokens) { + std::lock_guard lk(mu_); + draft_tokens_ = tokens; + } + + void set_idle() { + std::lock_guard lk(mu_); + phase_ = InferencePhase::IDLE; + prompt_excerpt_.clear(); + draft_tokens_.clear(); + } + + void record_perf(const PerfRecord & rec) { + std::lock_guard lk(mu_); + if ((int)perf_history_.size() >= kMaxHistory) { + perf_history_.erase(perf_history_.begin()); + } + perf_history_.push_back(rec); + total_requests_++; + } + + // Snapshot current state as JSON (thread-safe). + json to_json() const { + InferencePhase phase; + std::string prompt_excerpt; + int prompt_tokens = 0; + int completion_tokens = 0; + bool is_stream = false; + std::vector draft_tokens; + std::vector history; + int total_requests = 0; + double elapsed_s = 0.0; + + { + std::lock_guard lk(mu_); + phase = phase_; + prompt_excerpt = prompt_excerpt_; + prompt_tokens = prompt_tokens_; + completion_tokens = completion_tokens_; + is_stream = is_stream_; + draft_tokens = draft_tokens_; + history = perf_history_; + total_requests = total_requests_; + if (phase != InferencePhase::IDLE) { + elapsed_s = std::chrono::duration( + std::chrono::steady_clock::now() - started_at_).count(); + } + } + + json j; + j["phase"] = phase_name(phase); + j["total_requests"] = total_requests; + + if (phase != InferencePhase::IDLE) { + j["current"] = { + {"prompt_excerpt", prompt_excerpt}, + {"prompt_tokens", prompt_tokens}, + {"completion_tokens", completion_tokens}, + {"stream", is_stream}, + {"elapsed_s", elapsed_s}, + {"draft_tokens", draft_tokens}, + }; + } else { + j["current"] = nullptr; + } + + json perf = json::array(); + for (const auto & r : history) { + perf.push_back({ + {"prefill_tok_s", r.prefill_tok_s}, + {"decode_tok_s", r.decode_tok_s}, + {"accept_rate", r.accept_rate}, + {"prompt_tokens", r.prompt_tokens}, + {"completion_tokens", r.completion_tokens}, + {"cache_hit", r.cache_hit}, + {"pflash", r.pflash}, + {"spec_decode", r.spec_decode}, + }); + } + j["perf_history"] = perf; + + return j; + } + + // Format as SSE event string: "event: status\ndata: {json}\n\n" + std::string to_sse_event() const { + std::string data = to_json().dump(); + return "event: status\ndata: " + data + "\n\n"; + } + +private: + mutable std::mutex mu_; + + // Live state. + InferencePhase phase_ = InferencePhase::IDLE; + std::string prompt_excerpt_; + int prompt_tokens_ = 0; + int completion_tokens_ = 0; + bool is_stream_ = false; + std::vector draft_tokens_; + std::chrono::steady_clock::time_point started_at_; + + // History. + std::vector perf_history_; + int total_requests_ = 0; +}; + +// RAII guard that resets status to idle on scope exit. +struct StatusGuard { + ServerStatus & status; + ~StatusGuard() { status.set_idle(); } +}; + +} // namespace dflash::common From 70a50cb136dd17aa9933ecf5a6ad106f8697f1f3 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 31 May 2026 16:01:13 +0800 Subject: [PATCH 2/7] fix: ensure status.html is found from build directory - Add CMake POST_BUILD rule to copy share/status.html into build/share/ - Add exe_dir/share/status.html as a search path (build dir layout) - Keeps existing ../share/ and ./share/ fallbacks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../plan.md | 50 +++++++++++++++++++ server/CMakeLists.txt | 9 ++++ server/src/server/http_server.cpp | 17 +++++-- 3 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 .copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md diff --git a/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md b/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md new file mode 100644 index 000000000..a5a786fcc --- /dev/null +++ b/.copilot/session-state/02c7dc18-6cf5-4c5f-8ec8-b5c6c46234cc/plan.md @@ -0,0 +1,50 @@ +# Plan: Hybrid Routed+Cold Path for All MoE Layers + +## Problem +When `cold_compute` is enabled and layers have cold experts, DeltaNet layers are forced off the +routed fast path into the slower split path. This adds overhead from: +- Extra sync at split path entry (line 365) +- Hot graph rebuild/dispatch instead of reusing the pre-cached `rffn` graph +- Less pipelined execution + +## Approach +Extend both the **routed fast path** (DeltaNet) and the **split-path routed FFN sub-path** +(attention layers) to handle cold experts inline — running the cold fused kernel on CPU in +parallel with the GPU routed FFN dispatch. + +**Key property:** When all experts are hot at runtime (n_cold = 0), the cold branch is simply +skipped. The path executes identically to today's routed fast path — zero overhead. + +## Changes + +### 1. Routed Fast Path (DeltaNet layers, lines 260-355) + +**Remove** the cold_compute guard from line 263: +```cpp +// Before: +&& !(state.cold_compute && !hybrid.layers[(size_t)il].cold_expert_ids.empty()) +// After: removed — hybrid path handles cold inline +``` + +**Modify** the routing remap loop (lines 296-308) to also record cold IDs/weights. + +**After remap, before rffn dispatch** — read ffn_post to CPU if cold compute needed. + +**After rffn dispatch (async)** — run cold compute on CPU (parallel with GPU rffn). + +**Combine** — upload cold result instead of zeroing. + +### 2. Split-Path Routed FFN Sub-Path (attention layers, lines 441-495) + +Same pattern: remove cold_compute guard, add cold partition + D2H + cold compute + upload. + +### 3. Telemetry + +Add `hybrid_cold_compute_us` counter for the new inline cold compute timing. + +## Timing Analysis + +- Current split path per mixed layer: ~1630µs +- Proposed hybrid: prefn(300µs) + max(cold(850µs), rffn(300µs)) + combine(50µs) ≈ 1200µs +- Conservative estimate: 5-10% throughput improvement +- All-hot case: zero overhead (cold branch skipped) diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index d42762f35..79a356115 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -726,6 +726,15 @@ if(DFLASH27B_TESTS) else() target_link_libraries(dflash_server PRIVATE hip::host) endif() + + # Copy share/status.html next to the binary so it can be found at runtime. + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/share") + add_custom_command(TARGET dflash_server POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_CURRENT_SOURCE_DIR}/share/status.html" + "${CMAKE_CURRENT_BINARY_DIR}/share/status.html" + COMMENT "Copying status.html to build/share/" + ) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/backend_ipc_main.cpp") diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 7a93f26ce..525279fd2 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -481,7 +481,7 @@ std::string HttpServer::resolve_status_html() { struct stat st; if (::stat(path.c_str(), &st) == 0) return path; } - // 2. ../share/ relative to /proc/self/exe + // 2. share/ relative to /proc/self/exe (build dir or installed prefix) char exe_buf[1024] = {}; ssize_t len = ::readlink("/proc/self/exe", exe_buf, sizeof(exe_buf) - 1); if (len > 0) { @@ -490,9 +490,18 @@ std::string HttpServer::resolve_status_html() { auto slash = exe_dir.rfind('/'); if (slash != std::string::npos) { exe_dir = exe_dir.substr(0, slash); - std::string path = exe_dir + "/../share/status.html"; - struct stat st; - if (::stat(path.c_str(), &st) == 0) return path; + // 2a. /share/status.html (build directory layout) + { + std::string path = exe_dir + "/share/status.html"; + struct stat st; + if (::stat(path.c_str(), &st) == 0) return path; + } + // 2b. /../share/status.html (installed prefix layout) + { + std::string path = exe_dir + "/../share/status.html"; + struct stat st; + if (::stat(path.c_str(), &st) == 0) return path; + } } } // 3. ./share/status.html (development) From ac807385a862ebbd55e6f4af5171d483322e66a2 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 31 May 2026 16:30:25 +0800 Subject: [PATCH 3/7] feat(status): show full request details and live response output - Add request params to status: model, format, temperature, top_p/k, max_output, thinking_enabled, session_id, cache/pflash/spec_decode flags - Add incremental 'event: token' SSE events (browser accumulates output) - Add messages JSON to status event (sent once per request) - Redesigned HTML: two-column request/response view, params grid, feature tags (cache hit, pflash, spec decode, stream, thinking), live tok/s - All state accumulated client-side; server stays stateless for output text Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- server/share/status.html | 173 ++++++++++++++++++++++++++---- server/src/server/http_server.cpp | 50 ++++++++- server/src/server/http_server.h | 3 + server/src/server/server_status.h | 55 +++++++++- 4 files changed, 256 insertions(+), 25 deletions(-) diff --git a/server/share/status.html b/server/share/status.html index eaf277adc..6c8b4fee4 100644 --- a/server/share/status.html +++ b/server/share/status.html @@ -21,15 +21,32 @@ .stat { display: inline-block; margin-right: 24px; margin-bottom: 8px; } .stat-value { font-size: 1.4em; font-weight: 700; color: #f0f6fc; } .stat-label { font-size: 0.75em; color: #8b949e; } -.prompt-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px; - padding: 8px 12px; font-family: monospace; font-size: 0.85em; - max-height: 80px; overflow: hidden; word-break: break-all; - color: #8b949e; margin-top: 8px; } +.tag { display: inline-block; padding: 2px 8px; border-radius: 4px; + font-size: 0.7em; font-weight: 600; margin-right: 6px; margin-top: 4px; } +.tag-green { background: #3fb95022; color: #3fb950; border: 1px solid #3fb95044; } +.tag-orange { background: #f0883e22; color: #f0883e; border: 1px solid #f0883e44; } +.tag-blue { background: #1f6feb22; color: #58a6ff; border: 1px solid #1f6feb44; } +.tag-gray { background: #8b949e22; color: #8b949e; border: 1px solid #8b949e44; } +.params-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr)); + gap: 8px; margin-top: 8px; } +.param { background: #0d1117; border: 1px solid #30363d; border-radius: 4px; + padding: 6px 10px; } +.param-label { font-size: 0.7em; color: #8b949e; text-transform: uppercase; } +.param-value { font-size: 0.9em; color: #f0f6fc; font-weight: 500; } +.text-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px; + padding: 8px 12px; font-family: monospace; font-size: 0.82em; + max-height: 300px; overflow-y: auto; word-break: break-word; + white-space: pre-wrap; color: #c9d1d9; margin-top: 8px; line-height: 1.5; } +.text-box:empty::after { content: '(waiting...)'; color: #484f58; } +.messages-box { background: #0d1117; border: 1px solid #30363d; border-radius: 4px; + padding: 8px 12px; font-family: monospace; font-size: 0.78em; + max-height: 200px; overflow-y: auto; word-break: break-word; + white-space: pre-wrap; color: #8b949e; margin-top: 8px; } .tokens-box { margin-top: 8px; } .token { display: inline-block; background: #1f6feb22; border: 1px solid #1f6feb; border-radius: 4px; padding: 2px 6px; margin: 2px; font-family: monospace; font-size: 0.8em; color: #79c0ff; } -.chart-container { width: 100%; height: 200px; position: relative; } +.chart-container { width: 100%; height: 180px; position: relative; } .chart-container svg { width: 100%; height: 100%; } .legend { display: flex; gap: 16px; margin-top: 8px; font-size: 0.75em; } .legend-item { display: flex; align-items: center; gap: 4px; } @@ -38,7 +55,8 @@ border-radius: 4px; } .connected { background: #3fb95033; color: #3fb950; } .disconnected { background: #f8514933; color: #f85149; } -#no-data { color: #484f58; text-align: center; padding: 40px; } +.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; } +@media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } } @@ -47,7 +65,7 @@

⚡ DFlash Server Status

-

Current State

+

Current Request

idle @@ -69,12 +87,28 @@

Current State

0.0s elapsed
+ + - + tok/s (live) +
-
+
+
+ +

Prefill Performance

@@ -93,25 +127,27 @@

Decode Performance