From 8cef6bb9bfb841ee0ff1f39f84b4787f91192f84 Mon Sep 17 00:00:00 2001 From: weicj Date: Thu, 28 May 2026 17:43:24 +0800 Subject: [PATCH 1/3] feat(server): add draft residency policy --- server/src/common/model_backend.h | 2 + server/src/placement/draft_residency.h | 94 ++++++++++++++++++++++++++ server/src/qwen3/qwen3_backend.cpp | 4 ++ server/src/qwen35/qwen35_backend.cpp | 5 +- server/src/server/http_server.cpp | 33 +++++++-- server/src/server/http_server.h | 4 +- server/src/server/server_main.cpp | 27 ++++++-- server/test/test_server_unit.cpp | 78 +++++++++++++++++++++ 8 files changed, 236 insertions(+), 11 deletions(-) create mode 100644 server/src/placement/draft_residency.h diff --git a/server/src/common/model_backend.h b/server/src/common/model_backend.h index de439092d..b808d0c39 100644 --- a/server/src/common/model_backend.h +++ b/server/src/common/model_backend.h @@ -19,6 +19,7 @@ #include "ggml.h" #include "ggml-backend.h" #include "sampler.h" +#include "placement/draft_residency.h" namespace dflash::common { @@ -250,6 +251,7 @@ struct ModelBackend { std::string drafter_path; // GGUF path (for lazy-load) int drafter_gpu = 0; // backend-local GPU for PFlash drafter bool skip_park = false; // true on >=32GB GPUs + DraftResidencyAction residency_action = DraftResidencyAction::KeepLoaded; }; struct CompressResult { diff --git a/server/src/placement/draft_residency.h b/server/src/placement/draft_residency.h new file mode 100644 index 000000000..53bf4baf6 --- /dev/null +++ b/server/src/placement/draft_residency.h @@ -0,0 +1,94 @@ +// Drafter residency policy shared by draft-style runtime paths. +// +// The policy is intentionally scoped by draft use-case. PFlash compression can +// release its drafter immediately after prompt compression, while DFlash decode +// draft may need to stay resident across requests for latency. + +#pragma once + +#include + +namespace dflash::common { + +enum class DraftResidencyPolicy { + Auto, + Persistent, + RequestScoped, +}; + +enum class DraftResidencyUse { + PFlashCompress, + DFlashDecode, + MtpDecode, +}; + +enum class DraftResidencyAction { + KeepLoaded, + ReleaseAfterUse, +}; + +struct DraftResidencyContext { + DraftResidencyUse use = DraftResidencyUse::PFlashCompress; + bool low_vram_hint = false; + bool has_decode_draft = false; +}; + +inline const char * draft_residency_policy_name(DraftResidencyPolicy policy) { + switch (policy) { + case DraftResidencyPolicy::Auto: return "auto"; + case DraftResidencyPolicy::Persistent: return "persistent"; + case DraftResidencyPolicy::RequestScoped: return "request-scoped"; + } + return "auto"; +} + +inline bool parse_draft_residency_policy(const std::string & value, + DraftResidencyPolicy & out) { + if (value == "auto") { + out = DraftResidencyPolicy::Auto; + return true; + } + if (value == "persistent") { + out = DraftResidencyPolicy::Persistent; + return true; + } + if (value == "request-scoped" || value == "request_scoped") { + out = DraftResidencyPolicy::RequestScoped; + return true; + } + return false; +} + +inline DraftResidencyAction resolve_draft_residency_action( + DraftResidencyPolicy policy, + const DraftResidencyContext & ctx) { + if (policy == DraftResidencyPolicy::Persistent) { + return DraftResidencyAction::KeepLoaded; + } + if (policy == DraftResidencyPolicy::RequestScoped) { + return DraftResidencyAction::ReleaseAfterUse; + } + + switch (ctx.use) { + case DraftResidencyUse::PFlashCompress: + // In auto mode, only release the PFlash drafter when the operator gave + // a low-VRAM hint. That preserves the existing fast resident path while + // allowing small-card setups to make room for decode draft/target state. + return ctx.low_vram_hint + ? DraftResidencyAction::ReleaseAfterUse + : DraftResidencyAction::KeepLoaded; + case DraftResidencyUse::DFlashDecode: + // DFlash draft is latency-sensitive; keep it resident unless the + // operator explicitly opted into the low-VRAM/request-scoped path. + return (ctx.low_vram_hint && ctx.has_decode_draft) + ? DraftResidencyAction::ReleaseAfterUse + : DraftResidencyAction::KeepLoaded; + case DraftResidencyUse::MtpDecode: + // Placeholder use-case for future draft-style decode paths. Default to + // persistent until a concrete MTP residency lifecycle is wired. + return DraftResidencyAction::KeepLoaded; + } + return DraftResidencyAction::KeepLoaded; +} + +} // namespace dflash::common diff --git a/server/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp index e2adc7f65..253886978 100644 --- a/server/src/qwen3/qwen3_backend.cpp +++ b/server/src/qwen3/qwen3_backend.cpp @@ -955,6 +955,10 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req) drafter_ctx_, req.input_ids, req.keep_ratio); result.ok = true; + if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) { + free_drafter(); + } + if (!req.skip_park && !was_parked) unpark("target"); return result; } diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp index e3b161d8c..4a3d9674e 100644 --- a/server/src/qwen35/qwen35_backend.cpp +++ b/server/src/qwen35/qwen35_backend.cpp @@ -402,8 +402,9 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req req.input_ids.size(), result.compressed_ids.size()); } - // Keep drafter loaded (own backend + weights persist), matching test_dflash. - // ~1.4 GB stays resident but avoids reload cost on subsequent compresses. + if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) { + free_drafter(); + } // Restore park state if (!req.skip_park) { diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 362c2f4d8..b7adefcae 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -160,6 +160,7 @@ json build_props_body(const ServerConfig & config, {"bsa_enabled", (bsa_env != nullptr && *bsa_env && std::strcmp(bsa_env, "0") != 0)}, {"bsa_alpha", bsa_alpha}, {"lm_head_fix", (lmfix_env != nullptr && *lmfix_env && std::strcmp(lmfix_env, "0") != 0)}, + {"draft_residency", draft_residency_policy_name(config.draft_residency)}, }; } @@ -197,6 +198,7 @@ json build_props_body(const ServerConfig & config, {"kv_cache_k", config.kv_cache_k}, {"kv_cache_v", config.kv_cache_v}, {"lazy_draft", config.lazy_draft}, + {"draft_residency", draft_residency_policy_name(config.draft_residency)}, {"target_sharding", config.target_sharding}, // Prefill chunk size (bargs.chunk). Surfaced so snapshot // tooling captures the full config — bench consumers @@ -1216,6 +1218,15 @@ void HttpServer::worker_loop() { creq.drafter_path = config_.pflash_drafter_path; creq.drafter_gpu = config_.pflash_drafter_gpu; creq.skip_park = config_.pflash_skip_park; + const auto pflash_residency = + resolve_draft_residency_action( + config_.draft_residency, + DraftResidencyContext{ + DraftResidencyUse::PFlashCompress, + config_.lazy_draft, + !config_.draft_path.empty(), + }); + creq.residency_action = pflash_residency; ModelBackend::CompressResult cresult; if (config_.pflash_remote_drafter) { @@ -1229,6 +1240,9 @@ void HttpServer::worker_loop() { cresult.ok = pflash_remote_.compress( creq.input_ids, creq.keep_ratio, cresult.compressed_ids); + if (pflash_residency == DraftResidencyAction::ReleaseAfterUse) { + pflash_remote_.close(); + } } } else { cresult = backend_.compress(creq); @@ -1501,9 +1515,20 @@ void HttpServer::worker_loop() { return true; }; + const auto dflash_residency = + resolve_draft_residency_action( + config_.draft_residency, + DraftResidencyContext{ + DraftResidencyUse::DFlashDecode, + config_.lazy_draft, + !config_.draft_path.empty(), + }); + // Run generation (with or without restore). - // Lazy-draft: ensure decode draft is loaded before generate. - if (config_.lazy_draft) { + // Request-scoped draft residency ensures decode draft is loaded only + // around the generation window, leaving room for PFlash/target state. + if (dflash_residency == DraftResidencyAction::ReleaseAfterUse && + !config_.draft_path.empty()) { backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded backend_.unpark("draft"); // reload decode draft (~3.3 GB) } @@ -1515,8 +1540,8 @@ void HttpServer::worker_loop() { result = backend_.generate_with_empty_spec_fallback(gen_req, io); } - // Lazy-draft: park decode draft after generate to free VRAM. - if (config_.lazy_draft) { + if (dflash_residency == DraftResidencyAction::ReleaseAfterUse && + !config_.draft_path.empty()) { backend_.park("draft"); } diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 999eb5d99..71c544acb 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -18,6 +18,7 @@ #include "prefix_cache.h" #include "disk_prefix_cache.h" #include "api_types.h" +#include "placement/draft_residency.h" #include "placement/remote_draft_config.h" #include "common/pflash_drafter_ipc.h" #include "model_card.h" @@ -149,7 +150,8 @@ struct ServerConfig { bool pflash_remote_drafter = false; // use IPC drafter for mixed backends RemoteDraftConfig pflash_remote; // IPC binary/work-dir for remote PFlash drafter bool pflash_skip_park = false; // skip park/unpark for >=32GB GPUs - bool lazy_draft = false; // park decode draft when idle to save VRAM + bool lazy_draft = false; // legacy alias for request-scoped draft residency + DraftResidencyPolicy draft_residency = DraftResidencyPolicy::Auto; // Disk prefix cache std::string disk_cache_dir; // empty = disabled diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 0f31739ed..5f00d4dfe 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -19,6 +19,7 @@ #include "common/layer_split_utils.h" #include "common/peer_access.h" #include "placement/pflash_placement.h" +#include "placement/draft_residency.h" #include "gguf.h" @@ -211,7 +212,9 @@ static void print_usage(const char * prog) { " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" - " --lazy-draft Park decode draft when idle to save VRAM\n" + " --draft-residency auto|persistent|request-scoped\n" + " Drafter lifetime policy (default: auto)\n" + " --lazy-draft Legacy alias for --draft-residency=request-scoped\n" "\n" "Disk KV cache:\n" " --kv-cache-dir Directory for ondisk KV cache (enables feature)\n" @@ -389,8 +392,19 @@ int main(int argc, char ** argv) { sconfig.pflash_drafter_path = argv[++i]; } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) { sconfig.pflash_skip_park = true; + } else if (std::strcmp(argv[i], "--draft-residency") == 0 && i + 1 < argc) { + if (!parse_draft_residency_policy(argv[++i], sconfig.draft_residency)) { + std::fprintf(stderr, + "[server] unknown --draft-residency policy: '%s' " + "(expected: auto, persistent, request-scoped)\n", argv[i]); + print_usage(argv[0]); + return 1; + } + sconfig.lazy_draft = + (sconfig.draft_residency == DraftResidencyPolicy::RequestScoped); } else if (std::strcmp(argv[i], "--lazy-draft") == 0) { sconfig.lazy_draft = true; + sconfig.draft_residency = DraftResidencyPolicy::RequestScoped; } else if (std::strcmp(argv[i], "--chat-template-file") == 0 && i + 1 < argc) { const char * path = argv[++i]; std::FILE * f = std::fopen(path, "rb"); @@ -499,9 +513,12 @@ int main(int argc, char ** argv) { setenv("DFLASH27B_FA_WINDOW", "0", 0); } - // Lazy-draft requires both prefill-drafter AND decode draft to be useful. - if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) { - std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n"); + if (sconfig.draft_residency == DraftResidencyPolicy::RequestScoped && + !(pflash_enabled || bargs.draft_path)) { + std::fprintf(stderr, + "[server] --draft-residency=request-scoped ignored: requires " + "--prefill-compression or --draft\n"); + sconfig.draft_residency = DraftResidencyPolicy::Auto; sconfig.lazy_draft = false; } @@ -784,6 +801,8 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off"); std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)"); } + std::fprintf(stderr, "[server] │ draft_residency = %s\n", + draft_residency_policy_name(sconfig.draft_residency)); if (bargs.draft_path) { std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off"); } diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 275ec935b..d1b9212a9 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -23,6 +23,7 @@ #include "placement/placement_config.h" #include "common/layer_split_backend.h" #include "common/layer_split_utils.h" +#include "placement/draft_residency.h" #include #include @@ -892,6 +893,7 @@ static void test_pflash_config_defaults() { TEST_ASSERT(cfg.pflash_keep_ratio > 0.04f && cfg.pflash_keep_ratio < 0.06f); TEST_ASSERT(cfg.pflash_drafter_path.empty()); TEST_ASSERT(!cfg.pflash_skip_park); + TEST_ASSERT(cfg.draft_residency == DraftResidencyPolicy::Auto); } static void test_pflash_config_modes() { @@ -1038,6 +1040,77 @@ static void test_pflash_placement_usage_gate() { /*pflash_enabled=*/true, /*has_decode_draft=*/true)); } +static void test_draft_residency_parse() { + DraftResidencyPolicy policy = DraftResidencyPolicy::Auto; + TEST_ASSERT(parse_draft_residency_policy("auto", policy)); + TEST_ASSERT(policy == DraftResidencyPolicy::Auto); + TEST_ASSERT(parse_draft_residency_policy("persistent", policy)); + TEST_ASSERT(policy == DraftResidencyPolicy::Persistent); + TEST_ASSERT(parse_draft_residency_policy("request-scoped", policy)); + TEST_ASSERT(policy == DraftResidencyPolicy::RequestScoped); + TEST_ASSERT(parse_draft_residency_policy("request_scoped", policy)); + TEST_ASSERT(policy == DraftResidencyPolicy::RequestScoped); + TEST_ASSERT(!parse_draft_residency_policy("request", policy)); +} + +static void test_draft_residency_pflash_auto() { + auto action = resolve_draft_residency_action( + DraftResidencyPolicy::Auto, + DraftResidencyContext{ + DraftResidencyUse::PFlashCompress, + /*low_vram_hint=*/false, + /*has_decode_draft=*/false, + }); + TEST_ASSERT(action == DraftResidencyAction::KeepLoaded); + + action = resolve_draft_residency_action( + DraftResidencyPolicy::Auto, + DraftResidencyContext{ + DraftResidencyUse::PFlashCompress, + /*low_vram_hint=*/true, + /*has_decode_draft=*/true, + }); + TEST_ASSERT(action == DraftResidencyAction::ReleaseAfterUse); +} + +static void test_draft_residency_dflash_auto_and_request_scoped() { + auto action = resolve_draft_residency_action( + DraftResidencyPolicy::Auto, + DraftResidencyContext{ + DraftResidencyUse::DFlashDecode, + /*low_vram_hint=*/false, + /*has_decode_draft=*/true, + }); + TEST_ASSERT(action == DraftResidencyAction::KeepLoaded); + + action = resolve_draft_residency_action( + DraftResidencyPolicy::Auto, + DraftResidencyContext{ + DraftResidencyUse::DFlashDecode, + /*low_vram_hint=*/true, + /*has_decode_draft=*/true, + }); + TEST_ASSERT(action == DraftResidencyAction::ReleaseAfterUse); + + action = resolve_draft_residency_action( + DraftResidencyPolicy::RequestScoped, + DraftResidencyContext{ + DraftResidencyUse::DFlashDecode, + /*low_vram_hint=*/false, + /*has_decode_draft=*/true, + }); + TEST_ASSERT(action == DraftResidencyAction::ReleaseAfterUse); + + action = resolve_draft_residency_action( + DraftResidencyPolicy::Persistent, + DraftResidencyContext{ + DraftResidencyUse::DFlashDecode, + /*low_vram_hint=*/true, + /*has_decode_draft=*/true, + }); + TEST_ASSERT(action == DraftResidencyAction::KeepLoaded); +} + // ═══════════════════════════════════════════════════════════════════════ // Jinja chat template // ═══════════════════════════════════════════════════════════════════════ @@ -2292,6 +2365,7 @@ static void test_props_runtime_shape() { cfg.kv_cache_k = "tq3_0"; cfg.kv_cache_v = "tq3_0"; cfg.lazy_draft = false; + cfg.draft_residency = DraftResidencyPolicy::Persistent; cfg.target_sharding = false; cfg.chunk = 512; cfg.target_device = "auto:0"; @@ -2309,6 +2383,7 @@ static void test_props_runtime_shape() { TEST_ASSERT(rt["kv_cache_k"].get() == "tq3_0"); TEST_ASSERT(rt["kv_cache_v"].get() == "tq3_0"); TEST_ASSERT(rt["lazy_draft"].get() == false); + TEST_ASSERT(rt["draft_residency"].get() == "persistent"); TEST_ASSERT(rt["target_sharding"].get() == false); TEST_ASSERT(rt["chunk"].get() == 512); TEST_ASSERT(rt["target_device"].get() == "auto:0"); @@ -2643,6 +2718,9 @@ int main() { RUN_TEST(test_pflash_placement_auto_draft_follows_target); RUN_TEST(test_pflash_placement_disabled_never_remote); RUN_TEST(test_pflash_placement_usage_gate); + RUN_TEST(test_draft_residency_parse); + RUN_TEST(test_draft_residency_pflash_auto); + RUN_TEST(test_draft_residency_dflash_auto_and_request_scoped); std::fprintf(stderr, "\n── Jinja chat template ──\n"); RUN_TEST(test_jinja_render_basic); From ddcf3005f2420cd207843daddc72abf3bb40958c Mon Sep 17 00:00:00 2001 From: weicj Date: Mon, 1 Jun 2026 01:51:38 +0800 Subject: [PATCH 2/3] fix(server): keep draft residency props stable --- server/src/server/http_server.cpp | 1 + server/test/test_server_unit.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index b7adefcae..e751cad70 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -135,6 +135,7 @@ json build_props_body(const ServerConfig & config, {"bsa_enabled", nullptr}, {"bsa_alpha", nullptr}, {"lm_head_fix", nullptr}, + {"draft_residency", draft_residency_policy_name(config.draft_residency)}, }; } else { const char * bsa_env = std::getenv("DFLASH_FP_USE_BSA"); diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index d1b9212a9..57f998acd 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -2388,6 +2388,7 @@ static void test_props_runtime_shape() { TEST_ASSERT(rt["chunk"].get() == 512); TEST_ASSERT(rt["target_device"].get() == "auto:0"); TEST_ASSERT(rt["draft_device"].get() == "auto:0"); + TEST_ASSERT(body["pflash"]["draft_residency"].get() == "persistent"); // draft_device is null when no draft model is loaded. cfg.draft_device.clear(); From 2890525f3bb9aa63687517c02ef23cc9684bdf9f Mon Sep 17 00:00:00 2001 From: mrciffa Date: Tue, 2 Jun 2026 13:09:02 +0200 Subject: [PATCH 3/3] docs: document --draft-residency flag in root README Add the --draft-residency {auto,persistent,request-scoped} row to the Decode (DFlash + DDTree) flag table and reword --lazy-draft as its legacy alias, matching the CLI help in server_main.cpp. Pairs with PR #290. Co-Authored-By: WOZCODE --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a648d6a3f..1aa1dd06c 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,8 @@ DFLASH27B_KV_TQ3=1 \ | `--ddtree` | off (chain) | Enable tree verify | | `--ddtree-budget N` | `22` | Tree size. 22 on 3090 (default), 40 on 5090, re-sweep on GB10 | | `--fa-window N` | `2048` | Sliding FA window; `0` = full attention | -| `--lazy-draft` | off | Defer draft load until first request | +| `--draft-residency {auto,persistent,request-scoped}` | `auto` | When draft weights are evicted from VRAM. `request-scoped` parks/frees them after each request's draft work (frees VRAM for the target on tight GPUs); `persistent` keeps them resident across requests; `auto` preserves current behavior while honoring the low-VRAM / `--lazy-draft` hint. Reported at `/props.runtime.draft_residency`. | +| `--lazy-draft` | off | Legacy alias for `--draft-residency=request-scoped` (defer draft load until first request, release after) | **Prefill compression (PFlash)**