Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ DFLASH27B_KV_TQ3=1 \
| `--ddtree` | off (chain) | Enable tree verify |
| `--ddtree-budget N` | `22` | Tree size. 22 on 3090 (default), 40 on 5090, re-sweep on GB10 |
| `--fa-window N` | `2048` | Sliding FA window; `0` = full attention |
| `--lazy-draft` | off | Defer draft load until first request |
| `--draft-residency {auto,persistent,request-scoped}` | `auto` | When draft weights are evicted from VRAM. `request-scoped` parks/frees them after each request's draft work (frees VRAM for the target on tight GPUs); `persistent` keeps them resident across requests; `auto` preserves current behavior while honoring the low-VRAM / `--lazy-draft` hint. Reported at `/props.runtime.draft_residency`. |
| `--lazy-draft` | off | Legacy alias for `--draft-residency=request-scoped` (defer draft load until first request, release after) |

**Prefill compression (PFlash)**

Expand Down
2 changes: 2 additions & 0 deletions server/src/common/model_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "sampler.h"
#include "placement/draft_residency.h"

namespace dflash::common {

Expand Down Expand Up @@ -250,6 +251,7 @@ struct ModelBackend {
std::string drafter_path; // GGUF path (for lazy-load)
int drafter_gpu = 0; // backend-local GPU for PFlash drafter
bool skip_park = false; // true on >=32GB GPUs
DraftResidencyAction residency_action = DraftResidencyAction::KeepLoaded;
};

struct CompressResult {
Expand Down
94 changes: 94 additions & 0 deletions server/src/placement/draft_residency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Drafter residency policy shared by draft-style runtime paths.
//
// The policy is intentionally scoped by draft use-case. PFlash compression can
// release its drafter immediately after prompt compression, while DFlash decode
// draft may need to stay resident across requests for latency.

#pragma once

#include <string>

namespace dflash::common {

enum class DraftResidencyPolicy {
Auto,
Persistent,
RequestScoped,
};

enum class DraftResidencyUse {
PFlashCompress,
DFlashDecode,
MtpDecode,
};

enum class DraftResidencyAction {
KeepLoaded,
ReleaseAfterUse,
};

struct DraftResidencyContext {
DraftResidencyUse use = DraftResidencyUse::PFlashCompress;
bool low_vram_hint = false;
bool has_decode_draft = false;
};

inline const char * draft_residency_policy_name(DraftResidencyPolicy policy) {
switch (policy) {
case DraftResidencyPolicy::Auto: return "auto";
case DraftResidencyPolicy::Persistent: return "persistent";
case DraftResidencyPolicy::RequestScoped: return "request-scoped";
}
return "auto";
}

inline bool parse_draft_residency_policy(const std::string & value,
DraftResidencyPolicy & out) {
if (value == "auto") {
out = DraftResidencyPolicy::Auto;
return true;
}
if (value == "persistent") {
out = DraftResidencyPolicy::Persistent;
return true;
}
if (value == "request-scoped" || value == "request_scoped") {
out = DraftResidencyPolicy::RequestScoped;
return true;
}
return false;
}

inline DraftResidencyAction resolve_draft_residency_action(
DraftResidencyPolicy policy,
const DraftResidencyContext & ctx) {
if (policy == DraftResidencyPolicy::Persistent) {
return DraftResidencyAction::KeepLoaded;
}
if (policy == DraftResidencyPolicy::RequestScoped) {
return DraftResidencyAction::ReleaseAfterUse;
}

switch (ctx.use) {
case DraftResidencyUse::PFlashCompress:
// In auto mode, only release the PFlash drafter when the operator gave
// a low-VRAM hint. That preserves the existing fast resident path while
// allowing small-card setups to make room for decode draft/target state.
return ctx.low_vram_hint
? DraftResidencyAction::ReleaseAfterUse
: DraftResidencyAction::KeepLoaded;
case DraftResidencyUse::DFlashDecode:
// DFlash draft is latency-sensitive; keep it resident unless the
// operator explicitly opted into the low-VRAM/request-scoped path.
return (ctx.low_vram_hint && ctx.has_decode_draft)
? DraftResidencyAction::ReleaseAfterUse
: DraftResidencyAction::KeepLoaded;
case DraftResidencyUse::MtpDecode:
// Placeholder use-case for future draft-style decode paths. Default to
// persistent until a concrete MTP residency lifecycle is wired.
return DraftResidencyAction::KeepLoaded;
}
return DraftResidencyAction::KeepLoaded;
}

} // namespace dflash::common
4 changes: 4 additions & 0 deletions server/src/qwen3/qwen3_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,10 @@ ModelBackend::CompressResult Qwen3Backend::compress(const CompressRequest & req)
drafter_ctx_, req.input_ids, req.keep_ratio);
result.ok = true;

if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) {
free_drafter();
}

if (!req.skip_park && !was_parked) unpark("target");
return result;
}
Expand Down
5 changes: 3 additions & 2 deletions server/src/qwen35/qwen35_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,9 @@ ModelBackend::CompressResult Qwen35Backend::compress(const CompressRequest & req
req.input_ids.size(), result.compressed_ids.size());
}

// Keep drafter loaded (own backend + weights persist), matching test_dflash.
// ~1.4 GB stays resident but avoids reload cost on subsequent compresses.
if (req.residency_action == DraftResidencyAction::ReleaseAfterUse) {
free_drafter();
}

// Restore park state
if (!req.skip_park) {
Expand Down
34 changes: 30 additions & 4 deletions server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ json build_props_body(const ServerConfig & config,
{"bsa_enabled", nullptr},
{"bsa_alpha", nullptr},
{"lm_head_fix", nullptr},
{"draft_residency", draft_residency_policy_name(config.draft_residency)},
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P3: Duplicate draft_residency field initializer in both branches of the pflash if/else block. The expression draft_residency_policy_name(config.draft_residency) is branch-invariant and appears identically in both the !pflash_enabled (line 138) and pflash_enabled (line 164) branches. This creates a maintenance risk — any future change to the field name or value expression must be applied in two places.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/server/http_server.cpp, line 138:

<comment>Duplicate `draft_residency` field initializer in both branches of the pflash if/else block. The expression `draft_residency_policy_name(config.draft_residency)` is branch-invariant and appears identically in both the `!pflash_enabled` (line 138) and `pflash_enabled` (line 164) branches. This creates a maintenance risk — any future change to the field name or value expression must be applied in two places.</comment>

<file context>
@@ -135,6 +135,7 @@ json build_props_body(const ServerConfig & config,
             {"bsa_enabled",  nullptr},
             {"bsa_alpha",    nullptr},
             {"lm_head_fix",  nullptr},
+            {"draft_residency", draft_residency_policy_name(config.draft_residency)},
         };
     } else {
</file context>

};
} else {
const char * bsa_env = std::getenv("DFLASH_FP_USE_BSA");
Expand All @@ -160,6 +161,7 @@ json build_props_body(const ServerConfig & config,
{"bsa_enabled", (bsa_env != nullptr && *bsa_env && std::strcmp(bsa_env, "0") != 0)},
{"bsa_alpha", bsa_alpha},
{"lm_head_fix", (lmfix_env != nullptr && *lmfix_env && std::strcmp(lmfix_env, "0") != 0)},
{"draft_residency", draft_residency_policy_name(config.draft_residency)},
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
};
}

Expand Down Expand Up @@ -197,6 +199,7 @@ json build_props_body(const ServerConfig & config,
{"kv_cache_k", config.kv_cache_k},
{"kv_cache_v", config.kv_cache_v},
{"lazy_draft", config.lazy_draft},
{"draft_residency", draft_residency_policy_name(config.draft_residency)},
{"target_sharding", config.target_sharding},
// Prefill chunk size (bargs.chunk). Surfaced so snapshot
// tooling captures the full config — bench consumers
Expand Down Expand Up @@ -1216,6 +1219,15 @@ void HttpServer::worker_loop() {
creq.drafter_path = config_.pflash_drafter_path;
creq.drafter_gpu = config_.pflash_drafter_gpu;
creq.skip_park = config_.pflash_skip_park;
const auto pflash_residency =
resolve_draft_residency_action(
config_.draft_residency,
DraftResidencyContext{
DraftResidencyUse::PFlashCompress,
config_.lazy_draft,
!config_.draft_path.empty(),
});
creq.residency_action = pflash_residency;

ModelBackend::CompressResult cresult;
if (config_.pflash_remote_drafter) {
Expand All @@ -1229,6 +1241,9 @@ void HttpServer::worker_loop() {
cresult.ok = pflash_remote_.compress(
creq.input_ids, creq.keep_ratio,
cresult.compressed_ids);
if (pflash_residency == DraftResidencyAction::ReleaseAfterUse) {
pflash_remote_.close();
}
}
} else {
cresult = backend_.compress(creq);
Expand Down Expand Up @@ -1501,9 +1516,20 @@ void HttpServer::worker_loop() {
return true;
};

const auto dflash_residency =
resolve_draft_residency_action(
config_.draft_residency,
DraftResidencyContext{
DraftResidencyUse::DFlashDecode,
config_.lazy_draft,
!config_.draft_path.empty(),
});

// Run generation (with or without restore).
// Lazy-draft: ensure decode draft is loaded before generate.
if (config_.lazy_draft) {
// Request-scoped draft residency ensures decode draft is loaded only
// around the generation window, leaving room for PFlash/target state.
if (dflash_residency == DraftResidencyAction::ReleaseAfterUse &&
!config_.draft_path.empty()) {
backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded
backend_.unpark("draft"); // reload decode draft (~3.3 GB)
}
Expand All @@ -1515,8 +1541,8 @@ void HttpServer::worker_loop() {
result = backend_.generate_with_empty_spec_fallback(gen_req, io);
}

// Lazy-draft: park decode draft after generate to free VRAM.
if (config_.lazy_draft) {
if (dflash_residency == DraftResidencyAction::ReleaseAfterUse &&
!config_.draft_path.empty()) {
backend_.park("draft");
}

Expand Down
4 changes: 3 additions & 1 deletion server/src/server/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "prefix_cache.h"
#include "disk_prefix_cache.h"
#include "api_types.h"
#include "placement/draft_residency.h"
#include "placement/remote_draft_config.h"
#include "common/pflash_drafter_ipc.h"
#include "model_card.h"
Expand Down Expand Up @@ -149,7 +150,8 @@ struct ServerConfig {
bool pflash_remote_drafter = false; // use IPC drafter for mixed backends
RemoteDraftConfig pflash_remote; // IPC binary/work-dir for remote PFlash drafter
bool pflash_skip_park = false; // skip park/unpark for >=32GB GPUs
bool lazy_draft = false; // park decode draft when idle to save VRAM
bool lazy_draft = false; // legacy alias for request-scoped draft residency
DraftResidencyPolicy draft_residency = DraftResidencyPolicy::Auto;

// Disk prefix cache
std::string disk_cache_dir; // empty = disabled
Expand Down
27 changes: 23 additions & 4 deletions server/src/server/server_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "common/layer_split_utils.h"
#include "common/peer_access.h"
#include "placement/pflash_placement.h"
#include "placement/draft_residency.h"

#include "gguf.h"

Expand Down Expand Up @@ -211,7 +212,9 @@ static void print_usage(const char * prog) {
" --prefill-keep-ratio <F> Fraction of tokens to keep (default: 0.05)\n"
" --prefill-drafter <path> Drafter GGUF for compression (Qwen3-0.6B)\n"
" --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n"
" --lazy-draft Park decode draft when idle to save VRAM\n"
" --draft-residency auto|persistent|request-scoped\n"
" Drafter lifetime policy (default: auto)\n"
" --lazy-draft Legacy alias for --draft-residency=request-scoped\n"
"\n"
"Disk KV cache:\n"
" --kv-cache-dir <path> Directory for ondisk KV cache (enables feature)\n"
Expand Down Expand Up @@ -389,8 +392,19 @@ int main(int argc, char ** argv) {
sconfig.pflash_drafter_path = argv[++i];
} else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
sconfig.pflash_skip_park = true;
} else if (std::strcmp(argv[i], "--draft-residency") == 0 && i + 1 < argc) {
if (!parse_draft_residency_policy(argv[++i], sconfig.draft_residency)) {
std::fprintf(stderr,
"[server] unknown --draft-residency policy: '%s' "
"(expected: auto, persistent, request-scoped)\n", argv[i]);
print_usage(argv[0]);
return 1;
}
sconfig.lazy_draft =
(sconfig.draft_residency == DraftResidencyPolicy::RequestScoped);
} else if (std::strcmp(argv[i], "--lazy-draft") == 0) {
sconfig.lazy_draft = true;
sconfig.draft_residency = DraftResidencyPolicy::RequestScoped;
} else if (std::strcmp(argv[i], "--chat-template-file") == 0 && i + 1 < argc) {
const char * path = argv[++i];
std::FILE * f = std::fopen(path, "rb");
Expand Down Expand Up @@ -499,9 +513,12 @@ int main(int argc, char ** argv) {
setenv("DFLASH27B_FA_WINDOW", "0", 0);
}

// Lazy-draft requires both prefill-drafter AND decode draft to be useful.
if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) {
std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n");
if (sconfig.draft_residency == DraftResidencyPolicy::RequestScoped &&
!(pflash_enabled || bargs.draft_path)) {
std::fprintf(stderr,
"[server] --draft-residency=request-scoped ignored: requires "
"--prefill-compression or --draft\n");
sconfig.draft_residency = DraftResidencyPolicy::Auto;
sconfig.lazy_draft = false;
}

Expand Down Expand Up @@ -784,6 +801,8 @@ int main(int argc, char ** argv) {
std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off");
std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)");
}
std::fprintf(stderr, "[server] │ draft_residency = %s\n",
draft_residency_policy_name(sconfig.draft_residency));
if (bargs.draft_path) {
std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off");
}
Expand Down
Loading
Loading