From 8b5dc841b3c3c4eb2988640fd8d78ef8c04e2266 Mon Sep 17 00:00:00 2001 From: kitty Date: Thu, 18 Jun 2026 19:30:53 +0800 Subject: [PATCH 1/6] feat(qwen3-dflash): add draft-only batch scheduler --- Cargo.lock | 17 + Cargo.toml | 2 + docs/index.md | 1 + docs/models/qwen3/dflash.md | 434 +++++++++ openinfer-core/src/ops.rs | 3 +- .../csrc/shared/paged_attention.cu | 152 +++ openinfer-kernels/src/ffi/shared.rs | 36 + openinfer-kernels/src/ops.rs | 5 + openinfer-kernels/src/ops/dense_attention.rs | 204 ++++ openinfer-qwen3-4b-dflash/Cargo.toml | 33 + .../src/batch_buffers.rs | 145 +++ .../src/batch_forward.rs | 407 ++++++++ .../src/bin/qwen3_dflash_batch_bench.rs | 227 +++++ .../src/bin/qwen3_dflash_forward_bench.rs | 301 ++++++ .../src/bin/qwen3_dflash_forward_fixture.rs | 155 +++ openinfer-qwen3-4b-dflash/src/config.rs | 143 +++ openinfer-qwen3-4b-dflash/src/executor.rs | 640 +++++++++++++ openinfer-qwen3-4b-dflash/src/forward.rs | 886 ++++++++++++++++++ openinfer-qwen3-4b-dflash/src/lib.rs | 19 + openinfer-qwen3-4b-dflash/src/scheduler.rs | 422 +++++++++ openinfer-qwen3-4b-dflash/src/weights.rs | 274 ++++++ .../tests/hf_golden_gate.rs | 701 ++++++++++++++ .../qwen3-4b-dflash-hf-golden.safetensors | Bin 0 -> 82540 bytes .../accuracy/bench_qwen3_4b_dflash_forward.py | 169 ++++ ...pare_qwen3_4b_dflash_drafter_generation.py | 466 +++++++++ .../dump_qwen3_4b_dflash_hf_golden.py | 98 ++ 26 files changed, 5939 insertions(+), 1 deletion(-) create mode 100644 docs/models/qwen3/dflash.md create mode 100644 openinfer-kernels/src/ops/dense_attention.rs create mode 100644 openinfer-qwen3-4b-dflash/Cargo.toml create mode 100644 openinfer-qwen3-4b-dflash/src/batch_buffers.rs create mode 100644 openinfer-qwen3-4b-dflash/src/batch_forward.rs create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs create mode 100644 openinfer-qwen3-4b-dflash/src/config.rs create mode 100644 openinfer-qwen3-4b-dflash/src/executor.rs create mode 100644 openinfer-qwen3-4b-dflash/src/forward.rs create mode 100644 openinfer-qwen3-4b-dflash/src/lib.rs create mode 100644 openinfer-qwen3-4b-dflash/src/scheduler.rs create mode 100644 openinfer-qwen3-4b-dflash/src/weights.rs create mode 100644 openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs create mode 100644 test_data/qwen3-4b-dflash-hf-golden.safetensors create mode 100644 tools/accuracy/bench_qwen3_4b_dflash_forward.py create mode 100644 tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py create mode 100644 tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py diff --git a/Cargo.lock b/Cargo.lock index 6a41fdaf..d1433b01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3893,6 +3893,23 @@ dependencies = [ "vllm-text", ] +[[package]] +name = "openinfer-qwen3-4b-dflash" +version = "0.1.0" +dependencies = [ + "anyhow", + "crossbeam-channel", + "cudarc", + "half", + "log", + "memmap2", + "openinfer-core", + "openinfer-kernels", + "safetensors", + "serde", + "serde_json", +] + [[package]] name = "openinfer-qwen35-4b" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 33876140..31d52f0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ members = [ "openinfer-deepseek-v2-lite", "openinfer-kimi-k2", "openinfer-qwen3-4b", + "openinfer-qwen3-4b-dflash", "openinfer-qwen35-4b", "openinfer-kv-cache", "openinfer-kv-offload", @@ -128,6 +129,7 @@ openinfer-engine = { path = "openinfer-engine" } openinfer-kernels = { path = "openinfer-kernels" } openinfer-kimi-k2 = { path = "openinfer-kimi-k2" } openinfer-qwen3-4b = { path = "openinfer-qwen3-4b" } +openinfer-qwen3-4b-dflash = { path = "openinfer-qwen3-4b-dflash" } openinfer-qwen35-4b = { path = "openinfer-qwen35-4b" } openinfer-deepseek-v2-lite = { path = "openinfer-deepseek-v2-lite" } openinfer-vllm-frontend = { path = "openinfer-vllm-frontend" } diff --git a/docs/index.md b/docs/index.md index a013b4fb..0c92240c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,6 +27,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | `models/qwen3/serving-perf-5090.md` | Qwen3-4B vs vLLM 0.22.1 tuning record: beats vLLM at **every measured QPS point** after unified-step attention fusion (decode rows as qo_len=1 prefill-plan entries + cta_tile_q dispatch fix). Also: batched step tail (#345), chunked prefill (default 1024), **cuBLAS 12.9 N=1025 cliff (build with CUDA ≥ 13)**, cublasLt per-shape tuning (buckets 8/16 restored), split-KV ≤bs32, two-stage argmax. | | `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB now fixed (sized cache + admission guard + kernel trap, gated by reject + in-window ITs); open set is per-row batch sampling, zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, YaRN #8 follow-up. Sequenced Now/Next/Later + cleanup ledger. | | `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. | +| `models/qwen3/dflash.md` | `openinfer-qwen3-4b-dflash` supports only `z-lab/Qwen3-4B-DFlash-b16`: standalone model config/weights/forward plus transformers remote-code parity, with no generic DFlash framework or Qwen3 server/controller changes in this task. | | `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. | | `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. | | `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. | diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md new file mode 100644 index 00000000..493f5c21 --- /dev/null +++ b/docs/models/qwen3/dflash.md @@ -0,0 +1,434 @@ +# Qwen3-4B-DFlash model + +**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope. + +Last touched: 2026-06 + +## Boundary + +This task is model-specific. The boundary is: + +| Crate | Owns | +| --- | --- | +| `openinfer-qwen3-4b-dflash` | `Qwen3-4B-DFlash-b16` config, weights, draft forward, draft-only batch executor/scheduler, model-specific kernels/wrappers, and transformers parity tests | +| `openinfer-qwen3-4b` | Unchanged existing Qwen3 target serving, scheduler, KV, LoRA/offload/TP policy, and HF logits gate | + +Out of scope for this task: generic speculative decoding, a generic DFlash abstraction, OpenAI/server flags, LoRA/TP/KV-offload interactions, target verification, acceptance-length calculation, fallback token selection, and target hidden extraction from Qwen3. + +## Reference Model + +The authoritative reference is the Hugging Face repo `z-lab/Qwen3-4B-DFlash-b16`, not an inferred architecture from the target Qwen3 crate. The model card uses: + +```python +transformers==4.57.3 +AutoModel.from_pretrained(..., trust_remote_code=True) +draft.spec_generate(target, input_ids, ...) +``` + +The local checkpoint at `/home/hezhaozhao/models/Qwen3-4B-DFlash-b16` contains the same remote-code shape: + +| Field | Value | +| --- | --- | +| `architectures` | `DFlashDraftModel` | +| draft layers | `5` | +| target layers | `36` | +| hidden size | `2560` | +| intermediate size | `9728` | +| attention heads / KV heads | `32 / 8` | +| head dim | `128` | +| block size | `16` | +| mask token | `151669` | +| target hidden layers | `[1, 9, 17, 25, 33]` | +| vocab size | `151936` | + +Checkpoint keys are unprefixed relative to a target `model.` namespace: `layers.*`, `fc.weight`, `hidden_norm.weight`, and `norm.weight`. `fc.weight` is `[2560, 12800]`, i.e. one hidden-sized projection from five concatenated target hidden states. + +## Draft Forward + +The draft forward is not target Qwen3 attention with a different checkpoint. Its attention is dense and non-causal: + +1. `target_hidden = hidden_norm(fc(concat(selected target hidden states)))` +2. `hidden_states = noise_embedding` +3. for each of the five draft layers: + - RMSNorm `hidden_states` + - Q comes from normalized noise hidden + - K/V come from `cat(target_hidden, hidden_states)` + - Q/K get Qwen3 head RMSNorm and RoPE + - attention is non-causal over the whole `target_hidden + noise_hidden` span + - residual add + - post-attention RMSNorm + Qwen3 MLP + residual add +4. final `norm(hidden_states)` + +The crate should expose draft-model primitives, not speculative serving: + +```rust +pub struct DFlashDraftModel { ... } + +impl DFlashDraftModel { + pub fn load(model_path: &Path, device_ordinal: usize) -> anyhow::Result; + pub fn config(&self) -> &DFlashConfig; + pub fn target_layer_ids(&self) -> &[usize]; + pub fn forward( + &self, + noise_embedding: &HiddenStates, + selected_target_hidden: &DFlashTargetHidden, + position_ids: &[i32], + ) -> anyhow::Result; +} +``` + +The first version takes already-selected target hidden states as input and returns the final draft hidden states. Extracting those hidden states from `openinfer-qwen3-4b`, target verification, acceptance length calculation, and KV cropping are not part of this model implementation. + +## Draft-Only Batch Runner + +The batch path is intentionally internal. It is not an OpenAI-compatible text +generation surface because the DFlash draft model does not consume prompt token +ids and does not own a language-model head. Callers must provide device +`HiddenStates` for: + +| Input | Shape | +| --- | --- | +| `noise_embedding` | `[q_len, hidden_size]` | +| `target_hidden` | `[ctx_len, target_layer_count * hidden_size]` | +| `position_ids` | `ctx_len + q_len` host positions | + +The runner groups only exact-shape requests. The batch key is +`(q_len, ctx_len, past_len, cache_mode)`. `NoCache` requests use the real +batched path: compact D2D input staging, batched FC/context projection, batched +per-layer Q/K/V and MLP GEMMs, and FlashInfer +`BatchPrefillWithRaggedKVCache` in non-causal mode for attention. `DraftCache` +requests keep the same `DFlashDraftCache` lifecycle and are executed serially +inside the GPU owner thread in this step; cross-request draft-cache batching +needs a compact past-K/V layout and should be added with the target +verification loop. + +The public Rust surface is crate-local serving infrastructure, not server API: + +```rust +pub struct DFlashDraftHostRequest { ... } +pub struct DFlashDraftHostResponse { ... } +pub struct DFlashExecutor { ... } +pub struct DFlashSchedulerHandle { ... } +``` + +`DFlashSchedulerHandle` is a single-thread GPU owner with FCFS exact-shape +batching, a small `max_wait` coalescing window, and `max_total_tokens` +admission over `(ctx_len + q_len + past_len)` for each candidate batch. Its +public `submit` boundary uses host bf16 buffers and returns host bf16 output so +CUDA device tensors do not cross thread/context ownership boundaries. It also +owns per-request draft cache state through `reset_cache`, `crop_cache`, and +`cache_seq_len`, and these calls now error on unknown request ids instead of +silently treating them as empty state; `NoCache` requests use the real batched path, while host +`DraftCache` requests run serially until compact past-K/V batching lands. The +executor also exposes a borrowed compact batch view for same-thread controller +experiments. + +## Draft Cache + +Do not maintain separate public cache concepts for this crate. The reference +Python uses one `past_key_values_draft = DynamicCache()` in `spec_generate`, +then calls the drafter with: + +```python +position_ids=position_ids[:, past_key_values_draft.get_seq_length(): start + block_size] +past_key_values=past_key_values_draft +use_cache=True +past_key_values_draft.crop(start) +``` + +OpenInfer mirrors that boundary with one `DFlashDraftCache`: + +| State | Meaning | +| --- | --- | +| `prepare_step_context(...)` | Projects the current selected target hidden states and prepares per-layer context `K/V`; this replaces the old standalone `prepare_context_cache(...)` wording. | +| `forward_with_draft_cache(...)` | Runs one draft block, appends step context `K/V` and noise-token `K/V` to each layer's draft past state, and advances `seq_len`. | +| `crop(seq_len)` / `reset()` | Matches the reference `DynamicCache.crop(start)` lifecycle after target verification decides how far the draft state remains valid. | + +The first-step cached path is numerically identical to the standalone HF +remote-code forward because there is no existing past yet. Cross-step cached +parity must be validated only after the target verification/controller is added; +without the target loop, a second cached draft step is not the same numerical +problem as the old no-draft-cache substitution probe. + +## Correctness Gate + +The accuracy bar is transformers parity. For the draft crate that means: + +| Gate | Purpose | +| --- | --- | +| config/loader shape test | Reject wrong checkpoint layout early: `target_layer_ids`, `block_size`, `mask_token_id`, `fc.weight`, layer count, and attention/MLP shapes | +| draft-forward smoke | Load `/home/hezhaozhao/models/Qwen3-4B-DFlash-b16`, run a tiny GPU block with synthetic `noise_embedding`, selected target hidden states, and position ids, and catch shape/kernel failures | +| transformers forward parity | Compare the standalone draft forward against the HF remote-code model for fixed synthetic `noise_embedding`, selected target hidden states, and position ids | +| batch-vs-single parity | Compare two exact-shape batched rows against the bs1 forward output under the same DFlash tolerance | +| executor smoke | Submit request-tagged exact-shape `NoCache` requests and assert output shape/request ids | +| scheduler cache smoke | Submit host `DraftCache` request, then assert scheduler-owned `cache_seq_len`, `crop_cache`, and `reset_cache` behavior; also checks control messages preserve FIFO ordering behind pending submits | +| drafter generation parity | Run a greedy bs1 transformers target loop twice, once with the HF drafter and once with the OpenInfer drafter, then compare generated token ids/text and acceptance lengths | + +Do not use `Qwen3-4B-Instruct-2507` as a correctness baseline for this model. The checkpoint is documented for `Qwen/Qwen3-4B`, but this task's gate is the DFlash draft model's own transformers forward, not target acceptance rate. + +## Kernel Notes + +Existing Qwen3 target attention is causal/paged and does not match `Qwen3-4B-DFlash-b16` draft attention. The draft kernel path should follow vLLM/FlashAttention semantics where possible: Q/K/V in head-major logical shape, GQA expansion by `q_head / (num_q_heads / num_kv_heads)`, RoPE on Q and K, softmax over all context+draft keys, and no causal mask. + +The reference implementation to mirror is vLLM's attention stack, especially `vllm.v1.attention.backends.flash_attn.FlashAttentionBackend` and `vllm.v1.attention.backends.flashinfer.FlashInferBackend`: both explicitly support `supports_non_causal()`, and their prefill/decode planners expose the causal flag and varlen context shape that DFlash needs. + +The batch runner uses FlashInfer `BatchPrefillWithRaggedKVCache` with +`MaskMode::kNone` for compact non-causal attention. That keeps the DFlash batch +path close to vLLM's varlen/non-causal attention semantics instead of looping +over single-request prefill. + +## Accuracy Scripts + +The DFlash scripts intentionally mirror the rest of the repository: + +| Script | Output | Use | +| --- | --- | --- | +| `tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py` | `test_data/qwen3-4b-dflash-hf-golden.safetensors` | Offline transformers remote-code forward oracle for the Rust gate | +| `openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs` | test pass/fail plus delta distribution | Release Rust gate that replays the stored oracle without Python | +| `tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py` | `target/accuracy/qwen3-dflash/drafter-generation.json` | End-to-end drafter-substitution evidence: same transformers target loop, HF drafter vs OpenInfer drafter | +| `tools/accuracy/bench_qwen3_4b_dflash_forward.py` + `qwen3_dflash_forward_bench` | `target/benchmarks/qwen3-dflash/forward.json` | Standalone forward latency comparison: transformers remote-code vs OpenInfer forward on the same synthetic fixture | +| `qwen3_dflash_batch_bench` | stdout JSON / redirected benchmark artifact | Draft-only batch sweep over bs `1,2,4,8,16,32`, reporting req/s, draft tok/s, and latency percentiles | +| `openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs` | safetensors with `openinfer_output` | Bridge used by the generation comparison script to call the Rust drafter from Python | + +The forward golden is generated by: + +```bash +.venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \ + --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --out test_data/qwen3-4b-dflash-hf-golden.safetensors +``` + +The Rust gate is: + +```bash +OPENINFER_DFLASH_TEST_MODEL_PATH=/home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ +cargo test --release -p openinfer-qwen3-4b-dflash --test hf_golden_gate -- --nocapture +``` + +The DFlash gate intentionally uses `OPENINFER_DFLASH_TEST_MODEL_PATH` rather +than the generic `OPENINFER_TEST_MODEL_PATH`, because the latter usually points +at the normal Qwen3 target checkpoint. The test also checks that +`config.json.architectures` contains `DFlashDraftModel` before running. + +The batch throughput probe is: + +```bash +cargo run --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_batch_bench -- \ + --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --ctx-len 2 \ + --q-len 16 \ + --batch-sizes 1,2,4,8,16,32 \ + --warmup 5 \ + --iters 30 +``` + +Observed local batch runner sweep on the same WSL/CUDA `sm_120` setup, +`ctx_len=2`, `q_len=16`, warmup `5`, iters `30`: + +| Batch | mean ms | p50 ms | p90 ms | p99 ms | draft tok/s | req/s | +| ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| 1 | 2.175 | 2.175 | 2.230 | 2.308 | 7,358 | 460 | +| 2 | 2.488 | 2.446 | 2.607 | 2.947 | 12,859 | 804 | +| 4 | 3.790 | 3.794 | 3.928 | 4.014 | 16,886 | 1,055 | +| 8 | 4.651 | 4.571 | 5.184 | 5.419 | 27,518 | 1,720 | +| 16 | 7.260 | 7.223 | 7.582 | 8.302 | 35,264 | 2,204 | +| 32 | 13.221 | 13.080 | 14.237 | 15.073 | 38,725 | 2,420 | + +The current batch path improves draft-token throughput by `5.3x` from bs1 to +bs32 after moving the ragged attention plan into reusable batch buffers. This is +draft-model throughput only; it does not include target hidden production, +verification, acceptance, or fallback-token work. + +On the local WSL setup used for the first run, the workspace-level vLLM git dependency and empty FlashInfer submodule required a narrower temporary workspace plus: + +```bash +LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib \ +OPENINFER_FLASHINFER_INCLUDE=/home/hezhaozhao/openinfer/.venv/lib/python3.12/site-packages/flashinfer/data/include \ +cargo test --release -p openinfer-qwen3-4b-dflash --test hf_golden_gate -- --nocapture +``` + +Observed result after the unified cache change: + +```text +dflash HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680 +dflash unified-cache one-shot HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680 +dflash draft-cache HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680 +test dflash_forward_matches_hf_remote_code ... ok +``` + +The drafter-substitution generation probe is: + +```bash +cargo build --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_forward_fixture + +.venv/bin/python tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py \ + --target-model-path /path/to/Qwen3-4B \ + --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --openinfer-bin target/release/qwen3_dflash_forward_fixture \ + --out target/accuracy/qwen3-dflash/drafter-generation.json +``` + +The JSON report records each prompt's generated token ids/text, token/text hashes, +first mismatch if any, acceptance lengths, and optional OpenInfer-vs-HF draft +hidden deltas. It exits non-zero unless every case is `all_token_text_exact`. +This is the DFlash analogue of the DeepSeek-V2-Lite same-host generation +comparison, but scoped to the current standalone drafter boundary. + +For performance, use the same synthetic fixture on both sides: + +```bash +cargo build --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_forward_bench + +.venv/bin/python tools/accuracy/bench_qwen3_4b_dflash_forward.py \ + --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --openinfer-bin target/release/qwen3_dflash_forward_bench \ + --out target/benchmarks/qwen3-dflash/forward.json +``` + +The benchmark report includes transformers latency stats and OpenInfer latency +stats for the same bf16 fixture. It is a standalone draft-forward measurement, +not a full speculative-decoding throughput claim. + +Observed local benchmark on RTX 5070 Ti, WSL, CUDA `sm_120`, `ctx_len=2`, +`q_len=16`, warmup `5`, iters `30`, same generated bf16 fixture: + +| Engine | mean ms | p50 ms | p90 ms | p99 ms | +| --- | ---: | ---: | ---: | ---: | +| transformers remote-code | 4.294 | 3.612 | 5.067 | 15.360 | +| OpenInfer DFlash | 2.285 | 2.195 | 2.659 | 2.895 | + +OpenInfer is `1.65x` faster at p50 and `1.88x` faster by mean for this +standalone forward shape. The transformers p99 includes a single 15.36 ms tail +in this short run, so p99 should not be over-interpreted without a longer sweep. +The measured artifact is `target/benchmarks/qwen3-dflash/forward.json`. + +First optimization pass: `DFlashForwardScratch` reuses the forward buffer set +across repeated calls. The HF forward gate stayed identical: +`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`. The same forward +benchmark wrote `target/benchmarks/qwen3-dflash/forward-final.json`: + +| OpenInfer path | mean ms | p50 ms | p90 ms | p99 ms | +| --- | ---: | ---: | ---: | ---: | +| allocate buffers per forward | 2.285 | 2.195 | 2.659 | 2.895 | +| reuse `DFlashForwardScratch` | 2.125 | 2.035 | 2.410 | 2.936 | + +This pass improved OpenInfer p50 by `1.08x`. It is a necessary cleanup for the +future decode loop, but not enough by itself to prove DFlash value. + +A follow-up attempt to move the cloned input hidden state into reusable scratch +was not kept: the current fused residual+RMSNorm op mutates the residual hidden +state in place, so separating input/output ping-pong buffers correctly requires +reworking that layer boundary rather than a local buffer-only patch. + +Second optimization pass: `DFlashForwardScratch` gained an explicit draft-side +target-hidden context K/V cache. `prepare_context_cache(...)` computes +`target_normed` plus each layer's context `K/V` and K norm+RoPE once; repeated +`forward_with_context_cache(...)` calls then only compute the noise-token K/V and +concat cached context with the current draft block. The HF gate now checks both +uncached and cached paths, and both stayed identical: +`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`. + +Cached benchmark artifact: `target/benchmarks/qwen3-dflash/forward-context-cache.json`. +The reported latency excludes the one-time `prepare_context_cache(...)` call, +matching the intended loop shape where context cache is updated explicitly when +target hidden changes. + +| OpenInfer path | mean ms | p50 ms | p90 ms | p99 ms | +| --- | ---: | ---: | ---: | ---: | +| allocate buffers per forward | 2.285 | 2.195 | 2.659 | 2.895 | +| reuse `DFlashForwardScratch` | 2.125 | 2.035 | 2.410 | 2.936 | +| reuse scratch + context K/V cache | 1.863 | 1.831 | 2.001 | 2.301 | + +The context cache improves p50 by `1.11x` over scratch-only and `1.20x` over the +initial implementation for this small `ctx_len=2`, `q_len=16` fixture. + +Third pass: the public cache shape was unified as `DFlashDraftCache`. The old +"context cache" is now just the step-context part of the same object, and the +cache also owns per-layer draft past K/V buffers plus `seq_len`, `crop`, and +`reset` state. The HF gate checks uncached, unified-cache one-shot, and first-step +draft-cache paths; all three retain the same delta distribution: +`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`. + +The cache internals now follow the `openinfer-kv-cache` separation more closely +without directly adopting its paged block manager: `DFlashDraftState` owns the +long-lived draft past K/V and sequence length, `DFlashStepContext` owns the +current target-hidden context K/V, and `ForwardBuffers` remains transient +scratch. The public object is still a single `DFlashDraftCache`, but a prepared +step is consumed by `forward_with_draft_cache(...)`; callers must prepare the +next step explicitly after `crop(start)`, mirroring the reference `DynamicCache` +lifecycle. + +The corresponding benchmark artifact is +`target/benchmarks/qwen3-dflash/forward-draft-cache.json`. This benchmark uses +the more honest `prepare_step_context + forward_with_draft_cache` timing inside +each measured iteration, so it should not be compared directly against the +previous context-cache number that excluded prepare time: + +| Engine/path | mean ms | p50 ms | p90 ms | p99 ms | +| --- | ---: | ---: | ---: | ---: | +| transformers remote-code | 5.564 | 4.429 | 9.078 | 18.713 | +| OpenInfer `DFlashDraftCache` first-step path | 2.311 | 2.209 | 2.479 | 3.519 | + +After the internal state/step/scratch refactor, the same benchmark wrote +`target/benchmarks/qwen3-dflash/forward-draft-cache-refactor.json` with no +accuracy change and no performance regression: + +| Engine/path | mean ms | p50 ms | p90 ms | p99 ms | +| --- | ---: | ---: | ---: | ---: | +| transformers remote-code | 4.242 | 3.861 | 5.616 | 6.922 | +| OpenInfer `DFlashDraftCache` refactor path | 2.228 | 2.155 | 2.454 | 2.541 | + +## Current Implementation + +The crate now exists as a standalone model implementation with config parsing, exact-key safetensor loading, a block draft forward, unified draft cache state, a tiny local GPU smoke test, and a HF remote-code golden gate. The attention path uses the existing Qwen3 Q/K RMSNorm+RoPE kernel and a FlashInfer single-prefill wrapper with `MaskMode::kNone`; context K currently reuses the Q/K kernel with a throwaway Q scratch buffer, so a future cleanup can split a K-only norm+RoPE helper without changing semantics. + +The local `.venv` uses `torch==2.9.0+cu129`, `transformers==4.57.3`, `safetensors`, `accelerate`, and `datasets` because the HF remote code imports `datasets` via `utils.py`. The generated fixture stores seed-pinned synthetic `noise_embedding`, selected `target_hidden`, `position_ids`, and HF final `output`; `openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs` replays those tensors through the Rust forward and compares deltas. + +An additional end-to-end generation probe used the same transformers target +model for verification and swapped only the drafter: + +| Prompt | Result | +| --- | --- | +| `Hello, my name is` | identical token ids/text; acceptance `[1, 2, 1, 2, 1, 1]` | +| `The capital of France is` | identical token ids/text; acceptance `[2, 1, 2, 2, 2]` | +| `Qwen is a language model that` | identical token ids/text; acceptance `[2, 2, 1, 1, 1, 1]` | +| `1, 1, 2, 3, 5,` | identical token ids/text; acceptance `[4, 1, 2, 2]` | + +The probe intentionally used a no-draft-cache loop on both sides because it +predates `DFlashDraftCache` and because `openinfer-qwen3-4b-dflash` still does +not own the target verification/controller. Within that older boundary, +OpenInfer DFlash produces the same greedy generation tokens as the transformers +DFlash drafter when the target/verification path is held fixed. The next +meaningful generation probe should use the real target loop and exercise +`DFlashDraftCache.crop(start)` after acceptance calculation. + +## 2026-06-18 Batch Bench + +The current Codex runner needed an explicit runtime library path to see the WSL +CUDA driver: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib \ +OPENINFER_FLASHINFER_INCLUDE=/home/hezhaozhao/openinfer/.venv/lib/python3.12/site-packages/flashinfer/data/include \ +cargo run --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_batch_bench -- \ + --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --ctx-len 2 \ + --q-len 16 \ + --batch-sizes 1,2,4,8 \ + --warmup 2 \ + --iters 5 +``` + +Observed result on the RTX 5070 Ti host: + +| Batch | mean ms | draft tok/s | req/s | +| ---: | ---: | ---: | ---: | +| 1 | 2.052 | 7,796 | 487 | +| 2 | 2.303 | 13,893 | 868 | +| 4 | 3.532 | 18,121 | 1,133 | +| 8 | 4.364 | 29,333 | 1,833 | + +This confirms the draft-only batch path still scales after the fail-closed +cache fix. It is draft throughput only; it does not include target hidden +production, verification, acceptance, or fallback-token work. diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs index efb544cb..df729753 100644 --- a/openinfer-core/src/ops.rs +++ b/openinfer-core/src/ops.rs @@ -23,7 +23,8 @@ pub use openinfer_kernels::ops::{ qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into, rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place, scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, - scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into, + scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, RaggedPrefillPlan, + batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into, write_vec_into, }; #[cfg(not(feature = "kernel-call-trace"))] pub use openinfer_kernels::ops::{ diff --git a/openinfer-kernels/csrc/shared/paged_attention.cu b/openinfer-kernels/csrc/shared/paged_attention.cu index 4506a60f..f21181ed 100644 --- a/openinfer-kernels/csrc/shared/paged_attention.cu +++ b/openinfer-kernels/csrc/shared/paged_attention.cu @@ -22,6 +22,7 @@ using namespace flashinfer; using DType = __nv_bfloat16; using IdType = int32_t; using ParamsT = BatchDecodeParams; +using BatchPrefillRaggedParamsT = BatchPrefillRaggedParams; using Variant = DefaultAttention(stream))); } +// --------------------------------------------------------------------------- +// Single-request non-causal prefill over contiguous NHD K/V. +// +// DFlash draft attention materializes K/V as token-major HiddenStates: +// q: [q_len, num_qo_heads, head_dim] +// k/v: [kv_len, num_kv_heads, head_dim] +// This wrapper mirrors vLLM's non-causal FlashAttention/FlashInfer semantics: +// no causal mask, no sliding window, and GQA handled by FlashInfer. +// --------------------------------------------------------------------------- +int single_prefill_nhd_noncausal_cuda( + void* q, + void* output, + void* k, + void* v, + int32_t num_qo_heads, + int32_t num_kv_heads, + int32_t head_dim, + int32_t q_len, + int32_t kv_len, + float sm_scale, + void* stream) +{ + uint32_t q_stride_n = num_qo_heads * head_dim; + uint32_t q_stride_h = head_dim; + uint32_t kv_stride_n = num_kv_heads * head_dim; + uint32_t kv_stride_h = head_dim; + + PrefillParamsT params( + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + /*maybe_custom_mask=*/nullptr, + reinterpret_cast(output), + /*lse=*/nullptr, + /*maybe_alibi_slopes=*/nullptr, + num_qo_heads, + num_kv_heads, + static_cast(q_len), + static_cast(kv_len), + q_stride_n, + q_stride_h, + kv_stride_n, + kv_stride_h, + static_cast(head_dim), + /*window_left=*/-1, + /*logits_soft_cap=*/0.0f, + sm_scale, + /*rope_scale=*/1.0f, + /*rope_theta=*/1e6f); + + return static_cast( + SinglePrefillWithKVCacheDispatched< + /*HEAD_DIM_QK=*/128, + /*HEAD_DIM_VO=*/128, + PosEncodingMode::kNone, + /*USE_FP16_QK_REDUCTION=*/false, + MaskMode::kNone, + Variant, + PrefillParamsT>( + params, + /*tmp=*/nullptr, + reinterpret_cast(stream))); +} + +// --------------------------------------------------------------------------- +// Batched non-causal prefill over compact ragged NHD K/V. +// +// DFlash groups exact-shape draft requests into compact token-major tensors: +// q: [sum(q_len), num_qo_heads, head_dim] +// k/v: [sum(kv_len), num_kv_heads, head_dim] +// with q_indptr/kv_indptr separating requests. This maps directly to +// FlashInfer BatchPrefillWithRaggedKVCache with MaskMode::kNone. +// --------------------------------------------------------------------------- +int batch_prefill_ragged_nhd_noncausal_cuda( + void* q, + void* output, + void* k, + void* v, + int32_t* q_indptr, + int32_t* kv_indptr, + int32_t* request_indices, + int32_t* qo_tile_indices, + int32_t* kv_tile_indices, + int32_t* kv_chunk_size_ptr, + uint32_t* total_num_rows, + int32_t num_qo_heads, + int32_t num_kv_heads, + int32_t head_dim, + int32_t total_q_len, + int32_t batch_size, + int32_t padded_batch_size, + float sm_scale, + void* stream) +{ + uint32_t q_stride_n = num_qo_heads * head_dim; + uint32_t q_stride_h = head_dim; + uint32_t kv_stride_n = num_kv_heads * head_dim; + uint32_t kv_stride_h = head_dim; + + BatchPrefillRaggedParamsT params( + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + /*maybe_custom_mask=*/nullptr, + q_indptr, + kv_indptr, + /*maybe_mask_indptr=*/nullptr, + /*maybe_q_rope_offset=*/nullptr, + /*maybe_k_rope_offset=*/nullptr, + reinterpret_cast(output), + /*lse=*/nullptr, + /*maybe_alibi_slopes=*/nullptr, + num_qo_heads, + num_kv_heads, + q_stride_n, + q_stride_h, + kv_stride_n, + kv_stride_h, + /*window_left=*/-1, + /*logits_soft_cap=*/0.0f, + sm_scale, + /*rope_scale=*/1.0f, + /*rope_theta=*/1e6f); + + params.request_indices = request_indices; + params.qo_tile_indices = qo_tile_indices; + params.kv_tile_indices = kv_tile_indices; + params.o_indptr = q_indptr; + params.kv_chunk_size_ptr = kv_chunk_size_ptr; + params.total_num_rows = total_num_rows; + params.max_total_num_rows = static_cast(total_q_len); + params.padded_batch_size = static_cast(padded_batch_size); + params.partition_kv = false; + + return static_cast( + BatchPrefillWithRaggedKVCacheDispatched< + /*CTA_TILE_Q=*/16, + /*HEAD_DIM_QK=*/128, + /*HEAD_DIM_VO=*/128, + PosEncodingMode::kNone, + /*USE_FP16_QK_REDUCTION=*/false, + MaskMode::kNone, + Variant, + BatchPrefillRaggedParamsT>( + params, + /*tmp_v=*/nullptr, + /*tmp_s=*/nullptr, + /*enable_pdl=*/false, + reinterpret_cast(stream))); +} + // --------------------------------------------------------------------------- // Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache. // diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs index 46fcba96..4f7f554c 100644 --- a/openinfer-kernels/src/ffi/shared.rs +++ b/openinfer-kernels/src/ffi/shared.rs @@ -478,6 +478,42 @@ unsafe extern "C" { stream: CUstream, ) -> i32; + pub fn single_prefill_nhd_noncausal_cuda( + q: *const Half, + output: *mut Half, + k: *const Half, + v: *const Half, + num_qo_heads: i32, + num_kv_heads: i32, + head_dim: i32, + q_len: i32, + kv_len: i32, + sm_scale: f32, + stream: CUstream, + ) -> i32; + + pub fn batch_prefill_ragged_nhd_noncausal_cuda( + q: *const Half, + output: *mut Half, + k: *const Half, + v: *const Half, + q_indptr: *const i32, + kv_indptr: *const i32, + request_indices: *const i32, + qo_tile_indices: *const i32, + kv_tile_indices: *const i32, + kv_chunk_size_ptr: *const i32, + total_num_rows: *const u32, + num_qo_heads: i32, + num_kv_heads: i32, + head_dim: i32, + total_q_len: i32, + batch_size: i32, + padded_batch_size: i32, + sm_scale: f32, + stream: CUstream, + ) -> i32; + pub fn repeat_f32_for_reduce_scatter_cuda( local: *const f32, repeated: *mut f32, diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs index 983eb817..bb3d8778 100644 --- a/openinfer-kernels/src/ops.rs +++ b/openinfer-kernels/src/ops.rs @@ -3,6 +3,7 @@ mod attention; #[cfg(feature = "kimi-k2")] mod deepep; +mod dense_attention; mod elementwise; mod embedding; #[cfg(feature = "kimi-k2")] @@ -21,6 +22,10 @@ pub use attention::{ pub use deepep::{ DeepEp, DeepEpDispatchScratch, DeepEpPrefillCounts, deepep_info, deepep_unique_id, }; +pub use dense_attention::{ + RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into, + single_prefill_nhd_noncausal_into, +}; pub use elementwise::{ accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, diff --git a/openinfer-kernels/src/ops/dense_attention.rs b/openinfer-kernels/src/ops/dense_attention.rs new file mode 100644 index 00000000..bf438707 --- /dev/null +++ b/openinfer-kernels/src/ops/dense_attention.rs @@ -0,0 +1,204 @@ +use anyhow::Result; +use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut}; + +use crate::ffi; +use crate::tensor::{DeviceContext, HiddenStates}; + +#[allow(clippy::too_many_arguments)] +pub fn single_prefill_nhd_noncausal_into( + ctx: &DeviceContext, + q: &HiddenStates, + k: &HiddenStates, + v: &HiddenStates, + out: &mut HiddenStates, + num_qo_heads: usize, + num_kv_heads: usize, + head_dim: usize, +) -> Result<()> { + let q_dim = num_qo_heads * head_dim; + let kv_dim = num_kv_heads * head_dim; + assert_eq!(q.hidden_dim, q_dim); + assert_eq!(k.hidden_dim, kv_dim); + assert_eq!(v.hidden_dim, kv_dim); + assert_eq!(v.seq_len, k.seq_len); + assert_eq!(out.hidden_dim, q_dim); + assert_eq!(out.seq_len, q.seq_len); + assert_eq!( + head_dim, 128, + "FlashInfer wrapper is instantiated for head_dim=128" + ); + + let (q_ptr, _gq) = q.data.device_ptr(&ctx.stream); + let (k_ptr, _gk) = k.data.device_ptr(&ctx.stream); + let (v_ptr, _gv) = v.data.device_ptr(&ctx.stream); + let (out_ptr, _go) = out.data.device_ptr_mut(&ctx.stream); + let sm_scale = 1.0f32 / (head_dim as f32).sqrt(); + let status = unsafe { + ffi::single_prefill_nhd_noncausal_cuda( + q_ptr as *const ffi::Half, + out_ptr as *mut ffi::Half, + k_ptr as *const ffi::Half, + v_ptr as *const ffi::Half, + num_qo_heads as i32, + num_kv_heads as i32, + head_dim as i32, + q.seq_len as i32, + k.seq_len as i32, + sm_scale, + ctx.stream.cu_stream(), + ) + }; + if status != 0 { + anyhow::bail!( + "single_prefill_nhd_noncausal_cuda failed: status={}, q_len={}, kv_len={}, q_heads={}, kv_heads={}, head_dim={}", + status, + q.seq_len, + k.seq_len, + num_qo_heads, + num_kv_heads, + head_dim + ); + } + Ok(()) +} + +pub struct RaggedPrefillPlan { + q_indptr: CudaSlice, + kv_indptr: CudaSlice, + request_indices: CudaSlice, + qo_tile_indices: CudaSlice, + kv_tile_indices: CudaSlice, + kv_chunk_size: CudaSlice, + total_num_rows: CudaSlice, + batch_size: usize, + total_q_len: usize, +} + +impl RaggedPrefillPlan { + pub fn new( + ctx: &DeviceContext, + q_lens: &[usize], + kv_lens: &[usize], + group_size: usize, + ) -> Result { + anyhow::ensure!(!q_lens.is_empty(), "ragged prefill batch is empty"); + anyhow::ensure!( + q_lens.len() == kv_lens.len(), + "q_lens len {} != kv_lens len {}", + q_lens.len(), + kv_lens.len() + ); + anyhow::ensure!(group_size > 0, "group_size must be positive"); + let mut q_indptr = Vec::with_capacity(q_lens.len() + 1); + let mut kv_indptr = Vec::with_capacity(kv_lens.len() + 1); + q_indptr.push(0i32); + kv_indptr.push(0i32); + for (&q_len, &kv_len) in q_lens.iter().zip(kv_lens.iter()) { + anyhow::ensure!(q_len > 0, "ragged prefill q_len must be positive"); + anyhow::ensure!(kv_len > 0, "ragged prefill kv_len must be positive"); + q_indptr.push(q_indptr.last().copied().unwrap() + q_len as i32); + kv_indptr.push(kv_indptr.last().copied().unwrap() + kv_len as i32); + } + let total_q_len = *q_indptr.last().unwrap() as usize; + let mut request_indices = Vec::new(); + let mut qo_tile_indices = Vec::new(); + let mut kv_tile_indices = Vec::new(); + const CTA_TILE_Q: usize = 16; + for (req_idx, &q_len) in q_lens.iter().enumerate() { + let packed_q_len = q_len * group_size; + let tiles = packed_q_len.div_ceil(CTA_TILE_Q); + for tile in 0..tiles { + request_indices.push(req_idx as i32); + qo_tile_indices.push(tile as i32); + kv_tile_indices.push(0i32); + } + } + let kv_chunk_size: Vec = kv_lens.iter().map(|&len| len as i32).collect(); + Ok(Self { + q_indptr: ctx.stream.clone_htod(&q_indptr)?, + kv_indptr: ctx.stream.clone_htod(&kv_indptr)?, + request_indices: ctx.stream.clone_htod(&request_indices)?, + qo_tile_indices: ctx.stream.clone_htod(&qo_tile_indices)?, + kv_tile_indices: ctx.stream.clone_htod(&kv_tile_indices)?, + kv_chunk_size: ctx.stream.clone_htod(&kv_chunk_size)?, + total_num_rows: ctx.stream.clone_htod(&[total_q_len as u32])?, + batch_size: q_lens.len(), + total_q_len, + }) + } +} + +#[allow(clippy::too_many_arguments)] +pub fn batch_prefill_ragged_nhd_noncausal_into( + ctx: &DeviceContext, + q: &HiddenStates, + k: &HiddenStates, + v: &HiddenStates, + out: &mut HiddenStates, + plan: &RaggedPrefillPlan, + num_qo_heads: usize, + num_kv_heads: usize, + head_dim: usize, +) -> Result<()> { + let q_dim = num_qo_heads * head_dim; + let kv_dim = num_kv_heads * head_dim; + assert_eq!(q.hidden_dim, q_dim); + assert_eq!(k.hidden_dim, kv_dim); + assert_eq!(v.hidden_dim, kv_dim); + assert_eq!(v.seq_len, k.seq_len); + assert_eq!(out.hidden_dim, q_dim); + assert_eq!(out.seq_len, q.seq_len); + assert_eq!(q.seq_len, plan.total_q_len); + assert_eq!( + head_dim, 128, + "FlashInfer ragged wrapper is instantiated for head_dim=128" + ); + + let (q_ptr, _gq) = q.data.device_ptr(&ctx.stream); + let (k_ptr, _gk) = k.data.device_ptr(&ctx.stream); + let (v_ptr, _gv) = v.data.device_ptr(&ctx.stream); + let (out_ptr, _go) = out.data.device_ptr_mut(&ctx.stream); + let (q_indptr, _) = plan.q_indptr.device_ptr(&ctx.stream); + let (kv_indptr, _) = plan.kv_indptr.device_ptr(&ctx.stream); + let (request_indices, _) = plan.request_indices.device_ptr(&ctx.stream); + let (qo_tile_indices, _) = plan.qo_tile_indices.device_ptr(&ctx.stream); + let (kv_tile_indices, _) = plan.kv_tile_indices.device_ptr(&ctx.stream); + let (kv_chunk_size, _) = plan.kv_chunk_size.device_ptr(&ctx.stream); + let (total_num_rows, _) = plan.total_num_rows.device_ptr(&ctx.stream); + let sm_scale = 1.0f32 / (head_dim as f32).sqrt(); + let status = unsafe { + ffi::batch_prefill_ragged_nhd_noncausal_cuda( + q_ptr as *const ffi::Half, + out_ptr as *mut ffi::Half, + k_ptr as *const ffi::Half, + v_ptr as *const ffi::Half, + q_indptr as *const i32, + kv_indptr as *const i32, + request_indices as *const i32, + qo_tile_indices as *const i32, + kv_tile_indices as *const i32, + kv_chunk_size as *const i32, + total_num_rows as *const u32, + num_qo_heads as i32, + num_kv_heads as i32, + head_dim as i32, + q.seq_len as i32, + plan.batch_size as i32, + plan.request_indices.len() as i32, + sm_scale, + ctx.stream.cu_stream(), + ) + }; + if status != 0 { + anyhow::bail!( + "batch_prefill_ragged_nhd_noncausal_cuda failed: status={}, total_q_len={}, batch_size={}, q_heads={}, kv_heads={}, head_dim={}", + status, + q.seq_len, + plan.batch_size, + num_qo_heads, + num_kv_heads, + head_dim + ); + } + Ok(()) +} diff --git a/openinfer-qwen3-4b-dflash/Cargo.toml b/openinfer-qwen3-4b-dflash/Cargo.toml new file mode 100644 index 00000000..47b11a25 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "openinfer-qwen3-4b-dflash" +license = "Apache-2.0" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = { workspace = true } +crossbeam-channel = { workspace = true } +cudarc = { workspace = true } +half = { workspace = true } +log = { workspace = true } +memmap2 = { workspace = true } +openinfer-core = { workspace = true } +openinfer-kernels = { workspace = true } +safetensors = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } + +[[bin]] +name = "qwen3_dflash_forward_fixture" +path = "src/bin/qwen3_dflash_forward_fixture.rs" + +[[bin]] +name = "qwen3_dflash_forward_bench" +path = "src/bin/qwen3_dflash_forward_bench.rs" + +[[bin]] +name = "qwen3_dflash_batch_bench" +path = "src/bin/qwen3_dflash_batch_bench.rs" + +[lints] +workspace = true diff --git a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs new file mode 100644 index 00000000..84739906 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs @@ -0,0 +1,145 @@ +use anyhow::Result; +use cudarc::driver::CudaSlice; +use openinfer_core::ops::RaggedPrefillPlan; +use openinfer_core::tensor::HiddenStates; + +use crate::weights::DFlashDraftModel; + +pub struct DFlashBatchBuffers { + pub(crate) max_batch_size: usize, + pub(crate) q_len: usize, + pub(crate) ctx_len: usize, + pub(crate) total_q_len: usize, + pub(crate) total_ctx_len: usize, + pub(crate) total_kv_len: usize, + pub(crate) noise: HiddenStates, + pub(crate) target_hidden: HiddenStates, + pub(crate) target_projected: HiddenStates, + pub(crate) target_normed: HiddenStates, + pub(crate) hidden: HiddenStates, + pub(crate) hidden_out: HiddenStates, + pub(crate) normed: HiddenStates, + pub(crate) q: HiddenStates, + pub(crate) q_ctx_scratch: HiddenStates, + pub(crate) k_ctx: HiddenStates, + pub(crate) k_noise: HiddenStates, + pub(crate) v_ctx: HiddenStates, + pub(crate) v_noise: HiddenStates, + pub(crate) k_all: HiddenStates, + pub(crate) v_all: HiddenStates, + pub(crate) attn_out: HiddenStates, + pub(crate) o_buf: HiddenStates, + pub(crate) gate_up: HiddenStates, + pub(crate) act_out: HiddenStates, + pub(crate) positions_q: CudaSlice, + pub(crate) positions_ctx: CudaSlice, + pub(crate) ragged_plan: Option, +} + +pub(crate) struct CachedRaggedPlan { + pub(crate) batch_size: usize, + pub(crate) plan: RaggedPrefillPlan, +} + +impl DFlashBatchBuffers { + pub(crate) fn new( + model: &DFlashDraftModel, + max_batch_size: usize, + q_len: usize, + ctx_len: usize, + ) -> Result { + anyhow::ensure!(max_batch_size > 0, "max_batch_size must be positive"); + anyhow::ensure!(q_len > 0, "q_len must be positive"); + anyhow::ensure!(ctx_len > 0, "ctx_len must be positive"); + let config = model.config(); + let ctx = model.device_context(); + let hidden = config.hidden_size; + let target_hidden_dim = config.hidden_size * config.target_layer_count(); + let q_dim = config.q_dim(); + let kv_dim = config.kv_dim(); + let total_q_len = max_batch_size * q_len; + let total_ctx_len = max_batch_size * ctx_len; + let total_kv_len = max_batch_size * (ctx_len + q_len); + Ok(Self { + max_batch_size, + q_len, + ctx_len, + total_q_len, + total_ctx_len, + total_kv_len, + noise: HiddenStates::zeros(ctx, hidden, total_q_len)?, + target_hidden: HiddenStates::zeros(ctx, target_hidden_dim, total_ctx_len)?, + target_projected: HiddenStates::zeros(ctx, hidden, total_ctx_len)?, + target_normed: HiddenStates::zeros(ctx, hidden, total_ctx_len)?, + hidden: HiddenStates::zeros(ctx, hidden, total_q_len)?, + hidden_out: HiddenStates::zeros(ctx, hidden, total_q_len)?, + normed: HiddenStates::zeros(ctx, hidden, total_q_len)?, + q: HiddenStates::zeros(ctx, q_dim, total_q_len)?, + q_ctx_scratch: HiddenStates::zeros(ctx, q_dim, total_ctx_len)?, + k_ctx: HiddenStates::zeros(ctx, kv_dim, total_ctx_len)?, + k_noise: HiddenStates::zeros(ctx, kv_dim, total_q_len)?, + v_ctx: HiddenStates::zeros(ctx, kv_dim, total_ctx_len)?, + v_noise: HiddenStates::zeros(ctx, kv_dim, total_q_len)?, + k_all: HiddenStates::zeros(ctx, kv_dim, total_kv_len)?, + v_all: HiddenStates::zeros(ctx, kv_dim, total_kv_len)?, + attn_out: HiddenStates::zeros(ctx, q_dim, total_q_len)?, + o_buf: HiddenStates::zeros(ctx, hidden, total_q_len)?, + gate_up: HiddenStates::zeros(ctx, 2 * config.intermediate_size, total_q_len)?, + act_out: HiddenStates::zeros(ctx, config.intermediate_size, total_q_len)?, + positions_q: ctx.stream.alloc_zeros(total_q_len)?, + positions_ctx: ctx.stream.alloc_zeros(total_ctx_len)?, + ragged_plan: None, + }) + } + + pub(crate) fn set_active_batch(&mut self, batch_size: usize) { + debug_assert!(batch_size <= self.max_batch_size); + self.total_q_len = batch_size * self.q_len; + self.total_ctx_len = batch_size * self.ctx_len; + self.total_kv_len = batch_size * (self.ctx_len + self.q_len); + self.noise.seq_len = self.total_q_len; + self.target_hidden.seq_len = self.total_ctx_len; + self.target_projected.seq_len = self.total_ctx_len; + self.target_normed.seq_len = self.total_ctx_len; + self.hidden.seq_len = self.total_q_len; + self.hidden_out.seq_len = self.total_q_len; + self.normed.seq_len = self.total_q_len; + self.q.seq_len = self.total_q_len; + self.q_ctx_scratch.seq_len = self.total_ctx_len; + self.k_ctx.seq_len = self.total_ctx_len; + self.k_noise.seq_len = self.total_q_len; + self.v_ctx.seq_len = self.total_ctx_len; + self.v_noise.seq_len = self.total_q_len; + self.k_all.seq_len = self.total_kv_len; + self.v_all.seq_len = self.total_kv_len; + self.attn_out.seq_len = self.total_q_len; + self.o_buf.seq_len = self.total_q_len; + self.gate_up.seq_len = self.total_q_len; + self.act_out.seq_len = self.total_q_len; + } + + pub(crate) fn prepare_ragged_plan( + &mut self, + model: &DFlashDraftModel, + batch_size: usize, + ) -> Result<()> { + let needs_rebuild = self + .ragged_plan + .as_ref() + .map(|cached| cached.batch_size != batch_size) + .unwrap_or(true); + if needs_rebuild { + let config = model.config(); + let q_lens = vec![self.q_len; batch_size]; + let kv_lens = vec![self.ctx_len + self.q_len; batch_size]; + let plan = RaggedPrefillPlan::new( + model.device_context(), + &q_lens, + &kv_lens, + config.num_attention_heads / config.num_key_value_heads, + )?; + self.ragged_plan = Some(CachedRaggedPlan { batch_size, plan }); + } + Ok(()) + } +} diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs new file mode 100644 index 00000000..ba9ef95b --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs @@ -0,0 +1,407 @@ +use anyhow::Result; +use half::bf16; +use openinfer_core::ops; +use openinfer_core::tensor::{DeviceContext, HiddenStates}; + +use crate::batch_buffers::DFlashBatchBuffers; +use crate::forward::DFlashTargetHidden; +use crate::weights::{DFlashDraftModel, DFlashLayer}; + +pub struct DFlashBatchInput<'a> { + pub noise_embedding: &'a HiddenStates, + pub target_hidden: DFlashTargetHidden<'a>, + pub position_ids: &'a [i32], +} + +pub struct DFlashHostBatchInput<'a> { + pub noise_embedding: &'a [bf16], + pub target_hidden: &'a [bf16], + pub position_ids: &'a [i32], +} + +impl DFlashDraftModel { + pub fn create_batch_buffers( + &self, + max_batch_size: usize, + q_len: usize, + ctx_len: usize, + ) -> Result { + DFlashBatchBuffers::new(self, max_batch_size, q_len, ctx_len) + } + + pub fn forward_batch<'a>( + &self, + requests: &[DFlashBatchInput<'_>], + bufs: &'a mut DFlashBatchBuffers, + ) -> Result<&'a HiddenStates> { + anyhow::ensure!(!requests.is_empty(), "DFlash batch is empty"); + anyhow::ensure!( + requests.len() <= bufs.max_batch_size, + "DFlash batch size {} exceeds buffer capacity {}", + requests.len(), + bufs.max_batch_size + ); + let q_len = bufs.q_len; + let ctx_len = bufs.ctx_len; + for req in requests { + let (actual_q, actual_ctx) = self.validate_forward_inputs( + req.noise_embedding, + &req.target_hidden, + req.position_ids, + )?; + anyhow::ensure!( + actual_q == q_len && actual_ctx == ctx_len, + "DFlash exact-shape batch expected q_len={}, ctx_len={} but got q_len={}, ctx_len={}", + q_len, + ctx_len, + actual_q, + actual_ctx + ); + } + bufs.set_active_batch(requests.len()); + compact_inputs(self.device_context(), requests, bufs)?; + self.forward_compact_batch(requests.len(), bufs)?; + Ok(&bufs.normed) + } + + pub fn forward_host_batch<'a>( + &self, + requests: &[DFlashHostBatchInput<'_>], + bufs: &'a mut DFlashBatchBuffers, + ) -> Result<&'a HiddenStates> { + anyhow::ensure!(!requests.is_empty(), "DFlash host batch is empty"); + anyhow::ensure!( + requests.len() <= bufs.max_batch_size, + "DFlash host batch size {} exceeds buffer capacity {}", + requests.len(), + bufs.max_batch_size + ); + let config = self.config(); + let noise_len = bufs.q_len * config.hidden_size; + let target_len = bufs.ctx_len * config.hidden_size * config.target_layer_count(); + let position_len = bufs.ctx_len + bufs.q_len; + for req in requests { + anyhow::ensure!( + req.noise_embedding.len() == noise_len, + "noise_embedding len {} != {}", + req.noise_embedding.len(), + noise_len + ); + anyhow::ensure!( + req.target_hidden.len() == target_len, + "target_hidden len {} != {}", + req.target_hidden.len(), + target_len + ); + anyhow::ensure!( + req.position_ids.len() == position_len, + "position_ids len {} != {}", + req.position_ids.len(), + position_len + ); + } + bufs.set_active_batch(requests.len()); + compact_host_inputs(self.device_context(), requests, bufs)?; + self.forward_compact_batch(requests.len(), bufs)?; + Ok(&bufs.normed) + } + + fn forward_compact_batch( + &self, + batch_size: usize, + bufs: &mut DFlashBatchBuffers, + ) -> Result<()> { + let config = self.config(); + ops::gemm_into_checked( + self.device_context(), + &self.fc, + &bufs.target_hidden, + &mut bufs.target_projected, + )?; + ops::rms_norm_batch_into( + self.device_context(), + &bufs.target_projected, + &self.hidden_norm, + config.rms_norm_eps, + &mut bufs.target_normed, + ); + copy_hidden( + self.device_context(), + &bufs.noise, + 0, + &mut bufs.hidden, + 0, + config.hidden_size, + bufs.total_q_len, + )?; + for layer in &self.layers { + self.forward_compact_batch_layer(layer, batch_size, bufs)?; + } + ops::rms_norm_batch_into( + self.device_context(), + &bufs.hidden, + &self.norm, + config.rms_norm_eps, + &mut bufs.normed, + ); + Ok(()) + } + + fn forward_compact_batch_layer( + &self, + layer: &DFlashLayer, + batch_size: usize, + bufs: &mut DFlashBatchBuffers, + ) -> Result<()> { + let config = self.config(); + let ctx = self.device_context(); + ops::rms_norm_batch_into( + ctx, + &bufs.hidden, + &layer.input_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + ); + ops::gemm_into_checked(ctx, &layer.attention.q_proj, &bufs.normed, &mut bufs.q)?; + ops::gemm_into_checked( + ctx, + &layer.attention.k_proj, + &bufs.normed, + &mut bufs.k_noise, + )?; + ops::gemm_into_checked( + ctx, + &layer.attention.v_proj, + &bufs.normed, + &mut bufs.v_noise, + )?; + ops::qk_norm_rope_batch_decode_into( + ctx, + &mut bufs.q, + &mut bufs.k_noise, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &bufs.positions_q, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + + ops::gemm_into_checked( + ctx, + &layer.attention.k_proj, + &bufs.target_normed, + &mut bufs.k_ctx, + )?; + ops::gemm_into_checked( + ctx, + &layer.attention.v_proj, + &bufs.target_normed, + &mut bufs.v_ctx, + )?; + ops::qk_norm_rope_batch_decode_into( + ctx, + &mut bufs.q_ctx_scratch, + &mut bufs.k_ctx, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &bufs.positions_ctx, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + + compact_kv( + ctx, + &bufs.k_ctx, + &bufs.k_noise, + &mut bufs.k_all, + batch_size, + bufs.ctx_len, + bufs.q_len, + )?; + compact_kv( + ctx, + &bufs.v_ctx, + &bufs.v_noise, + &mut bufs.v_all, + batch_size, + bufs.ctx_len, + bufs.q_len, + )?; + bufs.prepare_ragged_plan(self, batch_size)?; + let cached_plan = bufs.ragged_plan.take().expect("ragged plan exists"); + let attention_result = ops::batch_prefill_ragged_nhd_noncausal_into( + ctx, + &bufs.q, + &bufs.k_all, + &bufs.v_all, + &mut bufs.attn_out, + &cached_plan.plan, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + ); + bufs.ragged_plan = Some(cached_plan); + attention_result?; + ops::gemm_into_checked( + ctx, + &layer.attention.o_proj, + &bufs.attn_out, + &mut bufs.o_buf, + )?; + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( + ctx, + &mut bufs.hidden, + &bufs.o_buf, + &layer.post_attention_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + )?; + ops::gemm_into_checked( + ctx, + &layer.mlp.gate_up_proj, + &bufs.normed, + &mut bufs.gate_up, + )?; + ops::silu_mul_fused_batch_into(ctx, &bufs.gate_up, &mut bufs.act_out); + ops::gemm_into_checked(ctx, &layer.mlp.down_proj, &bufs.act_out, &mut bufs.o_buf)?; + ops::add_batch_into(ctx, &bufs.hidden, &bufs.o_buf, &mut bufs.hidden_out)?; + std::mem::swap(&mut bufs.hidden, &mut bufs.hidden_out); + Ok(()) + } +} + +fn compact_inputs( + ctx: &DeviceContext, + requests: &[DFlashBatchInput<'_>], + bufs: &mut DFlashBatchBuffers, +) -> Result<()> { + let hidden = bufs.noise.hidden_dim; + let target_hidden = bufs.target_hidden.hidden_dim; + let mut pos_q = Vec::with_capacity(bufs.total_q_len); + let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len); + for (i, req) in requests.iter().enumerate() { + copy_hidden( + ctx, + req.noise_embedding, + 0, + &mut bufs.noise, + i * bufs.q_len, + hidden, + bufs.q_len, + )?; + copy_hidden( + ctx, + req.target_hidden.concatenated, + 0, + &mut bufs.target_hidden, + i * bufs.ctx_len, + target_hidden, + bufs.ctx_len, + )?; + pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]); + pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]); + } + let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len()); + ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?; + let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len()); + ctx.stream.memcpy_htod(&pos_ctx, &mut dst_ctx)?; + Ok(()) +} + +fn compact_host_inputs( + ctx: &DeviceContext, + requests: &[DFlashHostBatchInput<'_>], + bufs: &mut DFlashBatchBuffers, +) -> Result<()> { + let hidden = bufs.noise.hidden_dim; + let target_hidden = bufs.target_hidden.hidden_dim; + let mut pos_q = Vec::with_capacity(bufs.total_q_len); + let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len); + for (i, req) in requests.iter().enumerate() { + let noise_offset = i * bufs.q_len * hidden; + let mut noise_dst = bufs + .noise + .data + .slice_mut(noise_offset..noise_offset + req.noise_embedding.len()); + ctx.stream + .memcpy_htod(req.noise_embedding, &mut noise_dst)?; + + let target_offset = i * bufs.ctx_len * target_hidden; + let mut target_dst = bufs + .target_hidden + .data + .slice_mut(target_offset..target_offset + req.target_hidden.len()); + ctx.stream.memcpy_htod(req.target_hidden, &mut target_dst)?; + + pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]); + pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]); + } + let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len()); + ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?; + let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len()); + ctx.stream.memcpy_htod(&pos_ctx, &mut dst_ctx)?; + Ok(()) +} + +fn compact_kv( + ctx: &DeviceContext, + ctx_part: &HiddenStates, + noise_part: &HiddenStates, + out: &mut HiddenStates, + batch_size: usize, + ctx_len: usize, + q_len: usize, +) -> Result<()> { + let dim = ctx_part.hidden_dim; + for i in 0..batch_size { + copy_hidden( + ctx, + ctx_part, + i * ctx_len, + out, + i * (ctx_len + q_len), + dim, + ctx_len, + )?; + copy_hidden( + ctx, + noise_part, + i * q_len, + out, + i * (ctx_len + q_len) + ctx_len, + dim, + q_len, + )?; + } + Ok(()) +} + +pub(crate) fn copy_hidden( + ctx: &DeviceContext, + src: &HiddenStates, + src_token_offset: usize, + dst: &mut HiddenStates, + dst_token_offset: usize, + hidden_dim: usize, + token_count: usize, +) -> Result<()> { + debug_assert_eq!(src.hidden_dim, hidden_dim); + debug_assert_eq!(dst.hidden_dim, hidden_dim); + debug_assert!(src_token_offset + token_count <= src.seq_len); + debug_assert!(dst_token_offset + token_count <= dst.seq_len); + let len = hidden_dim * token_count; + let src_offset = hidden_dim * src_token_offset; + let dst_offset = hidden_dim * dst_token_offset; + let src_view = src.data.slice(src_offset..src_offset + len); + let mut dst_view = dst.data.slice_mut(dst_offset..dst_offset + len); + ctx.stream.memcpy_dtod(&src_view, &mut dst_view)?; + Ok(()) +} diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs new file mode 100644 index 00000000..9ba748ae --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs @@ -0,0 +1,227 @@ +use std::path::PathBuf; +use std::time::Instant; + +use anyhow::{Context, Result, bail}; +use half::bf16; +use openinfer_core::tensor::HiddenStates; +use openinfer_qwen3_4b_dflash::{DFlashBatchInput, DFlashDraftModel, DFlashTargetHidden}; +use serde::Serialize; + +fn main() -> Result<()> { + let args = Args::parse()?; + let model = DFlashDraftModel::load(&args.model_path, args.device)?; + let config = model.config(); + let ctx = model.device_context(); + let mut reports = Vec::new(); + + for &batch_size in &args.batch_sizes { + let mut noises = Vec::with_capacity(batch_size); + let mut targets = Vec::with_capacity(batch_size); + let mut positions = Vec::with_capacity(batch_size); + for i in 0..batch_size { + let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_0000 + i as u64); + let target = deterministic_bf16( + args.ctx_len * config.hidden_size * config.target_layer_count(), + 0xC0DE_0000 + i as u64, + ); + noises.push(HiddenStates { + data: ctx.stream.clone_htod(&noise).context("noise h2d")?, + hidden_dim: config.hidden_size, + seq_len: args.q_len, + }); + targets.push(HiddenStates { + data: ctx.stream.clone_htod(&target).context("target h2d")?, + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: args.ctx_len, + }); + positions.push( + (0..(args.ctx_len + args.q_len)) + .map(|pos| pos as i32) + .collect::>(), + ); + } + let mut bufs = model.create_batch_buffers(batch_size, args.q_len, args.ctx_len)?; + let inputs = build_inputs(&noises, &targets, &positions); + for _ in 0..args.warmup { + let _ = model.forward_batch(&inputs, &mut bufs)?; + ctx.sync()?; + } + let mut latencies_ms = Vec::with_capacity(args.iters); + for _ in 0..args.iters { + ctx.sync()?; + let started = Instant::now(); + let _ = model.forward_batch(&inputs, &mut bufs)?; + ctx.sync()?; + latencies_ms.push(started.elapsed().as_secs_f64() * 1000.0); + } + let stats = Stats::from(&latencies_ms); + let mean_s = stats.mean / 1000.0; + reports.push(BatchReport { + batch_size, + ctx_len: args.ctx_len, + q_len: args.q_len, + warmup: args.warmup, + iters: args.iters, + draft_tokens_per_s: (batch_size * args.q_len) as f64 / mean_s, + requests_per_s: batch_size as f64 / mean_s, + latency_ms: stats, + }); + } + + let report = Report { + schema: 1, + engine: "openinfer-qwen3-4b-dflash-batch", + model_path: args.model_path.to_string_lossy().to_string(), + device: args.device, + hidden_size: config.hidden_size, + target_layer_count: config.target_layer_count(), + reports, + }; + println!("{}", serde_json::to_string_pretty(&report)?); + Ok(()) +} + +fn build_inputs<'a>( + noises: &'a [HiddenStates], + targets: &'a [HiddenStates], + positions: &'a [Vec], +) -> Vec> { + noises + .iter() + .zip(targets.iter()) + .zip(positions.iter()) + .map(|((noise, target), position_ids)| DFlashBatchInput { + noise_embedding: noise, + target_hidden: DFlashTargetHidden { + concatenated: target, + }, + position_ids, + }) + .collect() +} + +#[derive(Clone)] +struct Args { + model_path: PathBuf, + device: usize, + ctx_len: usize, + q_len: usize, + warmup: usize, + iters: usize, + batch_sizes: Vec, +} + +impl Args { + fn parse() -> Result { + let mut model_path = PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"); + let mut device = 0usize; + let mut ctx_len = 2usize; + let mut q_len = 16usize; + let mut warmup = 5usize; + let mut iters = 30usize; + let mut batch_sizes = vec![1, 2, 4, 8, 16, 32]; + let mut args = std::env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--model-path" => model_path = PathBuf::from(next_value(&mut args, &arg)?), + "--device" => device = next_value(&mut args, &arg)?.parse()?, + "--ctx-len" => ctx_len = next_value(&mut args, &arg)?.parse()?, + "--q-len" => q_len = next_value(&mut args, &arg)?.parse()?, + "--warmup" => warmup = next_value(&mut args, &arg)?.parse()?, + "--iters" => iters = next_value(&mut args, &arg)?.parse()?, + "--batch-sizes" => { + batch_sizes = next_value(&mut args, &arg)? + .split(',') + .map(str::parse) + .collect::, _>>()?; + } + _ => bail!("unknown argument {arg}"), + } + } + if ctx_len == 0 || q_len == 0 || iters == 0 { + bail!("--ctx-len, --q-len, and --iters must be greater than zero"); + } + if batch_sizes.is_empty() || batch_sizes.contains(&0) { + bail!("--batch-sizes must contain positive batch sizes"); + } + Ok(Self { + model_path, + device, + ctx_len, + q_len, + warmup, + iters, + batch_sizes, + }) + } +} + +fn next_value(args: &mut impl Iterator, flag: &str) -> Result { + args.next() + .with_context(|| format!("{flag} requires a value")) +} + +fn deterministic_bf16(len: usize, seed: u64) -> Vec { + let mut state = seed; + let mut out = Vec::with_capacity(len); + for _ in 0..len { + state = state.wrapping_mul(6364136223846793005).wrapping_add(1); + let bits = ((state >> 32) as u32) as f32 / (u32::MAX as f32); + out.push(bf16::from_f32((bits * 2.0 - 1.0) * 0.125)); + } + out +} + +#[derive(Serialize)] +struct Report { + schema: u32, + engine: &'static str, + model_path: String, + device: usize, + hidden_size: usize, + target_layer_count: usize, + reports: Vec, +} + +#[derive(Serialize)] +struct BatchReport { + batch_size: usize, + ctx_len: usize, + q_len: usize, + warmup: usize, + iters: usize, + draft_tokens_per_s: f64, + requests_per_s: f64, + latency_ms: Stats, +} + +#[derive(Serialize)] +struct Stats { + mean: f64, + p50: f64, + p90: f64, + p99: f64, + min: f64, + max: f64, +} + +impl Stats { + fn from(values: &[f64]) -> Self { + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mean = sorted.iter().sum::() / sorted.len() as f64; + Self { + mean, + p50: percentile(&sorted, 0.50), + p90: percentile(&sorted, 0.90), + p99: percentile(&sorted, 0.99), + min: sorted[0], + max: sorted[sorted.len() - 1], + } + } +} + +fn percentile(sorted: &[f64], q: f64) -> f64 { + let idx = ((sorted.len() - 1) as f64 * q).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs new file mode 100644 index 00000000..cb5bd0c9 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs @@ -0,0 +1,301 @@ +use std::path::PathBuf; +use std::time::Instant; + +use anyhow::{Context, Result, bail}; +use half::bf16; +use openinfer_core::tensor::HiddenStates; +use openinfer_qwen3_4b_dflash::{DFlashDraftModel, DFlashTargetHidden}; +use safetensors::{Dtype, SafeTensors}; +use serde::Serialize; + +fn main() -> Result<()> { + let args = Args::parse()?; + let model = DFlashDraftModel::load(&args.model_path, args.device)?; + let config = model.config(); + let ctx = model.device_context(); + + let (noise, target_hidden, positions, ctx_len, q_len) = if let Some(fixture) = &args.fixture { + let bytes = std::fs::read(fixture) + .with_context(|| format!("failed to read fixture {}", fixture.display()))?; + let st = SafeTensors::deserialize(&bytes).context("parse fixture")?; + let noise = read_bf16(&st, "noise_embedding", &[1, args.q_len, config.hidden_size])?; + let target_hidden = read_bf16( + &st, + "target_hidden", + &[ + 1, + args.ctx_len, + config.hidden_size * config.target_layer_count(), + ], + )?; + let positions = read_i32(&st, "position_ids", &[1, args.ctx_len + args.q_len])?; + (noise, target_hidden, positions, args.ctx_len, args.q_len) + } else { + let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_4B16); + let target_hidden = deterministic_bf16( + args.ctx_len * config.hidden_size * config.target_layer_count(), + 0xD4A5_C0DE, + ); + let positions = (0..(args.ctx_len + args.q_len)) + .map(|pos| pos as i32) + .collect::>(); + (noise, target_hidden, positions, args.ctx_len, args.q_len) + }; + + let noise = HiddenStates { + data: ctx.stream.clone_htod(&noise).context("noise h2d")?, + hidden_dim: config.hidden_size, + seq_len: q_len, + }; + let target_hidden = HiddenStates { + data: ctx + .stream + .clone_htod(&target_hidden) + .context("target hidden h2d")?, + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: ctx_len, + }; + ctx.sync()?; + + let mut cache = model.create_draft_cache(q_len, ctx_len, ctx_len + q_len)?; + if args.draft_cache { + model.prepare_step_context( + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + )?; + ctx.sync()?; + } + for _ in 0..args.warmup { + if args.draft_cache { + cache.reset(); + model.prepare_step_context( + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + )?; + let _out = model.forward_with_draft_cache(&noise, &positions, &mut cache)?; + } else { + let _out = model.forward_with_cache( + &noise, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + )?; + } + ctx.sync()?; + } + + let mut latencies_ms = Vec::with_capacity(args.iters); + for _ in 0..args.iters { + ctx.sync()?; + let started = Instant::now(); + if args.draft_cache { + cache.reset(); + model.prepare_step_context( + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + )?; + let _out = model.forward_with_draft_cache(&noise, &positions, &mut cache)?; + } else { + let _out = model.forward_with_cache( + &noise, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + )?; + } + ctx.sync()?; + latencies_ms.push(started.elapsed().as_secs_f64() * 1000.0); + } + + let report = Report { + schema: 1, + engine: "openinfer-qwen3-4b-dflash", + model_path: args.model_path.to_string_lossy().to_string(), + device: args.device, + ctx_len: args.ctx_len, + q_len: args.q_len, + hidden_size: config.hidden_size, + target_layer_count: config.target_layer_count(), + draft_cache: args.draft_cache, + warmup: args.warmup, + iters: args.iters, + latency_ms: Stats::from(&latencies_ms), + }; + println!("{}", serde_json::to_string_pretty(&report)?); + Ok(()) +} + +#[derive(Clone)] +struct Args { + model_path: PathBuf, + fixture: Option, + device: usize, + ctx_len: usize, + q_len: usize, + warmup: usize, + iters: usize, + draft_cache: bool, +} + +impl Args { + fn parse() -> Result { + let mut model_path = PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"); + let mut fixture = None; + let mut device = 0usize; + let mut ctx_len = 2usize; + let mut q_len = 16usize; + let mut warmup = 5usize; + let mut iters = 30usize; + let mut draft_cache = false; + let mut args = std::env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--model-path" => model_path = PathBuf::from(next_value(&mut args, &arg)?), + "--fixture" => fixture = Some(PathBuf::from(next_value(&mut args, &arg)?)), + "--device" => device = next_value(&mut args, &arg)?.parse()?, + "--ctx-len" => ctx_len = next_value(&mut args, &arg)?.parse()?, + "--q-len" => q_len = next_value(&mut args, &arg)?.parse()?, + "--warmup" => warmup = next_value(&mut args, &arg)?.parse()?, + "--iters" => iters = next_value(&mut args, &arg)?.parse()?, + "--draft-cache" | "--context-cache" => draft_cache = true, + _ => bail!("unknown argument {arg}"), + } + } + if ctx_len == 0 { + bail!("--ctx-len must be greater than zero"); + } + if q_len == 0 { + bail!("--q-len must be greater than zero"); + } + if iters == 0 { + bail!("--iters must be greater than zero"); + } + Ok(Self { + model_path, + fixture, + device, + ctx_len, + q_len, + warmup, + iters, + draft_cache, + }) + } +} + +fn next_value(args: &mut impl Iterator, flag: &str) -> Result { + args.next() + .with_context(|| format!("{flag} requires a value")) +} + +fn deterministic_bf16(len: usize, seed: u64) -> Vec { + let mut state = seed; + let mut out = Vec::with_capacity(len); + for _ in 0..len { + state = state.wrapping_mul(6364136223846793005).wrapping_add(1); + let bits = ((state >> 32) as u32) as f32 / (u32::MAX as f32); + let value = (bits * 2.0 - 1.0) * 0.125; + out.push(bf16::from_f32(value)); + } + out +} + +#[derive(Serialize)] +struct Report { + schema: u32, + engine: &'static str, + model_path: String, + device: usize, + ctx_len: usize, + q_len: usize, + hidden_size: usize, + target_layer_count: usize, + draft_cache: bool, + warmup: usize, + iters: usize, + latency_ms: Stats, +} + +#[derive(Serialize)] +struct Stats { + mean: f64, + p50: f64, + p90: f64, + p99: f64, + min: f64, + max: f64, +} + +impl Stats { + fn from(values: &[f64]) -> Self { + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mean = sorted.iter().sum::() / sorted.len() as f64; + Self { + mean, + p50: percentile(&sorted, 0.50), + p90: percentile(&sorted, 0.90), + p99: percentile(&sorted, 0.99), + min: sorted[0], + max: sorted[sorted.len() - 1], + } + } +} + +fn read_bf16(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Result> { + let view = st + .tensor(name) + .with_context(|| format!("missing tensor {name}"))?; + if view.dtype() != Dtype::BF16 { + bail!("{name} must be BF16, got {:?}", view.dtype()); + } + if view.shape() != shape { + bail!( + "{name} shape mismatch: expected {shape:?}, got {:?}", + view.shape() + ); + } + Ok(view + .data() + .chunks_exact(2) + .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]]))) + .collect()) +} + +fn read_i32(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Result> { + let view = st + .tensor(name) + .with_context(|| format!("missing tensor {name}"))?; + if view.dtype() != Dtype::I32 { + bail!("{name} must be I32, got {:?}", view.dtype()); + } + if view.shape() != shape { + bail!( + "{name} shape mismatch: expected {shape:?}, got {:?}", + view.shape() + ); + } + Ok(view + .data() + .chunks_exact(4) + .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])) + .collect()) +} + +fn percentile(sorted: &[f64], q: f64) -> f64 { + let idx = ((sorted.len() - 1) as f64 * q).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs new file mode 100644 index 00000000..08ff66a3 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs @@ -0,0 +1,155 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use anyhow::{Context, Result, bail}; +use half::bf16; +use openinfer_core::tensor::HiddenStates; +use openinfer_qwen3_4b_dflash::{DFlashDraftModel, DFlashTargetHidden}; +use safetensors::{Dtype, SafeTensors, tensor::TensorView}; + +fn main() -> Result<()> { + let args = Args::parse()?; + let fixture_bytes = std::fs::read(&args.fixture).with_context(|| { + format!( + "failed to read input fixture {}", + args.fixture.to_string_lossy() + ) + })?; + let st = SafeTensors::deserialize(&fixture_bytes).context("parse input fixture")?; + let model = DFlashDraftModel::load(&args.model_path, args.device)?; + let config = model.config(); + let ctx = model.device_context(); + + let noise = bf16_tensor(&st, "noise_embedding")?; + let target_hidden = bf16_tensor(&st, "target_hidden")?; + let positions = i32_tensor(&st, "position_ids")?; + + if noise.1.len() != 3 || noise.1[0] != 1 || noise.1[2] != config.hidden_size { + bail!( + "noise_embedding shape mismatch: expected [1, q_len, {}], got {:?}", + config.hidden_size, + noise.1 + ); + } + if target_hidden.1.len() != 3 + || target_hidden.1[0] != 1 + || target_hidden.1[2] != config.hidden_size * config.target_layer_count() + { + bail!( + "target_hidden shape mismatch: expected [1, ctx_len, {}], got {:?}", + config.hidden_size * config.target_layer_count(), + target_hidden.1 + ); + } + let q_len = noise.1[1]; + let ctx_len = target_hidden.1[1]; + ensure_shape("position_ids", &positions.1, &[1, ctx_len + q_len])?; + + let noise_embedding = HiddenStates { + data: ctx.stream.clone_htod(&noise.0)?, + hidden_dim: config.hidden_size, + seq_len: q_len, + }; + let target_hidden = HiddenStates { + data: ctx.stream.clone_htod(&target_hidden.0)?, + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: ctx_len, + }; + let out = model.forward( + &noise_embedding, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions.0, + )?; + ctx.sync()?; + let out = ctx.stream.clone_dtoh(&out.data)?; + ctx.sync()?; + + let out_bytes = bf16_bytes(&out); + let tensors = HashMap::from([( + "openinfer_output".to_string(), + TensorView::new(Dtype::BF16, vec![1, q_len, config.hidden_size], &out_bytes)?, + )]); + safetensors::serialize_to_file(tensors, None, &args.out)?; + Ok(()) +} + +struct Args { + model_path: PathBuf, + fixture: PathBuf, + out: PathBuf, + device: usize, +} + +impl Args { + fn parse() -> Result { + let mut model_path = None; + let mut fixture = None; + let mut out = None; + let mut device = 0usize; + let mut args = std::env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--model-path" => model_path = Some(PathBuf::from(next_value(&mut args, &arg)?)), + "--fixture" => fixture = Some(PathBuf::from(next_value(&mut args, &arg)?)), + "--out" => out = Some(PathBuf::from(next_value(&mut args, &arg)?)), + "--device" => device = next_value(&mut args, &arg)?.parse()?, + _ => bail!("unknown argument {arg}"), + } + } + Ok(Self { + model_path: model_path + .unwrap_or_else(|| PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16")), + fixture: fixture.context("--fixture is required")?, + out: out.context("--out is required")?, + device, + }) + } +} + +fn next_value(args: &mut impl Iterator, flag: &str) -> Result { + args.next() + .with_context(|| format!("{flag} requires a value")) +} + +fn ensure_shape(name: &str, got: &[usize], expected: &[usize]) -> Result<()> { + if got != expected { + bail!("{name} shape mismatch: expected {expected:?}, got {got:?}"); + } + Ok(()) +} + +fn bf16_tensor(st: &SafeTensors<'_>, name: &str) -> Result<(Vec, Vec)> { + let view = st.tensor(name)?; + if view.dtype() != Dtype::BF16 { + bail!("{name} must be BF16, got {:?}", view.dtype()); + } + let values = view + .data() + .chunks_exact(2) + .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]]))) + .collect(); + Ok((values, view.shape().to_vec())) +} + +fn i32_tensor(st: &SafeTensors<'_>, name: &str) -> Result<(Vec, Vec)> { + let view = st.tensor(name)?; + if view.dtype() != Dtype::I32 { + bail!("{name} must be I32, got {:?}", view.dtype()); + } + let values = view + .data() + .chunks_exact(4) + .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])) + .collect(); + Ok((values, view.shape().to_vec())) +} + +fn bf16_bytes(values: &[bf16]) -> Vec { + let mut out = Vec::with_capacity(values.len() * 2); + for value in values { + out.extend(value.to_bits().to_le_bytes()); + } + out +} diff --git a/openinfer-qwen3-4b-dflash/src/config.rs b/openinfer-qwen3-4b-dflash/src/config.rs new file mode 100644 index 00000000..bebd1657 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/config.rs @@ -0,0 +1,143 @@ +use anyhow::{Result, bail}; +use serde::Deserialize; +use std::fs; +use std::path::Path; + +#[derive(Clone, Debug, Deserialize)] +pub struct DFlashInnerConfig { + pub mask_token_id: u32, + pub target_layer_ids: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +pub struct DFlashConfig { + pub architectures: Vec, + pub attention_bias: bool, + pub attention_dropout: f32, + pub block_size: usize, + pub dflash_config: DFlashInnerConfig, + pub hidden_size: usize, + pub intermediate_size: usize, + pub num_attention_heads: usize, + pub num_hidden_layers: usize, + pub num_key_value_heads: usize, + pub num_target_layers: usize, + pub head_dim: usize, + pub max_position_embeddings: usize, + pub rms_norm_eps: f32, + pub rope_theta: f32, + pub tie_word_embeddings: bool, + pub vocab_size: usize, +} + +impl DFlashConfig { + pub fn from_model_dir(model_path: &Path) -> Result { + let content = fs::read_to_string(model_path.join("config.json"))?; + let config: Self = serde_json::from_str(&content)?; + config.validate()?; + Ok(config) + } + + pub fn validate(&self) -> Result<()> { + if self + .architectures + .iter() + .all(|name| name != "DFlashDraftModel") + { + bail!("DFlash config architectures must include DFlashDraftModel"); + } + if self.attention_bias { + bail!("DFlash v1 expects bias-free Qwen3 projections"); + } + if self.attention_dropout != 0.0 { + bail!("DFlash inference expects attention_dropout=0"); + } + if self.num_hidden_layers == 0 { + bail!("DFlash draft must have at least one layer"); + } + if self.num_hidden_layers != 5 { + bail!( + "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 with 5 draft layers, got {}", + self.num_hidden_layers + ); + } + if self.block_size != 16 { + bail!( + "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 block_size=16, got {}", + self.block_size + ); + } + if self.dflash_config.mask_token_id != 151669 { + bail!( + "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 mask_token_id=151669, got {}", + self.dflash_config.mask_token_id + ); + } + if self.hidden_size == 0 || self.head_dim == 0 { + bail!("DFlash hidden_size/head_dim must be positive"); + } + if self.num_attention_heads == 0 || self.num_key_value_heads == 0 { + bail!("DFlash attention/KV head counts must be positive"); + } + if self.num_attention_heads % self.num_key_value_heads != 0 { + bail!("DFlash GQA requires attention heads divisible by KV heads"); + } + if self.dflash_config.target_layer_ids.len() != self.num_hidden_layers { + bail!( + "DFlash target_layer_ids len {} must match draft layers {}", + self.dflash_config.target_layer_ids.len(), + self.num_hidden_layers + ); + } + if self + .dflash_config + .target_layer_ids + .iter() + .any(|&layer| layer >= self.num_target_layers) + { + bail!("DFlash target_layer_ids must be within num_target_layers"); + } + if self.dflash_config.target_layer_ids.as_slice() != [1, 9, 17, 25, 33] { + bail!( + "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 target_layer_ids=[1, 9, 17, 25, 33], got {:?}", + self.dflash_config.target_layer_ids + ); + } + Ok(()) + } + + pub fn target_layer_count(&self) -> usize { + self.dflash_config.target_layer_ids.len() + } + + pub fn q_dim(&self) -> usize { + self.num_attention_heads * self.head_dim + } + + pub fn kv_dim(&self) -> usize { + self.num_key_value_heads * self.head_dim + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"; + + #[test] + fn parses_local_dflash_config() { + let path = Path::new(LOCAL_DFLASH); + if !path.exists() { + eprintln!("skipping: {LOCAL_DFLASH} does not exist"); + return; + } + let config = DFlashConfig::from_model_dir(path).expect("config"); + assert_eq!(config.num_hidden_layers, 5); + assert_eq!(config.block_size, 16); + assert_eq!(config.dflash_config.mask_token_id, 151669); + assert_eq!(config.dflash_config.target_layer_ids, [1, 9, 17, 25, 33]); + assert_eq!(config.hidden_size, 2560); + assert_eq!(config.intermediate_size, 9728); + } +} diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs new file mode 100644 index 00000000..818226c7 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/executor.rs @@ -0,0 +1,640 @@ +use std::collections::HashMap; +use std::path::Path; +use std::time::{Duration, Instant}; + +use anyhow::Result; +use half::bf16; +use openinfer_core::tensor::HiddenStates; + +use crate::batch_buffers::DFlashBatchBuffers; +use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden}; +use crate::forward::{DFlashDraftCache, DFlashTargetHidden}; +use crate::weights::DFlashDraftModel; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)] +pub struct DFlashRequestId(pub u64); + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum DFlashCacheMode { + NoCache, + DraftCache, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub struct DFlashBatchKey { + pub q_len: usize, + pub ctx_len: usize, + pub past_len: usize, + pub cache_mode: DFlashCacheMode, +} + +pub struct DFlashDraftRequest { + pub request_id: DFlashRequestId, + pub noise_embedding: HiddenStates, + pub target_hidden: HiddenStates, + pub position_ids: Vec, + pub cache_mode: DFlashCacheMode, +} + +pub struct DFlashDraftHostRequest { + pub request_id: DFlashRequestId, + pub noise_embedding: Vec, + pub target_hidden: Vec, + pub position_ids: Vec, + pub q_len: usize, + pub ctx_len: usize, + pub cache_mode: DFlashCacheMode, +} + +pub struct DFlashDraftResponse { + pub request_id: DFlashRequestId, + pub output: HiddenStates, + pub cache_seq_len: usize, + pub batch_size: usize, + pub elapsed: Duration, +} + +pub struct DFlashDraftHostResponse { + pub request_id: DFlashRequestId, + pub output: Vec, + pub hidden_dim: usize, + pub seq_len: usize, + pub cache_seq_len: usize, + pub batch_size: usize, + pub elapsed: Duration, +} + +pub struct DFlashDraftBatchResponse { + pub request_ids: Vec, + pub output: HiddenStates, + pub cache_seq_lens: Vec, + pub batch_size: usize, + pub q_len: usize, + pub elapsed: Duration, +} + +pub struct DFlashDraftBatchView<'a> { + pub request_ids: Vec, + pub output: &'a HiddenStates, + pub cache_seq_lens: Vec, + pub batch_size: usize, + pub q_len: usize, + pub elapsed: Duration, +} + +pub struct DFlashExecutorOptions { + pub max_batch_size: usize, + pub max_step_context_len: usize, + pub max_seq_len: usize, +} + +impl Default for DFlashExecutorOptions { + fn default() -> Self { + Self { + max_batch_size: 32, + max_step_context_len: 16, + max_seq_len: 4096, + } + } +} + +pub struct DFlashExecutor { + model: DFlashDraftModel, + options: DFlashExecutorOptions, + buffers: HashMap<(usize, usize, usize), DFlashBatchBuffers>, + caches: HashMap, +} + +impl DFlashExecutor { + pub fn load( + model_path: &Path, + device_ordinal: usize, + options: DFlashExecutorOptions, + ) -> Result { + let model = DFlashDraftModel::load(model_path, device_ordinal)?; + Ok(Self { + model, + options, + buffers: HashMap::new(), + caches: HashMap::new(), + }) + } + + pub fn model(&self) -> &DFlashDraftModel { + &self.model + } + + pub fn max_batch_size(&self) -> usize { + self.options.max_batch_size + } + + pub fn batch_key(&self, req: &DFlashDraftRequest) -> Result { + let target = DFlashTargetHidden { + concatenated: &req.target_hidden, + }; + let (q_len, ctx_len) = + self.model + .validate_forward_inputs(&req.noise_embedding, &target, &req.position_ids)?; + let past_len = self + .caches + .get(&req.request_id) + .map(DFlashDraftCache::seq_len) + .unwrap_or(0); + Ok(DFlashBatchKey { + q_len, + ctx_len, + past_len, + cache_mode: req.cache_mode, + }) + } + + pub fn host_batch_key(&self, req: &DFlashDraftHostRequest) -> Result { + let config = self.model.config(); + anyhow::ensure!( + req.noise_embedding.len() == req.q_len * config.hidden_size, + "noise_embedding len {} != q_len * hidden_size {}", + req.noise_embedding.len(), + req.q_len * config.hidden_size + ); + anyhow::ensure!( + req.target_hidden.len() + == req.ctx_len * config.hidden_size * config.target_layer_count(), + "target_hidden len {} != ctx_len * target_layer_count * hidden_size {}", + req.target_hidden.len(), + req.ctx_len * config.hidden_size * config.target_layer_count() + ); + anyhow::ensure!( + req.position_ids.len() == req.ctx_len + req.q_len, + "position_ids len {} != ctx_len + q_len {}", + req.position_ids.len(), + req.ctx_len + req.q_len + ); + let past_len = self + .caches + .get(&req.request_id) + .map(DFlashDraftCache::seq_len) + .unwrap_or(0); + Ok(DFlashBatchKey { + q_len: req.q_len, + ctx_len: req.ctx_len, + past_len, + cache_mode: req.cache_mode, + }) + } + + pub fn execute_batch( + &mut self, + requests: Vec, + ) -> Result> { + let batch = self.execute_batch_compact(requests)?; + self.split_compact_response(batch) + } + + pub fn execute_host_batch_compact( + &mut self, + requests: Vec, + ) -> Result { + anyhow::ensure!(!requests.is_empty(), "DFlash host executor batch is empty"); + anyhow::ensure!( + requests.len() <= self.options.max_batch_size, + "DFlash host executor batch size {} exceeds max_batch_size {}", + requests.len(), + self.options.max_batch_size + ); + let key = self.host_batch_key(&requests[0])?; + for req in &requests[1..] { + let req_key = self.host_batch_key(req)?; + anyhow::ensure!( + req_key == key, + "DFlash host executor requires exact-shape batch: first={key:?}, got={req_key:?}" + ); + } + if key.cache_mode == DFlashCacheMode::DraftCache { + return self.execute_cached_host_requests_serial_compact(requests, key); + } + let started = Instant::now(); + let batch_size = requests.len(); + let request_ids = requests + .iter() + .map(|request| request.request_id) + .collect::>(); + let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); + if !self.buffers.contains_key(&buffer_key) { + let bufs = self.model.create_batch_buffers( + self.options.max_batch_size, + key.q_len, + key.ctx_len, + )?; + self.buffers.insert(buffer_key, bufs); + } + let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); + let inputs = requests + .iter() + .map(|req| DFlashHostBatchInput { + noise_embedding: &req.noise_embedding, + target_hidden: &req.target_hidden, + position_ids: &req.position_ids, + }) + .collect::>(); + let batch_output = self.model.forward_host_batch(&inputs, bufs)?; + self.model.device_context().sync()?; + let elapsed = started.elapsed(); + let mut output = HiddenStates::zeros( + self.model.device_context(), + batch_output.hidden_dim, + batch_output.seq_len, + )?; + copy_hidden( + self.model.device_context(), + batch_output, + 0, + &mut output, + 0, + batch_output.hidden_dim, + batch_output.seq_len, + )?; + Ok(DFlashDraftBatchResponse { + request_ids, + output, + cache_seq_lens: vec![0; batch_size], + batch_size, + q_len: key.q_len, + elapsed, + }) + } + + pub fn execute_host_batch( + &mut self, + requests: Vec, + ) -> Result> { + let batch = self.execute_host_batch_compact(requests)?; + self.split_compact_response(batch) + } + + pub fn execute_host_batch_host( + &mut self, + requests: Vec, + ) -> Result> { + let batch = self.execute_host_batch_compact(requests)?; + self.split_compact_host_response(batch) + } + + pub fn execute_host_batch_view( + &mut self, + requests: Vec, + ) -> Result> { + anyhow::ensure!(!requests.is_empty(), "DFlash host executor batch is empty"); + anyhow::ensure!( + requests.len() <= self.options.max_batch_size, + "DFlash host executor batch size {} exceeds max_batch_size {}", + requests.len(), + self.options.max_batch_size + ); + let key = self.host_batch_key(&requests[0])?; + for req in &requests[1..] { + let req_key = self.host_batch_key(req)?; + anyhow::ensure!( + req_key == key, + "DFlash host executor requires exact-shape batch: first={key:?}, got={req_key:?}" + ); + } + anyhow::ensure!( + key.cache_mode == DFlashCacheMode::NoCache, + "borrowed host batch view currently supports only NoCache mode" + ); + let started = Instant::now(); + let batch_size = requests.len(); + let request_ids = requests + .iter() + .map(|request| request.request_id) + .collect::>(); + let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); + if !self.buffers.contains_key(&buffer_key) { + let bufs = self.model.create_batch_buffers( + self.options.max_batch_size, + key.q_len, + key.ctx_len, + )?; + self.buffers.insert(buffer_key, bufs); + } + let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); + let inputs = requests + .iter() + .map(|req| DFlashHostBatchInput { + noise_embedding: &req.noise_embedding, + target_hidden: &req.target_hidden, + position_ids: &req.position_ids, + }) + .collect::>(); + let output = self.model.forward_host_batch(&inputs, bufs)?; + self.model.device_context().sync()?; + Ok(DFlashDraftBatchView { + request_ids, + output, + cache_seq_lens: vec![0; batch_size], + batch_size, + q_len: key.q_len, + elapsed: started.elapsed(), + }) + } + + pub fn execute_batch_compact( + &mut self, + requests: Vec, + ) -> Result { + anyhow::ensure!(!requests.is_empty(), "DFlash executor batch is empty"); + anyhow::ensure!( + requests.len() <= self.options.max_batch_size, + "DFlash executor batch size {} exceeds max_batch_size {}", + requests.len(), + self.options.max_batch_size + ); + let key = self.batch_key(&requests[0])?; + for req in &requests[1..] { + let req_key = self.batch_key(req)?; + anyhow::ensure!( + req_key == key, + "DFlash executor requires exact-shape batch: first={key:?}, got={req_key:?}" + ); + } + match key.cache_mode { + DFlashCacheMode::NoCache => self.execute_uncached_batch_compact(requests, key), + DFlashCacheMode::DraftCache => { + self.execute_cached_requests_serial_compact(requests, key) + } + } + } + + pub fn reset_cache(&mut self, request_id: DFlashRequestId) -> Result<()> { + let Some(cache) = self.caches.get_mut(&request_id) else { + anyhow::bail!("unknown DFlash cache request_id {:?}", request_id); + }; + cache.reset(); + Ok(()) + } + + pub fn crop_cache(&mut self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> { + let Some(cache) = self.caches.get_mut(&request_id) else { + anyhow::bail!("unknown DFlash cache request_id {:?}", request_id); + }; + cache.crop(seq_len)?; + Ok(()) + } + + pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result { + self.caches + .get(&request_id) + .map(DFlashDraftCache::seq_len) + .ok_or_else(|| anyhow::anyhow!("unknown DFlash cache request_id {:?}", request_id)) + } + + fn execute_uncached_batch_compact( + &mut self, + requests: Vec, + key: DFlashBatchKey, + ) -> Result { + let started = Instant::now(); + let batch_size = requests.len(); + let request_ids = requests + .iter() + .map(|request| request.request_id) + .collect::>(); + let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); + if !self.buffers.contains_key(&buffer_key) { + let bufs = self.model.create_batch_buffers( + self.options.max_batch_size, + key.q_len, + key.ctx_len, + )?; + self.buffers.insert(buffer_key, bufs); + } + let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); + let inputs = requests + .iter() + .map(|req| DFlashBatchInput { + noise_embedding: &req.noise_embedding, + target_hidden: DFlashTargetHidden { + concatenated: &req.target_hidden, + }, + position_ids: &req.position_ids, + }) + .collect::>(); + let batch_output = self.model.forward_batch(&inputs, bufs)?; + self.model.device_context().sync()?; + let elapsed = started.elapsed(); + let mut output = HiddenStates::zeros( + self.model.device_context(), + self.model.config().hidden_size, + batch_size * key.q_len, + )?; + copy_hidden( + self.model.device_context(), + batch_output, + 0, + &mut output, + 0, + self.model.config().hidden_size, + batch_size * key.q_len, + )?; + Ok(DFlashDraftBatchResponse { + request_ids, + output, + cache_seq_lens: vec![0; batch_size], + batch_size, + q_len: key.q_len, + elapsed, + }) + } + + fn execute_cached_requests_serial_compact( + &mut self, + requests: Vec, + key: DFlashBatchKey, + ) -> Result { + let started = Instant::now(); + let batch_size = requests.len(); + let mut request_ids = Vec::with_capacity(batch_size); + let mut cache_seq_lens = Vec::with_capacity(batch_size); + let mut output = HiddenStates::zeros( + self.model.device_context(), + self.model.config().hidden_size, + batch_size * key.q_len, + )?; + for (i, req) in requests.into_iter().enumerate() { + if !self.caches.contains_key(&req.request_id) { + let cache = self.model.create_draft_cache( + key.q_len, + self.options.max_step_context_len, + self.options.max_seq_len, + )?; + self.caches.insert(req.request_id, cache); + } + let cache = self.caches.get_mut(&req.request_id).expect("cache exists"); + self.model.prepare_step_context( + DFlashTargetHidden { + concatenated: &req.target_hidden, + }, + &req.position_ids, + cache, + )?; + let out = self.model.forward_with_draft_cache( + &req.noise_embedding, + &req.position_ids, + cache, + )?; + self.model.device_context().sync()?; + copy_hidden( + self.model.device_context(), + out, + 0, + &mut output, + i * key.q_len, + self.model.config().hidden_size, + key.q_len, + )?; + request_ids.push(req.request_id); + cache_seq_lens.push(cache.seq_len()); + } + Ok(DFlashDraftBatchResponse { + request_ids, + output, + cache_seq_lens, + batch_size, + q_len: key.q_len, + elapsed: started.elapsed(), + }) + } + + fn execute_cached_host_requests_serial_compact( + &mut self, + requests: Vec, + key: DFlashBatchKey, + ) -> Result { + let started = Instant::now(); + let batch_size = requests.len(); + let config = self.model.config(); + let mut request_ids = Vec::with_capacity(batch_size); + let mut cache_seq_lens = Vec::with_capacity(batch_size); + let mut output = HiddenStates::zeros( + self.model.device_context(), + config.hidden_size, + batch_size * key.q_len, + )?; + for (i, req) in requests.into_iter().enumerate() { + if !self.caches.contains_key(&req.request_id) { + let cache = self.model.create_draft_cache( + key.q_len, + self.options.max_step_context_len, + self.options.max_seq_len, + )?; + self.caches.insert(req.request_id, cache); + } + let noise_embedding = HiddenStates { + data: self + .model + .device_context() + .stream + .clone_htod(&req.noise_embedding)?, + hidden_dim: config.hidden_size, + seq_len: key.q_len, + }; + let target_hidden = HiddenStates { + data: self + .model + .device_context() + .stream + .clone_htod(&req.target_hidden)?, + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: key.ctx_len, + }; + let cache = self.caches.get_mut(&req.request_id).expect("cache exists"); + self.model.prepare_step_context( + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &req.position_ids, + cache, + )?; + let out = + self.model + .forward_with_draft_cache(&noise_embedding, &req.position_ids, cache)?; + self.model.device_context().sync()?; + copy_hidden( + self.model.device_context(), + out, + 0, + &mut output, + i * key.q_len, + config.hidden_size, + key.q_len, + )?; + request_ids.push(req.request_id); + cache_seq_lens.push(cache.seq_len()); + } + Ok(DFlashDraftBatchResponse { + request_ids, + output, + cache_seq_lens, + batch_size, + q_len: key.q_len, + elapsed: started.elapsed(), + }) + } + + fn split_compact_response( + &self, + batch: DFlashDraftBatchResponse, + ) -> Result> { + let mut responses = Vec::with_capacity(batch.batch_size); + for i in 0..batch.batch_size { + let mut output = HiddenStates::zeros( + self.model.device_context(), + self.model.config().hidden_size, + batch.q_len, + )?; + copy_hidden( + self.model.device_context(), + &batch.output, + i * batch.q_len, + &mut output, + 0, + self.model.config().hidden_size, + batch.q_len, + )?; + responses.push(DFlashDraftResponse { + request_id: batch.request_ids[i], + output, + cache_seq_len: batch.cache_seq_lens[i], + batch_size: batch.batch_size, + elapsed: batch.elapsed, + }); + } + Ok(responses) + } + + fn split_compact_host_response( + &self, + batch: DFlashDraftBatchResponse, + ) -> Result> { + let host = self + .model + .device_context() + .stream + .clone_dtoh(&batch.output.data)?; + self.model.device_context().sync()?; + let row_len = batch.output.hidden_dim * batch.q_len; + let mut responses = Vec::with_capacity(batch.batch_size); + for i in 0..batch.batch_size { + responses.push(DFlashDraftHostResponse { + request_id: batch.request_ids[i], + output: host[i * row_len..(i + 1) * row_len].to_vec(), + hidden_dim: batch.output.hidden_dim, + seq_len: batch.q_len, + cache_seq_len: batch.cache_seq_lens[i], + batch_size: batch.batch_size, + elapsed: batch.elapsed, + }); + } + Ok(responses) + } +} diff --git a/openinfer-qwen3-4b-dflash/src/forward.rs b/openinfer-qwen3-4b-dflash/src/forward.rs new file mode 100644 index 00000000..7323e130 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/forward.rs @@ -0,0 +1,886 @@ +use anyhow::Result; +use cudarc::driver::CudaSlice; +use openinfer_core::ops; +use openinfer_core::tensor::HiddenStates; + +use crate::weights::{DFlashDraftModel, DFlashLayer}; + +pub struct DFlashTargetHidden<'a> { + /// HF reference layout: `[seq_len, target_layer_count * hidden_size]`. + pub concatenated: &'a HiddenStates, +} + +pub struct DFlashDraftCache { + pub(crate) q_len: usize, + pub(crate) state: DFlashDraftState, + pub(crate) step: DFlashStepContext, + pub(crate) scratch: ForwardBuffers, +} + +pub(crate) struct DFlashDraftState { + pub(crate) max_seq_len: usize, + pub(crate) seq_len: usize, + pub(crate) layers: Vec, +} + +pub(crate) struct DFlashStepContext { + pub(crate) max_len: usize, + pub(crate) len: usize, + pub(crate) valid: bool, + pub(crate) layers: Vec, +} + +pub(crate) struct DFlashLayerStepContext { + pub(crate) k_ctx: HiddenStates, + pub(crate) v_ctx: HiddenStates, +} + +pub(crate) struct DFlashLayerPastKv { + pub(crate) k_past: HiddenStates, + pub(crate) v_past: HiddenStates, +} + +pub(crate) struct ForwardBuffers { + pub(crate) hidden_out: HiddenStates, + pub(crate) target_projected: HiddenStates, + pub(crate) target_normed: HiddenStates, + pub(crate) normed: HiddenStates, + pub(crate) q: HiddenStates, + pub(crate) q_ctx_scratch: HiddenStates, + pub(crate) k_ctx: HiddenStates, + pub(crate) k_noise: HiddenStates, + pub(crate) v_ctx: HiddenStates, + pub(crate) v_noise: HiddenStates, + pub(crate) k_all: HiddenStates, + pub(crate) v_all: HiddenStates, + pub(crate) attn_out: HiddenStates, + pub(crate) o_buf: HiddenStates, + pub(crate) gate_up: HiddenStates, + pub(crate) act_out: HiddenStates, + pub(crate) positions_q: CudaSlice, + pub(crate) positions_ctx: CudaSlice, +} + +impl DFlashDraftModel { + pub fn create_draft_cache( + &self, + q_len: usize, + max_step_context_len: usize, + max_seq_len: usize, + ) -> Result { + anyhow::ensure!(q_len > 0, "DFlash scratch requires q_len greater than zero"); + anyhow::ensure!( + max_step_context_len > 0, + "DFlash cache requires max_step_context_len greater than zero" + ); + anyhow::ensure!( + max_seq_len >= max_step_context_len + q_len, + "DFlash cache max_seq_len {} must fit at least one step: context {} + q_len {}", + max_seq_len, + max_step_context_len, + q_len + ); + Ok(DFlashDraftCache { + q_len, + state: DFlashDraftState::new(self, max_seq_len)?, + step: DFlashStepContext::new(self, max_step_context_len)?, + scratch: ForwardBuffers::new(self, q_len, max_step_context_len)?, + }) + } + + pub fn forward( + &self, + noise_embedding: &HiddenStates, + target_hidden: DFlashTargetHidden<'_>, + position_ids: &[i32], + ) -> Result { + let (q_len, ctx_len) = + self.validate_forward_inputs(noise_embedding, &target_hidden, position_ids)?; + let mut bufs = ForwardBuffers::new(self, q_len, ctx_len)?; + self.project_target_hidden(target_hidden, &mut bufs)?; + self.run_forward(noise_embedding, ctx_len, position_ids, &mut bufs)?; + Ok(bufs.normed) + } + + pub fn forward_with_cache<'a>( + &self, + noise_embedding: &HiddenStates, + target_hidden: DFlashTargetHidden<'_>, + position_ids: &[i32], + cache: &'a mut DFlashDraftCache, + ) -> Result<&'a HiddenStates> { + let (q_len, ctx_len) = + self.validate_forward_inputs(noise_embedding, &target_hidden, position_ids)?; + anyhow::ensure!( + cache.q_len == q_len && cache.step.max_len >= ctx_len, + "DFlash cache shape mismatch: cache q_len={}, max_step_context_len={} but input q_len={}, ctx_len={}", + cache.q_len, + cache.step.max_len, + q_len, + ctx_len + ); + cache.reset(); + self.prepare_step_context(target_hidden, position_ids, cache)?; + self.run_forward(noise_embedding, ctx_len, position_ids, &mut cache.scratch)?; + cache.step.valid = false; + Ok(&cache.scratch.normed) + } + + pub fn prepare_step_context( + &self, + target_hidden: DFlashTargetHidden<'_>, + position_ids: &[i32], + cache: &mut DFlashDraftCache, + ) -> Result<()> { + let config = &self.config; + let ctx_len = target_hidden.concatenated.seq_len; + anyhow::ensure!( + ctx_len <= cache.step.max_len, + "DFlash step context length {} exceeds cache capacity {}", + ctx_len, + cache.step.max_len + ); + anyhow::ensure!( + cache.state.seq_len + ctx_len + cache.q_len <= cache.state.max_seq_len, + "DFlash draft cache would exceed capacity: past {} + ctx {} + q {} > {}", + cache.state.seq_len, + ctx_len, + cache.q_len, + cache.state.max_seq_len + ); + anyhow::ensure!( + ctx_len > 0, + "DFlash step context must contain at least one token" + ); + anyhow::ensure!( + position_ids.len() >= ctx_len, + "position_ids len {} < ctx_len {}", + position_ids.len(), + ctx_len + ); + anyhow::ensure!( + target_hidden.concatenated.hidden_dim + == config.target_layer_count() * config.hidden_size, + "target_hidden hidden_dim {} != {}", + target_hidden.concatenated.hidden_dim, + config.target_layer_count() * config.hidden_size + ); + set_step_context_len(&mut cache.scratch, &mut cache.step.layers, ctx_len); + let mut positions_ctx = cache.scratch.positions_ctx.slice_mut(..ctx_len); + self.ctx + .stream + .memcpy_htod(&position_ids[..ctx_len], &mut positions_ctx)?; + + ops::gemm_into_checked( + &self.ctx, + &self.fc, + target_hidden.concatenated, + &mut cache.scratch.target_projected, + )?; + ops::rms_norm_batch_into( + &self.ctx, + &cache.scratch.target_projected, + &self.hidden_norm, + config.rms_norm_eps, + &mut cache.scratch.target_normed, + ); + for (layer, cached) in self.layers.iter().zip(cache.step.layers.iter_mut()) { + ops::gemm_into_checked( + &self.ctx, + &layer.attention.k_proj, + &cache.scratch.target_normed, + &mut cached.k_ctx, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.v_proj, + &cache.scratch.target_normed, + &mut cached.v_ctx, + )?; + ops::qk_norm_rope_batch_decode_into( + &self.ctx, + &mut cache.scratch.q_ctx_scratch, + &mut cached.k_ctx, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &cache.scratch.positions_ctx, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + } + cache.step.len = ctx_len; + cache.step.valid = true; + Ok(()) + } + + pub fn forward_with_draft_cache<'a>( + &self, + noise_embedding: &HiddenStates, + position_ids: &[i32], + cache: &'a mut DFlashDraftCache, + ) -> Result<&'a HiddenStates> { + anyhow::ensure!(cache.step.valid, "DFlash step context is not prepared"); + anyhow::ensure!( + noise_embedding.hidden_dim == self.config.hidden_size, + "noise_embedding hidden_dim {} != {}", + noise_embedding.hidden_dim, + self.config.hidden_size + ); + anyhow::ensure!( + noise_embedding.seq_len == cache.q_len, + "noise_embedding q_len {} != scratch q_len {}", + noise_embedding.seq_len, + cache.q_len + ); + anyhow::ensure!( + position_ids.len() == cache.step.len + cache.q_len, + "position_ids len {} != step_context_len + q_len {}", + position_ids.len(), + cache.step.len + cache.q_len + ); + anyhow::ensure!( + cache.state.seq_len + cache.step.len + cache.q_len <= cache.state.max_seq_len, + "DFlash draft cache would exceed capacity: past {} + ctx {} + q {} > {}", + cache.state.seq_len, + cache.step.len, + cache.q_len, + cache.state.max_seq_len + ); + let past_len = cache.state.seq_len; + self.run_forward_with_draft_cache(noise_embedding, past_len, position_ids, cache)?; + cache.step.valid = false; + Ok(&cache.scratch.normed) + } + + pub(crate) fn validate_forward_inputs( + &self, + noise_embedding: &HiddenStates, + target_hidden: &DFlashTargetHidden<'_>, + position_ids: &[i32], + ) -> Result<(usize, usize)> { + let config = &self.config; + anyhow::ensure!( + noise_embedding.hidden_dim == config.hidden_size, + "noise_embedding hidden_dim {} != {}", + noise_embedding.hidden_dim, + config.hidden_size + ); + let ctx_len = target_hidden.concatenated.seq_len; + let q_len = noise_embedding.seq_len; + anyhow::ensure!( + ctx_len > 0, + "DFlash forward requires at least one target-hidden token" + ); + anyhow::ensure!( + q_len > 0, + "DFlash forward requires at least one noise token" + ); + anyhow::ensure!( + target_hidden.concatenated.hidden_dim + == config.target_layer_count() * config.hidden_size, + "target_hidden hidden_dim {} != {}", + target_hidden.concatenated.hidden_dim, + config.target_layer_count() * config.hidden_size + ); + anyhow::ensure!( + position_ids.len() == ctx_len + q_len, + "position_ids len {} != ctx_len + q_len {}", + position_ids.len(), + ctx_len + q_len + ); + Ok((q_len, ctx_len)) + } + + fn project_target_hidden( + &self, + target_hidden: DFlashTargetHidden<'_>, + bufs: &mut ForwardBuffers, + ) -> Result<()> { + let config = &self.config; + ops::gemm_into_checked( + &self.ctx, + &self.fc, + target_hidden.concatenated, + &mut bufs.target_projected, + )?; + ops::rms_norm_batch_into( + &self.ctx, + &bufs.target_projected, + &self.hidden_norm, + config.rms_norm_eps, + &mut bufs.target_normed, + ); + Ok(()) + } + + pub(crate) fn run_forward( + &self, + noise_embedding: &HiddenStates, + ctx_len: usize, + position_ids: &[i32], + bufs: &mut ForwardBuffers, + ) -> Result<()> { + let q_len = noise_embedding.seq_len; + let mut positions_q = bufs.positions_q.slice_mut(..q_len); + self.ctx + .stream + .memcpy_htod(&position_ids[ctx_len..], &mut positions_q)?; + let mut positions_ctx = bufs.positions_ctx.slice_mut(..ctx_len); + self.ctx + .stream + .memcpy_htod(&position_ids[..ctx_len], &mut positions_ctx)?; + + let mut hidden = clone_hidden(&self.ctx, noise_embedding)?; + for layer in &self.layers { + self.forward_layer(layer, &mut hidden, bufs)?; + } + ops::rms_norm_batch_into( + &self.ctx, + &hidden, + &self.norm, + self.config.rms_norm_eps, + &mut bufs.normed, + ); + Ok(()) + } + + fn run_forward_with_draft_cache( + &self, + noise_embedding: &HiddenStates, + past_len: usize, + position_ids: &[i32], + cache: &mut DFlashDraftCache, + ) -> Result<()> { + let ctx_len = cache.step.len; + let q_len = noise_embedding.seq_len; + let total_len = past_len + ctx_len + q_len; + let mut positions_q = cache.scratch.positions_q.slice_mut(..q_len); + self.ctx + .stream + .memcpy_htod(&position_ids[ctx_len..], &mut positions_q)?; + + let mut hidden = clone_hidden(&self.ctx, noise_embedding)?; + for layer_idx in 0..self.layers.len() { + let layer = &self.layers[layer_idx]; + self.forward_layer_with_draft_cache( + layer, + past_len, + total_len, + &cache.step.layers[layer_idx], + &mut cache.state.layers[layer_idx], + &mut hidden, + &mut cache.scratch, + )?; + } + ops::rms_norm_batch_into( + &self.ctx, + &hidden, + &self.norm, + self.config.rms_norm_eps, + &mut cache.scratch.normed, + ); + cache.state.seq_len = total_len; + set_past_seq_len(&mut cache.state.layers, total_len); + Ok(()) + } + + pub(crate) fn forward_layer( + &self, + layer: &DFlashLayer, + hidden: &mut HiddenStates, + bufs: &mut ForwardBuffers, + ) -> Result<()> { + let config = &self.config; + let q_len = hidden.seq_len; + let ctx_len = bufs.target_normed.seq_len; + + ops::rms_norm_batch_into( + &self.ctx, + hidden, + &layer.input_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + ); + + ops::gemm_into_checked( + &self.ctx, + &layer.attention.q_proj, + &bufs.normed, + &mut bufs.q, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.k_proj, + &bufs.normed, + &mut bufs.k_noise, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.v_proj, + &bufs.normed, + &mut bufs.v_noise, + )?; + + ops::qk_norm_rope_batch_decode_into( + &self.ctx, + &mut bufs.q, + &mut bufs.k_noise, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &bufs.positions_q, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + ops::gemm_into_checked( + &self.ctx, + &layer.attention.k_proj, + &bufs.target_normed, + &mut bufs.k_ctx, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.v_proj, + &bufs.target_normed, + &mut bufs.v_ctx, + )?; + // Normalize and rotate context K with its own positions. Q has already + // been prepared above; q_ctx_scratch only reuses the shared Q/K kernel. + ops::qk_norm_rope_batch_decode_into( + &self.ctx, + &mut bufs.q_ctx_scratch, + &mut bufs.k_ctx, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &bufs.positions_ctx, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + concat_kv( + &self.ctx, + &bufs.k_ctx, + &bufs.k_noise, + ctx_len, + q_len, + &mut bufs.k_all, + )?; + concat_kv( + &self.ctx, + &bufs.v_ctx, + &bufs.v_noise, + ctx_len, + q_len, + &mut bufs.v_all, + )?; + + ops::single_prefill_nhd_noncausal_into( + &self.ctx, + &bufs.q, + &bufs.k_all, + &bufs.v_all, + &mut bufs.attn_out, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.o_proj, + &bufs.attn_out, + &mut bufs.o_buf, + )?; + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( + &self.ctx, + hidden, + &bufs.o_buf, + &layer.post_attention_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + )?; + + ops::gemm_into_checked( + &self.ctx, + &layer.mlp.gate_up_proj, + &bufs.normed, + &mut bufs.gate_up, + )?; + ops::silu_mul_fused_batch_into(&self.ctx, &bufs.gate_up, &mut bufs.act_out); + ops::gemm_into_checked( + &self.ctx, + &layer.mlp.down_proj, + &bufs.act_out, + &mut bufs.o_buf, + )?; + ops::add_batch_into(&self.ctx, hidden, &bufs.o_buf, &mut bufs.hidden_out)?; + std::mem::swap(hidden, &mut bufs.hidden_out); + Ok(()) + } + + fn forward_layer_with_draft_cache( + &self, + layer: &DFlashLayer, + past_len: usize, + total_len: usize, + step_context: &DFlashLayerStepContext, + past: &mut DFlashLayerPastKv, + hidden: &mut HiddenStates, + bufs: &mut ForwardBuffers, + ) -> Result<()> { + let config = &self.config; + let q_len = hidden.seq_len; + let ctx_len = bufs.target_normed.seq_len; + + ops::rms_norm_batch_into( + &self.ctx, + hidden, + &layer.input_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + ); + + ops::gemm_into_checked( + &self.ctx, + &layer.attention.q_proj, + &bufs.normed, + &mut bufs.q, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.k_proj, + &bufs.normed, + &mut bufs.k_noise, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.v_proj, + &bufs.normed, + &mut bufs.v_noise, + )?; + + ops::qk_norm_rope_batch_decode_into( + &self.ctx, + &mut bufs.q, + &mut bufs.k_noise, + &layer.attention.q_norm, + &layer.attention.k_norm, + &self.cos_cache, + &self.sin_cache, + &bufs.positions_q, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + config.rms_norm_eps, + ); + + append_kv( + &self.ctx, + &step_context.k_ctx, + &bufs.k_noise, + past_len, + ctx_len, + q_len, + &mut past.k_past, + )?; + append_kv( + &self.ctx, + &step_context.v_ctx, + &bufs.v_noise, + past_len, + ctx_len, + q_len, + &mut past.v_past, + )?; + past.k_past.seq_len = total_len; + past.v_past.seq_len = total_len; + + ops::single_prefill_nhd_noncausal_into( + &self.ctx, + &bufs.q, + &past.k_past, + &past.v_past, + &mut bufs.attn_out, + config.num_attention_heads, + config.num_key_value_heads, + config.head_dim, + )?; + ops::gemm_into_checked( + &self.ctx, + &layer.attention.o_proj, + &bufs.attn_out, + &mut bufs.o_buf, + )?; + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( + &self.ctx, + hidden, + &bufs.o_buf, + &layer.post_attention_layernorm, + config.rms_norm_eps, + &mut bufs.normed, + )?; + + ops::gemm_into_checked( + &self.ctx, + &layer.mlp.gate_up_proj, + &bufs.normed, + &mut bufs.gate_up, + )?; + ops::silu_mul_fused_batch_into(&self.ctx, &bufs.gate_up, &mut bufs.act_out); + ops::gemm_into_checked( + &self.ctx, + &layer.mlp.down_proj, + &bufs.act_out, + &mut bufs.o_buf, + )?; + ops::add_batch_into(&self.ctx, hidden, &bufs.o_buf, &mut bufs.hidden_out)?; + std::mem::swap(hidden, &mut bufs.hidden_out); + Ok(()) + } +} + +impl DFlashDraftCache { + pub fn seq_len(&self) -> usize { + self.state.seq_len + } + + pub fn reset(&mut self) { + self.state.seq_len = 0; + self.step.len = 0; + self.step.valid = false; + set_past_seq_len(&mut self.state.layers, 0); + } + + pub fn crop(&mut self, seq_len: usize) -> Result<()> { + anyhow::ensure!( + seq_len <= self.state.seq_len, + "cannot crop DFlash draft cache from {} to larger length {}", + self.state.seq_len, + seq_len + ); + self.state.seq_len = seq_len; + self.step.valid = false; + self.step.len = 0; + set_past_seq_len(&mut self.state.layers, seq_len); + Ok(()) + } +} + +impl DFlashDraftState { + fn new(model: &DFlashDraftModel, max_seq_len: usize) -> Result { + let config = &model.config; + let kv_dim = config.kv_dim(); + let mut layers = Vec::with_capacity(config.num_hidden_layers); + for _ in 0..config.num_hidden_layers { + layers.push(DFlashLayerPastKv { + k_past: HiddenStates::zeros(&model.ctx, kv_dim, max_seq_len)?, + v_past: HiddenStates::zeros(&model.ctx, kv_dim, max_seq_len)?, + }); + } + Ok(Self { + max_seq_len, + seq_len: 0, + layers, + }) + } +} + +impl DFlashStepContext { + fn new(model: &DFlashDraftModel, max_len: usize) -> Result { + let config = &model.config; + let kv_dim = config.kv_dim(); + let mut layers = Vec::with_capacity(config.num_hidden_layers); + for _ in 0..config.num_hidden_layers { + layers.push(DFlashLayerStepContext { + k_ctx: HiddenStates::zeros(&model.ctx, kv_dim, max_len)?, + v_ctx: HiddenStates::zeros(&model.ctx, kv_dim, max_len)?, + }); + } + Ok(Self { + max_len, + len: 0, + valid: false, + layers, + }) + } +} + +impl ForwardBuffers { + pub(crate) fn new(model: &DFlashDraftModel, q_len: usize, ctx_len: usize) -> Result { + let config = &model.config; + let ctx = &model.ctx; + let hidden = config.hidden_size; + let q_dim = config.q_dim(); + let kv_dim = config.kv_dim(); + Ok(Self { + hidden_out: HiddenStates::zeros(ctx, hidden, q_len)?, + target_projected: HiddenStates::zeros(ctx, hidden, ctx_len)?, + target_normed: HiddenStates::zeros(ctx, hidden, ctx_len)?, + normed: HiddenStates::zeros(ctx, hidden, q_len)?, + q: HiddenStates::zeros(ctx, q_dim, q_len)?, + q_ctx_scratch: HiddenStates::zeros(ctx, q_dim, ctx_len)?, + k_ctx: HiddenStates::zeros(ctx, kv_dim, ctx_len)?, + k_noise: HiddenStates::zeros(ctx, kv_dim, q_len)?, + v_ctx: HiddenStates::zeros(ctx, kv_dim, ctx_len)?, + v_noise: HiddenStates::zeros(ctx, kv_dim, q_len)?, + k_all: HiddenStates::zeros(ctx, kv_dim, ctx_len + q_len)?, + v_all: HiddenStates::zeros(ctx, kv_dim, ctx_len + q_len)?, + attn_out: HiddenStates::zeros(ctx, q_dim, q_len)?, + o_buf: HiddenStates::zeros(ctx, hidden, q_len)?, + gate_up: HiddenStates::zeros(ctx, 2 * config.intermediate_size, q_len)?, + act_out: HiddenStates::zeros(ctx, config.intermediate_size, q_len)?, + positions_q: ctx.stream.alloc_zeros(q_len)?, + positions_ctx: ctx.stream.alloc_zeros(ctx_len)?, + }) + } +} + +pub(crate) fn clone_hidden( + ctx: &openinfer_core::tensor::DeviceContext, + input: &HiddenStates, +) -> Result { + let mut out = HiddenStates::zeros(ctx, input.hidden_dim, input.seq_len)?; + let src = input.data.slice(..input.hidden_dim * input.seq_len); + let mut dst = out.data.slice_mut(..input.hidden_dim * input.seq_len); + ctx.stream.memcpy_dtod(&src, &mut dst)?; + Ok(out) +} + +pub(crate) fn concat_kv( + ctx: &openinfer_core::tensor::DeviceContext, + ctx_part: &HiddenStates, + noise_part: &HiddenStates, + ctx_len: usize, + q_len: usize, + out: &mut HiddenStates, +) -> Result<()> { + debug_assert_eq!(ctx_part.seq_len, ctx_len); + debug_assert_eq!(noise_part.seq_len, q_len); + debug_assert_eq!(ctx_part.hidden_dim, noise_part.hidden_dim); + debug_assert_eq!(out.hidden_dim, ctx_part.hidden_dim); + debug_assert_eq!(out.seq_len, ctx_len + q_len); + let ctx_src = ctx_part.data.slice(..ctx_part.hidden_dim * ctx_len); + let mut ctx_dst = out.data.slice_mut(..ctx_part.hidden_dim * ctx_len); + ctx.stream.memcpy_dtod(&ctx_src, &mut ctx_dst)?; + let noise_src = noise_part.data.slice(..noise_part.hidden_dim * q_len); + let offset = ctx_part.hidden_dim * ctx_len; + let mut noise_dst = out + .data + .slice_mut(offset..offset + noise_part.hidden_dim * q_len); + ctx.stream.memcpy_dtod(&noise_src, &mut noise_dst)?; + Ok(()) +} + +pub(crate) fn append_kv( + ctx: &openinfer_core::tensor::DeviceContext, + ctx_part: &HiddenStates, + noise_part: &HiddenStates, + past_len: usize, + ctx_len: usize, + q_len: usize, + out: &mut HiddenStates, +) -> Result<()> { + debug_assert_eq!(ctx_part.seq_len, ctx_len); + debug_assert_eq!(noise_part.seq_len, q_len); + debug_assert_eq!(ctx_part.hidden_dim, noise_part.hidden_dim); + debug_assert_eq!(out.hidden_dim, ctx_part.hidden_dim); + debug_assert!(past_len + ctx_len + q_len <= out.data.len()); + let ctx_src = ctx_part.data.slice(..ctx_part.hidden_dim * ctx_len); + let ctx_offset = ctx_part.hidden_dim * past_len; + let mut ctx_dst = out + .data + .slice_mut(ctx_offset..ctx_offset + ctx_part.hidden_dim * ctx_len); + ctx.stream.memcpy_dtod(&ctx_src, &mut ctx_dst)?; + let noise_src = noise_part.data.slice(..noise_part.hidden_dim * q_len); + let noise_offset = ctx_part.hidden_dim * (past_len + ctx_len); + let mut noise_dst = out + .data + .slice_mut(noise_offset..noise_offset + noise_part.hidden_dim * q_len); + ctx.stream.memcpy_dtod(&noise_src, &mut noise_dst)?; + Ok(()) +} + +pub(crate) fn set_step_context_len( + bufs: &mut ForwardBuffers, + layers: &mut [DFlashLayerStepContext], + ctx_len: usize, +) { + bufs.target_projected.seq_len = ctx_len; + bufs.target_normed.seq_len = ctx_len; + bufs.q_ctx_scratch.seq_len = ctx_len; + bufs.k_ctx.seq_len = ctx_len; + bufs.v_ctx.seq_len = ctx_len; + for layer in layers { + layer.k_ctx.seq_len = ctx_len; + layer.v_ctx.seq_len = ctx_len; + } +} + +pub(crate) fn set_past_seq_len(layers: &mut [DFlashLayerPastKv], seq_len: usize) { + for layer in layers { + layer.k_past.seq_len = seq_len; + layer.v_past.seq_len = seq_len; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use half::bf16; + use std::path::Path; + + const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"; + + #[test] + fn draft_forward_smoke_local_model() { + let path = Path::new(LOCAL_DFLASH); + if !path.exists() { + eprintln!("skipping: {LOCAL_DFLASH} does not exist"); + return; + } + + let model = DFlashDraftModel::load(path, 0).expect("load model"); + let config = model.config(); + let ctx_len = 1; + let q_len = 1; + let noise_host = vec![bf16::ZERO; config.hidden_size * q_len]; + let target_host = + vec![bf16::ZERO; config.hidden_size * config.target_layer_count() * ctx_len]; + let noise_embedding = HiddenStates { + data: model.ctx.stream.clone_htod(&noise_host).expect("noise h2d"), + hidden_dim: config.hidden_size, + seq_len: q_len, + }; + let target_hidden = HiddenStates { + data: model + .ctx + .stream + .clone_htod(&target_host) + .expect("target h2d"), + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: ctx_len, + }; + + let out = model + .forward( + &noise_embedding, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &[0, 1], + ) + .expect("forward"); + model.ctx.sync().expect("sync"); + assert_eq!(out.hidden_dim, config.hidden_size); + assert_eq!(out.seq_len, q_len); + } +} diff --git a/openinfer-qwen3-4b-dflash/src/lib.rs b/openinfer-qwen3-4b-dflash/src/lib.rs new file mode 100644 index 00000000..4b64dcfc --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/lib.rs @@ -0,0 +1,19 @@ +mod batch_buffers; +mod batch_forward; +mod config; +mod executor; +mod forward; +mod scheduler; +mod weights; + +pub use batch_buffers::DFlashBatchBuffers; +pub use batch_forward::DFlashBatchInput; +pub use config::{DFlashConfig, DFlashInnerConfig}; +pub use executor::{ + DFlashBatchKey, DFlashCacheMode, DFlashDraftBatchResponse, DFlashDraftHostRequest, + DFlashDraftHostResponse, DFlashDraftRequest, DFlashDraftResponse, DFlashExecutor, + DFlashExecutorOptions, DFlashRequestId, +}; +pub use forward::{DFlashDraftCache, DFlashTargetHidden}; +pub use scheduler::{DFlashSchedulerHandle, DFlashSchedulerOptions}; +pub use weights::DFlashDraftModel; diff --git a/openinfer-qwen3-4b-dflash/src/scheduler.rs b/openinfer-qwen3-4b-dflash/src/scheduler.rs new file mode 100644 index 00000000..a803ad15 --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/scheduler.rs @@ -0,0 +1,422 @@ +use std::collections::VecDeque; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::{Duration, Instant}; + +use anyhow::Result; +use crossbeam_channel as channel; + +use crate::executor::{ + DFlashBatchKey, DFlashDraftHostRequest, DFlashDraftHostResponse, DFlashExecutor, + DFlashExecutorOptions, DFlashRequestId, +}; + +pub struct DFlashSchedulerOptions { + pub executor: DFlashExecutorOptions, + pub max_wait: Duration, + pub max_total_tokens: usize, +} + +impl Default for DFlashSchedulerOptions { + fn default() -> Self { + Self { + executor: DFlashExecutorOptions::default(), + max_wait: Duration::from_micros(200), + max_total_tokens: 512, + } + } +} + +#[derive(Clone)] +pub struct DFlashSchedulerHandle { + submit_tx: channel::Sender, +} + +enum SchedulerMessage { + Submit { + request: DFlashDraftHostRequest, + response_tx: channel::Sender>, + }, + ResetCache { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, + CropCache { + request_id: DFlashRequestId, + seq_len: usize, + response_tx: channel::Sender>, + }, + CacheSeqLen { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, +} + +struct PendingRequest { + request: DFlashDraftHostRequest, + response_tx: channel::Sender>, + queued_at: Instant, +} + +enum PendingItem { + Submit(PendingRequest), + Control(SchedulerControl), +} + +enum SchedulerControl { + ResetCache { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, + CropCache { + request_id: DFlashRequestId, + seq_len: usize, + response_tx: channel::Sender>, + }, + CacheSeqLen { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, +} + +impl DFlashSchedulerHandle { + pub fn start( + model_path: &Path, + device_ordinal: usize, + options: DFlashSchedulerOptions, + ) -> Result { + let (submit_tx, submit_rx) = channel::unbounded(); + let (init_tx, init_rx) = channel::bounded(1); + let model_path = PathBuf::from(model_path); + let max_wait = options.max_wait; + let max_total_tokens = options.max_total_tokens; + thread::Builder::new() + .name("qwen3-dflash-scheduler".into()) + .spawn(move || { + let mut executor = + match DFlashExecutor::load(&model_path, device_ordinal, options.executor) { + Ok(executor) => executor, + Err(err) => { + let _ = init_tx.send(Err(err)); + return; + } + }; + let _ = init_tx.send(Ok(())); + scheduler_loop(&mut executor, submit_rx, max_wait, max_total_tokens); + }) + .expect("failed to spawn DFlash scheduler thread"); + init_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler initialization channel closed"))??; + Ok(Self { submit_tx }) + } + + pub fn submit(&self, request: DFlashDraftHostRequest) -> Result { + let (response_tx, response_rx) = channel::bounded(1); + self.submit_tx + .send(SchedulerMessage::Submit { + request, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; + response_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? + } + + pub fn submit_with_enqueued_ack( + &self, + request: DFlashDraftHostRequest, + ack_tx: channel::Sender<()>, + ) -> Result { + let (response_tx, response_rx) = channel::bounded(1); + self.submit_tx + .send(SchedulerMessage::Submit { + request, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; + let _ = ack_tx.send(()); + response_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? + } + + pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> { + let (response_tx, response_rx) = channel::bounded(1); + self.submit_tx + .send(SchedulerMessage::ResetCache { + request_id, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; + response_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? + } + + pub fn crop_cache(&self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> { + let (response_tx, response_rx) = channel::bounded(1); + self.submit_tx + .send(SchedulerMessage::CropCache { + request_id, + seq_len, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; + response_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? + } + + pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result { + let (response_tx, response_rx) = channel::bounded(1); + self.submit_tx + .send(SchedulerMessage::CacheSeqLen { + request_id, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; + response_rx + .recv() + .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? + } +} + +fn scheduler_loop( + executor: &mut DFlashExecutor, + submit_rx: channel::Receiver, + max_wait: Duration, + max_total_tokens: usize, +) { + let mut pending: VecDeque = VecDeque::new(); + loop { + if pending.is_empty() { + match submit_rx.recv() { + Ok(msg) => handle_message_or_enqueue(msg, &mut pending), + Err(_) => break, + } + } + while let Ok(msg) = submit_rx.try_recv() { + handle_message_or_enqueue(msg, &mut pending); + } + if pending.is_empty() { + continue; + } + let head_wait = pending + .front() + .and_then(PendingItem::queued_elapsed) + .unwrap_or(max_wait); + if pending.len() == 1 && head_wait < max_wait { + let timeout = max_wait - head_wait; + if let Ok(msg) = submit_rx.recv_timeout(timeout) { + handle_message_or_enqueue(msg, &mut pending); + while let Ok(msg) = submit_rx.try_recv() { + handle_message_or_enqueue(msg, &mut pending); + } + } + } + drain_one_batch(executor, &mut pending, max_total_tokens); + } + for pending in pending { + pending.send_stopped(); + } +} + +fn handle_message_or_enqueue(msg: SchedulerMessage, pending: &mut VecDeque) { + match msg { + SchedulerMessage::Submit { + request, + response_tx, + } => pending.push_back(PendingItem::Submit(PendingRequest { + request, + response_tx, + queued_at: Instant::now(), + })), + SchedulerMessage::ResetCache { + request_id, + response_tx, + } => pending.push_back(PendingItem::Control(SchedulerControl::ResetCache { + request_id, + response_tx, + })), + SchedulerMessage::CropCache { + request_id, + seq_len, + response_tx, + } => pending.push_back(PendingItem::Control(SchedulerControl::CropCache { + request_id, + seq_len, + response_tx, + })), + SchedulerMessage::CacheSeqLen { + request_id, + response_tx, + } => pending.push_back(PendingItem::Control(SchedulerControl::CacheSeqLen { + request_id, + response_tx, + })), + } +} + +fn drain_one_batch( + executor: &mut DFlashExecutor, + pending: &mut VecDeque, + max_total_tokens: usize, +) { + let Some(first) = pending.pop_front() else { + return; + }; + let PendingItem::Submit(first) = first else { + if let PendingItem::Control(control) = first { + control.execute(executor); + } + return; + }; + let key = match executor.host_batch_key(&first.request) { + Ok(key) => key, + Err(err) => { + let _ = first.response_tx.send(Err(err)); + return; + } + }; + let max_batch_size = executor_max_batch_size(executor); + let mut batch = vec![first]; + let mut total_tokens = key.q_len + key.ctx_len + key.past_len; + if total_tokens > max_total_tokens { + let err = anyhow::anyhow!( + "DFlash scheduler request total tokens {} exceeds max_total_tokens {}", + total_tokens, + max_total_tokens + ); + let first = batch.pop().expect("first request exists"); + let _ = first.response_tx.send(Err(err)); + return; + } + let mut i = 0; + while i < pending.len() && batch.len() < max_batch_size { + if !matches!(pending.get(i), Some(PendingItem::Submit(_))) { + break; + } + let matches = pending + .get(i) + .map(|candidate| { + let PendingItem::Submit(candidate) = candidate else { + return false; + }; + request_matches_key( + executor, + &candidate.request, + key, + total_tokens, + max_total_tokens, + ) + }) + .unwrap_or(false); + if matches { + total_tokens += key.q_len + key.ctx_len + key.past_len; + match pending.remove(i).expect("pending index exists") { + PendingItem::Submit(request) => batch.push(request), + PendingItem::Control(_) => unreachable!("control items are batch barriers"), + } + } else { + i += 1; + } + } + let response_txs = batch + .iter() + .map(|req| req.response_tx.clone()) + .collect::>(); + let requests = batch.into_iter().map(|pending| pending.request).collect(); + match executor.execute_host_batch_host(requests) { + Ok(responses) => { + for (response_tx, response) in response_txs.into_iter().zip(responses.into_iter()) { + let _ = response_tx.send(Ok(response)); + } + } + Err(err) => { + let message = err.to_string(); + for response_tx in response_txs { + let _ = response_tx.send(Err(anyhow::anyhow!(message.clone()))); + } + } + } +} + +fn request_matches_key( + executor: &DFlashExecutor, + request: &DFlashDraftHostRequest, + key: DFlashBatchKey, + current_total_tokens: usize, + max_total_tokens: usize, +) -> bool { + executor + .host_batch_key(request) + .map(|candidate| { + let candidate_tokens = candidate.q_len + candidate.ctx_len + candidate.past_len; + candidate == key && current_total_tokens + candidate_tokens <= max_total_tokens + }) + .unwrap_or(false) +} + +fn executor_max_batch_size(executor: &DFlashExecutor) -> usize { + executor.max_batch_size() +} + +impl PendingItem { + fn queued_elapsed(&self) -> Option { + match self { + PendingItem::Submit(request) => Some(request.queued_at.elapsed()), + PendingItem::Control(_) => None, + } + } + + fn send_stopped(self) { + match self { + PendingItem::Submit(request) => { + let _ = request + .response_tx + .send(Err(anyhow::anyhow!("DFlash scheduler stopped"))); + } + PendingItem::Control(control) => control.send_stopped(), + } + } +} + +impl SchedulerControl { + fn execute(self, executor: &mut DFlashExecutor) { + match self { + SchedulerControl::ResetCache { + request_id, + response_tx, + } => { + let _ = response_tx.send(executor.reset_cache(request_id)); + } + SchedulerControl::CropCache { + request_id, + seq_len, + response_tx, + } => { + let _ = response_tx.send(executor.crop_cache(request_id, seq_len)); + } + SchedulerControl::CacheSeqLen { + request_id, + response_tx, + } => { + let _ = response_tx.send(executor.cache_seq_len(request_id)); + } + } + } + + fn send_stopped(self) { + match self { + SchedulerControl::ResetCache { response_tx, .. } + | SchedulerControl::CropCache { response_tx, .. } => { + let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped"))); + } + SchedulerControl::CacheSeqLen { response_tx, .. } => { + let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped"))); + } + } + } +} diff --git a/openinfer-qwen3-4b-dflash/src/weights.rs b/openinfer-qwen3-4b-dflash/src/weights.rs new file mode 100644 index 00000000..c0f4f7aa --- /dev/null +++ b/openinfer-qwen3-4b-dflash/src/weights.rs @@ -0,0 +1,274 @@ +use anyhow::{Context, Result, bail}; +use log::info; +use openinfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec}; +use openinfer_core::weight_loader::{ + deserialize_shards, load_shard_info, load_tensor_1d, load_tensor_2d, mmap_shards, + precompute_rope, +}; +use std::collections::HashMap; +use std::path::Path; + +use crate::config::DFlashConfig; + +pub(crate) struct DFlashAttention { + pub(crate) q_proj: DeviceMatrix, + pub(crate) k_proj: DeviceMatrix, + pub(crate) v_proj: DeviceMatrix, + pub(crate) o_proj: DeviceMatrix, + pub(crate) q_norm: DeviceVec, + pub(crate) k_norm: DeviceVec, +} + +pub(crate) struct DFlashMlp { + pub(crate) gate_up_proj: DeviceMatrix, + pub(crate) down_proj: DeviceMatrix, +} + +pub(crate) struct DFlashLayer { + pub(crate) input_layernorm: DeviceVec, + pub(crate) attention: DFlashAttention, + pub(crate) post_attention_layernorm: DeviceVec, + pub(crate) mlp: DFlashMlp, +} + +pub struct DFlashDraftModel { + pub(crate) ctx: DeviceContext, + pub(crate) config: DFlashConfig, + pub(crate) layers: Vec, + pub(crate) fc: DeviceMatrix, + pub(crate) hidden_norm: DeviceVec, + pub(crate) norm: DeviceVec, + pub(crate) cos_cache: DeviceVec, + pub(crate) sin_cache: DeviceVec, +} + +// SAFETY: The model owns one CUDA context/stream and is intended to run on one +// worker thread at a time, matching other OpenInfer model structs. +unsafe impl Send for DFlashDraftModel {} +unsafe impl Sync for DFlashDraftModel {} + +impl DFlashDraftModel { + pub fn load(model_path: &Path, device_ordinal: usize) -> Result { + info!( + "Loading Qwen3-4B DFlash draft model from {}", + model_path.display() + ); + let ctx = DeviceContext::new_with_device(device_ordinal)?; + let config = DFlashConfig::from_model_dir(model_path)?; + let model_path_str = model_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("DFlash model path must be valid UTF-8"))?; + let (shard_paths, weight_map) = load_shard_info(model_path_str)?; + let mmaps = mmap_shards(&shard_paths)?; + let shards = deserialize_shards(&mmaps)?; + + let fc = load_tensor_2d(&ctx, &shards, &weight_map, "fc.weight") + .context("load DFlash fc.weight")?; + ensure_matrix_shape( + "fc.weight", + &fc, + config.hidden_size, + config.hidden_size * config.target_layer_count(), + )?; + let hidden_norm = load_tensor_1d(&ctx, &shards, &weight_map, "hidden_norm.weight")?; + let norm = load_tensor_1d(&ctx, &shards, &weight_map, "norm.weight")?; + ensure_vec_len("hidden_norm.weight", &hidden_norm, config.hidden_size)?; + ensure_vec_len("norm.weight", &norm, config.hidden_size)?; + + let mut layers = Vec::with_capacity(config.num_hidden_layers); + for layer_idx in 0..config.num_hidden_layers { + layers.push(load_layer(&ctx, &shards, &weight_map, &config, layer_idx)?); + } + let (cos_cache, sin_cache) = precompute_rope( + &ctx, + config.head_dim, + config.max_position_embeddings, + config.rope_theta, + )?; + + Ok(Self { + ctx, + config, + layers, + fc, + hidden_norm, + norm, + cos_cache, + sin_cache, + }) + } + + pub fn config(&self) -> &DFlashConfig { + &self.config + } + + pub fn target_layer_ids(&self) -> &[usize] { + &self.config.dflash_config.target_layer_ids + } + + pub fn mask_token_id(&self) -> u32 { + self.config.dflash_config.mask_token_id + } + + pub fn device_context(&self) -> &DeviceContext { + &self.ctx + } +} + +fn load_layer( + ctx: &DeviceContext, + shards: &[safetensors::SafeTensors<'_>], + weight_map: &HashMap, + config: &DFlashConfig, + layer_idx: usize, +) -> Result { + let prefix = format!("layers.{layer_idx}"); + let q_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.q_proj.weight"), + )?; + let k_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.k_proj.weight"), + )?; + let v_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.v_proj.weight"), + )?; + let o_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.o_proj.weight"), + )?; + ensure_matrix_shape("q_proj", &q_proj, config.q_dim(), config.hidden_size)?; + ensure_matrix_shape("k_proj", &k_proj, config.kv_dim(), config.hidden_size)?; + ensure_matrix_shape("v_proj", &v_proj, config.kv_dim(), config.hidden_size)?; + ensure_matrix_shape("o_proj", &o_proj, config.hidden_size, config.q_dim())?; + + let gate_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.mlp.gate_proj.weight"), + )?; + let up_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.mlp.up_proj.weight"), + )?; + let gate_up_proj = DeviceMatrix::vstack(ctx, &[&gate_proj, &up_proj])?; + let down_proj = load_tensor_2d( + ctx, + shards, + weight_map, + &format!("{prefix}.mlp.down_proj.weight"), + )?; + ensure_matrix_shape( + "gate_up_proj", + &gate_up_proj, + 2 * config.intermediate_size, + config.hidden_size, + )?; + ensure_matrix_shape( + "down_proj", + &down_proj, + config.hidden_size, + config.intermediate_size, + )?; + + let input_layernorm = load_tensor_1d( + ctx, + shards, + weight_map, + &format!("{prefix}.input_layernorm.weight"), + )?; + let post_attention_layernorm = load_tensor_1d( + ctx, + shards, + weight_map, + &format!("{prefix}.post_attention_layernorm.weight"), + )?; + let q_norm = load_tensor_1d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.q_norm.weight"), + )?; + let k_norm = load_tensor_1d( + ctx, + shards, + weight_map, + &format!("{prefix}.self_attn.k_norm.weight"), + )?; + ensure_vec_len("input_layernorm", &input_layernorm, config.hidden_size)?; + ensure_vec_len( + "post_attention_layernorm", + &post_attention_layernorm, + config.hidden_size, + )?; + ensure_vec_len("q_norm", &q_norm, config.head_dim)?; + ensure_vec_len("k_norm", &k_norm, config.head_dim)?; + + Ok(DFlashLayer { + input_layernorm, + attention: DFlashAttention { + q_proj, + k_proj, + v_proj, + o_proj, + q_norm, + k_norm, + }, + post_attention_layernorm, + mlp: DFlashMlp { + gate_up_proj, + down_proj, + }, + }) +} + +fn ensure_matrix_shape(name: &str, matrix: &DeviceMatrix, rows: usize, cols: usize) -> Result<()> { + if matrix.rows != rows || matrix.cols != cols { + bail!( + "{name} shape mismatch: expected [{rows}, {cols}], got [{}, {}]", + matrix.rows, + matrix.cols + ); + } + Ok(()) +} + +fn ensure_vec_len(name: &str, vector: &DeviceVec, len: usize) -> Result<()> { + if vector.len != len { + bail!("{name} length mismatch: expected {len}, got {}", vector.len); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"; + + #[test] + fn loads_local_dflash_weights() { + let path = Path::new(LOCAL_DFLASH); + if !path.exists() { + eprintln!("skipping: {LOCAL_DFLASH} does not exist"); + return; + } + let model = DFlashDraftModel::load(path, 0).expect("load model"); + assert_eq!(model.layers.len(), 5); + assert_eq!(model.fc.rows, 2560); + assert_eq!(model.fc.cols, 12800); + } +} diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs new file mode 100644 index 00000000..69ead63b --- /dev/null +++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs @@ -0,0 +1,701 @@ +//! HuggingFace remote-code golden gate for the standalone Qwen3-4B-DFlash draft. +//! +//! The fixture is generated by: +//! +//! ```ignore +//! .venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \ +//! --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ +//! --out test_data/qwen3-4b-dflash-hf-golden.safetensors +//! ``` + +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Barrier}; + +use half::bf16; +use openinfer_core::tensor::HiddenStates; +use openinfer_qwen3_4b_dflash::{ + DFlashBatchInput, DFlashCacheMode, DFlashDraftHostRequest, DFlashDraftModel, + DFlashDraftRequest, DFlashExecutor, DFlashExecutorOptions, DFlashRequestId, + DFlashSchedulerHandle, DFlashSchedulerOptions, DFlashTargetHidden, +}; +use safetensors::{Dtype, SafeTensors}; + +const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16"; +const GOLDEN: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../test_data/qwen3-4b-dflash-hf-golden.safetensors" +); + +const MEAN_TOL: f32 = 0.12; +const P99_TOL: f32 = 0.35; + +#[test] +fn dflash_forward_matches_hf_remote_code() { + let Some(model_path) = model_path_or_skip("dflash golden gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash golden gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let model = DFlashDraftModel::load(&model_path, 0).expect("load dflash"); + let config = model.config(); + let ctx = model.device_context(); + + let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target_hidden = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let expected = bf16_tensor(&st, "output", &[1, 3, config.hidden_size]); + let positions = i32_tensor(&st, "position_ids", &[1, 5]); + + let noise_embedding = HiddenStates { + data: ctx.stream.clone_htod(&noise).expect("noise h2d"), + hidden_dim: config.hidden_size, + seq_len: 3, + }; + let target_hidden = HiddenStates { + data: ctx.stream.clone_htod(&target_hidden).expect("target h2d"), + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: 2, + }; + let uncached = model + .forward( + &noise_embedding, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + ) + .expect("forward"); + ctx.sync().expect("sync"); + let uncached = ctx.stream.clone_dtoh(&uncached.data).expect("output d2h"); + ctx.sync().expect("sync"); + assert_deltas("dflash HF golden deltas", &uncached, &expected); + + let mut cache = model + .create_draft_cache(3, 2, 8) + .expect("create draft cache"); + let cached_one_shot = model + .forward_with_cache( + &noise_embedding, + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + ) + .expect("cached one-shot forward"); + ctx.sync().expect("sync"); + let cached_one_shot = ctx + .stream + .clone_dtoh(&cached_one_shot.data) + .expect("output d2h"); + ctx.sync().expect("sync"); + assert_deltas( + "dflash unified-cache one-shot HF golden deltas", + &cached_one_shot, + &expected, + ); + + cache.reset(); + model + .prepare_step_context( + DFlashTargetHidden { + concatenated: &target_hidden, + }, + &positions, + &mut cache, + ) + .expect("prepare step context"); + let cached = model + .forward_with_draft_cache(&noise_embedding, &positions, &mut cache) + .expect("cached forward"); + ctx.sync().expect("sync"); + let cached = ctx.stream.clone_dtoh(&cached.data).expect("output d2h"); + ctx.sync().expect("sync"); + assert_deltas("dflash draft-cache HF golden deltas", &cached, &expected); + assert_eq!(cache.seq_len(), 5); + cache.crop(2).expect("crop draft cache"); + assert_eq!(cache.seq_len(), 2); +} + +#[test] +fn dflash_batched_forward_matches_single_forward() { + let Some(model_path) = model_path_or_skip("dflash batch gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash batch gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let model = DFlashDraftModel::load(&model_path, 0).expect("load dflash"); + let config = model.config(); + let ctx = model.device_context(); + + let noise0 = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target0 = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let positions0 = i32_tensor(&st, "position_ids", &[1, 5]); + let mut noise1 = noise0.clone(); + for (i, value) in noise1.iter_mut().enumerate() { + if i % 13 == 0 { + *value = bf16::from_f32(value.to_f32() + 0.015625); + } + } + let mut target1 = target0.clone(); + for (i, value) in target1.iter_mut().enumerate() { + if i % 31 == 0 { + *value = bf16::from_f32(value.to_f32() - 0.03125); + } + } + let mut positions1 = positions0.clone(); + for value in &mut positions1 { + *value += 2; + } + let noise_a = HiddenStates { + data: ctx.stream.clone_htod(&noise0).expect("noise h2d"), + hidden_dim: config.hidden_size, + seq_len: 3, + }; + let target_a = HiddenStates { + data: ctx.stream.clone_htod(&target0).expect("target h2d"), + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: 2, + }; + let noise_b = HiddenStates { + data: ctx.stream.clone_htod(&noise1).expect("noise h2d"), + hidden_dim: config.hidden_size, + seq_len: 3, + }; + let target_b = HiddenStates { + data: ctx.stream.clone_htod(&target1).expect("target h2d"), + hidden_dim: config.hidden_size * config.target_layer_count(), + seq_len: 2, + }; + + let single = model + .forward( + &noise_a, + DFlashTargetHidden { + concatenated: &target_a, + }, + &positions0, + ) + .expect("single forward"); + ctx.sync().expect("sync"); + let single = ctx.stream.clone_dtoh(&single.data).expect("single d2h"); + let single_row1 = model + .forward( + &noise_b, + DFlashTargetHidden { + concatenated: &target_b, + }, + &positions1, + ) + .expect("single row1 forward"); + ctx.sync().expect("sync"); + let single_row1 = ctx + .stream + .clone_dtoh(&single_row1.data) + .expect("single row1 d2h"); + + let mut bufs = model.create_batch_buffers(2, 3, 2).expect("batch buffers"); + let batch = model + .forward_batch( + &[ + DFlashBatchInput { + noise_embedding: &noise_a, + target_hidden: DFlashTargetHidden { + concatenated: &target_a, + }, + position_ids: &positions0, + }, + DFlashBatchInput { + noise_embedding: &noise_b, + target_hidden: DFlashTargetHidden { + concatenated: &target_b, + }, + position_ids: &positions1, + }, + ], + &mut bufs, + ) + .expect("batch forward"); + ctx.sync().expect("sync"); + let batch = ctx.stream.clone_dtoh(&batch.data).expect("batch d2h"); + let row_len = config.hidden_size * 3; + assert_deltas("dflash batch row0 vs single", &batch[..row_len], &single); + assert_deltas( + "dflash batch row1 vs single", + &batch[row_len..2 * row_len], + &single_row1, + ); +} + +#[test] +fn dflash_executor_returns_request_tagged_batch_outputs() { + let Some(model_path) = model_path_or_skip("dflash executor gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash executor gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let mut executor = DFlashExecutor::load( + &model_path, + 0, + DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_seq_len: 8, + }, + ) + .expect("load executor"); + let hidden_size = executor.model().config().hidden_size; + let target_layer_count = executor.model().config().target_layer_count(); + let ctx = executor.model().device_context(); + let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, hidden_size]); + let target = bf16_tensor( + &st, + "target_hidden", + &[1, 2, hidden_size * target_layer_count], + ); + let positions = i32_tensor(&st, "position_ids", &[1, 5]); + let mk_req = |request_id| DFlashDraftRequest { + request_id: DFlashRequestId(request_id), + noise_embedding: HiddenStates { + data: ctx.stream.clone_htod(&noise).expect("noise h2d"), + hidden_dim: hidden_size, + seq_len: 3, + }, + target_hidden: HiddenStates { + data: ctx.stream.clone_htod(&target).expect("target h2d"), + hidden_dim: hidden_size * target_layer_count, + seq_len: 2, + }, + position_ids: positions.clone(), + cache_mode: DFlashCacheMode::NoCache, + }; + let responses = executor + .execute_batch(vec![mk_req(7), mk_req(8)]) + .expect("execute batch"); + assert_eq!(responses.len(), 2); + assert_eq!(responses[0].request_id, DFlashRequestId(7)); + assert_eq!(responses[1].request_id, DFlashRequestId(8)); + assert_eq!(responses[0].output.hidden_dim, hidden_size); + assert_eq!(responses[0].output.seq_len, 3); + assert_eq!(responses[0].batch_size, 2); +} + +#[test] +fn dflash_scheduler_accepts_host_requests() { + let Some(model_path) = model_path_or_skip("dflash scheduler gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash scheduler gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let config = + openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config"); + let noise0 = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target0 = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let positions0 = i32_tensor(&st, "position_ids", &[1, 5]); + let mut noise1 = noise0.clone(); + for (i, value) in noise1.iter_mut().enumerate() { + if i % 13 == 0 { + *value = bf16::from_f32(value.to_f32() + 0.015625); + } + } + let mut target1 = target0.clone(); + for (i, value) in target1.iter_mut().enumerate() { + if i % 31 == 0 { + *value = bf16::from_f32(value.to_f32() - 0.03125); + } + } + let mut positions1 = positions0.clone(); + for value in &mut positions1 { + *value += 2; + } + let scheduler = DFlashSchedulerHandle::start( + &model_path, + 0, + DFlashSchedulerOptions { + executor: DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_seq_len: 8, + }, + max_wait: std::time::Duration::from_millis(50), + max_total_tokens: 16, + }, + ) + .expect("start scheduler"); + let barrier = Arc::new(Barrier::new(3)); + let scheduler0 = scheduler.clone(); + let barrier0 = Arc::clone(&barrier); + let t0 = std::thread::spawn(move || { + barrier0.wait(); + scheduler0.submit(DFlashDraftHostRequest { + request_id: DFlashRequestId(42), + noise_embedding: noise0, + target_hidden: target0, + position_ids: positions0, + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::NoCache, + }) + }); + let barrier1 = Arc::clone(&barrier); + let t1 = std::thread::spawn(move || { + barrier1.wait(); + scheduler.submit(DFlashDraftHostRequest { + request_id: DFlashRequestId(43), + noise_embedding: noise1, + target_hidden: target1, + position_ids: positions1, + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::NoCache, + }) + }); + barrier.wait(); + let response0 = t0 + .join() + .expect("join scheduler request 0") + .expect("submit 0"); + let response1 = t1 + .join() + .expect("join scheduler request 1") + .expect("submit 1"); + assert_eq!(response0.request_id, DFlashRequestId(42)); + assert_eq!(response1.request_id, DFlashRequestId(43)); + assert_eq!(response0.hidden_dim, config.hidden_size); + assert_eq!(response1.hidden_dim, config.hidden_size); + assert_eq!(response0.seq_len, 3); + assert_eq!(response1.seq_len, 3); + assert_eq!(response0.output.len(), config.hidden_size * 3); + assert_eq!(response1.output.len(), config.hidden_size * 3); + assert_eq!(response0.batch_size, 2); + assert_eq!(response1.batch_size, 2); + assert_eq!(response0.cache_seq_len, 0); + assert_eq!(response1.cache_seq_len, 0); +} + +#[test] +fn dflash_scheduler_manages_draft_cache() { + let Some(model_path) = model_path_or_skip("dflash scheduler cache gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash scheduler cache gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let config = + openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config"); + let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let positions = i32_tensor(&st, "position_ids", &[1, 5]); + let scheduler = DFlashSchedulerHandle::start( + &model_path, + 0, + DFlashSchedulerOptions { + executor: DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_seq_len: 8, + }, + max_wait: std::time::Duration::from_millis(10), + max_total_tokens: 16, + }, + ) + .expect("start scheduler"); + let request_id = DFlashRequestId(99); + let response = scheduler + .submit(DFlashDraftHostRequest { + request_id, + noise_embedding: noise, + target_hidden: target, + position_ids: positions, + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::DraftCache, + }) + .expect("submit cached request"); + assert_eq!(response.request_id, request_id); + assert_eq!(response.cache_seq_len, 5); + assert_eq!( + scheduler.cache_seq_len(request_id).expect("cache seq len"), + 5 + ); + scheduler.crop_cache(request_id, 2).expect("crop cache"); + assert_eq!( + scheduler.cache_seq_len(request_id).expect("cache seq len"), + 2 + ); + scheduler.reset_cache(request_id).expect("reset cache"); + assert_eq!( + scheduler.cache_seq_len(request_id).expect("cache seq len"), + 0 + ); +} + +#[test] +fn dflash_scheduler_control_messages_are_fifo() { + let Some(model_path) = model_path_or_skip("dflash scheduler fifo gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash scheduler fifo gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let config = + openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config"); + let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let positions = i32_tensor(&st, "position_ids", &[1, 5]); + let scheduler = DFlashSchedulerHandle::start( + &model_path, + 0, + DFlashSchedulerOptions { + executor: DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_seq_len: 8, + }, + max_wait: std::time::Duration::from_millis(100), + max_total_tokens: 16, + }, + ) + .expect("start scheduler"); + let request_id = DFlashRequestId(123); + let submitter = scheduler.clone(); + let (ack_tx, ack_rx) = crossbeam_channel::bounded(1); + let submit = std::thread::spawn(move || { + submitter.submit_with_enqueued_ack( + DFlashDraftHostRequest { + request_id, + noise_embedding: noise, + target_hidden: target, + position_ids: positions, + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::DraftCache, + }, + ack_tx, + ) + }); + ack_rx.recv().expect("submit should be enqueued"); + let seq_len = scheduler + .cache_seq_len(request_id) + .expect("cache seq len must follow pending submit"); + let response = submit + .join() + .expect("join cached submit") + .expect("cached submit"); + assert_eq!(response.cache_seq_len, 5); + assert_eq!(seq_len, 5); + scheduler.reset_cache(request_id).expect("reset cache"); + assert_eq!( + scheduler.cache_seq_len(request_id).expect("cache seq len"), + 0 + ); +} + +#[test] +fn dflash_cache_control_rejects_unknown_request_ids() { + let Some(model_path) = model_path_or_skip("dflash cache rejection gate") else { + return; + }; + let mut executor = DFlashExecutor::load( + &model_path, + 0, + DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_seq_len: 8, + }, + ) + .expect("load executor"); + let unknown = DFlashRequestId(777); + let reset_err = executor.reset_cache(unknown).expect_err("reset must fail"); + assert!( + reset_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected reset error: {reset_err}" + ); + let crop_err = executor.crop_cache(unknown, 1).expect_err("crop must fail"); + assert!( + crop_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected crop error: {crop_err}" + ); + let seq_err = executor + .cache_seq_len(unknown) + .expect_err("cache seq len must fail"); + assert!( + seq_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected seq len error: {seq_err}" + ); + + let scheduler = DFlashSchedulerHandle::start(&model_path, 0, DFlashSchedulerOptions::default()) + .expect("start scheduler"); + let reset_err = scheduler + .reset_cache(unknown) + .expect_err("scheduler reset must fail"); + assert!( + reset_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected scheduler reset error: {reset_err}" + ); + let crop_err = scheduler + .crop_cache(unknown, 1) + .expect_err("scheduler crop must fail"); + assert!( + crop_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected scheduler crop error: {crop_err}" + ); + let seq_err = scheduler + .cache_seq_len(unknown) + .expect_err("scheduler cache seq len must fail"); + assert!( + seq_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected scheduler seq len error: {seq_err}" + ); +} + +fn assert_deltas(label: &str, actual: &[bf16], expected: &[bf16]) { + assert_eq!(actual.len(), expected.len()); + let mut deltas = actual + .iter() + .zip(expected.iter()) + .map(|(got, want)| (got.to_f32() - want.to_f32()).abs()) + .collect::>(); + deltas.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mean = deltas.iter().sum::() / deltas.len() as f32; + let p99 = deltas[((deltas.len() as f32 * 0.99).floor() as usize).min(deltas.len() - 1)]; + let max = deltas[deltas.len() - 1]; + eprintln!( + "{label}: mean={mean:.6}, p99={p99:.6}, max={max:.6}, n={}", + deltas.len() + ); + assert!(mean <= MEAN_TOL, "mean delta {mean} > {MEAN_TOL}"); + assert!(p99 <= P99_TOL, "p99 delta {p99} > {P99_TOL}; max={max}"); +} + +fn tensor<'a>(st: &'a SafeTensors<'_>, name: &str, dtype: Dtype, shape: &[usize]) -> &'a [u8] { + let view = st + .tensor(name) + .unwrap_or_else(|err| panic!("golden missing {name}: {err}")); + assert_eq!(view.dtype(), dtype, "{name} dtype mismatch"); + assert_eq!(view.shape(), shape, "{name} shape mismatch"); + view.data() +} + +fn bf16_tensor(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Vec { + tensor(st, name, Dtype::BF16, shape) + .chunks_exact(2) + .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]]))) + .collect() +} + +fn i32_tensor(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Vec { + tensor(st, name, Dtype::I32, shape) + .chunks_exact(4) + .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])) + .collect() +} + +fn model_path_or_skip(label: &str) -> Option { + let path = std::env::var("OPENINFER_DFLASH_TEST_MODEL_PATH") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(LOCAL_DFLASH)); + let config_path = path.join("config.json"); + if !config_path.exists() { + eprintln!( + "skipping {label}: {}/config.json does not exist; set OPENINFER_DFLASH_TEST_MODEL_PATH to run it", + path.display() + ); + return None; + } + let config_text = std::fs::read_to_string(&config_path).unwrap_or_else(|err| { + panic!( + "failed to read DFlash config {}: {err}", + config_path.display() + ) + }); + let config: serde_json::Value = serde_json::from_str(&config_text).unwrap_or_else(|err| { + panic!( + "failed to parse DFlash config {}: {err}", + config_path.display() + ) + }); + let is_dflash = config + .get("architectures") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .any(|item| item.as_str() == Some("DFlashDraftModel")) + }) + .unwrap_or(false); + if !is_dflash { + eprintln!( + "skipping {label}: {} is not a DFlashDraftModel checkpoint; set OPENINFER_DFLASH_TEST_MODEL_PATH", + path.display() + ); + return None; + } + Some(path) +} diff --git a/test_data/qwen3-4b-dflash-hf-golden.safetensors b/test_data/qwen3-4b-dflash-hf-golden.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6007c5c65cf0113fc36f4f3124855cde3ad7dadc GIT binary patch literal 82540 zcmbTeb#zr%^e&1dxLa@uP6LDxg5>Nu7h2riEydkw@eo3S6Cg+kZV3dpoIU5lifi#g zDcS<1P@J~F`_lV+<2W289j9Z=`&&T9uHHn)pWl0e$=8i~sYpLDfP6 zQ(j?Yuc4#98#Z!C*vL^m#^R3w!%|*6xKg$1l~S(XXY~008Bb8kwTJZ$8{A`fuhISS zGnM-f8xmHzf7rzSy@sWHRQ~U~qbj!=7dEtN#o#&>>opkMYgGS=z5g55=wT!K^#6ao zLQtiel>)!%Gp0&V%GgGQg{6$OYPFE+A%Q``)q?#e1^5piHfq3VjOTyd`S<>PM^6}@ za+{5+ri^7&|6VDdb*U0ity_TqfAcr&yYEJYjm9i>2@D7d?1oDW9X4Q8SdXwFy~FzU z9WbHU7KaurZ^Dj~V@cbeaFI5l}U- zdhmZ&`k$r!UyJg;ymUMdDU0v$}kzEAykTo>OvmKIm`p@u+hB@bWJj&eYvoz+^)>x-+5bG*7o>q<77rKPC5 z){`hM$MO6Jwbgx8kato5=g}$B#}mY}H81Dma}vfIsW@%aKefL;qWYX#x9P9?Oz%@O z_LJ_qL;duV{3=y7P}Muwv2sVhb8~PdI!Mo)K73oY&^f(K=lB~gDlMEz&KJ(fJ!Fd% z)U{kjeJCw2;taG#9>`L?uG^`m-qRS~pg#H!SEl^>MBd9h{h5-e6_@d*!r13hdh+9E zoW&M)f0o+xTpoI-@H73>>8tOxibhEzor^i}v-fp2ccH+IHBx|*#P)1o%NT$k*6(CsNd=n&mXY0WNf%R@L)XX_I7)kFG4Mr#v}r5qAY!|0&A(lBQ{2WfN7#qIe|4wPvUtMPoo&1KS?655c`YB%lWrlC?g z1v65bcT<9%XV#e#%H6o0ex*2m!EI=TKH*i8%yZ-{W#W7`Q9bn1TUrJ6vfYj4q8v#t zIUQ#tUsi4^C-e^L^))4V8f${Y=vZFpUCoCjKdLYcee?}JYV$l@L5rk<3D$MGPy1@L zoZ$otllgLye6*BSr-q!_dU!oXP|Y6ZzM2`oGZA-}ONQzV-lv;5vr~(`^gY+*i9AqC zvC(=w7=I_p1}=r_cDcURq8z+e3hPwuB0o!4n+1KF9~HPpqIEhK*7roT&Yh)wc_+G} zp01E7oJA*a5#A>mP}33WYnIa{N^iG`!(G&$QZYABD;sr;+DvQ0|r zV!lQDWGf%g&D792i0Wy=$K<(liH>k{^wc(>$9aAPEXmJ#xi3{9bTOrqw>pzca9Z<~ z zz-zHm?Kz*b9b=6Zk5hp&YaR^&1|4&DqqDkm3^47PCd({Yib_A>7UKoZDJ@IYQ7I*O zj&qknbha~04tvkjBK@0|>SJBteIzd>f{$r)Q<)EIMIc{RTUAf%M_w;WWg_j%e=CddJuud^kJhieu(L?>v!7Sb%7%8UgX&601ZG}n>Kd=~4lmETBSDhdo7 zp)WDVbNR7mJxxG8rsH+!{hz2Vx8o~Pjw7@L zH|H`OiLbBJ3;6iM?I?BhXTC2T^{`fQlIVa_#Fo$^k|2+0Iv=HS8o{6S1Xd}HZHKvU z&b{bY+V6&o!Mc|?(Q=)V<_JVx)(-syU6GkD^69 zi>h!Edg;Daz(+aT605uj3^L9+!2bNv>Aj%_Ex9S2;FEL!6e+LIH(gB)Y4|xxA zezy=^1Zth46OzxS#y;!f^FUC*av74MSm zG|!nPNj#sga|oE^8W?Uq<&;T$mQUbgt!A_T2WqfU$IQTJEo&TXd$F=KZqGc6G>hvJobNq;won_8!Vk!J!c zvY9zt(Nxoc(n~vQAbN3jnzaxq}TMeHr5i-LMl>LBUF>O0v`+OZs(w0aF!*fhFjEP8_*N;dsLj=LjiDQZ5>sSGZQ;kwcQcr+7O*pBaV1=AcWl)#OFSk3jA zdr7-HaWWD$wF-S-$2`KFd%y~bT8pPTmpNDua6j$BU-_zx(eV@@k2C{!(rEPKRK7&5 zfqQ?@ehQ+qyocJ*OmOf6caBs>r57@H=o#jrHyr?K$71ewJ5OaBkhvtSA?xYMD=Ad- zYYWY!?P#XEn_cfG-6eg|?aSl~_0v%FU@h&b{+Qttd_`}d%OZ5N-gal}uihcA$6j0r*W9gOz4f4z&S7f&QF{$9*b3Mt-CR^OSC){p-Jjv#&CYVfc4AA zwZU-p-7x;-D7~iCrm&{Bxp=BX&@^u9R?`_84&?kr*SldllZvbBdgVKP!>zQ2RH9KF z<0R;3@q(Kh@)+onkID1+70=Mp`dFrMAWha#Np!YQG@YY(nkVnLBTttI=_(mrh5pK< z8)c$Y(xPg$lX!r^4S|mlz=rSms+QDdd`o+Ah%*7#f8|`@VPLx)auJGT2S%TXMgco& z=uVxZ9*uLS^A4WDVQPSV^JIn0rfg=FT%#@eqqL({)DH8Q-gb8yd2iDt`dgN9cL@eZ ze~}#yVSe5_g=G&_)QwIZ)I=tm9;i}7U%1QkZ;9k3;K*S0v5{(^ZC%+Q)pW03p=i#- zC#1G|WDfr+%ba|=Pfl?^-h*)#qKvix#mal}N1u$K$70;3?8D!1UQ|(28U&{QOV>NS zBpttX4?`me=i<{?kNn2hzLYG` zBzN75QkGKdSNBJKsM~o9WrbE+4Gg~`?NKkOG={rLS0{>!dIzHW&p=ZIa(-QV_?$am zvr8w61S)lh+8l~GY#@htyOT|4x%0FJuC(6iuS1;A?ia~y7HUnXp&YETU)#7P_>%6G zjm}2?D95!px1`8gT-}LY^vFZ10PpgHw$pPoQ`^d9DK6`@F{QSh zytz2LIj?QO*DL7*t;cJ(2cl)>f?AA^!{g{w`bG z^s<8IW5sUCIUY`#>`K}%;0`@YK3K8ssQWp*mUp>J&~LMOzeeyEcaS6jQb>8c6%3%Hs(t^}N<`?|3sompPJJ=22xf zddn>@>(!TY@KWiDJKwJ@fXe%!V}F&I+7i{9$jfx06bJ4UL7!znUFD+XdP?_zfp2pU zy^3nyh>=>JtDBs^+{-do?>gyBR@q9KaqaC?&TNqj{7&ccK+aAdJ>_kC%$ksu)tra^ zkyY+CIidM=iR{y9BvfA0aTz-kYrllw>nS-!tN4ZUhF_vfrm{u32>!53}950~mI*?r~iz4HoXFeRzwu$t8p)~id73`goV1lY$V|BbrufQ5 z^u2eM-s38q16r@Lb3HW;HtpkJaDHjEks2up-@_^fzsBk z%j;emtE$v_Q8=MqaAP{F!{=k8*N%4W@3o6gu}U?tK;fP&LZQ z8F`=640RE#fhHGRLRwqK*(YteJaUryT3b8wVX)jo9i;g%yU*QW+*J!gMP!v4^g#-6 zZf`rRP-oyntX9=~QUEB?fvRzYHz#oKi{{~?K>h@+1$VZ^vxdKuA2^t2(i**>3;C#? z1S{`HZ4^b{mf;@!EByEY8mH+)mmsfT=_FIRfh^WxI)JljM~VT9R&;Yfn?C^l{HZ;} z!vT7Qx4Km@4{#H3R%vt$RP&sY+Sho9C0!#Dgkp^nbRWeX#ll1DTOBlNdj3q4Zd9j4Vu@% z=z+cEc{le53;*sG(0k4zE+wm_m!8uNbkqA2w~~jN&EAo&(0~D2kUKh^^&h7pROja8 zKF*Jvsn#y&vft@n`ICOu(miT64!g-hv9O`VpMLCeGcR?#8+Opj@8PGeq3 z6y4;l8cSg;l9DY2aXZY(Y$$-sJOw;^0QYqV-E@LBA+Kl&&S=IPWv4Eo{B|p~1%9ng zp6fQ#!u-bFCE49AVA;&JJXht%P@qfnn)KjQ(p@7cUaN8wor3PKqlC5a@&OK4rK(WI z)9I7$2D80(*I-6#$w#R0;uy_xYd7wt}U_FzSIl2np67HP+qO?fPu@o z3=m}pGMuG!(9Hn6n**M$sEshoYr)T%pnY=cK%OPTG+e7n554AIb>0GVTEdsaa%~+h zUHC0!WQSAn6WIH3;3>2g`lA+*j}B>%_*q)n&kPkIJ#^Svx|Uh8DIfm%o; zlc1He*aJX)k|OY8HIc5wqWZ4MAUQ)_I2U)6+o-zcZX8tTFi#=)wq5XawUh5SeUO>f zu=&w>O>8gS3BOJF{dzzsKf4dAAU%4nF=yvwnF56|UcQxaoKsTU>fBgXasp861+vXX zv>CYBiuZA#=Fl^Gf|qD}_c*@4#VKc|aC@B&e953$bOz19x;Nw>!I=?M33tE2na*)e zLA?%km`us2ppE44b^S>;$+wiBmr`M#r9U}aP}%?LUF1eTB28QbCYqzd2E8zaC!yB= zaaUlLN=a+Vi(WtNEYU^IJ6sC#qsTodj}?;Vt;;D$g42#Rd4LrwSD%LanCL zN)(2bd_j+-3zgD*T7hvATZqmTHbdsm}11mBw|{#sn#I(5PRgWN>un~}Vm zR&p4!w#)j!&82Sg-Su zme0YRQRsxC+6@|blg^<8cv8csc@{K)@&XFxPN;!6bka60Z2sXpe4g$(v694X^Z;8Z zm;AH_%KfN2Up^7=jW=l~$?hD}yY6N_i#wmF@mkMqAoG!t4@W0&N5@Rl%%&PorUr0) z{z#uX^Cci(B|fWv$VTKYU!g7jMM4Bc#W$QH6fWo4&u(YTj$6@QqR~i@43t$AkJE~h z9UY&{T_nHy>k+t;SNwx>6qUA%=eWb9G|l7pG8FlTk2ynq!1Nz=7FB|F3+1KgkAi$q zkq_GR`~p7aG#zp_N-mDjL+*7c4epqsmCQ_J#1?AcJui1Wx?Jzm9v-62Ij!-*Pu7+o z*Uuc)0{nuPy1%(wfWO~MP6_59e(L@vuk|7@b-b!)v*!ip&<1c($9NT$@Gf%JYF*4m zYoP2JIzv-635<9bN~?r40drV)2rY3sQ4`d0vS&1vktgzmcjq{@2G5nd+)d5e^Sk4x`7 z(QMooDMW-Grvm7cDBj^+3>HhRn`nwNS_bO`q#Tpv8ESAUX6G65rFP`yf;Jn!qX;B~ zGu;IgsJ@z+tD7@Cj-Pmn@-*Ir>?jd6zlX0uGqsdG1s2Vdr5^cwl*EOgf+$`7o%0xozYo!~1f;;XaZ*dkHch0xh) z_zb1uQ_?_ZK~ZngG-jS0(kr2V$t|a%Aa5?uzf9m0V2~kjTg5dN%;b?Sn7O5Ni%N&?;?K@U`HG*&0(}$7ZQ;BMabc`! zStI}(G(WeepEweF%?}(A1J%4p7htR#^o@Lv9$YDdcre|UlE{pv!MhH^3%Y+3mn8=K5v|-o+5lPgI+_Qiv59>!KS%YYJaDVPw+A7c8BHn9VlQ&4r}QKBB_C%M zS3%$V(L%W<)0`UG6&}75@U*8^HjCjHTrP`ruO}DK{CbOrVh()Zggc{VA85+HS4C9e z6PbTpi@La_HJ-9N!PpJGFPWU>vdHPLv$>L6 zQJ?V&?MvyvU>l&znCgH{Z+LUsVDMp#_R>mxU(<0>r#f{1BTBE`sStJ))7vgs^VO1? z7Xh)8v<{TRJT7c1AoF{JD{OP>>2awHkE+@RbGT3D@pDfNtYdoQvvYWYY#=|{Eeo_J zl=vx4Le`T=y*UFffD7%QJD^7rc!C>88A4y^2wJD>k+V;7!r|L}v@4Q;4D<))XAmz$ z22>c;n3G$%|7aR@^dB0d53xFfXsl%7QQ(%`P=Kqw!KjZxdLFBCP(~v+X(tP?BeaCK z(qanGV6JY9>j-G54;qHLE()*w03#ovk+R=SWdpP|R7ydxi?7LMQXJWMtmi4I3KWcS z2k~ebt|E=ym(Cn%&ehzV)E2ltS>M6QU2#5oX25|qlc|#3+;{uvUUvtH{4UARlHqhO zg;9V7w@^#doD$f%$t5*WL902zQ(R-<)9=xRU1b*f>s~qg_ znz2B$EOt4y)@@Mqb7U0f)$v@)oWW@3%NO0id-k!ET#(Jsa^?Rxh#@y%R)=bF4I`X|*XE=#-a&3uqe}MWJEoJcfLvjdPR%4ec2?t&URO4+D5JApJEMNRl#gq2AFjx~_zUvK z>E2D;NJnyZyOUh{;@$v@-g4)Fo1TG1&QMA^l$o|+Jh${SRxA-Skd6x4=TL3GV}#Av zgLEzf3zv~>xN6%JWdmh5);o{;%L=3z>A0*osMat(L0xna)!^<M)qv9qkzxjHuo=dIB@;H>dHOxL-MwF=gDGWF3OP`aZqnj(4#UECEu zu9iNeo4~)9a8c9X-+z?t$!Se<_=!QtWy`>8zT|QeLdEgjUvveM?Ve(|vX-$UJgtFd zjr3pM!PC9j>;eVD=t=&HJ^svgO$tAN3wm=PhdY;bAaqt&Tw@=wWiM^gX;R&E!9LM5 zsGzreR1#U>;p6!bW??F_hMLY_;6Y!08ybZ=-Jne+$VPxchR6r>eGa{do$@%(I<5{3 zd4VLmCpt8XGk{rs=4H}Ui*pfFbTPQa5U9HL&^2DTRpDa%f!Dh$H34hA4jSQ-oTN+K z2-y1@W~(^I(+|=Tc>kK#a2s5=46^w3oZ!YdDITW+H}J-CdQ)3B>RF&gMVbjWl9Qjx zUo?*A=uGrkZc|qlB$w3hG0&TTgk^LYvXK0`jR*1|{Xp4KQB}bJS?n%(2aL~Wugf!c zIj41E;5KidV#aF>W#g)v*Y-lrkV_I!TVME$lju6ot7*849@56>@mS<3z4Qn1H3PYm zH!Jj_>RQUek8qtea1?1xUwDW`vVnt5eN^Z*DTV4dtp|VvLGbriQR_81wZ5lYZXA?D z4?c=M_=h`bVJ9~iwODh^UJSLRZu(lYnA>iUZq*x9fe%r2nGV$x?xZ&F`2|(5{@e@- zWr19DLUjjn=F`BNzxA{0YXX2Y%aPoXA#Yf=|^O4S=85^CqdkW z_j3=orM9%u8b(F9v9rhBESF^slGeo%EkDA|j@ItFOI+H?lQD;Dq2zrv6b`=~ufv{t zIjH7FaCISCM_lY$Yy{)a;!IH3m*`Kug{-0$m)AHwN=-R~83p})0o~gTSQSK(P9Cly zA1Sjsx?fV`B*O`)mlg2U>C8N>gI(5~sO9Y%>PEvoPXaPspdC6+m+3h3aLW#}64d8Xc%%yzMihq~O0SB#Yq=`WGIL;T@t zuh2hQ4Rdybu6kx-@8q6_>sU_2KHwPSwI1w)57ou`k$k-sP!}7))N}Z>?2=&T3~lG) zK(dcA*^{4BJp31ZLj`pil|p`=!_K4^w1K)fB?2GJT+U{|Lr|Da2y8d!NNJa0R=#hDx|%-hg$ z+n~e`A!%&Nlki)!sJ!_i;lQuiz`*NLU#@6s`-3B#25WvAIr1o|mb<(gHT(_t)cWoX zpti3m>ZU>6exf&Yj(xZe@aUCv<#KdV{P}U{HXvmyCl!2ebLcxibjbqgDjD<`P1jYr zoi9o;o(MgE0GcV6{YrCmowkKu&TI!DJ^tNOfbY|{8b?8T*sW}{A{{_Frc7ml1NHO) z7^^==L*x4)QEG#9B}lgDG0CCxJ*&CCL;<5u&`3^S>zuRzC;mYb(EaK7(1C2uPvDp% zNGJ~RUcIEJldrhjG#dG427b)#cmq%}tu{qH%+%ZqV0=Y3 z>mGMMQ?i@v{OJVhJgA2E@+W6B2e>TK!q=Fs!}17R+FjSnDLL<%4ut)agSZf!Xj?ia zF-}2@ET`RqtCfO|$pDWRMGY~Vdyu?@gU2ek3DgWJbR!*u-U`QAv4>n-qP4oSK6JTu zM};nN|CR-`1S5I`^>NXC286D!^Wo1QHEc#oU5R zLctxxZ&#D-vd5VLrPYrDkgf!%zZ{f;)LwIQOZ47e%-1i}U$bNPHBr-OLDPWpa2RDV zF3@*6WmT_Ckn`k^{cNQf-f-Ec%QTi(@=PSk2|5*sSDsz%#lKOKKB39-2lg;$X)Y

^`^bch= zX`s;Kz~%S(85HRzpnPtqi_-sT$8)IX>RiK)hnvlZE2O6|n#Qxx$=!9e^NMdv9!*7u z^#V@zorhYQ!M^CN(ex8-g3jD5QTlhy#!%~pG4@D|{251@{hx$G$9r2n?f$rh(Sd}n?wKWwF)g?Hc8^`%=X|1CX z+=|N?U;Z5xw?HiyaVkOy6BpurPBQhx>y@F;G(oq}PQ9=Fcngi80<;X{-lvu-aX6Lc ziZYgW0Gan_JIpZlm*`(0!aN>}uP^1VZf3Jds=@b1a9Mj(OF;XT;o-Vi9@9PSyiKL& zPynRK8t1)%zHJR`?n!g`6<3z^Pz?>?gWpJ~nN8C<9oS+qmE%v82yYSw?u_INNWx}w zM$P3E((CS4bXZp8Oj$TB{YzQh`sm?*+}gU1bHhct?rWV#Igr&((Q(+b8H7Y_uiHfD zA@MmtAK4X`x4SL082TrYugPXzF4+5~JyaeJ?Jw+umBkD!(jO^Nvzi$3pcl>|RVa=5 z{*m8^!@X%B*gvQKCYyk*A#jU^cj*Lp+f@7=pOaW_>;AwQr6ia*Qk3@M1WYZkXm4i) zZRA++%g?~p3!21+{kR|cV;%2+p5MVGM6g~dYH<_aq4qq8n&6D7$W%SdQ>By+nx^PeBw54IjuEG!vA}G5l^0Z`MN_!R;%MNEF51K|UlL7jyw;Jb@R} zJ4p}i7=saP;3znv?&N2}<*d_~z--s4?}thyO5)D>W^Kin%EO3j^~=&*D$2YXsY=&JhhQ|iwhb%ZQ+ z{&Z7CXJkaQSIG3-~$<4%{_#QYon_Z*# zG(_j{RGdv4>W<-aI$xp9^Ye*S&TiPME2DRj1!r(9ec}LDF?>o)eO+& zJ)j-t@?K9yT?maBCzIqI1!`Fa-%CU19xDC}-IJo|$O(J{$^0YdIzPq^OI;$UeVmmu z^>~5?>qRKCcnzUZT8bxlOLB}X0&b+|H5#G+Pyiq17IF(JJXm}1&-$Aqvd{Zfp0gKz zrMd*6GOB7v_XE$Mkz%x#^y51E2mQb+wH(xcG5GTNl9pd+sI7$a5Ko|KU7WNmj2=sb z)A|B+kX?VJmin#T3Z6R4^Qp3(N?T>O(*-)97S`%FSp)}B&qm+E){;%o)oY>e)0mtz z!#RpGtqZ6VXM*1!2`84$ti5C2Hq3yYQ36pjv%6>F`d$$r(C@JAI%}-9XG|8#wv8IG2_lTpKRa z;OE9VF>*ula5?rUQCql-Q9!sHyptc`?7~Re%Fmtr>d~L|i0ly$M}zsB0}1BQCRL$smgp6}g1o6EH<#4fnB(0`P>5@>YZOQawWHG(em5_zfM<@7*;vV0V91?N zcCDx_dU%u8w_P|^PB}}G%Y#QcL3vN--{J6P=*Hw^S+1k#7da{a;#6ftMMmhn;7e{t zsIpzySL>xl469*j>v{P&YZ~5KwLz z_lI`-O@pNlcBTtLCq+<6y&T#Jo~Ec|neKjlZK)*-n8OjAbXi6?EaAqTqDdtANwJgZ6KlI-c-LBSIhKj7l zPbApP(GJ*Yx`mzD`dpC$ffbG%)@vH&G}e#eXa3;N+|kYAHbEXS9~tvP+(AKjqu(h| za@$hkgE<@m^m#8?O-9D3!VUIGhn!Db0|2 zPKAP9PJ8eg`OPJ)-gqF-lazJQbreF$bV93Y3)!P-IRxs!m!Ollhm)Y+N)xD-_nHT< z-VBwUPXigxI+zHZCY?D!!f|$`Gk2z0c*TB58LrC`)cY^86g_v|{gE4KC*FYXztl~f zLw0j1Jt}XZ^rmxNBvg0?!Yb0)*Hp%|aO2Ry1yI?(+z*|S2l}{yROLJBYb&@tfmXF8 z0s5wi?t%i3)C%+%dU}JmA@9fO_;wmk^LaEnIZ9XPSUF7J>M2gi_)1bbbH~|;iW#bH zfZLV$5>hp&3SC-(dmi#&uAxc6@H}elGB1u=K?VVZ16(;u^)4r-qLkXfF5^eP;U9+ znM=zxWe0QyRs&}Rs1k3`Mv8qgu-6fIz@2a)OZ1`J6R*6S{-8xXAh|U$ zD~WQ$u!nixIVXL!q*TK>;8pH%uA$S&xG_{4>bxC4gm>*pk<<~V9bH)hr5uQ}#!H|( zKWZZX4ZTcK-1%DrU9c8~ax4{*U0hYqOMdZ}yx`)6c*5iZeBxZVpoiK@CO|n9Lbbb@9XbS)rx6(r( zN4!+k$ueI;^s{8*X4fTN^Nnh-oUQ8v;2UvAc;&3OC^cehhQ%7Rvk4RU(Q46k~=RltyqASo>7vWqk z$y}(t)tU=Vp_qDsX0MRAKBKv~6QDYj_;$)>&+N$MgS(mt}`3Q|iqI64J;5FN6Po!FTHI+$gSJNy` z09&_5hwh^5bVf4DYaRk!&=C7XS)q9LQ+ZVW4&+*6sFiz`b_({Jy|KWsMp%1?XGmdg zK{1&9OwcNq4-eIRW{*VcF-qC%5XnPcJS8=pv-4fdX+NA8kE63lOQDh_R$jX0pni%$ zkv)(qdi8J?`>p;Xy}2-0>noiEH-?L)U^9unlLq>jV&SC9@HSqBKabNhy(;2-)}`15 zTt*wUzJ~K^=L^mPT;_>9kB`VZJ%U~Otk}b>LeZKDXBMPW z3oR>wNEVXdlfTD^Q%?B}lqHhMPI5AG6!mre!0Sy>YhIm*`(7tspgr?&HX!8+38tC2 z_7=MP9S>TzkjYP7o&h`5yJ@L|_$?n7fNT3iHp52MPRk3; zbrZk^%@k^(@iwEKb7Te(gX*m1p-0y0Bwb>sYdd>^vf?R?Z1#|4yWf6*W=*mSIFm^< zNCKV9l7_3;Qnm%pwlmBYPGfy)-=i3ur!VLoK+(3YId^2fZs4+J9iAl0$X)DF=dezL z%Kz_~NpnqyVDrQ>-$ zU*HC&n7J#Cke#bDxg}1GRlt0Av!js3jYXc~V@C6~BRy<{4bvZWDY|Z^t!c7hS7@oSO>1sx z8as+R^ENzT_{3Q)Tjf`q9r~y#_89&aU%jiD;jW%Yr19XyXF5~dwsf!XBxLm)q@dl% z0cN%2Gye9P39>W57~cR#$Le3;jbMv?F6<(F;nK+Tnt%fmzzM&wkM2jmhuMrK5^32G z)737heGf^?0kC~H6OS1_iL@a#)L~9Mt7Xv%Gt5v9rZC`VYh)$yn3KoOX<%wP8-hDp zh?#aw6MKj6Les1<@5$GM*umB2Vd|Ju?lN9xeW;5q z%l~qH+uLR{CsaHcY${t;yYqWD)cj=|>rPwPCc{7MG_O&kAGE)&H;17$`c{KY3lpb3 zbb$TA=CSL|1+FeX=|k!Up3h;QX@re(2Wl4h&kJ^eVLoo+kszhUGdE8)z1hP{Wf1D2 zovC8encc{m<4hh~P!?ld$H@e(euu(sxGZ|_Z)Ox0ZEvmb*=FMzP zEoN)lwLAlzRMI}8dPJs#S%mNAwx>*G8i+mQI{cj-VfLC$%+MW6u~*Pu=b6pAncJI% zz{SJ(f06p?6x`Vr&16zTC;iJUfMvg7|GT9vV``aUP`hW1Kh9dOF*hiW9b^jI45owe zgNnuTsqSi91FwoFP}~YS$DRix1)4~yZf62H`kM`$`^dlilkC<DZu4pt<+r40Jh-<6o$-naw@4 zlJ_3m)567AH89_>diIhad+z0fjs9XBIpYF_~+aWzrX8yF#zHlP$#^sHa_SK1zr=sHbf)AmK0Q)M9owyi5cg zvX8hs>i#+QM!Mk2>FhDBDq&`(oysjt9#l^%JYTs^_dB8XFI~+fY0O@JESvD`uW+~> z<@ChU4EfA{;N(qyz*TLqpNK0OV3L6GR@e5CbUnf%> zyV?JMfl(6+}VKW}JYct;duSkKSYyi4y z33zl0PQm!uB4(4v&lWN<<{CUe2D8EKWG9#}`1d(z;?4H7KH*|!H3iwm(3;)Q(=F@; zZl!<1ajr$>@8N-VxqA>?Uc`ptw8lj!jQ*ym&1UQIO7|z=(n2s|Sq@|mkfIEbHQ0W` z3$R1e7#{T}sII?tx6U$;kQtn#{B{Vx))`Qco46bMVss}_uZ7GBDi4%TgZHY4v?UGF z1n~F2oZ7^IQ%=foGhG}!GZbKYdE>RIi+9E_Yq^oBV>^T4g>LF;w@b6T% zxM_<1arHjXvwaMfG9yAlq4J`5i39{-9AV-vEWY}`Q)nJhX9tGP#mZENh~ zjyCbYhj zceVG-ajZ%?sJ@Ig+;qnCzB{o7selrJxaS&X0IxG+p;-EJ6Mloa$?mw&1F3bQeZUL` zNo!VMZ+{*i(qJxNXUYIu%#7mkZf++EbF+|1_v&_UIP}vTa~pM6+UCXc_hkN&()Js3 zN>3m+o`*!#mug~e#$bHukk3rOjQ_%xe?aF?w*Kh1p(0uf`J*51XPixISJDQ&AI&nX z>@09rwENDl*1V4p(HMD>l-g{Cmek3V=8b5^!ZkdBb{kz z)fnUG0^sLMw}V!*iO}A8aq=|)Sm zy{tBMp%OlO!=VuRAXBXjt=So;_dMW>TiE-U!SQA}Uqjwq*GvSbF181`h;g8x432`+yhP4oy4dc78!v?1;)_Bj$z^!g zAbZiyZ#&OhNLI)*gqe2|yZK z(afWkG{=0va|#I*XL{;EU4&<0@Xkn>`S10sjKQ;hQJT{XHY@p(kN}XRp4hH;;HEv z^IYffXv|I7&}^I$o?$BLx(`$?5;Ho0GqE`RM%&#!_8oX)GfsixxgC=vJ@Bp?C21xU z#&EkF+Q*mi{ItGtc2HA&#Sxm;z6BRoa2J{bxI#^+>`A&s>)OLI(he~V>{5((0N(c^ zkwffRoCIiS3~KF|;@q7LXFrn_wb4e(;^%*N|5U|pPlR~_RGkVn=tw*-#BSnLQ^20& z+GeZF;cVt0z8_*zn~dPxH@ZN}>q_w5P0p&NY)j1AJaBy;GZ)!-kcomL|3ia-oo`W_ z=QU2&LglW4!VNGj`Liyj^T6sfHVR0439GkKi@+TQAUT+Js~E>zkc~LKp9!_?z#pF1fi?r3gmYTR?dcyaDtXNsUcjA9Tf5j4<*SU{ zUg*{Iwvl!>qi{-SZE^v-5qrP`sXMUcuEqje>Pa(Xd?PXD5tP}khX!|`zp7$JMw^1r z=4os_sHSD8ggC8;yZJ_yziK+1DVc`%jhYHyvCkQ#J3XB+7i4OJ&$F7JoyWkhWx%fV zI4OKqPl1^pb2?y?kJ;-^cL{qG$B-IN$Jxm7n$E^!wkGl`GXZP+m8)rEO{*EqdG{io z255#owJ5CrQTb6zV0G)^RpS+BW5NA_Sc%-I)#2v2V)tE6!0ng8dqRxTD^NFIG!ovX zrZa_lm?pTxgW%7SNZBj%N`vSqWyAw<#6vJGO zwk@E}rb{|#>s-*gop8z{lZ}xP#@Ec&a$xQbwvvexU(*a)E|CvwPy0y@Xlr16N)7Gh z=HUD^;OQCYv#QX~sq`|CyEjil@1`-6IUO`^U5urXPB(jLC2ulSq2woPVHR*fG22mI z=qQ{!DFys#WJW-Lb+B)pL}cytkWni9Wh?u}&1hyI>+zT*&8r!qd83^_B|Ual&H{~R z0k;}JKmNfFkY5zDe`rO9z9}e&r1{mqxhbw6)KoLXRRb9cISyR``mi^o2S& zouU7q(`KhIG+7k7zB!G+NtPdMF7p%CWTFj0RmW?6?#kz(gXrDFCLv3GNw3j!Gt32?UaDdH=`E<*b^J5t?vZ?h zbfPm)g-;#@rrkhkY$@+9U{DTI5u^WE-XItIo!&cZxx7iIi`{in5U*k>96QdXL#Im* zaOe%KN#B}O7&+tbF>=pLSGZa;#arS?MO64N`V5&)Fz>Ei!m!C>Hko~>`Jqq&i*&{j zLB3eI&L6zTjzzlK0_V900M~Xqr+6HE@bBt(q@dHDU%F+$H-qtR9cNIxxpcBE$+^uM zy@n+5iH6kv4(rn!zHua0I>yvHx>VXCVO@(oidmSYnv&CAgnnPD(@1wbn^Qg6rXKcM5-2Qxn1!y2Mk{X(ho&c~JCvfx|M@`&*MAxMIs;l>MU} zJ_C`zrDoyWK9(E$Lzrke!;W&);*#__@YW7@(E6f`ltv+H=c^Ev3rcstHM~G9zv1e` zn(I>?Q8*lB;-9$j!D;=$M#O&c6KtlRNhNQBjtV}q8DN@Z{vV&t3gRhS`V;X(h#}Ob z5IHL7VLa_7HTFFtOy3h7D!LuIlU1%n%wFNXl}?hH&g4zj&|d|y%$Iav?k}X9t4ob1 zHPO_LrO@5@&(sITh57Z>FpW0XW3iUR^_-$lCS97 z!0^74lFZvl4I8QmCz%l3Ac`ky3%YemhN)TCt#H9%I?+EQOPJT^4aZK+!m3`Fn;ZjAeg9h#1!KNlP|#OKCqxAZm8Y8(4)sLOq=VALW6<+T z@U>~6h5MkFO5rfj&uVED*0h;c4|dnam}C;}z`m%!30o0qc7*w`tTB#^ro$arX*_vpzS%(xllv5eUS!QC5PP94UC7VusePlDk#lawpzyPIu zmv9vsppVJk*9NCgrt5VCx%z_sE&qC0JDi7iYlZ)LlL-FA*LQc>@kpw>jQ);3AHD-h zPUj=HBq)tKdH|+)Oi?bhR5;dM)a*>Zo9Cu5$t$HVLaZ5x7r5jKhZ%ikCbRvh$-;iF zmiD$O`WC%}i}kTC0!9Z3;Zao5rXc4f!DaOC={x6=W$*nFI?2SV>k*7B(i-;P;El-&OQu|w4eFSqHST53>=e}A`gq<~^qvmD zy8f1|K0+N%>Y+;FTAyouH-|m$7S<2Dqc3ejsjh&kGaq%lh^5yfve)&Cb@5k%S=P3m zvqoqR<$Vzyg?dpsOegClN0<#9f5Qsnzpe+Y2$4R&55r~l0{)6xIak|fbPKd>SRrg@ zub^9Q4u6$5T?IefGWm6&>C$9f`hh$4$lS2McF{lawMVoh?0Byqs~yM}z9ZlBJ6^!v z>Y;@7@L9-mWUV(8%=pN4!+J}?Q6_otFZ(vaHzA@+G#T}zJ{mKA9R_qxPLL^}(mAAi z6Fuc65Yb|G|4)?L$>9)kz3bor(_J~9@I6twgQ-6gtq=ReZ_}FOO~#R@n`1w^2b_jy z;E(a3kHNCN?lzl4j!^T>|H$d4w37@?ew8dUt z2JLPO{+7LR5;bcos9*xDYbxCLIrPzF*yv-I$ExxsFXPeg@g1rQsA73ac65Zl9o^P} zVGB)$w`ql@(lA^Z4D(8bcFwR8Ub~u~>btq`Ex$jD5;a-3qM)w)|F4(!JAy*t6**76 z@?jY3`0xf=N!jrIs54q*E!4pD^kKf>=fh-|@bjDXby!4p|D)COkEzc(0GcoBYm(8?b`Cs>6}mW4Mg7^VxUrQ<2%^(EAn|D=)M>;6y|Oa!UcL_eRX zX{{<)beU|k!u}PqSRcYoZ|a0-wO<_;hRqfZf5-Zk`rS+=y2WZH!g^n%KMqC7?gZ<= zBVWr$eu-YOrs#*&!&S0YQ|S1x3>>wDb+__vPIxEy(?;qu^n?j`)_d9`%;f(kw^_~P zu)TNuQr>b=nAwv-Ign@wEQ&b}lP zPWyk~d(#Gmm2`Vl9i;w}9$@O{^SaHY^k+36_Wdrr5JU(1oA1(xX~-?@wBnX3jG;<2 zus_{66tAI@$q$0NmO?39OeSHk{==#N#3Tz=P8L2 zStm>dhDyh}F(Z-gtJ$ENf1}r7EN57m1~LInaggNKY5dl*zAz~F9(5aJv)SKteu|Zh zVJG2R@;tghJDoIT9utHT{<5ND5qt0MgnU(|%Nm?6w$|LGHHWsSoXr~u1thkl6~ zH-Km{-i>$nB?THAx|^fEM7rw*OM2Gk;0GA{UTtdoq>mNSUeXL(x+Xb%HY$?Fpj*yy zrNiO)=~?>3{pX(Zap5GJz#CLxLh3ZTE2nIM{^)B(<-tiKjHx?#>^-2j=B_kYqnee8 zr2}7WK>=)o4KbC|Hrinu4ENdRU@krpg(FQk$!#Qa6_0*4HcS~bwCBn1mG>p=jI5QX z8u*9>S<4Z)G9N_WWSiaC7oPD8R-AhjcA~?!j{B^Eadba6WO|e!P;pXjSJOp=VT|h z`RI+ub#-uDbHX1tV4+hbJ2PK8fd2oW=A(|bM^{};Hm0J>N^Y(KyHhyY$BCGyZTwP8 zN!{I7)&s1)U6T4z2l)@8onWSR;X5+Rza72^yBedb_O~D})dIEIX&wI(nV@X`cd2Lh zKuSrfRtK`E0neD{)9EZp<*I}SsRXzNx_+j|4F3aq%NE|TWZ_o((Dkx5ylrM*9Q^Ss zktJt1N7912)}oP*0!cmN{N?kx!$IzUVI8ny2Gk5^ANc>Q1koX_|DB9wZKZctbLk7= zWz8D?O4ZE-Ys=G^vx}da5+?XWD}>^A%l-w|lPUBWL8j%y@tpsOe!Bjml~B1~2upzo zD$reZ1*>U}C3KWe!iN4Y*GI0C9lW4NgQfPnz73M-ZWaBupgKrluI`l5_9{GN0X41b zT}#e;UX8+q@cRyWPFF_T@r${onx>P@S_%!m8cM|p|AD3rySrk3C$T(NSQJj3oD60) z)a6XP%`_NiRg};%WPF|7b+vpChXGqI|CW{qxm651z^R9`2A^3O-nxYENMBPg*+dob z-smG=$0pj3#L<=VK8kPww2F^lzpeQ!wdm}QbOLBBC-G@icse+W_H_XLtRI=ZHDq71 zGj-q>ooKnKXe>faLQ`1AUE+5f|5YfQ=0A4Fi0!*wHGJ}$az|k^t|ofKZ{&#I3;XCn z>rM8qBuY&x_G=k?kQQHkQQEMJ)xw9MgqvizM6>a)g>DUaaw9nKbF_&u>_-oM-KxVd zOG-+1st9!qWpu8k^y*f^NpI;6%|R@=rT?QVE3=;i>PsGu@E?Ykt&sE~t23MY&N(dv zc6*)9tp%IpRUyWI@mWOb}Y58$&S}S!x#)YD(@%x*U+kvk@#*{( z7}70J+2v?H)u$;miRo4em5R>-@z49h@ohmA?ftQ+XxJq@DPZ69UW4=qyM6M)YNB`F5ue@mV=1XBz#WiYt=B0-n6RWY~rYCQZ(2b%%K|i zBa zX;!!|EVXVSFk2CS!E$JRkQ}v@)W{CjqTzUa@vG55u-P^IR~dZOL24X_M`OuN7Uq;M z^O=|j)SREVKu>*HKVEydcgPL0;$g0^h-UX$T{&V?f;7~X+9&)zy5$B-%V4Z^Vn+2+ zJqtRWs-MCaf0IRU#YRMtL%T-cMi{x0P`g1g8>BTcys>}i=JS1T*+4k;r`E$3$~;9)VHb1yp00kl zktkisXNAcZGiD*^DPqhU;X~a=Rp~{S$G^xq+v!fbDd@tl+d4NWiC_92$^>eus7&vI zpS%>-(6s(VS#67L8fx-cJkUkL-}-kPB4!tDOD@GV_!Rq9nidG2Fb>dw|> zu=>Ua%TOV`5c??*?h3wlHSoG#MtO87JH@26d#F6WkONZkP|R?}g4f$+zm3)W5l;`3 z=IuIKId_d*?aQvSTz9|A@0u)HXknN%J1P-gP_kZGa&Mp5|74I(39I|N{wH_Rz78jc ziR{n_zeh@ili_aJWS-F_BuN}}b-0SGc`qqtsi+xRZLe{nzSHMmP?zitY6#=PdZ<_( ziFX}?2X1c|=__Fx*6@Tb!?u^pWql50GEM*U0okprWYCvbCo){ch?Co*-~4^+;;-2v zPC-jIz;B^KcCju7p=L2JSA5#zrL*clzJ-%=IxXY8yLi29g!^(mJZ8?vtduIiFT#X94_D||FYD+9!z#u z_>IOiq)Mb27^$Iwe6Z++bOb#FEtc0~`je-2+(l%~(xO$SrYrwv(Cb~^e7$XTTrstFJS9egJn70aHWst;|*nV0Yk+DrmENP0M2dz&1l3h(kAd975~G(63z1Z2lFSPn0f9PT@=NN+KDgCsZA*Kn*PFP45RWa>q;ke^F`wm`PR@ zeKuDJ)|59)Mz>x0unyWnfZo_dZ)jnVnCiEB8Oz9x@>axGfUouN7hD6X>$>2hO8D_X z=I~jpnr|*uuWM7^DQtV7uNDiY`xsp=mC=Xa^|PZ)I*;#5r(Jca+a*<~f!PU8 zIHJG1+reGUVkyI!x&p5`$>eUO)1!y|;-}@FrlJtLfif zLoRtC_+%0hWv5#p@hB<%b%B+mnmYv<;`jXt4T7=$06B=X;Tfqeo9wEb)CAWOyDS;* zrBW&zd-J=kXMSK~xN$aW1y1-PK?(aTPyBefP)G}-)avX!aP9(@%(3l>ojPrM&qZ~z^m2RiR{`ZUBJMVGo| ztvHGKQESVc(~lo2;Xy)Un|P*VT68{c`wx;k2*`)>p?_1u_fRR3!A@vLUj_trLbGur#&m5+B~WQt(JVSPGU7>c;qlMF zh0D9cveQ22G-uMv=-mU!ZY0>B(Ev_!DVIhD+Z<5;HGPvhy>_mjWwIgcX?0sdWJqhZ zP%UTUO~x`8vr1T#^IjD19A2_Zr`<6u zhKgFPZX5j_;@N7d2#fmF)FP$!7g@Qhwj94cBK%1c?H_Q(75e`l(_^_thR8$B6fR+h zK5$>7n*A%>#Dv#&u+rjTH$u-T$k7Yogt#jd6MrG(Meosp-zx zUCwz`)Vj0iGg)Ez?`sAffHFG-e32|X>lR>tf9X0hTD|Exe=N_qv11fHag)hkUxBB6 zp_6r$Ea2x(!Isjvl*GE>kC!2wz&Tsx>fl}fM4d>f zAM4t%Vz^eu>^~7rjpgvOJdMDcGfSx~i}pyO4> zJG^Z#=vq{^##S7Bw@EW<0npDETK_k?lB#l4z; z+cA;CGZ6*<{G9$Ae+xa~7Cw3q?Db{eS-&AGaK7#?Z0`Wy-<5qDgQcuM^{cHLs1@I! zdHgi8rE;#jEYPPD{5p4}Z%^MxF z@2wyhxC1(21DNuAflya48mk-zLp|e$`%C1A&s$6Cl=|6Jdx|}r*10yAGmyk=vW69X z3T6tl4cF>_*u}T{H5ImH!2?3nofN+0=Y$)gf22Y*l?fh|^pZBBBD#pLh(=l{d>S+W z*RKSxyx`9V0qnFlOyVh1#|HXg0lj4A;L`7h>8Uz;mDAwJx5QaOR3yA$Z|mpyk(*jB z{9b+!l4f9EVPy(i3+cul zt}6AhHFQ+8$A)vVcfz(Fq2uJYS~A|a{|V;uIX1S=`k`V@h1pKBcy{blzV9_VXI~ig zg=9T8Q>RukT#PSXgo5YnMV@iUegGM#@GrtD|AY5u@SjO$*1uP@c>fP{(vA%CQs2@v ztg6L8Nr~3l?bMhY1`Xuk`wvS4U(FI-(tT|sqJnj&6O~1GlRaweQ&>$@+QH#rscwtI zg;EmcmQLG&p2aV8ZQ{qtmwL~2(AQ-{Q}=mA<;Bk03QBIGg_qE&Cu&VfF*IQr(Awv|-4jtv7nY zYIP{O;u>mwYL*LyKBh zh|LuDA(r$2yPu@zBoDY{GWlXE&0N5qZ?UV+G!rplmiDux`O=HAydkJaP1u>!dK;xX zBi)kOY%hx1L(T3R*%;J-6bDX(J9RZ#f{r?s9*|8S!7|}#*m=_JO_xE|xu{?5txwP} z62T@ddH?LFU)^M?-|if2oBBY#5-!&!s0A7Qb3u7}8H@AT7FZGV=}q|a=HdI~ZI`+7 z(Lncl*wD?i_I{rG1#h^|YH4cTXwv(5LlwWd!Vj0ii&vinjpQj_wyDg3A8{>y~lMW4Wu=7x{Ex|$A!Zm&*D$NFz@16Fwf;X&|N%%xY@V1eD z#V0ouobH^~^B}FoHXA-UGQ1xxvQPP`a#n{rwAR>I+OV5181y2x=GPLz-#p`x76_91 zwNA?H@GUwN3fe5Kh6)$2|4?_{Ro8;`X9h3HF_`9#XrqqA+pg2}*wYK-au&vKChDF- zWuxLs_iA5wSzqeIvg?X)tR9oWM478H*>Yj0#Gk3;(jIOY_?s9x{HvHfRJ}cRd>GFuoN^X$m$9kB&V3zPc zyzY;%3ui4I(QT9EB?rSDedP7~;Eij8>zlj1Zh{{gmIQ+o2{TYBN6(dOA$xV1+f9Vu zN#5v=`yh-XXTMfYx1i&lKS( z1I(Z^9VUCIr^;;e;G9$RXRJ9lHPqL1j%1QsdQnRU=kY8xU2Q)U=8;qz-P4sp>-!S# zd4}HRIC`)WY^=oGufahpVd?RgUq{KcwGQ%=w03wtJSgbBpyE#99{#M0JwQ9^05>`8 z?nOszR=7`6*)cK#Q^_E8@yjibj@R{Wh^0V3`z7c`734v>4bDFKuWI=2K>IsvQ@7oC$C{42Ni);D86^W1d4pzBX;*oJ4FVW1tnR9Cu1@3?XPTUo{v ztJ<*eUl71LINUO9v@3lV-9Qp);Cx@oCA3QBtms236At<8_PT9wpSWCj{1uv>JsVE9 z%Rp@zUSr3u69rOnbIoz8b4Qx!>*UI6!l&1oS2~8>Ua8O5vxYOn>ztVqDC}ji^QK_4 zoaAT+*x&3~zHk$)sy15p7I@4u-RK8vsQ=;xbB3GjRWcM~TzUFqYT!rO`it%_mjg9q zD)Fjj*pQt{Ywy9$O8I_N3#~+Z@6K*4m4QJGi_j~(g|+Y$o2mh zraD?DhF$q>w=H?SeEwuqpKQ-yI)gHY9b%Wj{spMlOHXbq6%}7SIlI}w@D1H2RJ9sm zyn8C!c>B-oQLvZ(rJY)i+vjpHL5DeBk}TN`8;gFh6g_CB)?r@Q1bW;IjWi>wW?OWN zE&5yh7UFLs*9;cYEjHAK`M$2c?~k@s1%A5^?D>@at%Fa>NT$tDZf1fuYWcarJk&!f zF=0a){4;!ET(~6cNrs{yl^WeGpD#sT@E2K)H!1D5p-jF&Z{BgY(~XZzPD*RCdPUF{ zn`v5bZ9n!iX(~ZQZD!-#b~=t@`nRmK(Y8Rg#oou0oV2DWl}FHO8uDpV>s0&4ZAFRy zN~&5r@c1Im>qg7(j>=|Bl*-_oo>*{RX1`^oDrJ&OPKQSrB z%4*)M8|c1>)}XI!l^(M)%=u5S+?>1jqX%-#4IncxkgA9m!@sEc9}E^dEZ?~E?8H>` z*M;({{NVQ3cur+HY<>~3d;sdn^RBriC;R&ztl~kinc11lZ;pP{KGExXAs7s%UrYwK zu3iNoP|ck~JTN)>hQLYdI~5 zGV6X#%91s9-cCoq*%A3W%CA|YZP9zuDtKV=!F$#+_P{FZFY*Pd!XtZ|eJ>2FekPN& zpr!(`ceW(GJiR%|sNdRooyOmLTnXl1_O~I?ERf`O3-EGhP&Kzl9fJFk)yC-6_|k4O zIl85+$39snU%IBQq6Bu_E_1WfBKMV!4AQaR$Mm=8FP*NVTmefH6y!O3?OJRsyFWp(|}!nW<7h*J~m+@5!?yXP1+so^V4Sh@%uJsows& zE`w!_8b$dy|EUAf>^j@cL@6r?-aiE&A0cU(9CS}!1cL-FYp}zmMCrQa)=@do7`@>l zf6~~eWU$k?p>Doxa6^bGuP`U>b-Ym0B+El?6;l|POky9)Uor;7{F%Na)9o9p>C%P& zxX+p2LtO&T$P6Mmss~*faQJU}NIJXY?j84oF7z#dBv)?&4b$r^TK6{1T1Sr=@_kHwn(j60$PCX$dRlw!5-a)9$kC z)=)0GBDRpK`B2Nr1XtYd+HQJDa`{)H#TIibrHpH3qhkNVH*9lPvHsDrKAyXSsdkQm zbT7GSOnz7b^BV&uPvPGQjwlnu;3&6ZLv*LDcjIHJsEyy|{t7lxeS5&Jpk6Rn&#flQ zQ3y+$K_)6u@3a32Zjrq!c~HYL@YcV^hPsL9K@U04U+K%_(DQ`*qK9^ZOxP%y$>hFu zvepVqq-n9>0ic^#tRS2^mwpB+*lPD=7%MV}j;){N8~q`Qfm#=GG@!haE-ft17V2v%(R8}MCb6PFpDW*Qfp6fyHy&+_w6$5d@Skbm9c;DTJYGN>g1oCf>WU2c} zHsb@ov}rK2*+jiub^~0vU+FQ?R@iC-`7D@i%_Ng9a&OuamqC)d4b~m+KgcG4MEkq6 zI!n{Uii7MrYfkEli{g7e4h~}n3$U9~oZaktpPlN*xyyjv+`uo6B^IvMLF6IIxIwON z@U^V6<<`(Q(iHAPZ1#Yh(zo%T^_iGpfTQan(M)t98A{z1njJAaKfF_qB>i$`zH3OZFVo(F8$h-BW2Hv?v;`9Q@I43JV7JpU1D%#uB9{&^#U>aN#-5a!ZpUDVqr;BVU zYx%r(1v{RWTW&daxSecJdwow1GmoH=6t(2~TWq;IEgk5$Z46iWnRgfgMt$u15e+BH zU6h^P{QUBuMc5+tSZY(pR|}2d=jenhsG%%(DOvBoH7CkwQ>N#)!mEt8QP#$#rG6n3 zTxy?d;{LGY(w#f&SE8gWv~TIB?C&O9>gc4s@3zWjeE4-(%O-jU=DR7j9{>2Y%a7lg z2e*my)Er7NnDa?Dhuug+2TDdiLKo#ZxXaq9iwBGOfq^p=c@qV{->I4h%>I5`tE zC6?2Z+K`w|v4!@kX7-)f!#P$%ufmKU1&N@mrl7&&auUR{FgA>LD8%&5IrJhO2&&ySEtT&`c*Gm0q|NT9c-s;Bp%`_RV``EyKb%oSaPr) zmeKLm!YC?(R#4c-X?eXSze{>B#hB<5o2CDY>cx-5%hh8&GD^~?t`-fm=d`8u01q>z z5G=k>Ki9FrXZWS3a@hSB%;RS(*dMM&ESqK{hS!kPWJllBcwHP#cG+wmwL4{Oj|_${ zrPj*f3aR9t$WJJ!Kg&q%rpJQIb1nuUa8?Gxsh2wRcYb9S|>+yER+M;%}mgVxBi=&Q+ zN%-gpAKPUxQz0^}S-`wIEf@S{oi5fh`1}m?MOQ>srw&RYtnp41*dy2j)3oq_WcNXy z+fX#KN)tL}Z{W=bQ2TkFcf5gJH`8)%ba2bfA`;eu9bD1Q(p+lFWa4lkvh)LNzLdbO z_VNi@!FUSld^l8JyxisV3?Soi3NiV19y^YprWQ(RVm`D}<^) zn~c^s#dca1VpV0(cq4l;xU8F8XKXW=mU5F)Bp0;OAlO`4>14T4~T`i@y?^X27mu(j3tc{d}Cocnw{2_&+ z6Ou6~D-GmlH%)75W|v;elKYux%j9Q#`!Nlzq`j-Bf@ZLX$*2j*C4>F|qspS0ZMB`0 z$lY`QS_V6-p9KG7WnMwUc@oj1CS9y-IMZ%%GE(A?7P%!dfSsvg6F8+e$e7N?a=w(l z)?Yp*f=*ZNG>hMl&h~k*+NIJ5E_C~B9L)KGRY3ne6rT@e)yG>uqHZ8PP*@;ovn;|fm;8TT+!~}tUi^W z+)ZnXh5Z6jnI)5g@-jqox$HJvt7>lyr!Xe&5%CxRd1>%pM1 zkS8l*8(qBB)wi*wMrg|YtTa|wkq-0Z@S}=+wKTt@TC{@%nmfyf$iz;C0 zIeoh*yWbEM04*r!;dSW_!~HvW;F7s5!8WTwB&wlB?PL2dxTsyEoo;ZW?Yg_dYMzAY zzDkZZQQGr1tE2HQJw_-iFmRx;DP-*lSq(RQ&RhORc7lZ+YO7Upm;TzWwTv>?EOga z3Tnn;c@5_Lx9h0A^@^J=%WZ1#i>m?t2(kLtwUDjBFP?Ul=mZ)erFiqgvOu23674N_ z&t7zMT{kiurR{N0So*k>oVgk9yif-xIqa;v_mG2}TE#b{r1|2VBdG6=h z50qKTo&{6wv)FLU>i{lfoMV&x11;NTUqpMicQX7wV|G_7}V3J%Kwdm#Nm zluMX8JUQy>;=(uFP$m#8bDK4_+r>S6AGqsiKNIOK9wC#te~FbBxWAot&sbHag@fi6 z&|mTuaiqPyhu4{AyMuI5S)FTRKq%yPP_XEEVwTHs>EhaQ1}n-du9yy&_ob5c;kzcf za@J7eg63H3udaHSKl)GZQ^#SJG|eNG-o#H_cgyR}0Am%k)tX?FiCz7{77@N@Hn{kj zTSu*DOB*cNiJ7ac1QWs&b)Ft`Rq-Gl^nm^wZNayU(FH*PA#)L(CO(d{5%!OavSL)} zKH%qG;m`3Pi~BmLtclhaCHG-;*R9vPZj0vR$#JfNRd+9f*IUKX1YxdX8BG_E}BdeYiNr8SLqhK*I06p~k~@$O~gD-tV8bHdxGxvWI2&Nf8T z1yyW${NJ%zHj^&k3)s^6V5{Ve=7SS|ad)jH>sZ^KYAa=W73aE_T*PBllGK*Ny&PMD z2Q9!kJ0GqjlY!+#~%MXiD5=BAIlK5Mm-n72KMg3-Yw{AX&=LM{6- z*c3DfI>=hiRcd_MFEYml>QGr16t>Iufu0;>1^X}_e;zg@S;$H*V|K-M8x+ELq zr0vi=%-2ipdVzo2f}D5qyS9VFtAH@(YF@o7Hzg+?ceJL07ZxRtwOS7Z*(JC2)F+x- z?_phM+~nZ2J_&9}c6xJaao5pTe1d~A1P?S)_o9KN#xG=&^4#dLhRUT<(6FI?iTHDC1Ne3d?6F1Edwh2Lvhh^ zy{38KDUU1`KPTED7h?m#NryoQ&&yg))l0V2UD6?-{napp9=hKSI%i9)2I@$9epg1z zL>6fhyPlSLhF`i2Sbhn#(#!5SH^$PrM14^bEm_@O?hAKT^2%tu{s1`&*Q%?tvH4v2ftmS?|B_p2uK=|A2V6lE42+ z;=l{*^bHX9zwQEgfdSx={I(%TtshGbB1da@PFZd1Jf3L_zhRcHG$!RsXDiFu_{whU z9$OHU<;;`~GU%7?UuHG#a_L-2y=D3I3hGvzea8t&=gM1lX=1xAIdu$~B$v+CMUox7 zHGquL8eJ?6s8CrbXn%?oa)(7%u>WiNJ{~(x=-ff&2*9j$K~XDpS+EqW{0X+iZ9ZY@$}gAD_3p(KWpJMp@`?YtnrThtR=(wEWcG z%yxyt8Fl-|zOX^~tWSAn0$J#@R6527&1^sDI2ktew*E0{ePQFNl7;WY3 zHGNsqhEJn-9j9Yhrzu*QwO&J1_+4_@bmHouV5`d}C%_z+Ejbu8k1OD|fIPC>YnENR zldaq1R_az127&K3pw58#V^W;FS|+rH&p835Ia%r59VYQz-AQHJs4d!Pg^1R@z<(w6 zaj@Pc&H4Y(O0af2-RqhI-0}fPtEc9ZL~eB`N9JmwjBxM8KDJhHrD9qQhFBguJI?y2 z*Eg*-E0TbJydR}tHr*&IF6mG}k3`+<%lOMt4^Z)X@c*P}Ly#xFRFs*S3f`5$JH^}E zb*B)e&(hP?(dy&ZcUke^M?ENifhlsjWd2T6m`vp}+3tSYPdA_?$W%#&z4f(zb}(2E z6CCKevD$R2qqIKLU#XKgjZ%(kM9fYXtCaaqILo*8x1R0RHg@ zdU#6BD*TRu9Cux0?)01zm~HrIytO+dX$ls>^NiA?-jd z|C7c#Fj|EQHcB=`+p*ZyGC_yZ4WCZG*WD=p^SG1rJ3R-QDWLP*2<6@xSoRT1g~suY zKCg@QKWU*gtvFo2w^hR;ce|4EyA>wpq=!=%;$8d2Pto%ZEFo02N>O6LI8DzvS#9xd zxO>S~S~~gLP15h>q+E;+*cdm}cA=l0N7pFh@PEXpaoQ+6>Y5UDr?`5Yo%ZbG7WWEj zQZmcyu9Jl+WlhngKbME@wi}|mY=MsC4#E;)VaY5xeKAYA>n4SC(&Y4V9f`K-GZ1id zc*gHCpLKjk%Oz!7qz*o;7`P;9cFp8Xs`rZ`boFi1-C{o!%T`=zOaZS z#1D0|K!p+YOcRJB_wkJFTs`7$Wq9!=qQ^>WUdxWLJVWgHclD*Us zrlD@-mb|H>iLBRbIeW2z=N}{swMM!5!A;XIEVpZ;Rcv3h*=_}w-6q)K7`W(J>1n-~ zm3|z&G#!jr+C>wgzYs|XXLw>p~ z8o(=(j1H~h+%f;Lm6Epcw_Pnd(z@Us$tCIZee+-i6@t6ENn6+iv>|E|bf}%t&dN+X z`x%{drNr!cnH51Y!5)t}>d4gNMMly+)J&IKGIWC^ znG?+dchr?4?tsgKN4^ntvRQJ-U9gfqftu;^RN&6kWl|DNdbYiQ$FHm>T*RsDBTAiV zyPyx=C6m_CpDedtu@_L@PGcKcLoZ9M0CRErx?iyI+PdDYq0?s>XrQ*nS!b|rfBVzs zxxL`VrLkckut}gjZulSznp{7$VPL^HdN7&>bWca4K{HFhwzu7>pq;g5n$sG~XX(k+ z?iOlU$3BX3GhVi$R51HL+7qRqB4k7S;h?EYhowJ> zespDA;UJe4)f8BH3A{-Fa%e2~qSWT#0xN91e&l8cd0>34v9j91Gm(WNo{5%l6XCpzT|Kg0In1!O;UI{Td~Jy=#@DP5X2Pvf>W?C*x>}FUz-Qg( zbbYjciQ6t;YeOx;M2m&2`)GTI-+eHGP=3c-Od8{X(=(g0qvI|?cg_~!jdr-zu+;661FqFj zlB(;)8p zbhYc8AiC~x2e9pz@$q?VV=R@+4p$vYAI2j$6W@A({O3X059W`X7k+VU`L~Z4%*O$ z!+zqeoh|35_VDLwb!^gQ^mUJ8?2j_PK`9P58Rkw{5>vBu-vn9oF;Ibe4uwFgK{jY{S9$ z?sX!pfx3=5ir+9Im@g}%6{v!LMd!dJN5P7xoQwYgl-U53R0)rl&PKVx?Bivpoa#j~ z#l^__qzKE{dd}d#`n9FiH+Wi8>1tn)J4}`|>C>-^KWjIvESPLz5XXM((Dl)58y&3F zM?{jk^0pL{d%9PT5dSC1hfKPer`f?ENAY4AEQ=kB9kbhb=88H7e0x0l+1{7mK?@}% zWB9u}66<4&(bH#ga^Hb#rLbe}56i?XoYO%`%@y2nW64mzZ;hh&(BBGE50>D*AlG-% zZh~@#>6b9MM#RL0(G<|v3fIh5SV0>L`eGtKpJBNa(<-63f92Eow>hnNJGjka`8i0- z`VO}d?vCtdwKriG^i)fEes&`9`FEF(h_ap)t^~UtA?8v{JPTD2uCMM_HT4HAg=HKKA+Od1RH80$%PI$n+vD|p1KAINp)WgN1T)L6lC;RfG zvV2E2(0uYJlN}{1mzCdLH~6t`3A)I+zrixAT6Ju*6UxN-;2vvOAo`KWS=|c2qONl4 z*4h+u6{VRGwMeV!2QnYcri}KqtI>M>DiUiYbM=uvr^UmCx+SU+eeX8F=bqzqC#OsQ zksEFQNh|Dc3^g$0V^xFX8qdketts_A?vGmKGKQnk)0^pKauI*H2kh@SK6ipGkxBYX zbi{SDU+lQPi?=Ll`Ry;6p_`+1GFLMC_iT}Ej$YMMwpmt!*bCciRGD|Jd+cjg?>A|v z3-P*me?{pM=vzK_Qdh(wkaa-6s(gpwpO@Z!i7@IPNN51b06qOnicOcQ_WqL zwbn7ni9hDK29L$&zekr&MngNB?*QV@CWyP}#@cej&@lUaD3Z{>nrld67jFdWbF zE_TsUnH5Jq_GZk1=60hW&j&a5#oK-Y*7#c5k^y8ch9kR7UBy7R!cxMvl2qBUoc-%+ z(uDI+5tfiP{1n^zNJgU3K6V3GsVZ)`WYGmM^h9ht#NK*hCl{oR{SVb6CD8;PtZzyq zJ!^FE%6}`{E741*+jy+x zRo*rgxta9}j*@)Le44Bu5LGK%YWKBm1)n7KWHTuhH6uYI`8PN(f60!hq^&W|1zazy z{vG`jq@eQpBUvMy20Lp{QD#q)U3{LmNego^)V?M%B&=rD!aT3>K+DPbnB%?_wC(j-(>HrUQ?wzV&mo65CPT~J}zC~C($ z(}$_|qzln14c58~ub7CIJ&RISf6V#JdZnga3*4jXAqq&2+mQO#k;rb(9 zVw3c6^<{hfGkHy_lSQt;C$B+%H3gotkBjgp8!V5O(pENCuHD@s(0Df;E4kb*&Nnw+z=O}Y1Z|;7>;IDFWa4QX{l_)189{%G*spWTNcKXaaG3fxYVXSx%W3Q|716)QZ-fd;ae8shI8!{+;2jMZ;W0%dZ2e zWXr5oEuY&2CtYiA>38z84CMr+m%T8iGU%4Yv54ETY+)98%6^@-lUVv^)Y3GI?z_A? zOApzDpeBkHlZ@pzRE5>}v_IsQ#8G)WTPuJEo4du)f}mxDrV`}{=c853BU-GrZf=$v ziv52muj)d)*#=jIRZE&7c0f@mqCezgcDseOz{?KD9+!culctWfuvVx}Pos}?SL~ww z8^LklA_?-ccCh|tXjO`Hh9UCHg%dCw3vVSCqvUXGmYuu-cg6rDZjS9|# z%)5hSj)0Ah!zmld2zwh(mKr?yj>|;db5Gmq=Hc0f%VBU(QYB&>ekK(dHJP6r9DtL) z6%^G`*a=fG@BvlqZ%f6E1}o)HEv{YNc>RaC|HuwY9u%hMCYsx=lYvvFRs!`m>>xjv3uMy_4K%x(M6_*;H!J`bsm&82_pRk34)l+5(tq{2RA9~1`Tao$ za5K}+b*Oy}BkYBT;pV_#hc46W@apf~znUB6Hf7k~YRVJEr{T-H*)jbNqQ zE_joYQ8jH1&isS*zbH#&D6#skB@ijPg2DfF9}@Zdfb8~g3Oj`x$;b?f-lZ$&SajU2 zjq-p~)}Z-Mjegb#64P?v)Bl3yR?Fhx<1Z2A$I4vdQx{F*>BYd9CG7unLw-R#OJ(DHwX7CzO=OePsDZ?I8)oK{Em?)RbQgBnGMHJQ9VX5hqYYJ8`0vcap z7p;txa|hrlU34@)eYHKXeXfBk#zdo??9+4P9P2ix2`E^h?e z)55M3XK!9G+I>s)TxtCYpKuZ-uZBHwtw8{LTs?9S8^Nk;Ejjm8^axJl8B@Fe+%)ZH zpSsc3Q(9V8dm!hm8K0~e+`9xdbMNS(=mLAyU01mwRO8fgTjXbQrpwv2dG3=ajVuVJ zTPbrA*r;FwcC*q=Mqk?GMnvD(#9)OS)C|^I9_m)jf_LhJ?v=?i56LEy-Tg<(M#;1# zpLLOCWVJWpH7i+rIUgmnzsP5(Uay9W{-rtgjWo4?qFM1N z@Jy}A39hz}sbijKi}V7XJTLWrf<}@H48?6NI+^OQH|(&xW}*Ai()uudmlf8&+Bg`F z1~WpN;2%!osS|}MZ}Ph>vo%ESv6j1}5)XS1;y2nQUX=YGJdc7QF(6&0!ZFOB?2Tkb>{?B#O zR;VK-VNj+0V)#-Il(}QD@cXhvsnBt+#ICUJshMZ>KJ`4C=!tDWA9!O!zz7b_D!)7!3^wjmnK#0L(Q zShUeav7hj^i>NF(tK;Q0osJdWl$Bt#P|w=CT0;xb1Jgu1NPFq1$*iljMEz-^ORSO& z(G~cfJK9N#g0(hV2G_y%Gu(((muyoTYsT#cw`Bl0{xgvFK3U)%*L{Ilv_hV- zT79)V^9Q>{E#z1C#8pDyoy0RbkrTfgO_V-SN&H)r;A-%#_9rgxiH=wg>*r2mL0?Io zXaqYpP@fnjr+D)rMLBU zvXdUxk|1Y{+fhM;#hEEV2M1CA4xH>~xLg`e*hSgzPDV+SA*^csN@<}iPwDW<@vq50X5$Fgt zWuHEcJaMpsq;%!AA6W3FoUkkM*!^ku-8fByy7ha|Q=iG6U>_K~3z|wn_ID#O=B!(& z`8h>fu*}K$hqUy*ts-KMcRln1KDN5uN>cCbqRWm|2dP|f6AmLG3 z2dwduYahHD)T1k#jwtMUhHE2xbh!Q*90@*lv)u-}q|0@-nJYt|)hPQwcS$izx{)-s z-Lok?bG1~WYJG*Rwo=+2Z=aU9yNjq&5DYaj*i3C|b9+Ze$sH-`XTxJ=;t>kNy?zq! zdPTQ`^Z1A_-N%l}zwQTjGd>Sm`x4nGZ^$Fq8TSUc;Wmxi$C9+CGBBB0x*qnm9!&V6 zo|4TnI7*rodso)mVjXPrnK+u;mg&h@3z$j;siptN(Rn~gRV`hZJOs%(=bWR!^sQ== zN>&lcsDPjdq9{>Nau{;XNX}W}Oy8=O3=%{{Q2`YMMMN;5DC+ya`PW;%_k@}5d(Wxb zwQEb#eUHja4yrM#?vSQ<+` zOL-EO^$L-Dv(EN~+Mj5fdQR!C6tXo^k?D6;h<6pW61vPg*l;VIZHsj=*-~c_e9fV0#@S6t$JGD|i1)O*uzCGH9_$f`R zjs2m27!1{~q>i=Jl6DN9I9s1&(qaXDpk-{DOd?7=fV1XBm8VP z!cVk*Dn@I4-RnIhW7NaKo{P9J$M;D``-|w770+KuPQ8j)Qh{1hEbIdN$cW#Uas#Dn zh7ascFv!>H@o*QbJVoEu$~Ml%SW!NunD+?l(RcQ;owu_-1Y2pSiTb-d#iw_nk9$D; zS*^pVffGH| z_K}`j55luo*=eJLk|##})P~UnroN=A4V_q@2)@%!$4Nt2&Z1ZzF8~e~*2&aCLpoAk z)gExxU9ri!N;m5{nP^9S6&(;m>07xQPKPzUM(+MNy`cqy-|Y?Wt&4ev{@_gMu$MFK z0Y36b59qsk*yn0IR-I2ya%MGz8^MdYUYg0_Pr|NGqWtX9%sw1#s*z8Eq5MG3SHfr6 z%XsYy@fMa-y+Reu`HwX=5JigIT{S!R&_ zC42lS6uBC#U30MPNk0qbWb`Dgc0bScw5RHrWvl^;YaclB4)l$ZFpw=iUNXcR!eSdn zbKoO?hJ}2XEtQf|!*`)4u9q{u8{g_3&hW0>1vHHPTW!tI{Z|?GC9iuj=aW*1sxztU z?UQ;sS0BYRnO){xw)x&ERnEfcY8kiX4ZXA9DCE-FK`Ts*yX*H%o%%I4$I z9vdjPRDiyv@afsw2bCmswtsJ3L-Z*QIv?|r&V4swX-&_Kgy}84AJiGLL5HK<|Bh!> zpa$@|Pt}o@!+*9nt+JJ;DwulrRyq5Mxbho)XQhKt#Ea#YnL8z#@GL8^`g^Fexpg`x z#q;V)?1_8Vz?bZN+8#3THquY;!4IbK-Z|ot=;JcL&K&HaoI&twl0EX2e~0Swg1_Vq zQkgT{Zyr|AHuyzf=Vz9xfTnvB560RA%WR6@Va=3Pcuia3)6B1wRq`o+p5Up~;y<(m zSbanv4{wCIEy8K9vqY2TMCTStpRHV%Z-Zh!Ecqf9-QVWQR$tFEjiL+WzSm`s3dj4h za;hKmBv*-$oNBbmw#44YBc4K6IUdrhEDvKZ>-S*EATRFdl@xxD_Q)eG4#`-Swj}sjPpP_|yljpX+GLwJ)w02m>X)SIWV%hXC+1X>VL|=r7UGSQ4 zZImlbSE_gWz0HfxM}Zw)Mh3CM*(;B}1wmX+%%Q zb=xdCJhP1pqCCOH@FVo{ec}4K=N|THu-Qh# z@#q*J2YdtneGift*FidI$_e;=`Y_lix8xIjTzdzPQF;GX z9|g^9ft-?#pn7`gA)AT5lP&d*kepfr+!?2D>o4TG!?2p~z?MX!^<6ZotwASN<_^YdIr{C!c>xKnnvd}$D@Bw(EbYCO zzwcSCZ1}Hi*0J8tPx+T%WaUUvZLfzcj7*>|c$Kv%!@f20>iQd2!@Qob9tctZ*1yg6 zV1t9ftnSW?R7=&^i_*Ki(RZTA*TKiuXjS&7qgU1HWYU#1#WTTLhUh8#$ZZjUHl_iX3ON@p(F$iU>gyw7~d5>>D?7=oH!G;6?W5Z z!g6Z+*E*5?9LF1U(O!~zQ)^Al0-M|Bzt{x1>P7r_DWKQwpid^|RH4J|o_!PM@|C{P zpMYt!3es}U<7bUYdbp=)4|jIwR_p;01K}iiU4!^}IpJTy*D`7spXwXrKfA{1 zupIH4M2xJOUgG+VP6`rqCv~P*u+MIu2?jn1CQzAPS0;i5`+YffLx+?7gC`!tIw%1d^$!$g>R(jLNkPe0H zDIdY^)P|GIq@(Sy?g2li`n$SLTY3eovk!>3U6=S2NmWiig)z^EVa}8WBp}46QA7?&9bK$BK-or?(`(3sAoQBd{mYJ4 zutRI2zGPuWLe#G+r?TvrJ&9GFr^5TBPlGpR)-qIwsuKNHXlZEjYGIedz&!6LF_9j!Ev{$w@$eXDJiBA;Oq zX~Nmzhp{2p^I;-aRUNH=Yad_cZQ(dgy)xdi)VC1{PsP&krvv?}D5y+5MsI3Dv?A6r zlS*dZ=p@~yMPTd^tAfw{5zf*oeB(NLbN0$>sXNTzA-D%2Y6?F7cKOdejBU^>VhyrM4IJ&W3&1G=F~vPh`w?*K$0&aqJ+~#(PBbD$xp_DSn==qeb$VzM^X-&}qRDtDvcq zC35Q`-KnZu$(Bm_QtRO}r~;Msz1o3)-qD6ws_1t9|Nnv$cEd6UX@Ut_M)RTEebVPB=sQDzgXu4`WJ0Bg<|?g=a5x6e^kPwmR=XjQy|uJl{h+pE&ujCPTnm2;1D z_C}^@g8zU%`jHLB=dSq!tn&q|c&|N&;`ub(_!;(QLs&j>pDvNcnm$@b^7>_cLeF7E zd6ZK6{m>ucppSxM8!su{}ox^R$2CyKROx;_1&@Bf4U~mHs-# zUsRKg{v^5EN$E~ZJA;P)ubfFfVt?38X(*NaTW@I_;p^#`96c+TWz+EcTw2JM+shh> zykPWU>RcGq>&E0KOA4QXZy%B*P;8Mr1&2O~C3wP;!Gx!LGn`?KHTQ|~zNdH%*jr}qOiV9DwJqNDmy8Jx1uf)8_>q;-hf-e# z(06fFw%ZJQ$D7fQJPo^S4Tn$QQ=5}J{6q}d&y<;ete$oaPl6L|IL9!KRm;v~yjE09 zN@#g6E!@pV1f^3;3kF@C+kd3B&yAFntun(tv-?)rmrHhS>KCklt%pt2Cii;H8`)lB zdl|CX5_$}sdxq|!hjE5i4aJk-noax>IP{Ya@Ulel$uQdzc8jPQ+}lT`U1~*r&{O zmFIk`^}u_^rSg7V3t#VRp)@pSW&2nMuNG|Mzjym)?6R8f_KQAU_j@ME%ni0rgB2^V zzc{v0(SP+BLBV)ORP^_*UQT-q18+aE z|5%|tbHuzGv=y6%=d)Em+=%X_*G zB0{}j(}Vx0)-qc;%qOqt6P|j~BYkv-j|gHx7C&Ugp? z1RrJ0@xvZ>>PdX)3m-&9rm$xZpY`(LXng{e`IVpqmGs{Jk=*pNvXH*6-u`h=8GgN4 zbNgi(hZUsLdSpu6@&Q+RR!-wH-&uZ5z1d?bJhloJv5=F$sgqE0=vHDLy%%)2*6_o= zSgPPtMf@HAIFdcOz(kMhGW*xg$pWiHci6AKMRz)9>3k?GItLsp_3pXHsOGlSPr|g! z7#dHn;-T<)RFyAbl`FuhoVHpgX?Et9F0*gL&n-vfbA8oLM}~U--~jtR3teS1%zdu^ zroVZAZ{s_>GP^zvn|h3%)ttU2RyJDRM^gpaOU&maCWR5R_zo+*kG_CSd);HqkGklgXn@?9s`RHPzQbnwx16t2~O9!f*rFUTtbr$YR zh08n!&##9j_j$00^IoyAF;%+~u!$@-oEydl`B7H=dHqQ%>etjPU49R{vTx6WtS9Ux zeP0sD6JO91@YC9UO3}mciLIJ-;3F$F;oXgfD#|hqM;FATG>mu#=WzQ-Vy| zTN{&eKkJz+H*EW7S*caz9*pH5IY6#)N?w<`WL_VKFL)r8m45u-6{~^r)zJs5>o0zl z?>!^Sh)-iuas-dbEGgur;egMCFW3j<5x4NLzYP6^6EEBqhfX($=d)muEoU!J+UGjN z>!F*@AyXcuMT10eX`!dcdiy${f(VmJgxgf~+1AjuSnAxf{8&jLts4%3L!3h6I1pTn zRA6@GH}tF&L_cnh#|=cq8l`*fXY1%sat~KGD-?eumvtcTKU2?W1OHJ2qVmT+L1(5k zhO7KzJHkYAf}NHS8>5Y|g&Q6Ts4RsW!xS*?Nph|~HJJ{bI^5;D$!d`KQ~T4dTEjP6 zHlpSipjRI{5o+pOyl=kdwjDkZ%POJQw3U{P@0Kq)<$RL9#C)*%FW^GMd@1%b9>o05 z5~UyYqQ_xY8-ovIig%&P^qRJW?I-Jtnn$0q?tUSd?tzZgarob5ds`m&>^fPFYB6Sv zy?}<=+#cB9Qp?Xp>ND4O6zEI4;6fXUbUp2qf(#J1zB$QdRD>Tt>8? zLHv43>ktWxX=`G{S6U24I{^G#pu=^fI(9bPx_C}hiYt=a_Gl%`!adj-!MVfeNWG*x z{#J(Eb#lsJ`7xNAIM#-c>&(;(cw2^OW^3Z_$7WK~y(E`yjem&ZyxGP}Oa|+hmO;PA zCbL*Xzk%zfX2K=VsBijt5cENK$42-$J*t)BBgkQk`(rSqW!}LSOJnlnyQoS#{Fq#p zPl+Q*o-^12N`6ls{f^w!+2G*~n*&GNXfw$GPiicdD;$R|f8P$H*w)sxtfXM!F>>j} z;Q&4BQ}k9?op*T#y|xl4e%&|uZZAY`GF)D!1KA({p7uX;m@FuRcR?0VO#!UNj#Z`@Br zWGfgSBx@}rIfpv83u{SYQw&c0zJ9NnY?F=Gygm>fc32zwhU7R1G7WWoqV^zyF_Xkc zs`%^R(pLJ3r+BHz5@}<%$i0$?GK1lF2hec7pyHG!UO(VIB3nTGxgUN30(?#0`8F1@ zD`<>g6?O)x|V572lXsmc%jzRX7-A2)tO#gX9NdWv!blhBx|mFyrJju zlG-4p4>6>My(4XHlP-~`^#hn%KQ9rVMrTAV_H(#?mU2t8=_K&xF`uP1y$E}jNG9|+ zPwZM*^Mr$R1MiuZUGJ|iOIg<9ReuR)Hc$WY*Af>K5h{|eFQEQfRgCt^$9CA}N2{P<#i=0v7AxVe62*%94`E$U=tR(7UnWMRfbeDHG@0of8zQ^4 zJ6Ui#EV+w6sk>x89Z4NfEm|ulP<%UYnbq#25VX;);S61&f7=)6W`B8<`zc)WP`CbB z*64ldYtL$&XUYt3nQX^7>zp~-!`J90^!Oukg`K6-kXz^$>1a6`%mdpWf!VL=vx#fG z6N*S-;`VYYgDvz_=19r2Fyu$<+9q;?Ioe-(+2`<=fE;kFJ#L%%)0ce%xe67aP4z{rTyB)rthzF&2Ew(awm6uI>CW&ebiR(k=k`&!Mfys0xLbSH;UV7i zOXNBGPo@xg8iMzuv>oc*NutSAn;Yb?G(Lva|IH5hQTmT(%V-LOP zM_}Z&b$}M}6wA-&qF>pppe?!CBRlE|x?0hrwUJJcNqp+>)VD%QAfAP=3nt4l&vgoQ zwX<-JmcdW9gY0QOmhp>Rq~^+TG=lF1|X!r+$L70XVJIf4GUJgodyB2YCaFTfij_Y3P=$ezF(Hlr()gJmb`7%8A*J%?3*W%NdK?5Er9YS2oC`hP)De?_-Y zq4_mFTAMx z>df$JxG1Ot&ff@&p?g=2c%9sd4s1MpxJCuaLy~}NNuT)Mc=Z@zACYMq?uF)S)=lfQae$) zZOJF*67O!va82?Z$(gyGV45EGOTh{4Z?D^NS?qHpQBTWfFqMCTt6@IR0JF-BUH>3_ zvS#(=p3`p=8Q#NchY_#V#(LSI@PK8~LB86*^KaxSzNw)v2HU#nHh6hOzMGrjy^4JZ z_|!H&uJ0?pfJr%W%2W zvt{};EP>A1u(|EExo%!6H5&Viu*6)^yRy>rOAY@=_IbMS47Nh$AY4taTzdDBONCR+&ed*u)Ovf=+QKOt>8aGpGMwiZ{m`~)NBK0C zs&LP+?ct5sd*Q9fN!cv3*ulTNglF?h=%doq02vl3V7Fye12NqZ(1+)4SE zXn2EInu|Lha-%|J<}6A2_(!%VOr7IYM}L-?@YV`m$-mYarsSA+@sK-qSr&6nF3Bqr zeZNbouS4|@HORHESK3tgl%jo}as@d?g1bQ*9OA9|3_CT>~qDj)jr`c^3 zj57FQb(Dc5nC$|+B|XTqkHTd?gzH|=<$jHS8im)i6rarvu>L_L$(g^WIpYh1j*Nkiuffj8p7GB?#`4ixULjblM`Q#YPEDz( z4D{l`doa8UoOz$_7pT8IWJRkH^}h;gfG*8sjc=om?t9R`pnXRs@?q?FP?%enXX#dd zpA#6X@RbGrs+NxLw|#cqKlM-!L_U=-P(oS~H&-)PB|Q@{Pla1iOkYJMNkp5v70f^_ zUqP(ii}qMCUWItvN?!;@`tC?C{V5m=FSusawJ!amPdl9lz61tT&Zl}3dG@6+O|&?% z@=JJ5O)PjmF)=GXlflbzuWnl`{f-o2cUE{i9qj!$jX#Pk>KbRhst~Wf3I?E7Ox7jv zx(T7g&I-3l*>3D?eQ-k>^K+%7CFK^-ta%|RL}-nzNo{gVa&88_DL%GOIMz!fIgBgved75$sn80(@EIq``G7A z>}Ib0hi=-2ZkOElqHXt)N)LuU2#R2*g<)lnsJC{~OS&~RDsaZ4hz^Dqq_X+0o~T_UzYNkSYnV04%x_62 zyNJ)*=*SrWns2dAwjUk(8_5{Vl;JWT8!FA*{EVA$guWd$Di!R{D;g6Cnze zqk6cUPnhAg_1|DCJNvA^?R~&J(FD#X{U!rFFsjn>KhQPV=KH%awaG-l>Dc}ZuVy`A zfOo^bSXIMNWtrs#le>od=??A-Q&}y~co)85o=kMJrk)dK@xETr0dSV#VEqBV<`4Od z4>XzGnkZlDKw?%#tp2`@j&1gr{J1rfenh{1kraY^ry$ zBJ9sAtWkN*#~i_&wnSGR9D<(w66~-FcF+R8pN*PHC$KxKZiMF)W3Q@(oOj3ncVk0a ztrD^Kkj$ZWS{O|(1ubed+Qb^oNC#0ul$j6yj(j8|Vb^P|wqK1XYW!kp7kdZhnBDf< z4G^t}rh`@W<*6oNyB}#?a_^Qzjf=$NS>(gH;q}*GLRW)kXk&+B-}2m?Rk0pK4W?sr z@^86Mv}A1z@~sF`>qGrOfV!TZ=s8Sx+b`bR#}X5sBIa&p&0s$MoA0CM_+P-uMt_C> z`=XmOXs4jFsuWz_NqTulRtJHTIlW)&5v(Pi^gWP-Y(U z)}9P{XXEaOJg~t$_{g1b6uk1FRsv&Md)uIoGJ7>%%f8b6u`FJS%;_DvEnb107qRki z$SK-_s9Y6wVj(%}H~4ifGTlFQDZ1ye#xwL5iqjr+g+)Q4wYM*=to-N6a!LmHdY-Pk zPL;V5`XYGRY5y|iguE5Jg#PV5N*m!z=fL6qcvL@6OU~7ZX_sHwk8(%%>Uli4l>db? z^p3YTMPpg0yl{5NlkHo}&J9eQK++jg$ z<8_3t=buBt1T1!ltDSHm|Lh}q(E!eay^BD=Bly(_B1>UzSiSCDbvb^z!1hR49f$X} zlghGz=<@_oxGT6`jEFc!hT(hSN6~ase7s+fFJX{r$ouj}KPMa9AnKn;er)d;jJIC! zsbt_jkJg?E?zIgKFg3!A*6jL*z6eJhO2=n9*1raL^NtPUuB1iUH!@!a>up#Lb0oDh zf1B2(>Q!6fdBX9wl4zWZ*-9H_oBblc$Qo@&F4)|j)ZB`GgP&}}H)bmxm;M74jz-Dv zfNRIYgRsm*&!_Y7kE6C-Ch2oT^Q+XkhWh}%WrdvZx?~Io@wa*50%GTSZLX^=qJc*3 zf1W8a#|ol4l+nVv9$he}uJH?=>|E+g7>3-Lo6wzpUKw52lh0UmL{NDggEb?z-lNufg?wwV&H#woYiK#E5 zTF#VH{!O^YnS!s+`kVHF&$P|n%XZQ?pEI6YulZWb>ZhbUo>{`U*U1`Dr^@fkwL>sN zpYn9ke$>8ZfcXQ!;PUc3{`3x3^#QSPD~SGwEaQ{Q_(i);?p#aPYEf#M9j&{aw9uA& zGiw}j;!+M7cM?)J?#n5fBgE;kJ}Ahk%&_wjc>8`5R<0rB}rgEN|4W`VYdhB@W?KG=)3ui zr(lVbLvE_KL8&a1Xpo+pXj<9vAPxN`k;pf)UAri?L_GYF@A1xlmS|QR4_u?SIQ`dK zU-QXSG5h;C`aYfvdlQG71*ttIb@Vrj!BS7+zfSF;XMi%P>NURmO>(ArzQw;+rV;6*U>P_b2UcXhu7dks_2HoN2w4DYnB^Hj_Kze_ zoUT#MHPNqxzEK7ANvl-2!&NeeXd6LM{WM%EqbGm$b!iwfHjU7!E#d$0$Q|GXy(!uEau(i}GvM?FyGg-(v`oRCO4A|_u_8e^S zyiO#4ISUdbP=kL4beX3u!oo6DbJGt~ODpIcX{IYAg=$-?qDyNloniQApc;qX(-AfS3&sd)LCw^A{iXFigYjWqtHYWZT z;nvKZeg~G&PB$>Uo7!kNlN|e+zF@P1j}y1(@3sxbbtOo$LudB9dS#rJ|GNjt|%WfhJXS5OX*x;9W224JnB(Q7pbPQo(D(>E4%r z^q#MeWb{RPMUQ(Jxkk=#*bAKM4{!>po4cg@bLG--?vMl5(OimE zN8hL2JWZs47E}6OQ*z!vuz)QhoA?;}SQ=KTDqfh17yQ<+qn5CdK3F{gi!ViM}-GLC;!C@8Stq;;CRL8Raj&k_>XKwi%@OHKTz{Iq z`HxKC)3B+(6K;?y%=URv!^o35--m$DNyM|2))Y*CQOjzSpROTAvG`MVQr7dF)indL z;VRj4R~U6O9mGDC=Z>z=rM{Nn6wFpIPW_55h)hD8?In511#-o+So!d9Vh`P7Ikgu2 zsVr|hDah)RB}-VJ%;8Tf8q7`pPX6*n{tS%aG;utQEw`ESh#ar0Hbn6rtA%Vdo<9u) z4n4qIXV@=xDUvz`ElJ+>nc*<$<6rqHzayN2l2g`JcN#N-E#)T$~$e7ckP2cl^*xyb0-pct7t3cc< ziJI^Vo>g5p+j;E$JsGHN&|5NMQ|XyeS4}AG*0_*-uQh#W)sOL>AZ)Znso=e-a%*BADb{&FA2~)9tsQv`+GMWUCX|vG&r=&x0AoSdm%!kglvJv>gm|qD=CN zoA zu%$cMMp3*g6Q>_1#)|s3Ae7XZR(q|oRV5~swDGb3d<~gV4(Ij)n8lC0_blSV274Ge z9B$Au>RJ!2aKB89bwZ1H5LS;j_N-D0>{*XRB*38*n zi^;9(dR5-0lg7|Ow&*4)0N$2|m;CA7sP%s@eIhyF>?^&a{o(Dkj{m@#yy@J1Mqcth zxq3}{cDq?8l=ESYYr$>Hql#4V!LrS%w#94uHu;fEqO8o5T0RNXp5Ue9Q)P-b@HjEO zyI!$)a4&o=`n%mjb1H$3y+D6r=0$zK=et3l%;B4?GgA|T!(^fFx!D*HNEbrbppTk2rmZ?68W|Jo>h7Dhjsd;fa*yJ0WQ$Xe9)4JqyH zAY8JN{wrJLkv-uV=`^VBbzv--;xXT#tu+ljH|e=k_#&URPAlMbUlB2;+HQ7S{Vc2{ zQ+&J>_cy|mwo&rPTQ*)bq-TOCm5JX~y!Wi-u+@?u%edq_gB$P}YJ@sU|CX$dChmFS zbVx{PtaBwDVCB%pLi}vCq@#1-ls974mlD~s={cRF-SsEQh#zIa{>Qkc_5id!8vIObosYt?R>ymZ=vLH{8gf&I*dTw}+vzbr@g?jhFC6qTI;BP<28HW z{?~LU93`XD+l&t^VI}8lGU!zdR@R6P=yoYLgZtK5(s0u8VWfsnf^FQjMRb!DjZf04 zYWgkNa9^U%=CENjOFSp4+q3==(RPB)^{=$NCTec1AqRc-YseW(YA5UN734|k^JR&H z8|*sy{Cp{+tG$&@)0mYZ)|~*SA7d|aYKk|~pZT77_NRu4QNQWEjlL1Gz9r$Gy4f2M zK_6J!;9c8@pH$Gd!T))>l6ck z73EDd;4ih94-S5fa2s9lUAQi#8wglaAM?+2l^qM$p};0fQPB)x@01K}2)SAg;3Cs!1keI_~$12!o z_8>^&DHa9a@vR$)aW%D$w36@jS)HDEJ{%!0dBOMx{%NoXKOGdLc2+I6739>HZJH-w zTW|Zv;TbKUvy~e>!h4zx2LBw`a@Q9~Ti)C4!b(4pBY^SFxjQbWH)YuIWEc4MH9k{T9s&5!C%4p7xPvx z`(%9~JS{zaI0{4uO}&$C97^~K9U{v-Ax!k-pb56MEPP1x%z)a}QIq7Br7AmP(Lt7J zYuiMAk%2i8Q*|q~>(sve58-4j#bl4Z8BCJ2dfjVzuJ~}NATd2Ht+2ATnl4;}*gjfG+b8I%OrH7BCpC~4e zN1xO0t)J%#zO@_j8nY~3M+dxZTkx2R!9_6rYB_HbMg!ZXwPs z&T`exbXTNK(CRM@{6Z|Vyup0zo#5JH&h9Mo@w!(|*naRo(s+mDBx9K6Yhb7SeTa_@ zleCEU0f!Pi4;bFoSD|C{i=B$xuy-`|X4bRvJfDBqZc9E+dn}VN#P@x=f#@+?$C0bn zjb@H!lL}>{XycN_+dqF={x0Ze^cnM;axws$yjh+zmk}a z3gdh)K=e;`dWvKldeRlXWue#7Tb80m1?^4VY_R{>c(ivX>Tn0K{voc1LAyjel3SeO z-vx(kfIOkU`7})t)rw@7_h3Ouaz&)-Y_@+?q?cB< z;r1STRf_2P82w_Uh~)D*cb8Vv#os5#`rYb~RX+#vXV5SGLz(N_rJ;^zwFmh|cx_GL zbbF(dM2Rh76`ANceXqPFv+?6;dR*q&34fc8y)gkN?c&+xv^BC1buk^8+rX2!L_NE` zr#U!N)Ha$fx(Yrr)|O~n-zV?+d$8HzK~_~A4AY;XSEwGjx6v%bMQ*ANYWR9z4AL(2 z&05P!qQn#iM_z-Ka~hF;?JLsJ+e=UWyq-dB0*T{YBhb*`?}i^iFQ;F@%^Ytm1L zY6bhAOkf;OGr$*yC89OqWb>+hh(*3lx^&~e=v`9q5m zMI)BPdEn3F*W?4fBv#qqpra@&GX~PdOJeKYg3Tz$^{}~<^#6Z{((^5!(24VFbLDAy zO0xM1`1CUx#@>!3#_M`{p5q?4^`W=2Pvmp2>HAO1}g% zbc*iQ&HkZ3M8Bx!yYbm(VDlBQaxvN6?Vy@}777{4n{rf^2BopQTwc^#YtCpRPEs=c z3T3LGUa%LnRpJ4ui(-(?H~Ucenp>*Y$RsUp>EbIrvrmKz{h+V=Pju82jc*EG)7!ek zhlct6p0)6*zS^1T3(MN?U-N`NcmdB7Z78dVJB2*a=i@zA`pl&ghfUIKG>=I;eqn@Q3!H=d^+PJDBnf%E|w1 zxx9pyKi7LOgZ~NY6&t;meJlylYpj2AWTY2}4!8gO0(f66dP3?@`REyGtHpzitmz_q z9n7p4U*Qpuh)FBxhGldFEa;}crHjLZayq=NOSPIkiy~Meep^f0GOKR4VyAt&WF>~T zk8*PfO6`5>8n1@qeYVxr3!a^|Y!BPm;60Snb76fQMm2wIFi2Bpp)T}qOg%k#c2qx+ zZz9j=UzVTyr6@X(YDAY_$Dt*_Hr=1Nxh>Ny3_^>^fS699_M`SG%M+!durdNdJT=}NRN9G zXJ~5*cd>%RuUS?&M>RN1z|6sE`ff5e_lEc7k06^aBJz#JPq|xDhG;&0oGf#U%nvts z;6+gF(OUf9h@%V@u;26>pA2W|AS*owRjNbe3GeA_`2TZ$mv5aBnc;`f)(h)YSj*G& z41}@Q<#ePi6}z{cDb?a%k$rsVEwll=GLL+$UA!xuF!g01dd|neY~P_jS}`xFyL1X& z6%*_tX#W7)tf_6tZiOl2#cc*SzFi*!xleOD=VUFYErLtvvE)VFk~6j%9?;L9k*eg?obs{#FpIrLw`w#A7QIvNg3uQ^-PS>`(|eOA`lSCR z4T!g;$j1+A9{=8wSaTSmzoPGG0*YK#TSS)ptDKfhcxGYP!Jpx38xOuE(GgLN4!Qw0 z#5&mDL5AQRO#@fGP5;E>;r3t@YrsvG*vVyhV5Vr`FY~m|Nzr)e@NxebY;EOJB8A!a zn2(4R^=b44R`d$q*>m%LJ3+zM$jj;FfTeY})-bPLK3n%mQSC-8W`tacoRNLHT_=W{ zym~kltQj99Tj^kycd+?bUV0y_GpUDd^7WpdS-6GdG47>qrL(Bq4%FWw-%4%I23{s$ z-;bn*9AnpZIkyXd*RRG_TK4!+cJH>gz@r`yyRw=C{UC@{kB+Czwo5wed3Jw;f3432 zizW3&y0qR)=kQ)BWG$cAI$^r0Ea#N)UUc7{SWiQ(s@>rUOTtQ2k^ix`e4T$qtup3k z(UP}Vckkp^yrB@2H1$UC_k&)VsAKgTrDqB}xGgRGIeHTQ;pdl7E1c@7_bT1juc)z< z_7n1X$b=!ymhz=Np&7aL;TsF^)XU^XGqCoZ?9F-j^={7#Q`uy%dOk2JIozzJHILK^ z_t;E*7Bs0Lli_fG>8qZ{^V5so)^k}^pX~$nm>siP+J^f4JbRPpJ>q;cb2pv*ql(mt(kTQ#-Ayc9fmhYj{CGRa!TQ*CgFyie9T z(9TgoJfRuz>LefFoA^d%ih3q&Cbu%@(PvqL=M9$IAbkN&dYWu@uGOcTI*YEB%=Dm4 z)iEJ`3SlkX9;DuHJVN94J$RqXxlhAqYF99Hua8h}HbJ>>4P(!u{fJa$f`fVhBwXnC zWul&tj=q|wsDSTwLtCD$p@NB8B1oN6k=EMD*O9rl$A`-k-jSHLT1t6s*za-}bVVZ6 z8BY!~aQ?QvjST;QEsXXx_IKD%3-DG|JUut{4wG%cbk7&vf`wg*Ow{V}b+(e-i1|WE zvh8#ww`DC0$fq_IoIB0$3`QwG<>gSma>ZY@mFy&E+Q7GT`U#);DxP;nTktct^CCs^c)6z!ab-vi1wSHt}JK*p=Qby?i&_FYJx_GmonzRHd z#>+3kF4+Z7&EV&3I+&Xr?v^akR3}en-%e0R)?&7H~yW-6ZX+(q?4bAEw-nd z^{O#PD>5xS>`__C8z0waV6H!gLz&u=MMqjGO~zjLhhKO{U5zGOFj|_-s;XDA8^nyC z$P%y8$@3+BAMIFxhqY~sz zgLNZ$F10++{X0E{3X%{XYqiKu_K|mHh}WcN?OWE|v{~>}v_!mzALVrSKvuP)zT-!9 zAUJkFC&+T67c@8>Ss@D|1@$3om?65?H`+Gd{R^!{zTH-r^Qp{r^ZVg*dY@f*72H?P z@9T*1Zxfd_{DXd&_q{pniTlAhHIPS(MoKEDgW@Z&(j!{MV|Ys+&#c@E5uCSaFt!}g z59DQSfmc3FXY5LJ=mR#AQ{eseBRi}O$iC|nRnkN^QQ517YS6&C`EDz#7s*PGX#Vhq z_a+Bt{;Jf`@?IR=nn9Q9I=wHOJTW#+%krHkf~LgP+{y_Sy(2BP9Q$&C%Ki|2Rc@fq z?BhvidJ)?IOR5+xWHsSOL%cS8wj7KqJM(|$YdXB>PyAr7Y^Da?A>P#+5?ymirCqT%e((*B|}1|M^KLG7=ZG&!uMx%JQR zNm$sH)Ms^0DA@)SyXx8~m`1PN?=m7*Sf9}ix-sly`810J`n){l|8Q5-6!3DIx6^vM z+=l8Z?Wu>XPHd*%lNY3or%n-Q$ULD{lFr5VfvmTf)SZSK-@di&oK)Qh=Rd8>^~+#f zc#Ax^f!(3r(1YLmgZ`h>!N2mRXT*2s>lVF>x?kI0C6C>%Nt(tBMi0R@%EIYh_Hy=Y zFv+i4*61NAhANg@QqQKH!UkG9cM7vN%x>{AobT$P)ye^CDywW2ar`Cb)jSW{r)M%yF1s)P+AM)n+IXMGr)B?K`@BXmanu4ZQAI6Y+dS{#`8+W@9tD9m|ujnnE zW*f9YJb$#Tt(F`%5!T;CbI6A>Me2ZnCxTq@k0lFHaV<}^TfRmets684rT&36EU+#x zh_spjqj-{>oEzQzNjU1GaHRhj;Vy>gYRLk6&h&r$0DN)1W(|^qH+cGI*pX?vS@MxP z_m^DJ+FH`zksiMP&}q7yj= zwJa~t)y0p&zARy=S5Is)_N|j4Q7KMUPPwBS*z%6 zo`0ghfzH%7oZ(-4QT+ZFjp(xtXS*XS&5?i;YSHmG>NSSBsw zYvd{%uBG>2FZ%@#bcSq(_088w=!Xl4scCd9b%Q@6<%qD%?jU0>jP){OpZu>NOr{Ok zu@P?`PruC_0YSY&yeTXCnN-z#1_J3v`hOjr3Al}A_r{IKEb}ZgnI$6jyVfQ{nL<(- zLPb##pHfLUremJxd7kIl+3#B0EMy)-iHbyXiKhQ=|F7@*zAriZ?Du`1HQe{T*0Y{{ z=oUW)IkckpX4%0~%#89Zx*I*Dvwb05h(AA~^=}Q2Nd+6h?P6skO=YWI4)ghgaIsGJ zky<|}Of3H`;D$@Qb~ftM8=kl)Z(uA}^mCN6>9Fj3(Okz6eXeOizP%NliJN7lo+idFwX*yDm$u}V z{aD@$u-8{)j~|uUL4^LN63hqU3^etO%6@YX_2b|L9Z5Fi{&@Dbh~|j@Pp8TuorTR- zk9|T8Y3dexRT`6%8%Zyp2#Wcc$oM5yn#G`jn(<~flOC;a{6E{td;g|!{U857RSFVM z#*#C(`k=@}zXl_Zg1ou~H$jD{NyNg2meq@Z4o2XmGoAT!vQZx+TV}FvgTC6-CV@Go z_@BO(?4|yRPK^B#&bH4btA0#8Q6hCUbZn;h|399}iOpQ~(x{DV?FZqW_{cyX5uRjH z$bESjJ7x#5$n8}5_Tr15Q^Q)J%WW=sxluR|zvE1h71!oqfpJ=mpS)nj>B`+AqhUT@ z$pV=KPPrH>D{=oyOH#LJBfkb8Vt@U;EUfx~JfWk)Q$80pDFZW`i#l}#PwNL`yA8e3 z@{l(=>LmZ29Js|-hXul)gO&A5@pt*-2fPkwd$C7!nlD7LXUeEv_J6@r`M~k-c%|q; zG>-q-qw#&jvR+yUo1TqYeF9WC*zfo)ko~jT9ql%g<`5)yo2{a9@3To8nK?A=Wl#t36*Ac{? zrrev91Ai~!zxW~Vz`ZU9yo>ITW}u6lnjN3JX89}y{dT=2PP3jc^Agcy5W&m-isz^5 zwNJ8Q!}+|PHxzE2K})a6{be2G;~>nusqD@Rbd*7!sGPBt(UFd&D-04ZR{Ud2BOL< zx=LS^PlH0R%T7e+r!)oo=`B@!o-fn|WXWR8D5#F6H8a?x6Z8uoqOH}f8#Rl+gOgx@ zv{Ee2?Hszyijxuc$^mQX_iR8g!-qJ$)w9yOzEN_6I`-fZP3dJS7M~vOBD#L#f!6Yp z+SBWME1%=#Ee*Yd`9WY2O_R8h0;OY!g;piB3MPoji?DbmS-UrA4tVqCs zd%U7=@~@1$a`gz-nhV<+Ku^q7vRcvLY5ZWR)0M+SxFR})zOL!o9S_?MCTpqNK;ZO! zc!j71pUVYsOD>~s?8h~o^do9d#%6Oxub~^K2|v+%-b|+i1MEC%eR{OILY^#_!D2DJ zLigU%u(xE_4EBLN0{O)y5f*%9P zyVTRO_8g!5CMbq?&a#r4Ianvl=!i(@?3=0P^g%6J?C*Qtc!2Nx#(rGWk*N#dW&>n< z@G6Sm2rcNVV}-asc%O`PrlI*PcI!=@%F1npg{IT$WWMLX0yR-ekFgr1S+U<@?^476 zMURtJ`dMY&NVFKH+{aCS#z&e%7sHgtOCu@>TcmL~n<`75=veZ^e7g;ode#f*L@6Sn zrlBra#GeahNO!qHWGuq|mWmDzPX~N&u-YH^8u>^L+EyQF^K=8z_M}WBpO*zA)DOx_ zet*mth8L(Sy{0w7vEG9^Mh7nq`uIlDgU~yX2?pAskb9u)0{TEn|6XUMzH5WzS9=c5 z*Hd=*I?u^G^rw|-r{Ay_f-xcY%lS>)1#0<>oVQ21`TsNpKbs+gEsAAaLT7yt_SZIG z!pwdp;2vo9x4eEuR(@ZX!(fW(Q&y8$_W>B85$Jk){Xl!`Mk zM1~PEMW^^}ONxG{y?j;bM62Zkcv|McQ0b`>PZK|3rNH8A_{=@;B7e(wvPlALzLEc< z8$jkmiGFFloK9rlOM5?!SOI5lnNJ}8yka@xFM)(UV?}D~4l4mm|Il6s0hS`twfCW# znH}4#lfl(L+XB>qWjf18=~iE*RIK4ZS1p4Tm7ndbl=IM+hI@GHxn3yVLAaqnw@6%a zSuPzNyJQukj)kUBKW+_r;%*6jAuQ}QQG(VHXAjvP9j2qCT6jS+!iLXUi7;7C`3gJh zY4myD8+O$%$t4@*d2;(R(P!f0KKBRkrd&+z#^+s`T?Rl+PUy zW)b*wa7i^`f|HhqWB-Cp9`}bfAAO`NH5E>af}lRIzS@d^o+2BFmaD+M>%F@@PMlt2 z4Z~d8$BWsnNV41udwagray~86Q!bM?7lOSKw@p;H4*2$XO$vU6>wbbgBx6OyZ4g2( z`_6B1TBxKx3|$71*=C^NukswM?iRU*8XmVkV1_Q9Hku=T7wx=4WV=5R|1_*>J^gvj zXYY~A>QQNKWozVvAa1|Hm`8?>sDlcs`vjW-!rldUsHJCPS-At?6&WnqqIJ9q&om15 zGd83LpN{fdK3d-K*Wk0C5~1S``{wQIqDcIY{(u~wAy_OIWTuzazsXfkdPO@gS0x_y z1P7i3FVtY?F3U1qXN@&0Q!V}?PxocD%Q$rdvOqEL_QpsD`Azbp^B$2r+JH>IjeSYg z>EO5W;I69d<4i32Qn(AmmqhFsFM_U(0Jvu;Xx_2E%>Lf03{3 z1NOOx?b3t%+ebLL+KIl8o%}f`Dq04RVM1fRLMCm5#jms%!>6q`tJ8$&|BF{kOuZo6 zM)Z~7u+X`J^7WqAlb&F<_qRwatvv)6Pr7D*4co)m6&mukKcSib9y% z&&X=qAP2yt9d&|j55M;Qt5i|4t(SP$o4_TejH? z;LC$B+#y;~_vm`b#reSJ^(Ho#CR*1XiN;wsPLN~Gx2=dj?s@cqULa$YjECgqv2eW~ zZG#@Bn=QL@5)(F=$(|<)wA7P91s#imFb~vw+XoS4`br-Z-xacj9PWZYrMUz)Mez&U*#2PrLWmgA0+8uBbRlgbL)$z%6QP}QNHPr z4zx=!g9gsrByYsG^bP5OlPW#~&kE&_$hVqa7x*;&G`z3tz_s~79&^KNdQL{`HrC;Q z_6Vwx`)>tpVR|d9kT=tZwi&N~L?`IIa4=8Vm*3X~4_$2kT7Q2=bNX<-E9W9*_^HLg z1kXlaQeAo79~EZf%7w6jm9Z6mO&9ysaBKaO9%J8g=vc3$3oWxv)VjWjCwkK2HW{Cu zZb?|$29VngucG^c?ZzI>iOcNoFd>nE|cR&v62^w z!S|(uKf!#}*NJRj#$J%L@xDZg_pBmn#n(XxqSZvuL^`Z8+5hI-7h+q<`ni0h7lI9X zOy=O(SMad*DBw#`_d4@?^II1pQ;z5nKLuYIr$1?WUlW@Q7CP)LVTY5ft(^86_;GWx z`zzS{9qUO=cn$l#FO2DaI>ff|Je7Q5{fF#xYpkWEXA8D6t2T$IGNm3*s|+^y!9ENg z*QcoOXYzxVLCa_vD@6zIUOf@cB7e_d^}e*nwI#@;E^1mC*l>USNKp0T?PO6{$tvnC zKjLrr`##Rn!G5LT#k(+urlF8P=Q5Jd3l4OD9_Qdp^1v*M=pzs#e-th4K zUfkRG0y``p5R)&+O1ng-{$Ui3d%;YdLUn{1X|NX@bkRnTg}-BBZhOxhpTu*VfyEJR zBu$)28F+kl8}7R$IlM*GFRZV5ddsf!ttnHG4qIIqRssR-L6iT<@`4RhZ7m4;FUv_} z7%Rhg=6;$f+RQ%J;>a+s`^7);<jf+Hks8*B@nl&@||zxg+(pljhU?yhAO2HkKKlR}p5ab(3@Yjc-3cCU$Gy_bGa()y^}5O(c5S1D67Y$^as`$(Qv1S0lc+p4v~u<(TEx$io-=6Nc55T$DgVZ% z+E|Nvrsy4e(Mv{8+7PYjDM8|XxUSgV$6h`BnRThHMdJ@Vmwqq#;`d-Rd#spN0m=4A zAbp`q?`iE_{m2LUs>mMuo;aA8xH}13yy1C?qZfk|EUm6Hy`21;F1lJCvNw}_gq*O! z;SS5_lfARnp--Wod`sqE9bO5>$`Z?}nWYWbcf7ao?_jwPVws7vMVa|bn!7LV2oZV>A?gFepT^MA|zU>To##bd>!ii}ZkuhU1S8GHO(ZC-SrX z3(q(N()u;n<9U6K4v&rDO*cq+O@qhA^^_0ug|b+OD^qcTf9$-T)bCQW5V=dik<$BG zSbe~rjqtJdz{}a&`hyL#DR@eH8?MPj+3nOB@Ax0mz&rXye_2|yMya+?rt7U(BhXwO zJmoOBcf0M7iA2|Gk_|N1O)hE+_`n)32)caIo}z}hjb4>EeV@EY&rY}aZM3TI!Ua^s zvV$1+rIvJ+4ptBz8N-U!@V7lZ+1uDUE$9DXtz%Js=6OSQ{HGvs)}nv7(^h$UklJEl zZY>(i`EU_Gmw`Q+gw>XW4^NY>V5UWpk@{!sN?28Yw@S)sL%VAuV>jg;eL<7rW4x&b zo|vQ-lVL=)s#=Wb*iUPF1rY69>_qL@C2JuMV%6avC(+7BgM_BAj)!cwtgvogiT8Y8 z#(73g)Wn#CtzXqjzRCXN+Z%gsCXp2NarQ1uragPp(m&#F zE9-~a!n4G3ld&d&c&pJ(l|=WK`v%)AjmVBsY~^Kma5|=%x3!v@24kRYRDn@PEO$L0_U{XPdr)M$~RIBypS_21&Ue<`b(o1 z$RcxfiIml^HO4AQSkrpThd!Cw0R3X*+Z^1k@RQAk`6T-UJS}ni`+0p4oIF_{3lI4w z$q-IK*FEexZ8CrRK0CaKzfH9!cF0ysq4+=HsK}$7{p{oaYGZiGn(z|`q4RyEbh|$c z3ZX9Dm!-Zx%%hXRqi5(c`Bhee9X{KA9<@=6- zBZ_+`G{DjPTnk8D;lI;NRf6fobCc_5jp&-rYEx75~zOhFYycc$H`p&$Y ze{P%fH(3!}4IXMc-ZGOG){fZ7a9>PJD-=H??}ioerYe#-n!yuyJ_JPc@WK=G|Mvnq5Oq?_mMgh^*wYPb@jrU!{1{!%S-Wav8?o*TH2zz9lqTu z>9VQh(PZ#uAe-$RQ z*D^*s`ri@mQPN^Cl$=(8)wp71{6FfudBf4<&x~hekGF7>O1?Of4%OnCJ*i#n4;u~snhxfy zYtKiP<0VCON!Uai5>Kv?W1r?mW=?-f6F)A8q=roL#LbN-bhCG~v69Z`68m_wSVhkl zzXDU3@4tpH6653tI_VbqocLNz@52RlSral-XX?xS70yq+{zt9O%&H`QVt=qkR%$k4 zds-i&b;C~HfZ5Ko`13h=JlaVQ1!cnLeS=T;Emlx(Ngmi&aaMf1j?^i_-UM~Mv`2Io zcr{69X*V6A`=k~+#dfa<=J-_kZuYmESM~2S;u-06=JxKeMEo|MoD!TrkEw&x3cRiFDUt_5`f8qGgIs(-&;EA0ejyXw&$K z{9a2w;^$K&Nekh_r@WUh``Q*EFFwD(Z@&Re1@B2HN zC(3k6%VN`IH7qnyad<`kvAUX__}5;F*f}Ch@29F+-q;i$M=qJ^4^u%?Rtu)FF!q|w zw|(Ik(i4VJ4tuQ|9~Wlg6mJHbWXJ7&BFG_~N^V<^E;@sM65Xc6>IVr1e@~mV- zDuVRM+_q!-A2@!CZt%WKv?9fws(5jQiBEAL8mjrgbfL8JkS+1tN z?yJB6uUhsfvsJWzu!+dQU4XV93|5~wy@KAiqS58d2Kn13Me2lSKwA%Ns}>KY>cP_vwi0wX({kY59WTB2Qn!e}XJf}aW z9V|6yLQF32pV=T^5vKQ#iSvnj?%%T!KEsRYe*rfRfuI_bJMK^cJtj>p=2O9>)#!Fv zM4bAaJlaCK%Pt?PN!D5a4QoYL$|j9V<>+NrvX@rkBtceR?aW9C(f;GJV=ws`%d4UH zp+7PQv249=lr7+{lRli2)Mu=)74>niD+|8Hu3BaZ7BY^ z3O}3={w|9bmx5!Bwok+AVEZpD6U=-;aNo*%A#z(LKT9WQYimblAFMM$Du197F2RrT zdBZ4Yb@f^H^!xCE&J2!))1{71kVfGY<(4v83|7gbG2(I-o9|3w(_vuJ`5xDPK9Mt) zf5=;O3{TLy#P8f%$9w25pJ%tply4IUTF6MxB;SFl{`0%RSk6hch^7N&3bpiC;paa{ z9=u>ZEaX$K6{e5=62$!r>1_!WqQ4gPG@y*ncvd_$@<7q_!GlZn533B8KMvX{L8Q3m zZRG`WZEL=JqRrFpHWY?-)}H{g^aA(fQzZhV*R;_&ptglFkNEnB4UrXo!Xsdmld)a& z&SoGB7KQh3_bTx@mf4fyU9~9t#EI>2A-)JR(8F@fGmzol)<@CjD%ewUC-%F2fKoTc zM)OSV$o9Wsy~cj@588v=e~2vuW6~E#hM^BI{D6F!q$hlf=Z<$HtJeqR9``cwnby@? zg0zm=X5l>6! zQI=@ka4tNuR5%q@nMYd@Gjj#Ay^6N>+1>^X>5*t{{T(h>%5svQU#H{ZELeK4CV`z& z!xnnUe-9S=_mak+m1bT|f0ogy%l*1_l|1NeqhUT}(YU*!U=UY==D~58)G&YAL#+_4 z0k_>Ke_Kwt+&@NNJ8Ip>!6X!hYViD2V&Ow_@j%NKoo}Zd>+&4&DbkSmaZD>}cOrgD zFhJAzc;Z~Eu(dx+UONUB-79s8(8Z!9h)`chvQ@FAUch&OwNL0!6pJchvD7_agQL>i zo1+4f;bA?A{-mO;0Nq8*JzwAjL7!RUCnIOzT}Q)b;U%|;w)Oemfy9G*k{!e~D6&xB z2zv3kZk~^vyh8^2qu|stXz^*|mC?VWkrcls4>;l3M^}0U?L+SVUO%$Tx>8e|^JnOj>l+t$gEoyuu_0ytJ#%3zFFLPvze*^wm1U&qzOQ6wZO|lmtKj zB-{{$SEu)HJ#i|cls*Rg+z@W_OvIG;(MLX^M*2P9RDlkGoi;;%pf5DpKO$D9jdj$` z;gR}fiA@i}vEdbeg!yFW@&Ak<;nvuBNqFCK-|w0D-irPO`LVh!g(ofutNJV9GNQ^? z{tZa%voL40O)wP}G+GOyX#jD#xR%s!)X0`3cFN#`G^sPV1^)r2G&{*Q0Vt1uk zI8-9|Vm46bI2cSxa?dl&`WS%S-;-oe@qKC!qa)qyrjG(8Bqk-~#hcrJy&qBTNY=!? z3>)nwDnTQH%+ZdFV z-M(EKSSmPf8L=#H{jHI?y2(BZzV}7iL7HoCv2Z>Wn)96Fnynq-K?A_IZ`zZRD|n6- z9fHoe3~V`(laoX2OR49F?B{SWh^oC6M$4+JmAnt#@H?>43#?{}rq}VdN>Xj7Wg)`0 z2dk*A#{-*tJL33F{S1Y2C7p>|(P75wx1h(9;Y{}SIsM1tSioP>p1#m^u~zY1tYWud zy00N7-?FpPQBK> z!<*ma=f1bvKFE`KPJuaVI)wjW!ORmyN7Tu4vb2H~USh6E*768+^%gfeact zJK`R0lnF4LH+={G_ZXdvHDw_T<}=-ZR#rM%EP4r@|1;Zb*?m`7gXmh+e>89qdo|c+ z`TfX!eEA)3%q`9{c*Zw%RIo4nRtkAz;_+3TD}B6gxJDB9D&`~B9%gM9(m(P8{nk6> zYg=Ih;awF$u+!}e{A3Rqasb$I7dBH`pVAIQixSL|=tp&;KHi>Hrek&Q2s*Cb2pjXf ziL(&0-c|7p#! zu9E%|eMULrDX}uzE!gd4c&Bc9UZ}dj8AsEv*E4mD^w&eNjnwH@u`4Hivd-b`-nHOV zB3*7;mHDzC-Q@I)K5 zMm*~G;It9@Iyh@5J{c4(pJk?I`qRXw*PEoL8^`b$=jnD&tl2Wi!Zm*2KCK|^{N z)7yHVZG-WVz!RqDxfL;3#U(u z^`zK!e)DMZ+@pThj%yz8ty8p~{z{&iCq?vW)De1qd~EonzwWzbM{p^4GgzSu^jrGY zCQyetYc+kjZm}AkjktABGJ{7hlJS3$CE&4SALj+Z&Bd{&gILmLzb;cOon{54Ir($1 zu9TyG6yCU$x4TJBYDX@qt7U>|Fu-D3SPq~-u3}d|awcTiqv*Kzf@b8xzd~mJk{w1n zQ~Buo>rc1iE8!?^vgrNP98C?Ic^~*yAQ}8Ko@BA_lmal|S-kHQdYHbk$$E}eZ0m8< z!9n_j{t&hadPy(+F4jRqTSrzXMBJ_(ZpYF-<}@drqVV|Jdf#$WaoC|NY@?Klp4Vi~ zLso~`57HNOt&H=vdc@n44f=;urM;sY>2)7sh3M`0+A45ntd%a42}+L#-_PWZ_!!F_ zUXhEw+Kc#;Qa`MN68i+W>o`*wI2ol&sB6<#W3yrAH9bj2NM6nDIb+F)Ckm*#JEBXD^$P}ah8YX!;IO>OZ#?rpfU$^Jb2ReH{*%|%;PyN|B zX=lsv8}ruO-FMmsTWRO^!60XRsRW{LzQHIans6u*C8(f@xtJe@UJsqMQiU;R_YOU%C7fdJ4z8p=-WjGg+fk*v)P@ zO%Ob)Pt!4Wnkbi67X&YR3C#ea`B9j{X(OyubS?aTAKvkz*U{lT^8_tO-|J(XjvExd zsm;7b*ijGKe0x2%RQE^T_3V*#WT^F8&OgVeuTZgi%df~5p8}uh63q7w`a_V}cS}Cm zV)cj$`$4cXyoA53&pJIF{+>_QKK`e~z`DKaRp*pmAGlI>PqO8nLs!A`o01*p+jufm zMSmxJLgv^M;>%GRhew0v!9Y2%&qeI&aw-v<;7>Q6mCp(D6$#~pBzaWpt zhKJ>Kmpu!!N*|r(Uz?Koqh17D(7;Zjhvt-2?cqIRgUJEgJ%d))IpLes#HYbq6B9bi zOWydbdY{0`?vZCcM*W;(|4}7d{nGlpfRuu4K%| zBc0?;pX6`xHk+w22Y+f?DMNHQZUy5*L3LD5tiBBNYnI=C@x6Wrb~+k8=Ny*2 zoAtYDw=|*P?*}h0vdLar6TRWRrJS77olM*t7ug|=VR8rXmT6wihI#gQ!2A4e+!rPP zX#wvcNj^@`Xa%s=-_!tefB+g`FZ5(lvp(!aHKWgqYzi*o`!jqb9{8z!!kYG@LYb57 zQ(L}^G5HR@IuN$1SRmaoHUph?GyU4>F>(AGlKi(VWR7BR=Iy!DofPtIndt{pjR?=HZ zYt2cv+hJwn4eU>=?)A`@-n5=Ds1%un4Gx8ctkTcCC3WvRa?7jhemUzeXbx}TN3=OT z1na0ErG{mR9>f?eNM~{#ICV1T$MR_r7vA_u_L>p;+9Rqu-rH>F5(g$nNt470YUeWhs&6vd31C_Zlk-8kHc9^dRuOGEE;bxn! z19i9U)AaFpI7=E?5{ToW^bAwzAX}-E$%t)&SXkEgk&T8@xht$cVhtl?vt1!mtnnH4 zP+J5erAaVa-}XZ=uT}5`I(k8=eZbb)b*xjtm$;VlYr4q#ke}j)sz;Tfhtww{O`t2F zLeNRJphYZ{cQ|o;L04KYr;>`d#L4;hrHN77(9vkb34%QKP_vwLALV3pBahl4Ip z%M871ONiZ7Z6GMHgqBC)Stm!NyKlB}`md$vUSis2ID8CcumMcCTD-S5L`B;tyNCj< z+-$R@i5JwHvM_c?vQdX!8XlKN<^Sh7_;!%{Y}x4FYMwBg?m=}w8)Rkv?Q-2LR6qSX z)hP9MIh(uOPWo|qi|qe1(VKgq{6O%nsOQw@@b3@6ny*+1uLcTyl^F3jd7!EPB{M-~ z1NE7D<$|@5IdHWTSmSBdotZ9ll;opZ@OkaWv(=PA_;gC_b+7A{qmxsMNEgXO>~7*+ zG;xY7om{r^vQoJ-f;CEGCuOP+_xFQWJ+svHVRl3+CF;k~lEGjm{A>;vcrnlFRo(QY zm5?99S?C%qKxCICy*@+M{f_*)2Y&b$Hr|M=lvCc)v9_6vJ)PRbc%PwZ!lt?n&YTbJ zU_Pi}EO92iUXNt;v0#9|VCapsDq2(?tpWB?83k5t#6;}H@(U_WiXZbBd15l}_a1er zlQO_}ho?Awls&8rR|i=Uy#EfG!w|io*F#R;Fm++N&f^J6dojr<6Y;EUmctf>&uT5AAyubn z;>Oh@;Tm{Cg&@#s#G9UaR*!1~%?@vC$W9a`v%c-kJhya05g0FH(9w$0R~NTA;SWK2 z-sPxO(m`5ISB05WOEIKZdtRvA9=_Jy7 znrz@{DG>e3?!xgWhgqXN-6BbFqC=Pn&pRLcnUfYMVfbt#q~(ut|PU`|$lG zLHJB#l}lb+|EB>~ni6~?1!bjVjArq?dOMt_*}xccwT`y(&VC>JzC!ja4W`T!?`IA5 zAKi+*y&rTTpLfBFdypR=(GoBvZt~Lq!Zon#)-pr(Q=Nfjv(7omd{4`-@=L57`#qc< zieuK^x@Z+YCS`c8B4n&ezSKsB{W;~Hq$Q#g`QK66HYkAx6xIFqfGk$TcYrzP`+s&H zzYoD}X~_pU>@O{f)mQS|cyW9EKr?%mXiIy<4tXPgKqlHuRk02o179jV9??nqseTgD zGixvD>)^My@p9&m8FOOvs9x1?K=2R3b|8Zr@)g)WujFF>VGE7ku&BibKZ!V)%~(H1D{sa zji`+{)2`asbHx|= zK^ZA`BX#_39nLwosbOiXIa{nKy7X$^ZmuupM8-LG*mbkj^2E%!AMKi4lm+ziF0(@Z zBo?y4R-pg%=Xp;B<*cw?vUxC_8$q?OLio8&#=jodw=9D!w!iH#=F4+B@N}V4N4V0?tinkwM_S_rZNbeV{)YHYduJ zK<79b(idxI?F<%NE`bAmoOSTT1luf}C#mfPz}rdQ(kEI*&BD}{`gCq@C6i~8v-Yzb zlsYBF3+#?Y_q?pxv6~JK=J*@$4UuLXXnBJM6jjg(ogHtPp1*DEtlz$T^Qcf z^Ex(?UYmPfeEs)egx?{noU~PNy+r5H8GF}n$@~wb!kE4R|9o1GdsA^w@zK10d+%Y)h?gJb`>?o9WYzaa_S+5m zF-GVGPXV#@rt&^3wkTYvS;H65&G%6MVj`s^=F$(Na=%8qYd&4;oqVaBiWKr8>|TZF zDcb>Jtq5zGuD=tz#_OD*upb9Ie*mUVFI%;!zDV}xbYA^(*w6%jlBZiuN6a;U=u4Hm zE6^}nvs&3Wp}9_5!nf!#hflXaXKd~VK=d0x2=`)l$X9t#dY+1o)kmbdcIM4~)sCJG zUrVvX?AbYf0?gdoM~6&>!Yh-A=j&vcR5PU}dj>?2MsJyD1MISwGGRyIp?cgmX%YU^ z0`Hs$_W#Fz0|kxro;pt58spRTz+y*1>aS=%8Kc~>E{WR!X30XjTPn=sL#o=KL=OCajIco&CK-WQ#LPb>aF96sL(-Ign5ll!8jTWHOmX&nSpvi zzL#m%%@bIp@-yuIs4vF@ zQss$gvfl7GHxG6WCgLZRH47Hj(BHCVau7DX*yG_IEy{hA71_nP@`tTU-9=73$j@D} zU%*65=-{{$UiLYX2PT-+Ps_j7OZM0;n<%f!F*z&sy)ALFf#wLmkb?32I#ja9TRRms z>!E)IX}nPML%E4(&iCvV=*Iz_-}Oq^f?$;HWbK=Kahpjdnh|7xMKvJ0UXwNS91Nq^ zdJhOGi!Okf_3%tKg_9`<_@>u2zihM}VM~9>A0hL(ObNJ;-HT!!OUeBI1&2bjET~Fz zK`N`gBWO6U4w38LPVd+Oe++wkm+Il;oHOH`s2t{}P8s)qTNUE#5zPfQD+uoFFRO$b zLfDIq;b{NC57u9cA3uh@UyO~^#65Fqxw&8ktiG5>ygzf+GI}W)Z4vK4mdcdqjkSHw z dict[str, float]: + sorted_values = sorted(values) + if not sorted_values: + return {"mean": 0.0, "p50": 0.0, "p90": 0.0, "p99": 0.0, "min": 0.0, "max": 0.0} + def pct(q: float) -> float: + idx = round((len(sorted_values) - 1) * q) + return float(sorted_values[min(idx, len(sorted_values) - 1)]) + return { + "mean": float(sum(sorted_values) / len(sorted_values)), + "p50": pct(0.50), + "p90": pct(0.90), + "p99": pct(0.99), + "min": float(sorted_values[0]), + "max": float(sorted_values[-1]), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--draft-model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16") + parser.add_argument("--fixture-out", default="target/benchmarks/qwen3-dflash/forward-input.safetensors") + parser.add_argument("--openinfer-bin", type=Path, help="Path to qwen3_dflash_forward_bench") + parser.add_argument("--openinfer-draft-cache", action="store_true") + parser.add_argument("--openinfer-context-cache", action="store_true", help=argparse.SUPPRESS) + parser.add_argument("--out", default="target/benchmarks/qwen3-dflash/forward.json") + parser.add_argument("--device", type=int, default=0) + parser.add_argument("--ctx-len", type=int, default=2) + parser.add_argument("--q-len", type=int, default=16) + parser.add_argument("--warmup", type=int, default=5) + parser.add_argument("--iters", type=int, default=30) + parser.add_argument("--target-model-path", default="/home/hezhaozhao/models/Qwen3-4B") + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for the DFlash forward benchmark") + + draft = AutoModel.from_pretrained( + args.draft_model_path, + dtype=torch.bfloat16, + device_map={"": f"cuda:{args.device}"}, + trust_remote_code=True, + ).eval() + device = next(draft.parameters()).device + + gen = torch.Generator(device=device).manual_seed(SEED) + hidden = draft.config.hidden_size + target_layer_count = len(draft.target_layer_ids) + noise_embedding = torch.randn((1, args.q_len, hidden), generator=gen, device=device, dtype=torch.bfloat16) + target_hidden = torch.randn( + (1, args.ctx_len, hidden * target_layer_count), + generator=gen, + device=device, + dtype=torch.bfloat16, + ) + position_ids = torch.arange(args.ctx_len + args.q_len, device=device, dtype=torch.int32).unsqueeze(0) + fixture_path = Path(args.fixture_out) + fixture_path.parent.mkdir(parents=True, exist_ok=True) + save_file( + { + "noise_embedding": noise_embedding.detach().to("cpu", dtype=torch.bfloat16).contiguous(), + "target_hidden": target_hidden.detach().to("cpu", dtype=torch.bfloat16).contiguous(), + "position_ids": position_ids.detach().to("cpu", dtype=torch.int32).contiguous(), + }, + str(fixture_path), + ) + + hf_latencies = [] + with torch.inference_mode(): + for _ in range(args.warmup): + _ = draft( + noise_embedding=noise_embedding, + target_hidden=target_hidden, + position_ids=position_ids, + use_cache=False, + is_causal=False, + ) + torch.cuda.synchronize(device) + for _ in range(args.iters): + start = time.perf_counter() + _ = draft( + noise_embedding=noise_embedding, + target_hidden=target_hidden, + position_ids=position_ids, + use_cache=False, + is_causal=False, + ) + torch.cuda.synchronize(device) + hf_latencies.append((time.perf_counter() - start) * 1000.0) + + openinfer_latencies = None + if args.openinfer_bin is not None: + cmd = [ + str(args.openinfer_bin), + "--model-path", + args.draft_model_path, + "--fixture", + str(fixture_path), + "--device", + str(args.device), + "--warmup", + str(args.warmup), + "--iters", + str(args.iters), + ] + openinfer_draft_cache = args.openinfer_draft_cache or args.openinfer_context_cache + if openinfer_draft_cache: + cmd.append("--draft-cache") + raw = subprocess.run(cmd, check=True, capture_output=True, text=True).stdout + payload = json.loads(raw) + openinfer_latencies = payload["latency_ms"] + + report = { + "schema": 1, + "draft_model_path": args.draft_model_path, + "target_model_path": args.target_model_path, + "device": args.device, + "ctx_len": args.ctx_len, + "q_len": args.q_len, + "warmup": args.warmup, + "iters": args.iters, + "openinfer_draft_cache": args.openinfer_draft_cache or args.openinfer_context_cache, + "fixture_out": str(fixture_path), + "hf_remote_code": { + "engine": "transformers", + "latency_ms": stats(hf_latencies), + }, + "openinfer": openinfer_latencies, + } + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + print(f"wrote {out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py b/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py new file mode 100644 index 00000000..3ea144d8 --- /dev/null +++ b/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +"""Compare Qwen3-4B-DFlash HF drafter vs OpenInfer drafter in one target loop. + +This is an end-to-end drafter-substitution probe for the current +`openinfer-qwen3-4b-dflash` boundary. The target model, tokenizer, target KV +cache, target verification, target `lm_head`, and greedy sampler all come from +Transformers. The only variable is the drafter: + + * HF remote-code `DFlashDraftModel.forward` + * OpenInfer `qwen3_dflash_forward_fixture` + +The script intentionally uses a no-draft-cache loop on both sides because the +current OpenInfer crate implements standalone draft forward only, not DFlash's +Python `DynamicCache` path or an OpenInfer target/controller. + +Example: + + .venv/bin/python tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py \ + --target-model-path /home/hezhaozhao/models/Qwen3-4B \ + --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --openinfer-bin target/release/qwen3_dflash_forward_fixture \ + --out target/accuracy/qwen3-dflash/drafter-generation.json +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import torch +from safetensors.torch import load_file, save_file +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, DynamicCache + +DEFAULT_PROMPTS = [ + "Hello, my name is", + "The capital of France is", + "Qwen is a language model that", + "1, 1, 2, 3, 5,", +] + + +def sha256_u32_le(values: list[int]) -> str: + digest = hashlib.sha256() + for value in values: + digest.update(int(value).to_bytes(4, byteorder="little", signed=False)) + return digest.hexdigest() + + +def sha256_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def first_diff(left: list[int], right: list[int]) -> dict[str, Any] | None: + limit = min(len(left), len(right)) + for index in range(limit): + if left[index] != right[index]: + return { + "index": index, + "hf_drafter": left[index], + "openinfer_drafter": right[index], + "reason": "token_mismatch", + } + if len(left) != len(right): + return { + "index": limit, + "hf_drafter": left[limit] if len(left) > limit else None, + "openinfer_drafter": right[limit] if len(right) > limit else None, + "reason": "length_mismatch", + } + return None + + +def input_device(model: torch.nn.Module) -> torch.device: + return next(model.parameters()).device + + +def extract_context_feature(hidden_states: tuple[torch.Tensor, ...], layer_ids: list[int]) -> torch.Tensor: + # HF hidden_states includes the embedding output at index 0. + return torch.cat([hidden_states[layer_id + 1] for layer_id in layer_ids], dim=-1) + + +def greedy(logits: torch.Tensor) -> torch.Tensor: + return torch.argmax(logits, dim=-1) + + +def tensor_deltas(got: torch.Tensor, want: torch.Tensor) -> dict[str, float]: + deltas = (got.float() - want.float()).abs().flatten().detach().cpu() + if deltas.numel() == 0: + return {"mean": 0.0, "p99": 0.0, "max": 0.0, "n": 0} + sorted_deltas = torch.sort(deltas).values + p99_index = min(int(deltas.numel() * 0.99), deltas.numel() - 1) + return { + "mean": float(deltas.mean().item()), + "p99": float(sorted_deltas[p99_index].item()), + "max": float(sorted_deltas[-1].item()), + "n": int(deltas.numel()), + } + + +def merge_delta_stats(items: list[dict[str, float]]) -> dict[str, float] | None: + if not items: + return None + total_n = sum(int(item["n"]) for item in items) + if total_n == 0: + return {"mean": 0.0, "p99": 0.0, "max": 0.0, "n": 0} + # The exact aggregate p99 needs raw samples. For this report the per-block + # worst p99 is the conservative summary, and max is exact. + return { + "mean": sum(item["mean"] * item["n"] for item in items) / total_n, + "p99": max(item["p99"] for item in items), + "max": max(item["max"] for item in items), + "n": total_n, + } + + +@dataclass +class Runtime: + target: torch.nn.Module + draft: torch.nn.Module + tokenizer: Any + target_layer_ids: list[int] + block_size: int + mask_token_id: int + stop_token_ids: list[int] + openinfer_bin: Path | None + draft_model_path: Path + repo_root: Path + device_ordinal: int + collect_hidden_delta: bool + + +def run_openinfer_draft( + runtime: Runtime, + *, + noise_embedding: torch.Tensor, + target_hidden: torch.Tensor, + position_ids: torch.Tensor, + temp_dir: Path, + step_index: int, +) -> torch.Tensor: + fixture = temp_dir / f"dflash-input-{step_index:03d}.safetensors" + out = temp_dir / f"dflash-output-{step_index:03d}.safetensors" + save_file( + { + "noise_embedding": noise_embedding.detach().to("cpu", dtype=torch.bfloat16).contiguous(), + "target_hidden": target_hidden.detach().to("cpu", dtype=torch.bfloat16).contiguous(), + "position_ids": position_ids.detach().to("cpu", dtype=torch.int32).contiguous(), + }, + str(fixture), + ) + if runtime.openinfer_bin is not None: + cmd = [ + str(runtime.openinfer_bin), + "--model-path", + str(runtime.draft_model_path), + "--fixture", + str(fixture), + "--out", + str(out), + "--device", + str(runtime.device_ordinal), + ] + else: + cmd = [ + "cargo", + "run", + "--release", + "-p", + "openinfer-qwen3-4b-dflash", + "--bin", + "qwen3_dflash_forward_fixture", + "--", + "--model-path", + str(runtime.draft_model_path), + "--fixture", + str(fixture), + "--out", + str(out), + "--device", + str(runtime.device_ordinal), + ] + subprocess.run(cmd, cwd=runtime.repo_root, check=True) + tensors = load_file(str(out)) + return tensors["openinfer_output"].to(input_device(runtime.target), dtype=torch.bfloat16) + + +def draft_hidden( + runtime: Runtime, + *, + kind: str, + noise_embedding: torch.Tensor, + target_hidden: torch.Tensor, + position_ids: torch.Tensor, + temp_dir: Path, + step_index: int, +) -> tuple[torch.Tensor, dict[str, float] | None]: + with torch.inference_mode(): + hf_hidden = runtime.draft( + target_hidden=target_hidden, + noise_embedding=noise_embedding, + position_ids=position_ids, + use_cache=False, + is_causal=False, + ) + if kind == "hf": + return hf_hidden, None + oi_hidden = run_openinfer_draft( + runtime, + noise_embedding=noise_embedding, + target_hidden=target_hidden, + position_ids=position_ids, + temp_dir=temp_dir, + step_index=step_index, + ) + delta = tensor_deltas(oi_hidden, hf_hidden) if runtime.collect_hidden_delta else None + return oi_hidden, delta + + +def generate_with_drafter( + runtime: Runtime, + *, + prompt: str, + max_new_tokens: int, + kind: str, + temp_dir: Path, +) -> dict[str, Any]: + dev = input_device(runtime.target) + encoded = runtime.tokenizer(prompt, return_tensors="pt") + input_ids = encoded.input_ids.to(dev) + num_input_tokens = input_ids.shape[1] + max_length = num_input_tokens + max_new_tokens + output_ids = torch.full( + (1, max_length + runtime.block_size), + runtime.mask_token_id, + dtype=torch.long, + device=dev, + ) + all_position_ids = torch.arange(output_ids.shape[1], device=dev).unsqueeze(0) + + target_cache = DynamicCache() + with torch.inference_mode(): + output = runtime.target( + input_ids, + position_ids=all_position_ids[:, :num_input_tokens], + past_key_values=target_cache, + use_cache=True, + logits_to_keep=1, + output_hidden_states=True, + ) + output_ids[:, :num_input_tokens] = input_ids + output_ids[:, num_input_tokens : num_input_tokens + 1] = greedy(output.logits) + target_hidden = extract_context_feature(output.hidden_states, runtime.target_layer_ids) + + start = num_input_tokens + accepted_plus_fallback_lengths: list[int] = [] + hidden_deltas: list[dict[str, float]] = [] + step_index = 0 + while start < max_length: + q_len = runtime.block_size + block_output_ids = output_ids[:, start : start + q_len].clone() + block_position_ids = all_position_ids[:, start : start + q_len] + noise_embedding = runtime.target.model.embed_tokens(block_output_ids) + + ctx_len = target_hidden.shape[1] + draft_position_ids = all_position_ids[:, start - ctx_len : start + q_len] + hidden, delta = draft_hidden( + runtime, + kind=kind, + noise_embedding=noise_embedding, + target_hidden=target_hidden, + position_ids=draft_position_ids, + temp_dir=temp_dir, + step_index=step_index, + ) + if delta is not None: + hidden_deltas.append(delta) + draft_logits = runtime.target.lm_head(hidden[:, -runtime.block_size + 1 :, :]) + block_output_ids[:, 1:] = greedy(draft_logits) + + with torch.inference_mode(): + output = runtime.target( + block_output_ids, + position_ids=block_position_ids, + past_key_values=target_cache, + use_cache=True, + output_hidden_states=True, + ) + posterior = greedy(output.logits) + matches = block_output_ids[:, 1:] == posterior[:, :-1] + acceptance_length = int(matches.cumprod(dim=1).sum(dim=1)[0].item()) + advanced = acceptance_length + 1 + output_ids[:, start : start + advanced] = block_output_ids[:, :advanced] + output_ids[:, start + advanced] = posterior[:, acceptance_length] + start += advanced + target_cache.crop(start) + target_hidden = extract_context_feature(output.hidden_states, runtime.target_layer_ids)[:, :advanced, :] + accepted_plus_fallback_lengths.append(advanced) + step_index += 1 + + generated_so_far = output_ids[0, num_input_tokens : min(start + 1, max_length)] + if runtime.stop_token_ids and torch.isin( + generated_so_far, + torch.tensor(runtime.stop_token_ids, device=generated_so_far.device), + ).any(): + break + + full_ids = output_ids[0, :max_length] + full_ids = full_ids[full_ids != runtime.mask_token_id] + if runtime.stop_token_ids: + generated = full_ids[num_input_tokens:] + stop_tensor = torch.tensor(runtime.stop_token_ids, device=generated.device) + stop_positions = torch.isin(generated, stop_tensor).nonzero(as_tuple=True)[0] + if stop_positions.numel() > 0: + full_ids = full_ids[: num_input_tokens + int(stop_positions[0].item()) + 1] + + full_token_ids = [int(token) for token in full_ids.detach().cpu().tolist()] + generated_token_ids = full_token_ids[num_input_tokens:] + full_text = runtime.tokenizer.decode(full_token_ids, skip_special_tokens=False) + generated_text = runtime.tokenizer.decode(generated_token_ids, skip_special_tokens=False) + return { + "prompt_token_ids": [int(token) for token in input_ids[0].detach().cpu().tolist()], + "full_token_ids": full_token_ids, + "generated_token_ids": generated_token_ids, + "full_text": full_text, + "generated_text": generated_text, + "token_sha256": sha256_u32_le(generated_token_ids), + "text_sha256": sha256_text(generated_text), + "accepted_plus_fallback_lengths": accepted_plus_fallback_lengths, + "hidden_delta_vs_hf": merge_delta_stats(hidden_deltas), + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--target-model-path", required=True) + parser.add_argument("--draft-model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16") + parser.add_argument("--out", default="target/accuracy/qwen3-dflash/drafter-generation.json") + parser.add_argument("--prompt", action="append", help="Prompt to test; can be repeated.") + parser.add_argument("--max-new-tokens", type=int, default=12) + parser.add_argument("--openinfer-bin", type=Path, help="Path to a built qwen3_dflash_forward_fixture binary.") + parser.add_argument("--repo-root", type=Path, default=Path(__file__).resolve().parents[2]) + parser.add_argument("--device", type=int, default=0) + parser.add_argument("--skip-hidden-delta", action="store_true") + parser.add_argument("--stop-token-id", type=int, action="append", default=[]) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for the DFlash drafter generation comparison") + + target = AutoModelForCausalLM.from_pretrained( + args.target_model_path, + dtype=torch.bfloat16, + device_map={"": f"cuda:{args.device}"}, + trust_remote_code=True, + ).eval() + draft = AutoModel.from_pretrained( + args.draft_model_path, + dtype=torch.bfloat16, + device_map={"": f"cuda:{args.device}"}, + trust_remote_code=True, + ).eval() + tokenizer = AutoTokenizer.from_pretrained(args.target_model_path, trust_remote_code=True) + + stop_token_ids = list(args.stop_token_id) + eos = getattr(target.config, "eos_token_id", None) + if isinstance(eos, int): + stop_token_ids.append(eos) + elif isinstance(eos, list): + stop_token_ids.extend(int(token) for token in eos) + stop_token_ids = sorted(set(stop_token_ids)) + + runtime = Runtime( + target=target, + draft=draft, + tokenizer=tokenizer, + target_layer_ids=[int(layer) for layer in draft.target_layer_ids], + block_size=int(draft.block_size), + mask_token_id=int(getattr(draft, "mask_token_id", None) or draft.config.dflash_config["mask_token_id"]), + stop_token_ids=stop_token_ids, + openinfer_bin=args.openinfer_bin, + draft_model_path=Path(args.draft_model_path), + repo_root=args.repo_root, + device_ordinal=args.device, + collect_hidden_delta=not args.skip_hidden_delta, + ) + + prompts = args.prompt or DEFAULT_PROMPTS + cases = [] + with tempfile.TemporaryDirectory(prefix="qwen3-dflash-parity-") as tmp: + temp_dir = Path(tmp) + for index, prompt in enumerate(prompts): + hf = generate_with_drafter( + runtime, + prompt=prompt, + max_new_tokens=args.max_new_tokens, + kind="hf", + temp_dir=temp_dir, + ) + openinfer = generate_with_drafter( + runtime, + prompt=prompt, + max_new_tokens=args.max_new_tokens, + kind="openinfer", + temp_dir=temp_dir, + ) + token_diff = first_diff(hf["generated_token_ids"], openinfer["generated_token_ids"]) + text_match = hf["generated_text"] == openinfer["generated_text"] + token_match = token_diff is None + classification = "all_token_text_exact" if token_match and text_match else "drafter_generation_mismatch" + cases.append( + { + "id": f"prompt_{index:03d}", + "prompt": prompt, + "max_new_tokens": args.max_new_tokens, + "prompt_token_ids": hf["prompt_token_ids"], + "hf_drafter": hf, + "openinfer_drafter": openinfer, + "token_match": token_match, + "text_match": text_match, + "classification": classification, + "first_diff": token_diff, + } + ) + print( + f"{classification}: {prompt!r}; " + f"hf_accept={hf['accepted_plus_fallback_lengths']} " + f"openinfer_accept={openinfer['accepted_plus_fallback_lengths']}" + ) + + result = { + "schema": 1, + "comparison": "qwen3_4b_dflash_drafter_generation", + "mode": "greedy_bs1_no_draft_cache_drafter_substitution", + "target_model_path": args.target_model_path, + "draft_model_path": args.draft_model_path, + "openinfer_bin": str(args.openinfer_bin) if args.openinfer_bin else None, + "block_size": runtime.block_size, + "target_layer_ids": runtime.target_layer_ids, + "mask_token_id": runtime.mask_token_id, + "stop_token_ids": runtime.stop_token_ids, + "torch_version": torch.__version__, + "transformers_version": __import__("transformers").__version__, + "case_count": len(cases), + "all_token_text_exact": all(case["classification"] == "all_token_text_exact" for case in cases), + "cases": cases, + } + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(result, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + print(f"wrote {out}") + if not result["all_token_text_exact"]: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py b/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py new file mode 100644 index 00000000..5efc34b0 --- /dev/null +++ b/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Generate a tiny HuggingFace remote-code golden for Qwen3-4B-DFlash-b16. + +The DFlash crate compares its standalone draft forward against this fixture +without importing Python at Rust test time. The input tensors are synthetic but +seed-pinned, so the fixture exercises the exact `DFlashDraftModel.forward` +contract: selected target hidden states, noise embeddings, and absolute +position ids. + + .venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \ + --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \ + --out test_data/qwen3-4b-dflash-hf-golden.safetensors +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import torch +from safetensors.torch import save_file +from transformers import AutoModel + +SEED = 0xD4A5_4B16 +CTX_LEN = 2 +Q_LEN = 3 + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16") + parser.add_argument("--out", default="test_data/qwen3-4b-dflash-hf-golden.safetensors") + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required to generate the DFlash bf16 golden") + + model = AutoModel.from_pretrained( + args.model_path, + dtype=torch.bfloat16, + device_map="cuda", + trust_remote_code=True, + ).eval() + + gen = torch.Generator(device="cuda").manual_seed(SEED) + hidden = model.config.hidden_size + target_layers = len(model.target_layer_ids) + noise_embedding = torch.randn( + (1, Q_LEN, hidden), + generator=gen, + device="cuda", + dtype=torch.bfloat16, + ) + target_hidden = torch.randn( + (1, CTX_LEN, hidden * target_layers), + generator=gen, + device="cuda", + dtype=torch.bfloat16, + ) + position_ids = torch.arange(CTX_LEN + Q_LEN, device="cuda", dtype=torch.int64).unsqueeze(0) + + with torch.inference_mode(): + output = model( + noise_embedding=noise_embedding, + target_hidden=target_hidden, + position_ids=position_ids, + use_cache=False, + is_causal=False, + ) + torch.cuda.synchronize() + + tensors = { + "noise_embedding": noise_embedding.cpu(), + "target_hidden": target_hidden.cpu(), + "position_ids": position_ids.to(torch.int32).cpu(), + "output": output.cpu(), + } + meta = { + "model_path": args.model_path, + "seed": str(SEED), + "ctx_len": str(CTX_LEN), + "q_len": str(Q_LEN), + "hidden_size": str(hidden), + "target_layer_ids": ",".join(str(layer) for layer in model.target_layer_ids), + "block_size": str(model.block_size), + "mask_token_id": str(model.mask_token_id), + "torch_version": torch.__version__, + "transformers_version": __import__("transformers").__version__, + } + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + save_file(tensors, str(out), metadata=meta) + print(f"wrote {out}: ctx_len={CTX_LEN}, q_len={Q_LEN}, hidden={hidden}, seed={SEED}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 8011980c5ad8e02ba5e96e0a8b71f3e219ddaaa9 Mon Sep 17 00:00:00 2001 From: hezz Date: Mon, 22 Jun 2026 12:13:25 +0800 Subject: [PATCH 2/6] refactor(qwen3-dflash): collapse batch buffers into a single instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns the DFlash executor with the rest of the project (Qwen3 BatchDecodeBuffers, Kimi/DeepSeek scratch): one allocation sized for the worst case, narrowed per forward via set_active_shape, instead of the unique-per-crate HashMap<(batch, q, ctx), buffer> cache that grew a fresh GPU buffer set for every unseen (q_len, ctx_len) combo. * DFlashExecutor now holds a single DFlashBatchBuffers, allocated once in load() for (max_batch_size, max_q_len, max_step_context_len). New DFlashExecutorOptions.max_q_len gates the q-axis capacity. * set_active_batch(bs) -> set_active_shape(bs, q_len, ctx_len); both forward paths derive the shape from the requests themselves so callers no longer pre-set it. * prepare_ragged_plan cache key now covers (batch_size, q_len, ctx_len) — with a single instance the shape can change between forwards, so keying only on batch_size would reuse a stale plan. * compact_host_inputs stitches all requests on the host and uploads noise/target with one H2D each (was one launch per request per tensor), matching Qwen3's sync_paged_meta upload pattern. * Compact NoCache paths materialize the owned output via a single clone_batch_output dtod instead of zeros + copy_hidden over the full span. Tests/benches keep working: create_batch_buffers keeps its 3-positional signature (now (max_batch, max_q, max_ctx)), and test struct literals gain the max_q_len field. --- .../src/batch_buffers.rs | 62 ++++++-- .../src/batch_forward.rs | 105 ++++++++++---- openinfer-qwen3-4b-dflash/src/executor.rs | 137 +++++++++--------- .../tests/hf_golden_gate.rs | 5 + 4 files changed, 198 insertions(+), 111 deletions(-) diff --git a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs index 84739906..8d7cef74 100644 --- a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs +++ b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs @@ -7,6 +7,11 @@ use crate::weights::DFlashDraftModel; pub struct DFlashBatchBuffers { pub(crate) max_batch_size: usize, + pub(crate) max_q_len: usize, + pub(crate) max_ctx_len: usize, + /// Active shape for the current batch — set by `set_active_shape` before + /// each forward. `q_len`/`ctx_len` may shrink below `max_*`; the physical + /// buffers are sized for the max, so the active values only narrow the view. pub(crate) q_len: usize, pub(crate) ctx_len: usize, pub(crate) total_q_len: usize, @@ -38,32 +43,40 @@ pub struct DFlashBatchBuffers { pub(crate) struct CachedRaggedPlan { pub(crate) batch_size: usize, + pub(crate) q_len: usize, + pub(crate) ctx_len: usize, pub(crate) plan: RaggedPrefillPlan, } impl DFlashBatchBuffers { + /// Allocate a single-instance buffer sized for the worst case + /// (`max_batch_size × max_q_len` / `× max_ctx_len`). Each forward narrows + /// the active shape via `set_active_shape`, mirroring Qwen3's + /// `BatchDecodeBuffers` (one allocation, dynamic `set_batch_size`). pub(crate) fn new( model: &DFlashDraftModel, max_batch_size: usize, - q_len: usize, - ctx_len: usize, + max_q_len: usize, + max_ctx_len: usize, ) -> Result { anyhow::ensure!(max_batch_size > 0, "max_batch_size must be positive"); - anyhow::ensure!(q_len > 0, "q_len must be positive"); - anyhow::ensure!(ctx_len > 0, "ctx_len must be positive"); + anyhow::ensure!(max_q_len > 0, "max_q_len must be positive"); + anyhow::ensure!(max_ctx_len > 0, "max_ctx_len must be positive"); let config = model.config(); let ctx = model.device_context(); let hidden = config.hidden_size; let target_hidden_dim = config.hidden_size * config.target_layer_count(); let q_dim = config.q_dim(); let kv_dim = config.kv_dim(); - let total_q_len = max_batch_size * q_len; - let total_ctx_len = max_batch_size * ctx_len; - let total_kv_len = max_batch_size * (ctx_len + q_len); + let total_q_len = max_batch_size * max_q_len; + let total_ctx_len = max_batch_size * max_ctx_len; + let total_kv_len = max_batch_size * (max_ctx_len + max_q_len); Ok(Self { max_batch_size, - q_len, - ctx_len, + max_q_len, + max_ctx_len, + q_len: max_q_len, + ctx_len: max_ctx_len, total_q_len, total_ctx_len, total_kv_len, @@ -92,11 +105,18 @@ impl DFlashBatchBuffers { }) } - pub(crate) fn set_active_batch(&mut self, batch_size: usize) { + /// Narrow the active shape for this forward: sets `q_len`/`ctx_len` and + /// recomputes every buffer's `seq_len` to `batch_size × (q|ctx)`. Buffers + /// stay sized for the max, so callers can freely vary batch/q/ctx below it. + pub(crate) fn set_active_shape(&mut self, batch_size: usize, q_len: usize, ctx_len: usize) { debug_assert!(batch_size <= self.max_batch_size); - self.total_q_len = batch_size * self.q_len; - self.total_ctx_len = batch_size * self.ctx_len; - self.total_kv_len = batch_size * (self.ctx_len + self.q_len); + debug_assert!(q_len <= self.max_q_len); + debug_assert!(ctx_len <= self.max_ctx_len); + self.q_len = q_len; + self.ctx_len = ctx_len; + self.total_q_len = batch_size * q_len; + self.total_ctx_len = batch_size * ctx_len; + self.total_kv_len = batch_size * (ctx_len + q_len); self.noise.seq_len = self.total_q_len; self.target_hidden.seq_len = self.total_ctx_len; self.target_projected.seq_len = self.total_ctx_len; @@ -123,10 +143,17 @@ impl DFlashBatchBuffers { model: &DFlashDraftModel, batch_size: usize, ) -> Result<()> { + // The plan depends on (batch_size, q_len, ctx_len); with a single + // instance buffer any of them can change between forwards, so all three + // must be part of the cache key. let needs_rebuild = self .ragged_plan .as_ref() - .map(|cached| cached.batch_size != batch_size) + .map(|cached| { + cached.batch_size != batch_size + || cached.q_len != self.q_len + || cached.ctx_len != self.ctx_len + }) .unwrap_or(true); if needs_rebuild { let config = model.config(); @@ -138,7 +165,12 @@ impl DFlashBatchBuffers { &kv_lens, config.num_attention_heads / config.num_key_value_heads, )?; - self.ragged_plan = Some(CachedRaggedPlan { batch_size, plan }); + self.ragged_plan = Some(CachedRaggedPlan { + batch_size, + q_len: self.q_len, + ctx_len: self.ctx_len, + plan, + }); } Ok(()) } diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs index ba9ef95b..1ca49050 100644 --- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs +++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs @@ -23,10 +23,10 @@ impl DFlashDraftModel { pub fn create_batch_buffers( &self, max_batch_size: usize, - q_len: usize, - ctx_len: usize, + max_q_len: usize, + max_ctx_len: usize, ) -> Result { - DFlashBatchBuffers::new(self, max_batch_size, q_len, ctx_len) + DFlashBatchBuffers::new(self, max_batch_size, max_q_len, max_ctx_len) } pub fn forward_batch<'a>( @@ -41,9 +41,22 @@ impl DFlashDraftModel { requests.len(), bufs.max_batch_size ); - let q_len = bufs.q_len; - let ctx_len = bufs.ctx_len; - for req in requests { + // All requests in an exact-shape batch share one (q_len, ctx_len); read + // it from the first, then narrow the buffer's active shape to match. + let (q_len, ctx_len) = self.validate_forward_inputs( + requests[0].noise_embedding, + &requests[0].target_hidden, + requests[0].position_ids, + )?; + anyhow::ensure!( + q_len <= bufs.max_q_len && ctx_len <= bufs.max_ctx_len, + "DFlash batch shape q_len={}, ctx_len={} exceeds buffer capacity q_len={}, ctx_len={}", + q_len, + ctx_len, + bufs.max_q_len, + bufs.max_ctx_len, + ); + for req in &requests[1..] { let (actual_q, actual_ctx) = self.validate_forward_inputs( req.noise_embedding, &req.target_hidden, @@ -58,7 +71,7 @@ impl DFlashDraftModel { actual_ctx ); } - bufs.set_active_batch(requests.len()); + bufs.set_active_shape(requests.len(), q_len, ctx_len); compact_inputs(self.device_context(), requests, bufs)?; self.forward_compact_batch(requests.len(), bufs)?; Ok(&bufs.normed) @@ -77,10 +90,39 @@ impl DFlashDraftModel { bufs.max_batch_size ); let config = self.config(); - let noise_len = bufs.q_len * config.hidden_size; - let target_len = bufs.ctx_len * config.hidden_size * config.target_layer_count(); - let position_len = bufs.ctx_len + bufs.q_len; - for req in requests { + let hidden = config.hidden_size; + let target_hidden_dim = config.hidden_size * config.target_layer_count(); + // Derive the shared (q_len, ctx_len) from the first request, the same + // way forward_batch derives it from device tensors. + let first = &requests[0]; + anyhow::ensure!( + first.noise_embedding.len() % hidden == 0, + "noise_embedding len {} is not a multiple of hidden_size {}", + first.noise_embedding.len(), + hidden, + ); + let q_len = first.noise_embedding.len() / hidden; + anyhow::ensure!( + first.target_hidden.len() % target_hidden_dim == 0, + "target_hidden len {} is not a multiple of target_hidden_dim {}", + first.target_hidden.len(), + target_hidden_dim, + ); + let ctx_len = first.target_hidden.len() / target_hidden_dim; + anyhow::ensure!(q_len > 0, "DFlash host batch q_len must be positive"); + anyhow::ensure!(ctx_len > 0, "DFlash host batch ctx_len must be positive"); + anyhow::ensure!( + q_len <= bufs.max_q_len && ctx_len <= bufs.max_ctx_len, + "DFlash host batch shape q_len={}, ctx_len={} exceeds buffer capacity q_len={}, ctx_len={}", + q_len, + ctx_len, + bufs.max_q_len, + bufs.max_ctx_len, + ); + let noise_len = q_len * hidden; + let target_len = ctx_len * target_hidden_dim; + let position_len = ctx_len + q_len; + for req in &requests[1..] { anyhow::ensure!( req.noise_embedding.len() == noise_len, "noise_embedding len {} != {}", @@ -100,7 +142,7 @@ impl DFlashDraftModel { position_len ); } - bufs.set_active_batch(requests.len()); + bufs.set_active_shape(requests.len(), q_len, ctx_len); compact_host_inputs(self.device_context(), requests, bufs)?; self.forward_compact_batch(requests.len(), bufs)?; Ok(&bufs.normed) @@ -323,27 +365,28 @@ fn compact_host_inputs( ) -> Result<()> { let hidden = bufs.noise.hidden_dim; let target_hidden = bufs.target_hidden.hidden_dim; - let mut pos_q = Vec::with_capacity(bufs.total_q_len); - let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len); - for (i, req) in requests.iter().enumerate() { - let noise_offset = i * bufs.q_len * hidden; - let mut noise_dst = bufs - .noise - .data - .slice_mut(noise_offset..noise_offset + req.noise_embedding.len()); - ctx.stream - .memcpy_htod(req.noise_embedding, &mut noise_dst)?; - - let target_offset = i * bufs.ctx_len * target_hidden; - let mut target_dst = bufs - .target_hidden - .data - .slice_mut(target_offset..target_offset + req.target_hidden.len()); - ctx.stream.memcpy_htod(req.target_hidden, &mut target_dst)?; + let q_len = bufs.q_len; + let ctx_len = bufs.ctx_len; + let batch_size = requests.len(); - pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]); - pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]); + // Stitch all requests into contiguous host slices, then upload each tensor + // in a single H2D copy — matches Qwen3's batch metadata upload pattern and + // avoids one launch per request per tensor. + let mut noise_flat = Vec::with_capacity(batch_size * q_len * hidden); + let mut target_flat = Vec::with_capacity(batch_size * ctx_len * target_hidden); + let mut pos_q = Vec::with_capacity(batch_size * q_len); + let mut pos_ctx = Vec::with_capacity(batch_size * ctx_len); + for req in requests { + noise_flat.extend_from_slice(req.noise_embedding); + target_flat.extend_from_slice(req.target_hidden); + pos_ctx.extend_from_slice(&req.position_ids[..ctx_len]); + pos_q.extend_from_slice(&req.position_ids[ctx_len..]); } + + let mut noise_dst = bufs.noise.data.slice_mut(..noise_flat.len()); + ctx.stream.memcpy_htod(&noise_flat, &mut noise_dst)?; + let mut target_dst = bufs.target_hidden.data.slice_mut(..target_flat.len()); + ctx.stream.memcpy_htod(&target_flat, &mut target_dst)?; let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len()); ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?; let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len()); diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs index 818226c7..a879fc18 100644 --- a/openinfer-qwen3-4b-dflash/src/executor.rs +++ b/openinfer-qwen3-4b-dflash/src/executor.rs @@ -4,10 +4,10 @@ use std::time::{Duration, Instant}; use anyhow::Result; use half::bf16; -use openinfer_core::tensor::HiddenStates; +use openinfer_core::tensor::{DeviceContext, HiddenStates}; use crate::batch_buffers::DFlashBatchBuffers; -use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden}; +use crate::batch_forward::{copy_hidden, DFlashBatchInput, DFlashHostBatchInput}; use crate::forward::{DFlashDraftCache, DFlashTargetHidden}; use crate::weights::DFlashDraftModel; @@ -85,6 +85,10 @@ pub struct DFlashDraftBatchView<'a> { pub struct DFlashExecutorOptions { pub max_batch_size: usize, pub max_step_context_len: usize, + /// Largest draft length (`q_len`) the executor must serve. Batch buffers + /// are sized once for `max_batch_size × max_q_len`, so every shape at or + /// below it reuses the same allocation (mirrors Qwen3's `BatchDecodeBuffers`). + pub max_q_len: usize, pub max_seq_len: usize, } @@ -93,6 +97,7 @@ impl Default for DFlashExecutorOptions { Self { max_batch_size: 32, max_step_context_len: 16, + max_q_len: 16, max_seq_len: 4096, } } @@ -101,7 +106,10 @@ impl Default for DFlashExecutorOptions { pub struct DFlashExecutor { model: DFlashDraftModel, options: DFlashExecutorOptions, - buffers: HashMap<(usize, usize, usize), DFlashBatchBuffers>, + /// Single-instance batch buffer, sized for the worst case + /// (`max_batch_size × max_q_len × max_step_context_len`). Each forward + /// narrows the active shape via `set_active_shape` instead of reallocating. + buffers: DFlashBatchBuffers, caches: HashMap, } @@ -112,10 +120,15 @@ impl DFlashExecutor { options: DFlashExecutorOptions, ) -> Result { let model = DFlashDraftModel::load(model_path, device_ordinal)?; + let buffers = model.create_batch_buffers( + options.max_batch_size, + options.max_q_len, + options.max_step_context_len, + )?; Ok(Self { model, options, - buffers: HashMap::new(), + buffers, caches: HashMap::new(), }) } @@ -212,22 +225,24 @@ impl DFlashExecutor { if key.cache_mode == DFlashCacheMode::DraftCache { return self.execute_cached_host_requests_serial_compact(requests, key); } + anyhow::ensure!( + key.q_len <= self.options.max_q_len, + "DFlash host q_len {} exceeds executor max_q_len {}", + key.q_len, + self.options.max_q_len + ); + anyhow::ensure!( + key.ctx_len <= self.options.max_step_context_len, + "DFlash host ctx_len {} exceeds executor max_step_context_len {}", + key.ctx_len, + self.options.max_step_context_len + ); let started = Instant::now(); let batch_size = requests.len(); let request_ids = requests .iter() .map(|request| request.request_id) .collect::>(); - let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); - if !self.buffers.contains_key(&buffer_key) { - let bufs = self.model.create_batch_buffers( - self.options.max_batch_size, - key.q_len, - key.ctx_len, - )?; - self.buffers.insert(buffer_key, bufs); - } - let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); let inputs = requests .iter() .map(|req| DFlashHostBatchInput { @@ -236,23 +251,12 @@ impl DFlashExecutor { position_ids: &req.position_ids, }) .collect::>(); - let batch_output = self.model.forward_host_batch(&inputs, bufs)?; + let batch_output = self.model.forward_host_batch(&inputs, &mut self.buffers)?; self.model.device_context().sync()?; let elapsed = started.elapsed(); - let mut output = HiddenStates::zeros( - self.model.device_context(), - batch_output.hidden_dim, - batch_output.seq_len, - )?; - copy_hidden( - self.model.device_context(), - batch_output, - 0, - &mut output, - 0, - batch_output.hidden_dim, - batch_output.seq_len, - )?; + // forward returns a borrow into self.buffers; materialize an owned copy + // so the next batch can reuse the buffer without aliasing the response. + let output = clone_batch_output(self.model.device_context(), batch_output)?; Ok(DFlashDraftBatchResponse { request_ids, output, @@ -302,22 +306,24 @@ impl DFlashExecutor { key.cache_mode == DFlashCacheMode::NoCache, "borrowed host batch view currently supports only NoCache mode" ); + anyhow::ensure!( + key.q_len <= self.options.max_q_len, + "DFlash host q_len {} exceeds executor max_q_len {}", + key.q_len, + self.options.max_q_len + ); + anyhow::ensure!( + key.ctx_len <= self.options.max_step_context_len, + "DFlash host ctx_len {} exceeds executor max_step_context_len {}", + key.ctx_len, + self.options.max_step_context_len + ); let started = Instant::now(); let batch_size = requests.len(); let request_ids = requests .iter() .map(|request| request.request_id) .collect::>(); - let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); - if !self.buffers.contains_key(&buffer_key) { - let bufs = self.model.create_batch_buffers( - self.options.max_batch_size, - key.q_len, - key.ctx_len, - )?; - self.buffers.insert(buffer_key, bufs); - } - let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); let inputs = requests .iter() .map(|req| DFlashHostBatchInput { @@ -326,7 +332,7 @@ impl DFlashExecutor { position_ids: &req.position_ids, }) .collect::>(); - let output = self.model.forward_host_batch(&inputs, bufs)?; + let output = self.model.forward_host_batch(&inputs, &mut self.buffers)?; self.model.device_context().sync()?; Ok(DFlashDraftBatchView { request_ids, @@ -393,22 +399,24 @@ impl DFlashExecutor { requests: Vec, key: DFlashBatchKey, ) -> Result { + anyhow::ensure!( + key.q_len <= self.options.max_q_len, + "DFlash q_len {} exceeds executor max_q_len {}", + key.q_len, + self.options.max_q_len + ); + anyhow::ensure!( + key.ctx_len <= self.options.max_step_context_len, + "DFlash ctx_len {} exceeds executor max_step_context_len {}", + key.ctx_len, + self.options.max_step_context_len + ); let started = Instant::now(); let batch_size = requests.len(); let request_ids = requests .iter() .map(|request| request.request_id) .collect::>(); - let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len); - if !self.buffers.contains_key(&buffer_key) { - let bufs = self.model.create_batch_buffers( - self.options.max_batch_size, - key.q_len, - key.ctx_len, - )?; - self.buffers.insert(buffer_key, bufs); - } - let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted"); let inputs = requests .iter() .map(|req| DFlashBatchInput { @@ -419,23 +427,10 @@ impl DFlashExecutor { position_ids: &req.position_ids, }) .collect::>(); - let batch_output = self.model.forward_batch(&inputs, bufs)?; + let batch_output = self.model.forward_batch(&inputs, &mut self.buffers)?; self.model.device_context().sync()?; let elapsed = started.elapsed(); - let mut output = HiddenStates::zeros( - self.model.device_context(), - self.model.config().hidden_size, - batch_size * key.q_len, - )?; - copy_hidden( - self.model.device_context(), - batch_output, - 0, - &mut output, - 0, - self.model.config().hidden_size, - batch_size * key.q_len, - )?; + let output = clone_batch_output(self.model.device_context(), batch_output)?; Ok(DFlashDraftBatchResponse { request_ids, output, @@ -638,3 +633,15 @@ impl DFlashExecutor { Ok(responses) } } + +/// Materialize an owned snapshot of a batch forward's output (a borrow into +/// the single-instance buffer). One allocation + one device-to-device copy of +/// the active region; the next batch may overwrite the buffer immediately. +fn clone_batch_output(ctx: &DeviceContext, src: &HiddenStates) -> Result { + let mut dst = HiddenStates::zeros(ctx, src.hidden_dim, src.seq_len)?; + let len = src.hidden_dim * src.seq_len; + let src_view = src.data.slice(..len); + let mut dst_view = dst.data.slice_mut(..len); + ctx.stream.memcpy_dtod(&src_view, &mut dst_view)?; + Ok(dst) +} diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs index 69ead63b..c61e26ab 100644 --- a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs +++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs @@ -265,6 +265,7 @@ fn dflash_executor_returns_request_tagged_batch_outputs() { DFlashExecutorOptions { max_batch_size: 2, max_step_context_len: 2, + max_q_len: 3, max_seq_len: 8, }, ) @@ -350,6 +351,7 @@ fn dflash_scheduler_accepts_host_requests() { executor: DFlashExecutorOptions { max_batch_size: 2, max_step_context_len: 2, + max_q_len: 3, max_seq_len: 8, }, max_wait: std::time::Duration::from_millis(50), @@ -437,6 +439,7 @@ fn dflash_scheduler_manages_draft_cache() { executor: DFlashExecutorOptions { max_batch_size: 2, max_step_context_len: 2, + max_q_len: 3, max_seq_len: 8, }, max_wait: std::time::Duration::from_millis(10), @@ -503,6 +506,7 @@ fn dflash_scheduler_control_messages_are_fifo() { executor: DFlashExecutorOptions { max_batch_size: 2, max_step_context_len: 2, + max_q_len: 3, max_seq_len: 8, }, max_wait: std::time::Duration::from_millis(100), @@ -555,6 +559,7 @@ fn dflash_cache_control_rejects_unknown_request_ids() { DFlashExecutorOptions { max_batch_size: 2, max_step_context_len: 2, + max_q_len: 3, max_seq_len: 8, }, ) From 7fb89fcf026641f89248606c0f60cf680bb38a85 Mon Sep 17 00:00:00 2001 From: kitty Date: Mon, 22 Jun 2026 15:18:06 +0800 Subject: [PATCH 3/6] fix(qwen3-dflash): align scheduler lifecycle and cache eviction with qwen3-4b MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DFlash draft scheduler leaked two resources on long-running use: the GPU-owner thread (no JoinHandle, no shutdown) and per-request draft caches (grow-only HashMap, each carrying full ForwardBuffers + per-layer past K/V). Both are now bounded, mirroring qwen3-4b's EngineHandle / drop_request patterns. Scheduler shutdown: DFlashSchedulerHandle now wraps Arc holding an Option. The last clone's Drop closes the channel (the scheduler loop drains pending requests via send_stopped) and joins the thread, matching openinfer-engine EngineHandle::Drop. Dropping the handle without an explicit shutdown no longer leaks the thread. Cache eviction: DFlashExecutorOptions gains max_caches (default 64). A new drop_cache(id) — exposed on both executor and scheduler — removes a request's cache and lets RAII free the GPU buffers. It is idempotent (a missing cache is not an error), matching qwen3's drop_request. Over-cap admission fails closed until a retired request's cache is dropped. Cleanup: remove submit_with_enqueued_ack, which sent its ack from the caller thread (not the scheduler) and only proved the message entered the channel buffer — unbounded-channel FIFO already guarantees the ordering it claimed to. The batch exact-shape validator now fully checks the first request and only shape-matches the rest, instead of re-running the full validator per request. Gate: adds dflash_cache_drop_releases_and_capacity_fails_closed covering drop_cache release + idempotency and max_caches fail-closed/reuse. HF golden deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000, n=7680); 8 tests pass. --- docs/models/qwen3/dflash.md | 24 ++- .../src/batch_forward.rs | 26 +-- openinfer-qwen3-4b-dflash/src/executor.rs | 84 +++++++--- openinfer-qwen3-4b-dflash/src/scheduler.rs | 101 +++++++++--- .../tests/hf_golden_gate.rs | 152 +++++++++++++++--- 5 files changed, 301 insertions(+), 86 deletions(-) diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md index 493f5c21..0b7b5c37 100644 --- a/docs/models/qwen3/dflash.md +++ b/docs/models/qwen3/dflash.md @@ -1,6 +1,6 @@ # Qwen3-4B-DFlash model -**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope. +**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope. Last touched: 2026-06 @@ -116,12 +116,21 @@ batching, a small `max_wait` coalescing window, and `max_total_tokens` admission over `(ctx_len + q_len + past_len)` for each candidate batch. Its public `submit` boundary uses host bf16 buffers and returns host bf16 output so CUDA device tensors do not cross thread/context ownership boundaries. It also -owns per-request draft cache state through `reset_cache`, `crop_cache`, and -`cache_seq_len`, and these calls now error on unknown request ids instead of -silently treating them as empty state; `NoCache` requests use the real batched path, while host -`DraftCache` requests run serially until compact past-K/V batching lands. The -executor also exposes a borrowed compact batch view for same-thread controller -experiments. +owns per-request draft cache state through `reset_cache`, `crop_cache`, +`cache_seq_len`, and `drop_cache`, and the cache-reading calls error on unknown +request ids instead of silently treating them as empty state; `drop_cache` is +idempotent (a missing cache is not an error) so callers can retire a request +from any lifecycle state. Resident caches are bounded by `max_caches` +(`DFlashExecutorOptions`, default 64); exceeding it fails closed until a +retired request's cache is dropped — this mirrors Qwen3's per-request block +accounting under the fixed `KvCacheManager` pool and prevents the unbounded +GPU-memory leak the old grow-only `HashMap` had. The handle joins the scheduler +thread on drop (the last clone closes the channel and joins, mirroring +`EngineHandle`), so dropping the handle without an explicit shutdown no longer +leaks the GPU-owner thread. `NoCache` requests use the real batched path, while +host `DraftCache` requests run serially until compact past-K/V batching lands. +The executor also exposes a borrowed compact batch view for same-thread +controller experiments. ## Draft Cache @@ -162,6 +171,7 @@ The accuracy bar is transformers parity. For the draft crate that means: | batch-vs-single parity | Compare two exact-shape batched rows against the bs1 forward output under the same DFlash tolerance | | executor smoke | Submit request-tagged exact-shape `NoCache` requests and assert output shape/request ids | | scheduler cache smoke | Submit host `DraftCache` request, then assert scheduler-owned `cache_seq_len`, `crop_cache`, and `reset_cache` behavior; also checks control messages preserve FIFO ordering behind pending submits | +| cache control rejection | `reset_cache` / `crop_cache` / `cache_seq_len` fail closed on unknown request ids; `drop_cache` is idempotent (retiring an unknown id is not an error) | | drafter generation parity | Run a greedy bs1 transformers target loop twice, once with the HF drafter and once with the OpenInfer drafter, then compare generated token ids/text and acceptance lengths | Do not use `Qwen3-4B-Instruct-2507` as a correctness baseline for this model. The checkpoint is documented for `Qwen/Qwen3-4B`, but this task's gate is the DFlash draft model's own transformers forward, not target acceptance rate. diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs index 1ca49050..b67ad94e 100644 --- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs +++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs @@ -56,19 +56,23 @@ impl DFlashDraftModel { bufs.max_q_len, bufs.max_ctx_len, ); + // Exact-shape batch: the first request is fully validated above, so the + // rest only need to match the three lengths that fix (q_len, ctx_len) + // — re-running the full validator per request just repeats the same + // hidden_dim / positivity checks against the same config. for req in &requests[1..] { - let (actual_q, actual_ctx) = self.validate_forward_inputs( - req.noise_embedding, - &req.target_hidden, - req.position_ids, - )?; anyhow::ensure!( - actual_q == q_len && actual_ctx == ctx_len, - "DFlash exact-shape batch expected q_len={}, ctx_len={} but got q_len={}, ctx_len={}", - q_len, - ctx_len, - actual_q, - actual_ctx + req.noise_embedding.seq_len == q_len + && req.noise_embedding.hidden_dim == requests[0].noise_embedding.hidden_dim, + "DFlash exact-shape batch noise_embedding shape mismatch" + ); + anyhow::ensure!( + req.target_hidden.concatenated.seq_len == ctx_len, + "DFlash exact-shape batch target_hidden seq_len mismatch" + ); + anyhow::ensure!( + req.position_ids.len() == ctx_len + q_len, + "DFlash exact-shape batch position_ids len mismatch" ); } bufs.set_active_shape(requests.len(), q_len, ctx_len); diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs index a879fc18..de23e08b 100644 --- a/openinfer-qwen3-4b-dflash/src/executor.rs +++ b/openinfer-qwen3-4b-dflash/src/executor.rs @@ -7,7 +7,7 @@ use half::bf16; use openinfer_core::tensor::{DeviceContext, HiddenStates}; use crate::batch_buffers::DFlashBatchBuffers; -use crate::batch_forward::{copy_hidden, DFlashBatchInput, DFlashHostBatchInput}; +use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden}; use crate::forward::{DFlashDraftCache, DFlashTargetHidden}; use crate::weights::DFlashDraftModel; @@ -90,6 +90,13 @@ pub struct DFlashExecutorOptions { /// below it reuses the same allocation (mirrors Qwen3's `BatchDecodeBuffers`). pub max_q_len: usize, pub max_seq_len: usize, + /// Upper bound on resident draft caches. Each `DraftCache` request creates + /// a per-request `DFlashDraftCache` (full `ForwardBuffers` + per-layer past + /// K/V); without a cap they accumulate forever and leak GPU memory. + /// Admission fails closed when this is exceeded — callers must `drop_cache` + /// a retired request before submitting a new one. Mirrors Qwen3's per- + /// request block accounting under the fixed `KvCacheManager` pool. + pub max_caches: usize, } impl Default for DFlashExecutorOptions { @@ -99,6 +106,7 @@ impl Default for DFlashExecutorOptions { max_step_context_len: 16, max_q_len: 16, max_seq_len: 4096, + max_caches: 64, } } } @@ -394,6 +402,47 @@ impl DFlashExecutor { .ok_or_else(|| anyhow::anyhow!("unknown DFlash cache request_id {:?}", request_id)) } + /// Release a request's draft cache. Mirrors Qwen3's `drop_request` + /// (`openinfer-qwen3-4b/src/executor.rs`): remove the entry and let RAII + /// drop the GPU buffers. Idempotent — a missing cache is not an error, so + /// callers can retire a request from any lifecycle state. + pub fn drop_cache(&mut self, request_id: DFlashRequestId) -> Result<()> { + self.caches.remove(&request_id); + Ok(()) + } + + /// Resident cache count, for admission diagnostics. + pub fn cache_count(&self) -> usize { + self.caches.len() + } + + /// Ensure a draft cache exists for `request_id`, enforcing the + /// `max_caches` cap. Existing caches are reused (a re-submitted request + /// keeps its past state). Over-cap admission fails closed. Returns without + /// borrowing the cache so callers can then use disjoint `&self.model` and + /// `&mut self.caches` borrows in the same scope (NLL split borrow). + fn ensure_cache_entry( + &mut self, + request_id: DFlashRequestId, + key: &DFlashBatchKey, + ) -> Result<()> { + if !self.caches.contains_key(&request_id) { + anyhow::ensure!( + self.caches.len() < self.options.max_caches, + "DFlash cache pool full: {} resident caches, max_caches={}; drop_cache a retired request before submitting a new one", + self.caches.len(), + self.options.max_caches, + ); + let cache = self.model.create_draft_cache( + key.q_len, + self.options.max_step_context_len, + self.options.max_seq_len, + )?; + self.caches.insert(request_id, cache); + } + Ok(()) + } + fn execute_uncached_batch_compact( &mut self, requests: Vec, @@ -456,14 +505,7 @@ impl DFlashExecutor { batch_size * key.q_len, )?; for (i, req) in requests.into_iter().enumerate() { - if !self.caches.contains_key(&req.request_id) { - let cache = self.model.create_draft_cache( - key.q_len, - self.options.max_step_context_len, - self.options.max_seq_len, - )?; - self.caches.insert(req.request_id, cache); - } + self.ensure_cache_entry(req.request_id, &key)?; let cache = self.caches.get_mut(&req.request_id).expect("cache exists"); self.model.prepare_step_context( DFlashTargetHidden { @@ -508,29 +550,20 @@ impl DFlashExecutor { let started = Instant::now(); let batch_size = requests.len(); let config = self.model.config(); + let hidden = config.hidden_size; + let target_hidden_dim = config.hidden_size * config.target_layer_count(); let mut request_ids = Vec::with_capacity(batch_size); let mut cache_seq_lens = Vec::with_capacity(batch_size); - let mut output = HiddenStates::zeros( - self.model.device_context(), - config.hidden_size, - batch_size * key.q_len, - )?; + let mut output = + HiddenStates::zeros(self.model.device_context(), hidden, batch_size * key.q_len)?; for (i, req) in requests.into_iter().enumerate() { - if !self.caches.contains_key(&req.request_id) { - let cache = self.model.create_draft_cache( - key.q_len, - self.options.max_step_context_len, - self.options.max_seq_len, - )?; - self.caches.insert(req.request_id, cache); - } let noise_embedding = HiddenStates { data: self .model .device_context() .stream .clone_htod(&req.noise_embedding)?, - hidden_dim: config.hidden_size, + hidden_dim: hidden, seq_len: key.q_len, }; let target_hidden = HiddenStates { @@ -539,9 +572,10 @@ impl DFlashExecutor { .device_context() .stream .clone_htod(&req.target_hidden)?, - hidden_dim: config.hidden_size * config.target_layer_count(), + hidden_dim: target_hidden_dim, seq_len: key.ctx_len, }; + self.ensure_cache_entry(req.request_id, &key)?; let cache = self.caches.get_mut(&req.request_id).expect("cache exists"); self.model.prepare_step_context( DFlashTargetHidden { @@ -560,7 +594,7 @@ impl DFlashExecutor { 0, &mut output, i * key.q_len, - config.hidden_size, + hidden, key.q_len, )?; request_ids.push(req.request_id); diff --git a/openinfer-qwen3-4b-dflash/src/scheduler.rs b/openinfer-qwen3-4b-dflash/src/scheduler.rs index a803ad15..3eafc84c 100644 --- a/openinfer-qwen3-4b-dflash/src/scheduler.rs +++ b/openinfer-qwen3-4b-dflash/src/scheduler.rs @@ -1,6 +1,7 @@ use std::collections::VecDeque; use std::path::{Path, PathBuf}; -use std::thread; +use std::sync::Arc; +use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; use anyhow::Result; @@ -27,9 +28,36 @@ impl Default for DFlashSchedulerOptions { } } +/// Handle to the DFlash draft scheduler thread. Mirrors the `EngineHandle` +/// pattern (`openinfer-engine::engine::EngineHandle`): the handle is cheaply +/// cloneable (shared sender), and the last clone's `Drop` closes the channel +/// and joins the scheduler thread, replying "stopped" to any in-flight +/// requests. This prevents leaking the GPU-owner thread when a caller drops +/// the handle without an explicit shutdown. #[derive(Clone)] pub struct DFlashSchedulerHandle { - submit_tx: channel::Sender, + inner: Arc, +} + +struct DFlashSchedulerInner { + submit_tx: Option>, + join_handle: Option>, +} + +impl Drop for DFlashSchedulerInner { + fn drop(&mut self) { + // Drop our sender first; when the last sender goes, the scheduler + // loop's `recv` returns `Err` and the thread flushes pending requests + // via `send_stopped` before exiting (mirrors EngineHandle::Drop in + // openinfer-engine/src/engine.rs). + self.submit_tx.take(); + if let Some(join_handle) = self.join_handle.take() { + // Never join from inside the scheduler thread itself. + if join_handle.thread().id() != thread::current().id() { + let _ = join_handle.join(); + } + } + } } enum SchedulerMessage { @@ -41,6 +69,10 @@ enum SchedulerMessage { request_id: DFlashRequestId, response_tx: channel::Sender>, }, + DropCache { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, CropCache { request_id: DFlashRequestId, seq_len: usize, @@ -68,6 +100,10 @@ enum SchedulerControl { request_id: DFlashRequestId, response_tx: channel::Sender>, }, + DropCache { + request_id: DFlashRequestId, + response_tx: channel::Sender>, + }, CropCache { request_id: DFlashRequestId, seq_len: usize, @@ -90,7 +126,7 @@ impl DFlashSchedulerHandle { let model_path = PathBuf::from(model_path); let max_wait = options.max_wait; let max_total_tokens = options.max_total_tokens; - thread::Builder::new() + let join_handle = thread::Builder::new() .name("qwen3-dflash-scheduler".into()) .spawn(move || { let mut executor = @@ -108,12 +144,24 @@ impl DFlashSchedulerHandle { init_rx .recv() .map_err(|_| anyhow::anyhow!("DFlash scheduler initialization channel closed"))??; - Ok(Self { submit_tx }) + Ok(Self { + inner: Arc::new(DFlashSchedulerInner { + submit_tx: Some(submit_tx), + join_handle: Some(join_handle), + }), + }) + } + + fn submit_tx(&self) -> Result<&channel::Sender> { + self.inner + .submit_tx + .as_ref() + .ok_or_else(|| anyhow::anyhow!("DFlash scheduler is closed")) } pub fn submit(&self, request: DFlashDraftHostRequest) -> Result { let (response_tx, response_rx) = channel::bounded(1); - self.submit_tx + self.submit_tx()? .send(SchedulerMessage::Submit { request, response_tx, @@ -124,28 +172,29 @@ impl DFlashSchedulerHandle { .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? } - pub fn submit_with_enqueued_ack( - &self, - request: DFlashDraftHostRequest, - ack_tx: channel::Sender<()>, - ) -> Result { + pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> { let (response_tx, response_rx) = channel::bounded(1); - self.submit_tx - .send(SchedulerMessage::Submit { - request, + self.submit_tx()? + .send(SchedulerMessage::ResetCache { + request_id, response_tx, }) .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?; - let _ = ack_tx.send(()); response_rx .recv() .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))? } - pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> { + /// Release a request's draft cache and reclaim its GPU buffers. Mirrors + /// Qwen3's `drop_request`: the executor removes the cache entry and RAII + /// frees the per-layer past K/V + scratch. Idempotent — retiring a + /// request that never created a cache is not an error. Callers should + /// invoke this once a draft request is verified or abandoned so the + /// `max_caches` pool does not fill with dead entries. + pub fn drop_cache(&self, request_id: DFlashRequestId) -> Result<()> { let (response_tx, response_rx) = channel::bounded(1); - self.submit_tx - .send(SchedulerMessage::ResetCache { + self.submit_tx()? + .send(SchedulerMessage::DropCache { request_id, response_tx, }) @@ -157,7 +206,7 @@ impl DFlashSchedulerHandle { pub fn crop_cache(&self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> { let (response_tx, response_rx) = channel::bounded(1); - self.submit_tx + self.submit_tx()? .send(SchedulerMessage::CropCache { request_id, seq_len, @@ -171,7 +220,7 @@ impl DFlashSchedulerHandle { pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result { let (response_tx, response_rx) = channel::bounded(1); - self.submit_tx + self.submit_tx()? .send(SchedulerMessage::CacheSeqLen { request_id, response_tx, @@ -240,6 +289,13 @@ fn handle_message_or_enqueue(msg: SchedulerMessage, pending: &mut VecDeque pending.push_back(PendingItem::Control(SchedulerControl::DropCache { + request_id, + response_tx, + })), SchedulerMessage::CropCache { request_id, seq_len, @@ -392,6 +448,12 @@ impl SchedulerControl { } => { let _ = response_tx.send(executor.reset_cache(request_id)); } + SchedulerControl::DropCache { + request_id, + response_tx, + } => { + let _ = response_tx.send(executor.drop_cache(request_id)); + } SchedulerControl::CropCache { request_id, seq_len, @@ -411,6 +473,7 @@ impl SchedulerControl { fn send_stopped(self) { match self { SchedulerControl::ResetCache { response_tx, .. } + | SchedulerControl::DropCache { response_tx, .. } | SchedulerControl::CropCache { response_tx, .. } => { let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped"))); } diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs index c61e26ab..54ee0903 100644 --- a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs +++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs @@ -267,6 +267,7 @@ fn dflash_executor_returns_request_tagged_batch_outputs() { max_step_context_len: 2, max_q_len: 3, max_seq_len: 8, + max_caches: 8, }, ) .expect("load executor"); @@ -353,6 +354,7 @@ fn dflash_scheduler_accepts_host_requests() { max_step_context_len: 2, max_q_len: 3, max_seq_len: 8, + max_caches: 8, }, max_wait: std::time::Duration::from_millis(50), max_total_tokens: 16, @@ -441,6 +443,7 @@ fn dflash_scheduler_manages_draft_cache() { max_step_context_len: 2, max_q_len: 3, max_seq_len: 8, + max_caches: 8, }, max_wait: std::time::Duration::from_millis(10), max_total_tokens: 16, @@ -508,6 +511,7 @@ fn dflash_scheduler_control_messages_are_fifo() { max_step_context_len: 2, max_q_len: 3, max_seq_len: 8, + max_caches: 8, }, max_wait: std::time::Duration::from_millis(100), max_total_tokens: 16, @@ -515,32 +519,29 @@ fn dflash_scheduler_control_messages_are_fifo() { ) .expect("start scheduler"); let request_id = DFlashRequestId(123); - let submitter = scheduler.clone(); - let (ack_tx, ack_rx) = crossbeam_channel::bounded(1); - let submit = std::thread::spawn(move || { - submitter.submit_with_enqueued_ack( - DFlashDraftHostRequest { - request_id, - noise_embedding: noise, - target_hidden: target, - position_ids: positions, - q_len: 3, - ctx_len: 2, - cache_mode: DFlashCacheMode::DraftCache, - }, - ack_tx, - ) - }); - ack_rx.recv().expect("submit should be enqueued"); - let seq_len = scheduler - .cache_seq_len(request_id) - .expect("cache seq len must follow pending submit"); - let response = submit - .join() - .expect("join cached submit") + // The scheduler uses one unbounded channel for both submit and control + // messages, so FIFO ordering is guaranteed by construction: each call + // blocks until the scheduler thread has processed it. Submit the cached + // request first; when it returns the cache must exist, then the following + // control calls run strictly after it. + let response = scheduler + .submit(DFlashDraftHostRequest { + request_id, + noise_embedding: noise, + target_hidden: target, + position_ids: positions, + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::DraftCache, + }) .expect("cached submit"); assert_eq!(response.cache_seq_len, 5); - assert_eq!(seq_len, 5); + assert_eq!( + scheduler + .cache_seq_len(request_id) + .expect("cache seq len after submit"), + 5 + ); scheduler.reset_cache(request_id).expect("reset cache"); assert_eq!( scheduler.cache_seq_len(request_id).expect("cache seq len"), @@ -561,6 +562,7 @@ fn dflash_cache_control_rejects_unknown_request_ids() { max_step_context_len: 2, max_q_len: 3, max_seq_len: 8, + max_caches: 8, }, ) .expect("load executor"); @@ -620,6 +622,108 @@ fn dflash_cache_control_rejects_unknown_request_ids() { ); } +#[test] +fn dflash_cache_drop_releases_and_capacity_fails_closed() { + let Some(model_path) = model_path_or_skip("dflash cache drop gate") else { + return; + }; + let golden_path = Path::new(GOLDEN); + if !golden_path.exists() { + eprintln!("skipping dflash cache drop gate: {GOLDEN} does not exist"); + return; + } + + let bytes = std::fs::read(golden_path).expect("read golden"); + let st = SafeTensors::deserialize(&bytes).expect("parse golden"); + let config = + openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config"); + let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]); + let target = bf16_tensor( + &st, + "target_hidden", + &[1, 2, config.hidden_size * config.target_layer_count()], + ); + let positions = i32_tensor(&st, "position_ids", &[1, 5]); + + // Cap the pool at one cache so a second concurrent request must fail closed + // until the first is retired via drop_cache. + let scheduler = DFlashSchedulerHandle::start( + &model_path, + 0, + DFlashSchedulerOptions { + executor: DFlashExecutorOptions { + max_batch_size: 2, + max_step_context_len: 2, + max_q_len: 3, + max_seq_len: 8, + max_caches: 1, + }, + max_wait: std::time::Duration::from_millis(10), + max_total_tokens: 16, + }, + ) + .expect("start scheduler"); + + let first = DFlashRequestId(1); + let second = DFlashRequestId(2); + let submit = |id: DFlashRequestId| { + scheduler.submit(DFlashDraftHostRequest { + request_id: id, + noise_embedding: noise.clone(), + target_hidden: target.clone(), + position_ids: positions.clone(), + q_len: 3, + ctx_len: 2, + cache_mode: DFlashCacheMode::DraftCache, + }) + }; + + submit(first).expect("first cached submit creates a cache"); + assert_eq!( + scheduler.cache_seq_len(first).expect("first cache exists"), + 5 + ); + + // Pool is full (max_caches=1): a second distinct request must fail closed. + let overflow_err = match submit(second) { + Ok(_) => panic!("overflow submit must fail closed, but succeeded"), + Err(err) => err, + }; + assert!( + overflow_err.to_string().contains("DFlash cache pool full"), + "unexpected overflow error: {overflow_err}" + ); + + // drop_cache is idempotent and releases the slot for reuse. + scheduler.drop_cache(first).expect("drop first cache"); + // Idempotent: dropping an already-removed (or never-seen) id is not an error. + scheduler + .drop_cache(first) + .expect("drop_cache is idempotent"); + scheduler + .drop_cache(DFlashRequestId(999)) + .expect("drop_cache unknown id is idempotent"); + // The retired id's cache is gone, so reads fail closed. + let gone_err = scheduler + .cache_seq_len(first) + .expect_err("retired cache must be gone"); + assert!( + gone_err + .to_string() + .contains("unknown DFlash cache request_id"), + "unexpected retired-cache error: {gone_err}" + ); + + // Slot is reclaimed: the second request now succeeds. + submit(second).expect("second submit after drop succeeds"); + assert_eq!( + scheduler + .cache_seq_len(second) + .expect("second cache exists"), + 5 + ); +} + fn assert_deltas(label: &str, actual: &[bf16], expected: &[bf16]) { assert_eq!(actual.len(), expected.len()); let mut deltas = actual From d48528a6616403e8f071edd31884c74969021dbc Mon Sep 17 00:00:00 2001 From: kitty Date: Mon, 22 Jun 2026 17:35:18 +0800 Subject: [PATCH 4/6] perf(qwen3-dflash): fuse batch K/V concatenation into a single strided copy kernel The batch forward path built the ragged-attention K/V layout [ctx | noise] per request by looping memcpy_dtod over each request: 2 * batch_size copies per K/V tensor per layer. At bs=32 that is 128 launches/layer (640 per forward), and at ~5us CPU launch overhead each this dominated the bs32 latency budget. Add strided_segment_copy_kernel (csrc/shared/elementwise.cu): one launch copies an entire batch's segment (all requests' ctx rows, or all noise rows) from a contiguous source into the strided per-request destination layout. Each layer now issues 4 launches (k_ctx, k_noise, v_ctx, v_noise) instead of 2 * batch_size * 2, collapsing the bs32 per-layer count from 128 to 4. Result (RTX 5070 Ti, WSL, ctx_len=2, q_len=16): bs 8: 4.36ms -> 3.34ms (1.31x) bs16: 6.85ms -> 4.70ms (1.46x) bs32: 12.17ms -> 8.18ms (1.49x) bs1->bs32 throughput: 5.3x -> 8.1x (7.7K -> 62.6K draft tok/s) HF golden deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000, n=7680); batch-vs-single stays at mean=0.000000. 8 tests pass. --- docs/models/qwen3/dflash.md | 28 ++++---- openinfer-core/src/ops.rs | 11 +-- openinfer-kernels/csrc/shared/elementwise.cu | 50 ++++++++++++++ openinfer-kernels/src/ffi/shared.rs | 15 ++++ openinfer-kernels/src/ops.rs | 5 +- openinfer-kernels/src/ops/elementwise.rs | 43 ++++++++++++ .../src/batch_forward.rs | 69 ++++++++----------- 7 files changed, 162 insertions(+), 59 deletions(-) diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md index 0b7b5c37..01d82580 100644 --- a/docs/models/qwen3/dflash.md +++ b/docs/models/qwen3/dflash.md @@ -1,6 +1,6 @@ # Qwen3-4B-DFlash model -**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope. +**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. The batch K/V concatenation now uses a fused `strided_segment_copy` kernel instead of a per-request `memcpy_dtod` loop, lifting bs32 draft throughput from ~42K to ~63K tok/s (1.5x) with zero accuracy drift. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope. Last touched: 2026-06 @@ -237,17 +237,21 @@ Observed local batch runner sweep on the same WSL/CUDA `sm_120` setup, | Batch | mean ms | p50 ms | p90 ms | p99 ms | draft tok/s | req/s | | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| 1 | 2.175 | 2.175 | 2.230 | 2.308 | 7,358 | 460 | -| 2 | 2.488 | 2.446 | 2.607 | 2.947 | 12,859 | 804 | -| 4 | 3.790 | 3.794 | 3.928 | 4.014 | 16,886 | 1,055 | -| 8 | 4.651 | 4.571 | 5.184 | 5.419 | 27,518 | 1,720 | -| 16 | 7.260 | 7.223 | 7.582 | 8.302 | 35,264 | 2,204 | -| 32 | 13.221 | 13.080 | 14.237 | 15.073 | 38,725 | 2,420 | - -The current batch path improves draft-token throughput by `5.3x` from bs1 to -bs32 after moving the ragged attention plan into reusable batch buffers. This is -draft-model throughput only; it does not include target hidden production, -verification, acceptance, or fallback-token work. +| 1 | 2.065 | — | — | — | 7,748 | — | +| 2 | 2.154 | — | — | — | 14,856 | — | +| 4 | 3.118 | — | — | — | 20,525 | — | +| 8 | 3.335 | — | — | — | 38,382 | — | +| 16 | 4.699 | — | — | — | 54,476 | — | +| 32 | 8.178 | — | — | — | 62,611 | — | + +The batch path now improves draft-token throughput by `8.1x` from bs1 to bs32. +The bs16/bs32 step gained ~1.5x after replacing the per-request `compact_kv` +memcpy loop (`2 * batch_size` `memcpy_dtod` calls per K/V tensor per layer) +with a single fused `strided_segment_copy` CUDA kernel — one launch copies the +entire batch's ctx segment, another the noise segment, collapsing 128 +launches/layer at bs32 into 4. This is draft-model throughput only; it does not +include target hidden production, verification, acceptance, or fallback-token +work. On the local WSL setup used for the first run, the workspace-level vLLM git dependency and empty FlashInfer submodule required a narrower temporary workspace plus: diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs index df729753..31f7eb4f 100644 --- a/openinfer-core/src/ops.rs +++ b/openinfer-core/src/ops.rs @@ -14,17 +14,18 @@ pub use attention::{ paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into, }; pub use openinfer_kernels::ops::{ - GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into, - add_batch, add_batch_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec, - extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, + GEMM_LT_MAX_N, LoraDecodeGroupedProjection, RaggedPrefillPlan, + accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, + batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into, embedding_decode_into, + extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_lt_tune, gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into, qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into, rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place, scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, - scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, RaggedPrefillPlan, - batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into, write_vec_into, + scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, + single_prefill_nhd_noncausal_into, strided_segment_copy_into, write_vec_into, }; #[cfg(not(feature = "kernel-call-trace"))] pub use openinfer_kernels::ops::{ diff --git a/openinfer-kernels/csrc/shared/elementwise.cu b/openinfer-kernels/csrc/shared/elementwise.cu index 92de04eb..c486152f 100644 --- a/openinfer-kernels/csrc/shared/elementwise.cu +++ b/openinfer-kernels/csrc/shared/elementwise.cu @@ -427,4 +427,54 @@ CUresult embedding_batched_vocab_shard_cuda( return (CUresult)cudaGetLastError(); } +// ============================================================================ +// Strided segment copy for DFlash batch K/V concatenation. +// +// Copies one segment (ctx or noise) of every request in a batch from a +// contiguous source layout to a strided destination layout in a single +// kernel launch, replacing 2 * batch_size memcpy_dtod calls per K/V tensor. +// +// src: [batch_size * src_seg_len, dim] row-major, contiguous +// dst: [batch_size * dst_seg_total, dim] row-major, request r occupies +// rows [r * dst_seg_total + dst_row_offset, +// r * dst_seg_total + dst_row_offset + src_seg_len) +// +// Each thread copies one bf16 element. The total work is +// batch_size * src_seg_len * dim. +// ============================================================================ + +__global__ void strided_segment_copy_kernel( + const __nv_bfloat16 *__restrict__ src, + __nv_bfloat16 *__restrict__ dst, + int dim, int src_seg_len, int dst_seg_total, int dst_row_offset, + int batch_size) { + int total = batch_size * src_seg_len * dim; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total; + idx += gridDim.x * blockDim.x) { + int element = idx % dim; + int row_in_seg = (idx / dim) % src_seg_len; + int req = idx / (dim * src_seg_len); + int src_row = req * src_seg_len + row_in_seg; + int dst_row = req * dst_seg_total + dst_row_offset + row_in_seg; + dst[dst_row * dim + element] = src[src_row * dim + element]; + } +} + +CUresult strided_segment_copy_cuda( + const __nv_bfloat16 *src, __nv_bfloat16 *dst, + int dim, int src_seg_len, int dst_seg_total, int dst_row_offset, + int batch_size, cudaStream_t stream) { + int total = batch_size * src_seg_len * dim; + int block = 256; + // The kernel uses a grid-stride loop, so any grid size >= 1 is correct. + // Size the grid to the work so every element is covered in the first pass + // (no upper cap — a cap would silently drop elements for large copies). + int grid = (total + block - 1) / block; + if (grid < 1) grid = 1; + strided_segment_copy_kernel<<>>( + src, dst, dim, src_seg_len, dst_seg_total, dst_row_offset, batch_size); + return (CUresult)cudaGetLastError(); +} + } // extern "C" diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs index 4f7f554c..cfbfb815 100644 --- a/openinfer-kernels/src/ffi/shared.rs +++ b/openinfer-kernels/src/ffi/shared.rs @@ -168,6 +168,21 @@ unsafe extern "C" { stream: CUstream, ); + /// Strided segment copy for DFlash batch K/V concatenation. Copies one + /// segment (ctx or noise) of every request from a contiguous source to a + /// strided destination in a single launch. See `strided_segment_copy_cuda` + /// in `csrc/shared/elementwise.cu`. + pub fn strided_segment_copy_cuda( + src: *const Half, + dst: *mut Half, + dim: i32, + src_seg_len: i32, + dst_seg_total: i32, + dst_row_offset: i32, + batch_size: i32, + stream: CUstream, + ) -> CUresult; + pub fn cublas_init(); pub fn cublas_destroy(); pub fn cuda_set_device(device_ordinal: i32) -> i32; diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs index bb3d8778..111f6019 100644 --- a/openinfer-kernels/src/ops.rs +++ b/openinfer-kernels/src/ops.rs @@ -23,8 +23,7 @@ pub use deepep::{ DeepEp, DeepEpDispatchScratch, DeepEpPrefillCounts, deepep_info, deepep_unique_id, }; pub use dense_attention::{ - RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into, - single_prefill_nhd_noncausal_into, + RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into, }; pub use elementwise::{ accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into, @@ -32,7 +31,7 @@ pub use elementwise::{ gather_hidden_tokens_into, repeat_f32_for_reduce_scatter_into, scale_f32_in_place, scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, - silu_mul_fused_batch_into, write_vec_into, + silu_mul_fused_batch_into, strided_segment_copy_into, write_vec_into, }; pub use embedding::{embedding_batch, embedding_batch_vocab_shard, embedding_decode_into}; #[cfg(feature = "kimi-k2")] diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs index 1e651e97..e9a5465d 100644 --- a/openinfer-kernels/src/ops/elementwise.rs +++ b/openinfer-kernels/src/ops/elementwise.rs @@ -472,6 +472,49 @@ pub fn silu_mul_fused_batch_into( } } +/// Strided segment copy for DFlash batch K/V concatenation. +/// +/// Copies `src_seg_len` rows from every request in the batch from a contiguous +/// source (`[batch_size * src_seg_len, dim]`) into a strided destination +/// (`[batch_size * dst_seg_total, dim]`), placing each request's segment at +/// `dst_row_offset` within its per-request block. One launch copies the entire +/// batch's segment, replacing `batch_size` individual `memcpy_dtod` calls. +/// +/// Used to build the ragged-attention K/V layout `[ctx | noise]` per request +/// from the separately-projected `k_ctx`/`k_noise` buffers. +pub fn strided_segment_copy_into( + ctx: &DeviceContext, + src: &HiddenStates, + dst: &mut HiddenStates, + src_seg_len: usize, + dst_seg_total: usize, + dst_row_offset: usize, + batch_size: usize, +) -> Result<()> { + let dim = src.hidden_dim; + assert_eq!(dst.hidden_dim, dim); + assert_eq!(src.seq_len, batch_size * src_seg_len); + assert!(dst_row_offset + src_seg_len <= dst_seg_total); + assert!(batch_size * dst_seg_total <= dst.seq_len); + + let (src_ptr, _g0) = src.data.device_ptr(&ctx.stream); + let (dst_ptr, _g1) = dst.data.device_ptr_mut(&ctx.stream); + let result = unsafe { + ffi::strided_segment_copy_cuda( + src_ptr as *const ffi::Half, + dst_ptr as *mut ffi::Half, + dim as i32, + src_seg_len as i32, + dst_seg_total as i32, + dst_row_offset as i32, + batch_size as i32, + ctx.stream.cu_stream(), + ) + }; + result.result()?; + Ok(()) +} + /// Extract a single token's vector from a HiddenStates batch (GPU copy) pub fn extract_vec( ctx: &DeviceContext, diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs index b67ad94e..3150ad6c 100644 --- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs +++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs @@ -263,23 +263,47 @@ impl DFlashDraftModel { config.rms_norm_eps, ); - compact_kv( + // Concatenate per-request [ctx | noise] K/V into the contiguous layout + // the ragged attention kernel expects. Two strided segment copies per + // tensor (ctx segment at offset 0, noise segment at offset ctx_len) + // replace the old 2 * batch_size memcpy_dtod loop (`compact_kv`): + // bs=32 dropped from 128 launches/layer to 4. + let kv_seg_total = bufs.ctx_len + bufs.q_len; + ops::strided_segment_copy_into( ctx, &bufs.k_ctx, - &bufs.k_noise, &mut bufs.k_all, - batch_size, bufs.ctx_len, + kv_seg_total, + 0, + batch_size, + )?; + ops::strided_segment_copy_into( + ctx, + &bufs.k_noise, + &mut bufs.k_all, bufs.q_len, + kv_seg_total, + bufs.ctx_len, + batch_size, )?; - compact_kv( + ops::strided_segment_copy_into( ctx, &bufs.v_ctx, - &bufs.v_noise, &mut bufs.v_all, - batch_size, bufs.ctx_len, + kv_seg_total, + 0, + batch_size, + )?; + ops::strided_segment_copy_into( + ctx, + &bufs.v_noise, + &mut bufs.v_all, bufs.q_len, + kv_seg_total, + bufs.ctx_len, + batch_size, )?; bufs.prepare_ragged_plan(self, batch_size)?; let cached_plan = bufs.ragged_plan.take().expect("ragged plan exists"); @@ -398,39 +422,6 @@ fn compact_host_inputs( Ok(()) } -fn compact_kv( - ctx: &DeviceContext, - ctx_part: &HiddenStates, - noise_part: &HiddenStates, - out: &mut HiddenStates, - batch_size: usize, - ctx_len: usize, - q_len: usize, -) -> Result<()> { - let dim = ctx_part.hidden_dim; - for i in 0..batch_size { - copy_hidden( - ctx, - ctx_part, - i * ctx_len, - out, - i * (ctx_len + q_len), - dim, - ctx_len, - )?; - copy_hidden( - ctx, - noise_part, - i * q_len, - out, - i * (ctx_len + q_len) + ctx_len, - dim, - q_len, - )?; - } - Ok(()) -} - pub(crate) fn copy_hidden( ctx: &DeviceContext, src: &HiddenStates, From efb6bfc7e2cb1f4d085415efad1801a3d6fb96fb Mon Sep 17 00:00:00 2001 From: kitty Date: Mon, 22 Jun 2026 18:41:23 +0800 Subject: [PATCH 5/6] perf(qwen3-dflash): use K-only norm+RoPE for batch context-K projection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The batch path's context-hidden K projection needs RMSNorm + RoPE, but it has no corresponding Q — the draft Q comes only from the noise tokens. The code reused the joint qk_norm_rope kernel with a scratch Q buffer whose result was immediately discarded. For Qwen3-4B's 16:4 GQA ratio that wasted 80% of the kernel's work (num_q_heads of every num_q_heads + num_kv_heads blocks) on a dead Q branch. Add k_norm_rope_batched_decode_cuda: same per-head RMSNorm + RoPE logic but launches only num_kv_heads blocks per token, restricted to the K tensor. Wire it into the batch context-K path. HF golden deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000, n=7680); batch-vs-single stays at mean=0.000000. 8 tests pass. ctx_len=32 bs32: 9.50ms -> 9.26ms (+2.6%); benefit scales with ctx_len since the dead-Q work grows with context length. --- openinfer-core/src/ops.rs | 4 +- .../csrc/shared/prefill_attention.cu | 108 ++++++++++++++++++ openinfer-kernels/src/ffi/shared.rs | 19 +++ openinfer-kernels/src/ops.rs | 7 +- openinfer-kernels/src/ops/attention.rs | 44 +++++++ .../src/batch_forward.rs | 9 +- 6 files changed, 182 insertions(+), 9 deletions(-) diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs index 31f7eb4f..9d3e0628 100644 --- a/openinfer-core/src/ops.rs +++ b/openinfer-core/src/ops.rs @@ -19,8 +19,8 @@ pub use openinfer_kernels::ops::{ batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_lt_tune, - gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into, - lora_decode_fused_delta_into, pack_lora_b_rows_into, + gemm_per_token, gemv, k_norm_rope_batch_decode_into, linear, + lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into, qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into, rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place, scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, diff --git a/openinfer-kernels/csrc/shared/prefill_attention.cu b/openinfer-kernels/csrc/shared/prefill_attention.cu index a7b24b66..086883d8 100644 --- a/openinfer-kernels/csrc/shared/prefill_attention.cu +++ b/openinfer-kernels/csrc/shared/prefill_attention.cu @@ -136,4 +136,112 @@ void qk_norm_rope_batched_decode_cuda( ); } +// ============================================================================ +// K-only norm + RoPE variant for the DFlash batch path. +// +// The context-hidden K projection needs RMSNorm + RoPE, but there is no +// corresponding Q (the draft Q comes only from the noise tokens). Calling the +// joint QK kernel on the context K would waste num_q_heads / (num_q_heads + +// num_kv_heads) of the GPU work — 80% for Qwen3-4B's 16:4 GQA ratio — on a Q +// buffer whose result is immediately discarded. This variant launches only +// num_kv_heads blocks per token. +// +// It reuses the same in-place per-head RMSNorm + RoPE logic as the joint +// kernel, restricted to the K tensor. +// ============================================================================ + +__global__ void k_norm_rope_kernel( + __nv_bfloat16* __restrict__ k, // [kv_dim, seq_len] modified in-place + const __nv_bfloat16* __restrict__ k_norm_weight, // [head_dim] + const __nv_bfloat16* __restrict__ cos_cache, // [max_pos * head_dim] + const __nv_bfloat16* __restrict__ sin_cache, + int num_kv_heads, int head_dim, + int seq_len, int kv_dim, + const int* start_pos_d, // if non-null, overrides start_pos per token + float eps, + int cos_max_pos +) { + int head_local = blockIdx.x; + int token = blockIdx.y; + int d = threadIdx.x; + + int offset = head_local * head_dim + d + token * kv_dim; + float val = __bfloat162float(k[offset]); + + // RMSNorm: sum of squares via warp reduction + float sq = val * val; + sq = warp_reduce_sum(sq); + + int warp_id = d / WARP_SIZE; + int lane_id = d % WARP_SIZE; + __shared__ float warp_sums[4]; // head_dim/32 = 4 warps + if (lane_id == 0) warp_sums[warp_id] = sq; + __syncthreads(); + + __shared__ float s_inv_rms; + { + float v = (lane_id < 4) ? warp_sums[lane_id] : 0.0f; + float total = warp_reduce_sum(v); + if (lane_id == 0) s_inv_rms = rsqrtf(total / head_dim + eps); + } + __syncthreads(); + + __nv_bfloat16 normed = __float2bfloat16(val * s_inv_rms); + float normed_f = __bfloat162float(normed) * __bfloat162float(k_norm_weight[d]); + + __shared__ __nv_bfloat16 smem[HEAD_DIM]; + smem[d] = __float2bfloat16(normed_f); + __syncthreads(); + + int half = head_dim / 2; + int pos = start_pos_d ? __ldg(start_pos_d + token) : token; + if (pos < 0 || pos >= cos_max_pos) __trap(); + + __nv_bfloat16 result; + if (d < half) { + float lo = __bfloat162float(smem[d]); + float hi = __bfloat162float(smem[d + half]); + float c = __bfloat162float(cos_cache[pos * head_dim + d]); + float s = __bfloat162float(sin_cache[pos * head_dim + d]); + float lo_cos = __bfloat162float(__float2bfloat16(lo * c)); + float hi_sin = __bfloat162float(__float2bfloat16(hi * s)); + result = __float2bfloat16(lo_cos - hi_sin); + } else { + int pair_d = d - half; + float lo = __bfloat162float(smem[pair_d]); + float hi = __bfloat162float(smem[d]); + float c = __bfloat162float(cos_cache[pos * head_dim + pair_d]); + float s = __bfloat162float(sin_cache[pos * head_dim + pair_d]); + float lo_sin = __bfloat162float(__float2bfloat16(lo * s)); + float hi_cos = __bfloat162float(__float2bfloat16(hi * c)); + result = __float2bfloat16(lo_sin + hi_cos); + } + + k[offset] = result; +} + +void k_norm_rope_batched_decode_cuda( + __nv_bfloat16* k, // [kv_dim * batch_size] in-place + const __nv_bfloat16* k_norm_weight, + const __nv_bfloat16* cos_cache, + const __nv_bfloat16* sin_cache, + const int* positions, // [batch_size] per-request positions on GPU + int num_kv_heads, + int head_dim, + int batch_size, + float rms_eps, + int cos_max_pos, + cudaStream_t stream +) { + int kv_dim = num_kv_heads * head_dim; + dim3 grid(num_kv_heads, batch_size); + k_norm_rope_kernel<<>>( + k, k_norm_weight, cos_cache, sin_cache, + num_kv_heads, head_dim, + /*seq_len=*/batch_size, kv_dim, + /*start_pos_d=*/positions, + rms_eps, cos_max_pos + ); +} + } // extern "C" diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs index cfbfb815..66f118df 100644 --- a/openinfer-kernels/src/ffi/shared.rs +++ b/openinfer-kernels/src/ffi/shared.rs @@ -246,6 +246,25 @@ unsafe extern "C" { stream: CUstream, ); + /// K-only norm + RoPE for the DFlash batch context-K path. Same per-head + /// RMSNorm + RoPE as `qk_norm_rope_batched_decode_cuda` but launches only + /// `num_kv_heads` blocks per token — the draft path has no context Q, so + /// the joint kernel wastes the Q work. See `k_norm_rope_batched_decode_cuda` + /// in `csrc/shared/prefill_attention.cu`. + pub fn k_norm_rope_batched_decode_cuda( + k: *mut Half, + k_norm_weight: *const Half, + cos_cache: *const Half, + sin_cache: *const Half, + positions: *const i32, + num_kv_heads: i32, + head_dim: i32, + batch_size: i32, + rms_eps: f32, + cos_max_pos: i32, + stream: CUstream, + ); + // Scatter contiguous KV → paged layout (one layer, FlashInfer prefill append). pub fn paged_kv_scatter_cuda( kv_data: *const Half, diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs index 111f6019..135afd42 100644 --- a/openinfer-kernels/src/ops.rs +++ b/openinfer-kernels/src/ops.rs @@ -14,9 +14,10 @@ mod norm; mod sampling; pub use attention::{ - PrefillPagedPlan, paged_attention_batch_decode_hd256_into, paged_attention_batch_decode_into, - paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into, - qk_norm_partial_rope_batched_decode_hd256_into, qk_norm_rope_batch_decode_into, + PrefillPagedPlan, k_norm_rope_batch_decode_into, paged_attention_batch_decode_hd256_into, + paged_attention_batch_decode_into, paged_attention_batch_decode_split_kv_into, + prefill_attention_paged_into, qk_norm_partial_rope_batched_decode_hd256_into, + qk_norm_rope_batch_decode_into, }; #[cfg(feature = "kimi-k2")] pub use deepep::{ diff --git a/openinfer-kernels/src/ops/attention.rs b/openinfer-kernels/src/ops/attention.rs index e3ae955a..3351e8d1 100644 --- a/openinfer-kernels/src/ops/attention.rs +++ b/openinfer-kernels/src/ops/attention.rs @@ -497,6 +497,50 @@ pub fn qk_norm_rope_batch_decode_into( } } +/// K-only norm + RoPE for the DFlash batch context-K path. +/// +/// Applies in-place RMSNorm + RoPE to `k` only — the draft path's context K +/// projection has no corresponding Q, so the joint `qk_norm_rope` kernel would +/// waste `num_q_heads / (num_q_heads + num_kv_heads)` of its work on a Q buffer +/// whose result is discarded (80% for Qwen3-4B's 16:4 GQA). This variant +/// launches only `num_kv_heads` blocks per token. +#[allow(clippy::too_many_arguments)] +pub fn k_norm_rope_batch_decode_into( + ctx: &DeviceContext, + k: &mut HiddenStates, + k_norm_weight: &DeviceVec, + cos_cache: &DeviceVec, + sin_cache: &DeviceVec, + positions_d: &CudaSlice, + num_kv_heads: usize, + head_dim: usize, + rms_eps: f32, +) { + let batch_size = k.seq_len; + + let (k_ptr, _gk) = k.data.device_ptr_mut(&ctx.stream); + let (kn_ptr, _gkn) = k_norm_weight.data.device_ptr(&ctx.stream); + let (cos_ptr, _gc) = cos_cache.data.device_ptr(&ctx.stream); + let (sin_ptr, _gs) = sin_cache.data.device_ptr(&ctx.stream); + let (pos_ptr, _gp) = positions_d.device_ptr(&ctx.stream); + + unsafe { + ffi::k_norm_rope_batched_decode_cuda( + k_ptr as *mut ffi::Half, + kn_ptr as *const ffi::Half, + cos_ptr as *const ffi::Half, + sin_ptr as *const ffi::Half, + pos_ptr as *const i32, + num_kv_heads as i32, + head_dim as i32, + batch_size as i32, + rms_eps, + (cos_cache.data.len() / head_dim) as i32, + ctx.stream.cu_stream(), + ); + } +} + /// Batched QK RMSNorm + partial RoPE for Qwen3.5 HD256 decode. /// /// Reads Q from interleaved `q_full` ([q, gate] per head), writes prepared Q into `q`, diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs index 3150ad6c..3ec0d96f 100644 --- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs +++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs @@ -248,16 +248,17 @@ impl DFlashDraftModel { &bufs.target_normed, &mut bufs.v_ctx, )?; - ops::qk_norm_rope_batch_decode_into( + // Context-K needs norm + RoPE but has no corresponding Q. The K-only + // kernel launches num_kv_heads blocks per token instead of + // num_q_heads + num_kv_heads, dropping 80% of the joint kernel's work + // (the dead Q branch) for Qwen3-4B's 16:4 GQA ratio. + ops::k_norm_rope_batch_decode_into( ctx, - &mut bufs.q_ctx_scratch, &mut bufs.k_ctx, - &layer.attention.q_norm, &layer.attention.k_norm, &self.cos_cache, &self.sin_cache, &bufs.positions_ctx, - config.num_attention_heads, config.num_key_value_heads, config.head_dim, config.rms_norm_eps, From 3d98d55b1dfe5bbd40b90bbcb8215dfb7dc6b7d4 Mon Sep 17 00:00:00 2001 From: kitty Date: Tue, 23 Jun 2026 12:26:41 +0800 Subject: [PATCH 6/6] fix(qwen3-dflash): forward ctx-len/q-len to the forward bench binary The Python forward bench script generated its fixture with the caller's --ctx-len/--q-len but launched the Rust runner without forwarding them, so the runner kept its defaults (2/16) and rejected the fixture shape for any non-default dimension. Pass --ctx-len/--q-len through to the runner, and make the Rust fixture path derive ctx_len/q_len from the fixture's actual tensor shapes so the two sides agree regardless of which flags the caller repeats. --- .../src/bin/qwen3_dflash_forward_bench.rs | 23 ++++++++++++------- .../accuracy/bench_qwen3_4b_dflash_forward.py | 4 ++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs index cb5bd0c9..dac9f374 100644 --- a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs +++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs @@ -18,18 +18,25 @@ fn main() -> Result<()> { let bytes = std::fs::read(fixture) .with_context(|| format!("failed to read fixture {}", fixture.display()))?; let st = SafeTensors::deserialize(&bytes).context("parse fixture")?; - let noise = read_bf16(&st, "noise_embedding", &[1, args.q_len, config.hidden_size])?; + // Derive ctx_len/q_len from the fixture's actual tensor shapes so the + // bench works for any --ctx-len/--q-len the Python side used, rather + // than requiring the caller to repeat them on both sides. + let noise_view = st + .tensor("noise_embedding") + .with_context(|| "missing tensor noise_embedding")?; + let q_len = noise_view.shape()[1]; + let target_view = st + .tensor("target_hidden") + .with_context(|| "missing tensor target_hidden")?; + let ctx_len = target_view.shape()[1]; + let noise = read_bf16(&st, "noise_embedding", &[1, q_len, config.hidden_size])?; let target_hidden = read_bf16( &st, "target_hidden", - &[ - 1, - args.ctx_len, - config.hidden_size * config.target_layer_count(), - ], + &[1, ctx_len, config.hidden_size * config.target_layer_count()], )?; - let positions = read_i32(&st, "position_ids", &[1, args.ctx_len + args.q_len])?; - (noise, target_hidden, positions, args.ctx_len, args.q_len) + let positions = read_i32(&st, "position_ids", &[1, ctx_len + q_len])?; + (noise, target_hidden, positions, ctx_len, q_len) } else { let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_4B16); let target_hidden = deterministic_bf16( diff --git a/tools/accuracy/bench_qwen3_4b_dflash_forward.py b/tools/accuracy/bench_qwen3_4b_dflash_forward.py index fb232e4c..34fe05b2 100644 --- a/tools/accuracy/bench_qwen3_4b_dflash_forward.py +++ b/tools/accuracy/bench_qwen3_4b_dflash_forward.py @@ -129,6 +129,10 @@ def main() -> int: str(fixture_path), "--device", str(args.device), + "--ctx-len", + str(args.ctx_len), + "--q-len", + str(args.q_len), "--warmup", str(args.warmup), "--iters",