From 8b5dc841b3c3c4eb2988640fd8d78ef8c04e2266 Mon Sep 17 00:00:00 2001
From: kitty <kitty.eu.org@gmail.com>
Date: Thu, 18 Jun 2026 19:30:53 +0800
Subject: [PATCH 1/6] feat(qwen3-dflash): add draft-only batch scheduler

---
 Cargo.lock                                    |  17 +
 Cargo.toml                                    |   2 +
 docs/index.md                                 |   1 +
 docs/models/qwen3/dflash.md                   | 434 +++++++++
 openinfer-core/src/ops.rs                     |   3 +-
 .../csrc/shared/paged_attention.cu            | 152 +++
 openinfer-kernels/src/ffi/shared.rs           |  36 +
 openinfer-kernels/src/ops.rs                  |   5 +
 openinfer-kernels/src/ops/dense_attention.rs  | 204 ++++
 openinfer-qwen3-4b-dflash/Cargo.toml          |  33 +
 .../src/batch_buffers.rs                      | 145 +++
 .../src/batch_forward.rs                      | 407 ++++++++
 .../src/bin/qwen3_dflash_batch_bench.rs       | 227 +++++
 .../src/bin/qwen3_dflash_forward_bench.rs     | 301 ++++++
 .../src/bin/qwen3_dflash_forward_fixture.rs   | 155 +++
 openinfer-qwen3-4b-dflash/src/config.rs       | 143 +++
 openinfer-qwen3-4b-dflash/src/executor.rs     | 640 +++++++++++++
 openinfer-qwen3-4b-dflash/src/forward.rs      | 886 ++++++++++++++++++
 openinfer-qwen3-4b-dflash/src/lib.rs          |  19 +
 openinfer-qwen3-4b-dflash/src/scheduler.rs    | 422 +++++++++
 openinfer-qwen3-4b-dflash/src/weights.rs      | 274 ++++++
 .../tests/hf_golden_gate.rs                   | 701 ++++++++++++++
 .../qwen3-4b-dflash-hf-golden.safetensors     | Bin 0 -> 82540 bytes
 .../accuracy/bench_qwen3_4b_dflash_forward.py | 169 ++++
 ...pare_qwen3_4b_dflash_drafter_generation.py | 466 +++++++++
 .../dump_qwen3_4b_dflash_hf_golden.py         |  98 ++
 26 files changed, 5939 insertions(+), 1 deletion(-)
 create mode 100644 docs/models/qwen3/dflash.md
 create mode 100644 openinfer-kernels/src/ops/dense_attention.rs
 create mode 100644 openinfer-qwen3-4b-dflash/Cargo.toml
 create mode 100644 openinfer-qwen3-4b-dflash/src/batch_buffers.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/batch_forward.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/config.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/executor.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/forward.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/lib.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/scheduler.rs
 create mode 100644 openinfer-qwen3-4b-dflash/src/weights.rs
 create mode 100644 openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
 create mode 100644 test_data/qwen3-4b-dflash-hf-golden.safetensors
 create mode 100644 tools/accuracy/bench_qwen3_4b_dflash_forward.py
 create mode 100644 tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py
 create mode 100644 tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py

diff --git a/Cargo.lock b/Cargo.lock
index 6a41fdaf..d1433b01 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3893,6 +3893,23 @@ dependencies = [
  "vllm-text",
 ]
 
+[[package]]
+name = "openinfer-qwen3-4b-dflash"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "crossbeam-channel",
+ "cudarc",
+ "half",
+ "log",
+ "memmap2",
+ "openinfer-core",
+ "openinfer-kernels",
+ "safetensors",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "openinfer-qwen35-4b"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 33876140..31d52f0e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ members = [
     "openinfer-deepseek-v2-lite",
     "openinfer-kimi-k2",
     "openinfer-qwen3-4b",
+    "openinfer-qwen3-4b-dflash",
     "openinfer-qwen35-4b",
     "openinfer-kv-cache",
     "openinfer-kv-offload",
@@ -128,6 +129,7 @@ openinfer-engine = { path = "openinfer-engine" }
 openinfer-kernels = { path = "openinfer-kernels" }
 openinfer-kimi-k2 = { path = "openinfer-kimi-k2" }
 openinfer-qwen3-4b = { path = "openinfer-qwen3-4b" }
+openinfer-qwen3-4b-dflash = { path = "openinfer-qwen3-4b-dflash" }
 openinfer-qwen35-4b = { path = "openinfer-qwen35-4b" }
 openinfer-deepseek-v2-lite = { path = "openinfer-deepseek-v2-lite" }
 openinfer-vllm-frontend = { path = "openinfer-vllm-frontend" }
diff --git a/docs/index.md b/docs/index.md
index a013b4fb..0c92240c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -27,6 +27,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | `models/qwen3/serving-perf-5090.md` | Qwen3-4B vs vLLM 0.22.1 tuning record: beats vLLM at **every measured QPS point** after unified-step attention fusion (decode rows as qo_len=1 prefill-plan entries + cta_tile_q dispatch fix). Also: batched step tail (#345), chunked prefill (default 1024), **cuBLAS 12.9 N=1025 cliff (build with CUDA ≥ 13)**, cublasLt per-shape tuning (buckets 8/16 restored), split-KV ≤bs32, two-stage argmax. |
 | `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB now fixed (sized cache + admission guard + kernel trap, gated by reject + in-window ITs); open set is per-row batch sampling, zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, YaRN #8 follow-up. Sequenced Now/Next/Later + cleanup ledger. |
 | `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. |
+| `models/qwen3/dflash.md` | `openinfer-qwen3-4b-dflash` supports only `z-lab/Qwen3-4B-DFlash-b16`: standalone model config/weights/forward plus transformers remote-code parity, with no generic DFlash framework or Qwen3 server/controller changes in this task. |
 | `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. |
 | `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. |
 | `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. |
diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md
new file mode 100644
index 00000000..493f5c21
--- /dev/null
+++ b/docs/models/qwen3/dflash.md
@@ -0,0 +1,434 @@
+# Qwen3-4B-DFlash model
+
+**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope.
+
+Last touched: 2026-06
+
+## Boundary
+
+This task is model-specific. The boundary is:
+
+| Crate | Owns |
+| --- | --- |
+| `openinfer-qwen3-4b-dflash` | `Qwen3-4B-DFlash-b16` config, weights, draft forward, draft-only batch executor/scheduler, model-specific kernels/wrappers, and transformers parity tests |
+| `openinfer-qwen3-4b` | Unchanged existing Qwen3 target serving, scheduler, KV, LoRA/offload/TP policy, and HF logits gate |
+
+Out of scope for this task: generic speculative decoding, a generic DFlash abstraction, OpenAI/server flags, LoRA/TP/KV-offload interactions, target verification, acceptance-length calculation, fallback token selection, and target hidden extraction from Qwen3.
+
+## Reference Model
+
+The authoritative reference is the Hugging Face repo `z-lab/Qwen3-4B-DFlash-b16`, not an inferred architecture from the target Qwen3 crate. The model card uses:
+
+```python
+transformers==4.57.3
+AutoModel.from_pretrained(..., trust_remote_code=True)
+draft.spec_generate(target, input_ids, ...)
+```
+
+The local checkpoint at `/home/hezhaozhao/models/Qwen3-4B-DFlash-b16` contains the same remote-code shape:
+
+| Field | Value |
+| --- | --- |
+| `architectures` | `DFlashDraftModel` |
+| draft layers | `5` |
+| target layers | `36` |
+| hidden size | `2560` |
+| intermediate size | `9728` |
+| attention heads / KV heads | `32 / 8` |
+| head dim | `128` |
+| block size | `16` |
+| mask token | `151669` |
+| target hidden layers | `[1, 9, 17, 25, 33]` |
+| vocab size | `151936` |
+
+Checkpoint keys are unprefixed relative to a target `model.` namespace: `layers.*`, `fc.weight`, `hidden_norm.weight`, and `norm.weight`. `fc.weight` is `[2560, 12800]`, i.e. one hidden-sized projection from five concatenated target hidden states.
+
+## Draft Forward
+
+The draft forward is not target Qwen3 attention with a different checkpoint. Its attention is dense and non-causal:
+
+1. `target_hidden = hidden_norm(fc(concat(selected target hidden states)))`
+2. `hidden_states = noise_embedding`
+3. for each of the five draft layers:
+   - RMSNorm `hidden_states`
+   - Q comes from normalized noise hidden
+   - K/V come from `cat(target_hidden, hidden_states)`
+   - Q/K get Qwen3 head RMSNorm and RoPE
+   - attention is non-causal over the whole `target_hidden + noise_hidden` span
+   - residual add
+   - post-attention RMSNorm + Qwen3 MLP + residual add
+4. final `norm(hidden_states)`
+
+The crate should expose draft-model primitives, not speculative serving:
+
+```rust
+pub struct DFlashDraftModel { ... }
+
+impl DFlashDraftModel {
+    pub fn load(model_path: &Path, device_ordinal: usize) -> anyhow::Result<Self>;
+    pub fn config(&self) -> &DFlashConfig;
+    pub fn target_layer_ids(&self) -> &[usize];
+    pub fn forward(
+        &self,
+        noise_embedding: &HiddenStates,
+        selected_target_hidden: &DFlashTargetHidden,
+        position_ids: &[i32],
+    ) -> anyhow::Result<HiddenStates>;
+}
+```
+
+The first version takes already-selected target hidden states as input and returns the final draft hidden states. Extracting those hidden states from `openinfer-qwen3-4b`, target verification, acceptance length calculation, and KV cropping are not part of this model implementation.
+
+## Draft-Only Batch Runner
+
+The batch path is intentionally internal. It is not an OpenAI-compatible text
+generation surface because the DFlash draft model does not consume prompt token
+ids and does not own a language-model head. Callers must provide device
+`HiddenStates` for:
+
+| Input | Shape |
+| --- | --- |
+| `noise_embedding` | `[q_len, hidden_size]` |
+| `target_hidden` | `[ctx_len, target_layer_count * hidden_size]` |
+| `position_ids` | `ctx_len + q_len` host positions |
+
+The runner groups only exact-shape requests. The batch key is
+`(q_len, ctx_len, past_len, cache_mode)`. `NoCache` requests use the real
+batched path: compact D2D input staging, batched FC/context projection, batched
+per-layer Q/K/V and MLP GEMMs, and FlashInfer
+`BatchPrefillWithRaggedKVCache` in non-causal mode for attention. `DraftCache`
+requests keep the same `DFlashDraftCache` lifecycle and are executed serially
+inside the GPU owner thread in this step; cross-request draft-cache batching
+needs a compact past-K/V layout and should be added with the target
+verification loop.
+
+The public Rust surface is crate-local serving infrastructure, not server API:
+
+```rust
+pub struct DFlashDraftHostRequest { ... }
+pub struct DFlashDraftHostResponse { ... }
+pub struct DFlashExecutor { ... }
+pub struct DFlashSchedulerHandle { ... }
+```
+
+`DFlashSchedulerHandle` is a single-thread GPU owner with FCFS exact-shape
+batching, a small `max_wait` coalescing window, and `max_total_tokens`
+admission over `(ctx_len + q_len + past_len)` for each candidate batch. Its
+public `submit` boundary uses host bf16 buffers and returns host bf16 output so
+CUDA device tensors do not cross thread/context ownership boundaries. It also
+owns per-request draft cache state through `reset_cache`, `crop_cache`, and
+`cache_seq_len`, and these calls now error on unknown request ids instead of
+silently treating them as empty state; `NoCache` requests use the real batched path, while host
+`DraftCache` requests run serially until compact past-K/V batching lands. The
+executor also exposes a borrowed compact batch view for same-thread controller
+experiments.
+
+## Draft Cache
+
+Do not maintain separate public cache concepts for this crate. The reference
+Python uses one `past_key_values_draft = DynamicCache()` in `spec_generate`,
+then calls the drafter with:
+
+```python
+position_ids=position_ids[:, past_key_values_draft.get_seq_length(): start + block_size]
+past_key_values=past_key_values_draft
+use_cache=True
+past_key_values_draft.crop(start)
+```
+
+OpenInfer mirrors that boundary with one `DFlashDraftCache`:
+
+| State | Meaning |
+| --- | --- |
+| `prepare_step_context(...)` | Projects the current selected target hidden states and prepares per-layer context `K/V`; this replaces the old standalone `prepare_context_cache(...)` wording. |
+| `forward_with_draft_cache(...)` | Runs one draft block, appends step context `K/V` and noise-token `K/V` to each layer's draft past state, and advances `seq_len`. |
+| `crop(seq_len)` / `reset()` | Matches the reference `DynamicCache.crop(start)` lifecycle after target verification decides how far the draft state remains valid. |
+
+The first-step cached path is numerically identical to the standalone HF
+remote-code forward because there is no existing past yet. Cross-step cached
+parity must be validated only after the target verification/controller is added;
+without the target loop, a second cached draft step is not the same numerical
+problem as the old no-draft-cache substitution probe.
+
+## Correctness Gate
+
+The accuracy bar is transformers parity. For the draft crate that means:
+
+| Gate | Purpose |
+| --- | --- |
+| config/loader shape test | Reject wrong checkpoint layout early: `target_layer_ids`, `block_size`, `mask_token_id`, `fc.weight`, layer count, and attention/MLP shapes |
+| draft-forward smoke | Load `/home/hezhaozhao/models/Qwen3-4B-DFlash-b16`, run a tiny GPU block with synthetic `noise_embedding`, selected target hidden states, and position ids, and catch shape/kernel failures |
+| transformers forward parity | Compare the standalone draft forward against the HF remote-code model for fixed synthetic `noise_embedding`, selected target hidden states, and position ids |
+| batch-vs-single parity | Compare two exact-shape batched rows against the bs1 forward output under the same DFlash tolerance |
+| executor smoke | Submit request-tagged exact-shape `NoCache` requests and assert output shape/request ids |
+| scheduler cache smoke | Submit host `DraftCache` request, then assert scheduler-owned `cache_seq_len`, `crop_cache`, and `reset_cache` behavior; also checks control messages preserve FIFO ordering behind pending submits |
+| drafter generation parity | Run a greedy bs1 transformers target loop twice, once with the HF drafter and once with the OpenInfer drafter, then compare generated token ids/text and acceptance lengths |
+
+Do not use `Qwen3-4B-Instruct-2507` as a correctness baseline for this model. The checkpoint is documented for `Qwen/Qwen3-4B`, but this task's gate is the DFlash draft model's own transformers forward, not target acceptance rate.
+
+## Kernel Notes
+
+Existing Qwen3 target attention is causal/paged and does not match `Qwen3-4B-DFlash-b16` draft attention. The draft kernel path should follow vLLM/FlashAttention semantics where possible: Q/K/V in head-major logical shape, GQA expansion by `q_head / (num_q_heads / num_kv_heads)`, RoPE on Q and K, softmax over all context+draft keys, and no causal mask.
+
+The reference implementation to mirror is vLLM's attention stack, especially `vllm.v1.attention.backends.flash_attn.FlashAttentionBackend` and `vllm.v1.attention.backends.flashinfer.FlashInferBackend`: both explicitly support `supports_non_causal()`, and their prefill/decode planners expose the causal flag and varlen context shape that DFlash needs.
+
+The batch runner uses FlashInfer `BatchPrefillWithRaggedKVCache` with
+`MaskMode::kNone` for compact non-causal attention. That keeps the DFlash batch
+path close to vLLM's varlen/non-causal attention semantics instead of looping
+over single-request prefill.
+
+## Accuracy Scripts
+
+The DFlash scripts intentionally mirror the rest of the repository:
+
+| Script | Output | Use |
+| --- | --- | --- |
+| `tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py` | `test_data/qwen3-4b-dflash-hf-golden.safetensors` | Offline transformers remote-code forward oracle for the Rust gate |
+| `openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs` | test pass/fail plus delta distribution | Release Rust gate that replays the stored oracle without Python |
+| `tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py` | `target/accuracy/qwen3-dflash/drafter-generation.json` | End-to-end drafter-substitution evidence: same transformers target loop, HF drafter vs OpenInfer drafter |
+| `tools/accuracy/bench_qwen3_4b_dflash_forward.py` + `qwen3_dflash_forward_bench` | `target/benchmarks/qwen3-dflash/forward.json` | Standalone forward latency comparison: transformers remote-code vs OpenInfer forward on the same synthetic fixture |
+| `qwen3_dflash_batch_bench` | stdout JSON / redirected benchmark artifact | Draft-only batch sweep over bs `1,2,4,8,16,32`, reporting req/s, draft tok/s, and latency percentiles |
+| `openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs` | safetensors with `openinfer_output` | Bridge used by the generation comparison script to call the Rust drafter from Python |
+
+The forward golden is generated by:
+
+```bash
+.venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \
+  --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+  --out test_data/qwen3-4b-dflash-hf-golden.safetensors
+```
+
+The Rust gate is:
+
+```bash
+OPENINFER_DFLASH_TEST_MODEL_PATH=/home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+cargo test --release -p openinfer-qwen3-4b-dflash --test hf_golden_gate -- --nocapture
+```
+
+The DFlash gate intentionally uses `OPENINFER_DFLASH_TEST_MODEL_PATH` rather
+than the generic `OPENINFER_TEST_MODEL_PATH`, because the latter usually points
+at the normal Qwen3 target checkpoint. The test also checks that
+`config.json.architectures` contains `DFlashDraftModel` before running.
+
+The batch throughput probe is:
+
+```bash
+cargo run --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_batch_bench -- \
+  --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+  --ctx-len 2 \
+  --q-len 16 \
+  --batch-sizes 1,2,4,8,16,32 \
+  --warmup 5 \
+  --iters 30
+```
+
+Observed local batch runner sweep on the same WSL/CUDA `sm_120` setup,
+`ctx_len=2`, `q_len=16`, warmup `5`, iters `30`:
+
+| Batch | mean ms | p50 ms | p90 ms | p99 ms | draft tok/s | req/s |
+| ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| 1 | 2.175 | 2.175 | 2.230 | 2.308 | 7,358 | 460 |
+| 2 | 2.488 | 2.446 | 2.607 | 2.947 | 12,859 | 804 |
+| 4 | 3.790 | 3.794 | 3.928 | 4.014 | 16,886 | 1,055 |
+| 8 | 4.651 | 4.571 | 5.184 | 5.419 | 27,518 | 1,720 |
+| 16 | 7.260 | 7.223 | 7.582 | 8.302 | 35,264 | 2,204 |
+| 32 | 13.221 | 13.080 | 14.237 | 15.073 | 38,725 | 2,420 |
+
+The current batch path improves draft-token throughput by `5.3x` from bs1 to
+bs32 after moving the ragged attention plan into reusable batch buffers. This is
+draft-model throughput only; it does not include target hidden production,
+verification, acceptance, or fallback-token work.
+
+On the local WSL setup used for the first run, the workspace-level vLLM git dependency and empty FlashInfer submodule required a narrower temporary workspace plus:
+
+```bash
+LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib \
+OPENINFER_FLASHINFER_INCLUDE=/home/hezhaozhao/openinfer/.venv/lib/python3.12/site-packages/flashinfer/data/include \
+cargo test --release -p openinfer-qwen3-4b-dflash --test hf_golden_gate -- --nocapture
+```
+
+Observed result after the unified cache change:
+
+```text
+dflash HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680
+dflash unified-cache one-shot HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680
+dflash draft-cache HF golden deltas: mean=0.034243, p99=0.125000, max=0.500000, n=7680
+test dflash_forward_matches_hf_remote_code ... ok
+```
+
+The drafter-substitution generation probe is:
+
+```bash
+cargo build --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_forward_fixture
+
+.venv/bin/python tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py \
+  --target-model-path /path/to/Qwen3-4B \
+  --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+  --openinfer-bin target/release/qwen3_dflash_forward_fixture \
+  --out target/accuracy/qwen3-dflash/drafter-generation.json
+```
+
+The JSON report records each prompt's generated token ids/text, token/text hashes,
+first mismatch if any, acceptance lengths, and optional OpenInfer-vs-HF draft
+hidden deltas. It exits non-zero unless every case is `all_token_text_exact`.
+This is the DFlash analogue of the DeepSeek-V2-Lite same-host generation
+comparison, but scoped to the current standalone drafter boundary.
+
+For performance, use the same synthetic fixture on both sides:
+
+```bash
+cargo build --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_forward_bench
+
+.venv/bin/python tools/accuracy/bench_qwen3_4b_dflash_forward.py \
+  --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+  --openinfer-bin target/release/qwen3_dflash_forward_bench \
+  --out target/benchmarks/qwen3-dflash/forward.json
+```
+
+The benchmark report includes transformers latency stats and OpenInfer latency
+stats for the same bf16 fixture. It is a standalone draft-forward measurement,
+not a full speculative-decoding throughput claim.
+
+Observed local benchmark on RTX 5070 Ti, WSL, CUDA `sm_120`, `ctx_len=2`,
+`q_len=16`, warmup `5`, iters `30`, same generated bf16 fixture:
+
+| Engine | mean ms | p50 ms | p90 ms | p99 ms |
+| --- | ---: | ---: | ---: | ---: |
+| transformers remote-code | 4.294 | 3.612 | 5.067 | 15.360 |
+| OpenInfer DFlash | 2.285 | 2.195 | 2.659 | 2.895 |
+
+OpenInfer is `1.65x` faster at p50 and `1.88x` faster by mean for this
+standalone forward shape. The transformers p99 includes a single 15.36 ms tail
+in this short run, so p99 should not be over-interpreted without a longer sweep.
+The measured artifact is `target/benchmarks/qwen3-dflash/forward.json`.
+
+First optimization pass: `DFlashForwardScratch` reuses the forward buffer set
+across repeated calls. The HF forward gate stayed identical:
+`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`. The same forward
+benchmark wrote `target/benchmarks/qwen3-dflash/forward-final.json`:
+
+| OpenInfer path | mean ms | p50 ms | p90 ms | p99 ms |
+| --- | ---: | ---: | ---: | ---: |
+| allocate buffers per forward | 2.285 | 2.195 | 2.659 | 2.895 |
+| reuse `DFlashForwardScratch` | 2.125 | 2.035 | 2.410 | 2.936 |
+
+This pass improved OpenInfer p50 by `1.08x`. It is a necessary cleanup for the
+future decode loop, but not enough by itself to prove DFlash value.
+
+A follow-up attempt to move the cloned input hidden state into reusable scratch
+was not kept: the current fused residual+RMSNorm op mutates the residual hidden
+state in place, so separating input/output ping-pong buffers correctly requires
+reworking that layer boundary rather than a local buffer-only patch.
+
+Second optimization pass: `DFlashForwardScratch` gained an explicit draft-side
+target-hidden context K/V cache. `prepare_context_cache(...)` computes
+`target_normed` plus each layer's context `K/V` and K norm+RoPE once; repeated
+`forward_with_context_cache(...)` calls then only compute the noise-token K/V and
+concat cached context with the current draft block. The HF gate now checks both
+uncached and cached paths, and both stayed identical:
+`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`.
+
+Cached benchmark artifact: `target/benchmarks/qwen3-dflash/forward-context-cache.json`.
+The reported latency excludes the one-time `prepare_context_cache(...)` call,
+matching the intended loop shape where context cache is updated explicitly when
+target hidden changes.
+
+| OpenInfer path | mean ms | p50 ms | p90 ms | p99 ms |
+| --- | ---: | ---: | ---: | ---: |
+| allocate buffers per forward | 2.285 | 2.195 | 2.659 | 2.895 |
+| reuse `DFlashForwardScratch` | 2.125 | 2.035 | 2.410 | 2.936 |
+| reuse scratch + context K/V cache | 1.863 | 1.831 | 2.001 | 2.301 |
+
+The context cache improves p50 by `1.11x` over scratch-only and `1.20x` over the
+initial implementation for this small `ctx_len=2`, `q_len=16` fixture.
+
+Third pass: the public cache shape was unified as `DFlashDraftCache`. The old
+"context cache" is now just the step-context part of the same object, and the
+cache also owns per-layer draft past K/V buffers plus `seq_len`, `crop`, and
+`reset` state. The HF gate checks uncached, unified-cache one-shot, and first-step
+draft-cache paths; all three retain the same delta distribution:
+`mean=0.034243`, `p99=0.125000`, `max=0.500000`, `n=7680`.
+
+The cache internals now follow the `openinfer-kv-cache` separation more closely
+without directly adopting its paged block manager: `DFlashDraftState` owns the
+long-lived draft past K/V and sequence length, `DFlashStepContext` owns the
+current target-hidden context K/V, and `ForwardBuffers` remains transient
+scratch. The public object is still a single `DFlashDraftCache`, but a prepared
+step is consumed by `forward_with_draft_cache(...)`; callers must prepare the
+next step explicitly after `crop(start)`, mirroring the reference `DynamicCache`
+lifecycle.
+
+The corresponding benchmark artifact is
+`target/benchmarks/qwen3-dflash/forward-draft-cache.json`. This benchmark uses
+the more honest `prepare_step_context + forward_with_draft_cache` timing inside
+each measured iteration, so it should not be compared directly against the
+previous context-cache number that excluded prepare time:
+
+| Engine/path | mean ms | p50 ms | p90 ms | p99 ms |
+| --- | ---: | ---: | ---: | ---: |
+| transformers remote-code | 5.564 | 4.429 | 9.078 | 18.713 |
+| OpenInfer `DFlashDraftCache` first-step path | 2.311 | 2.209 | 2.479 | 3.519 |
+
+After the internal state/step/scratch refactor, the same benchmark wrote
+`target/benchmarks/qwen3-dflash/forward-draft-cache-refactor.json` with no
+accuracy change and no performance regression:
+
+| Engine/path | mean ms | p50 ms | p90 ms | p99 ms |
+| --- | ---: | ---: | ---: | ---: |
+| transformers remote-code | 4.242 | 3.861 | 5.616 | 6.922 |
+| OpenInfer `DFlashDraftCache` refactor path | 2.228 | 2.155 | 2.454 | 2.541 |
+
+## Current Implementation
+
+The crate now exists as a standalone model implementation with config parsing, exact-key safetensor loading, a block draft forward, unified draft cache state, a tiny local GPU smoke test, and a HF remote-code golden gate. The attention path uses the existing Qwen3 Q/K RMSNorm+RoPE kernel and a FlashInfer single-prefill wrapper with `MaskMode::kNone`; context K currently reuses the Q/K kernel with a throwaway Q scratch buffer, so a future cleanup can split a K-only norm+RoPE helper without changing semantics.
+
+The local `.venv` uses `torch==2.9.0+cu129`, `transformers==4.57.3`, `safetensors`, `accelerate`, and `datasets` because the HF remote code imports `datasets` via `utils.py`. The generated fixture stores seed-pinned synthetic `noise_embedding`, selected `target_hidden`, `position_ids`, and HF final `output`; `openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs` replays those tensors through the Rust forward and compares deltas.
+
+An additional end-to-end generation probe used the same transformers target
+model for verification and swapped only the drafter:
+
+| Prompt | Result |
+| --- | --- |
+| `Hello, my name is` | identical token ids/text; acceptance `[1, 2, 1, 2, 1, 1]` |
+| `The capital of France is` | identical token ids/text; acceptance `[2, 1, 2, 2, 2]` |
+| `Qwen is a language model that` | identical token ids/text; acceptance `[2, 2, 1, 1, 1, 1]` |
+| `1, 1, 2, 3, 5,` | identical token ids/text; acceptance `[4, 1, 2, 2]` |
+
+The probe intentionally used a no-draft-cache loop on both sides because it
+predates `DFlashDraftCache` and because `openinfer-qwen3-4b-dflash` still does
+not own the target verification/controller. Within that older boundary,
+OpenInfer DFlash produces the same greedy generation tokens as the transformers
+DFlash drafter when the target/verification path is held fixed. The next
+meaningful generation probe should use the real target loop and exercise
+`DFlashDraftCache.crop(start)` after acceptance calculation.
+
+## 2026-06-18 Batch Bench
+
+The current Codex runner needed an explicit runtime library path to see the WSL
+CUDA driver:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 \
+LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib \
+OPENINFER_FLASHINFER_INCLUDE=/home/hezhaozhao/openinfer/.venv/lib/python3.12/site-packages/flashinfer/data/include \
+cargo run --release -p openinfer-qwen3-4b-dflash --bin qwen3_dflash_batch_bench -- \
+  --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+  --ctx-len 2 \
+  --q-len 16 \
+  --batch-sizes 1,2,4,8 \
+  --warmup 2 \
+  --iters 5
+```
+
+Observed result on the RTX 5070 Ti host:
+
+| Batch | mean ms | draft tok/s | req/s |
+| ---: | ---: | ---: | ---: |
+| 1 | 2.052 | 7,796 | 487 |
+| 2 | 2.303 | 13,893 | 868 |
+| 4 | 3.532 | 18,121 | 1,133 |
+| 8 | 4.364 | 29,333 | 1,833 |
+
+This confirms the draft-only batch path still scales after the fail-closed
+cache fix. It is draft throughput only; it does not include target hidden
+production, verification, acceptance, or fallback-token work.
diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
index efb544cb..df729753 100644
--- a/openinfer-core/src/ops.rs
+++ b/openinfer-core/src/ops.rs
@@ -23,7 +23,8 @@ pub use openinfer_kernels::ops::{
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
-    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into,
+    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, RaggedPrefillPlan,
+    batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into, write_vec_into,
 };
 #[cfg(not(feature = "kernel-call-trace"))]
 pub use openinfer_kernels::ops::{
diff --git a/openinfer-kernels/csrc/shared/paged_attention.cu b/openinfer-kernels/csrc/shared/paged_attention.cu
index 4506a60f..f21181ed 100644
--- a/openinfer-kernels/csrc/shared/paged_attention.cu
+++ b/openinfer-kernels/csrc/shared/paged_attention.cu
@@ -22,6 +22,7 @@ using namespace flashinfer;
 using DType  = __nv_bfloat16;
 using IdType = int32_t;
 using ParamsT = BatchDecodeParams<DType, DType, DType, IdType>;
+using BatchPrefillRaggedParamsT = BatchPrefillRaggedParams<DType, DType, DType, IdType>;
 using Variant = DefaultAttention</*custom_mask=*/false,
                                  /*sliding_window=*/false,
                                  /*logits_soft_cap=*/false,
@@ -607,6 +608,157 @@ int single_prefill_cuda(
             reinterpret_cast<cudaStream_t>(stream)));
 }
 
+// ---------------------------------------------------------------------------
+// Single-request non-causal prefill over contiguous NHD K/V.
+//
+// DFlash draft attention materializes K/V as token-major HiddenStates:
+//   q: [q_len, num_qo_heads, head_dim]
+//   k/v: [kv_len, num_kv_heads, head_dim]
+// This wrapper mirrors vLLM's non-causal FlashAttention/FlashInfer semantics:
+// no causal mask, no sliding window, and GQA handled by FlashInfer.
+// ---------------------------------------------------------------------------
+int single_prefill_nhd_noncausal_cuda(
+    void*    q,
+    void*    output,
+    void*    k,
+    void*    v,
+    int32_t  num_qo_heads,
+    int32_t  num_kv_heads,
+    int32_t  head_dim,
+    int32_t  q_len,
+    int32_t  kv_len,
+    float    sm_scale,
+    void*    stream)
+{
+    uint32_t q_stride_n = num_qo_heads * head_dim;
+    uint32_t q_stride_h = head_dim;
+    uint32_t kv_stride_n = num_kv_heads * head_dim;
+    uint32_t kv_stride_h = head_dim;
+
+    PrefillParamsT params(
+        reinterpret_cast<DType*>(q),
+        reinterpret_cast<DType*>(k),
+        reinterpret_cast<DType*>(v),
+        /*maybe_custom_mask=*/nullptr,
+        reinterpret_cast<DType*>(output),
+        /*lse=*/nullptr,
+        /*maybe_alibi_slopes=*/nullptr,
+        num_qo_heads,
+        num_kv_heads,
+        static_cast<uint32_t>(q_len),
+        static_cast<uint32_t>(kv_len),
+        q_stride_n,
+        q_stride_h,
+        kv_stride_n,
+        kv_stride_h,
+        static_cast<uint32_t>(head_dim),
+        /*window_left=*/-1,
+        /*logits_soft_cap=*/0.0f,
+        sm_scale,
+        /*rope_scale=*/1.0f,
+        /*rope_theta=*/1e6f);
+
+    return static_cast<int>(
+        SinglePrefillWithKVCacheDispatched<
+            /*HEAD_DIM_QK=*/128,
+            /*HEAD_DIM_VO=*/128,
+            PosEncodingMode::kNone,
+            /*USE_FP16_QK_REDUCTION=*/false,
+            MaskMode::kNone,
+            Variant,
+            PrefillParamsT>(
+            params,
+            /*tmp=*/nullptr,
+            reinterpret_cast<cudaStream_t>(stream)));
+}
+
+// ---------------------------------------------------------------------------
+// Batched non-causal prefill over compact ragged NHD K/V.
+//
+// DFlash groups exact-shape draft requests into compact token-major tensors:
+//   q: [sum(q_len), num_qo_heads, head_dim]
+//   k/v: [sum(kv_len), num_kv_heads, head_dim]
+// with q_indptr/kv_indptr separating requests. This maps directly to
+// FlashInfer BatchPrefillWithRaggedKVCache with MaskMode::kNone.
+// ---------------------------------------------------------------------------
+int batch_prefill_ragged_nhd_noncausal_cuda(
+    void*    q,
+    void*    output,
+    void*    k,
+    void*    v,
+    int32_t* q_indptr,
+    int32_t* kv_indptr,
+    int32_t* request_indices,
+    int32_t* qo_tile_indices,
+    int32_t* kv_tile_indices,
+    int32_t* kv_chunk_size_ptr,
+    uint32_t* total_num_rows,
+    int32_t  num_qo_heads,
+    int32_t  num_kv_heads,
+    int32_t  head_dim,
+    int32_t  total_q_len,
+    int32_t  batch_size,
+    int32_t  padded_batch_size,
+    float    sm_scale,
+    void*    stream)
+{
+    uint32_t q_stride_n = num_qo_heads * head_dim;
+    uint32_t q_stride_h = head_dim;
+    uint32_t kv_stride_n = num_kv_heads * head_dim;
+    uint32_t kv_stride_h = head_dim;
+
+    BatchPrefillRaggedParamsT params(
+        reinterpret_cast<DType*>(q),
+        reinterpret_cast<DType*>(k),
+        reinterpret_cast<DType*>(v),
+        /*maybe_custom_mask=*/nullptr,
+        q_indptr,
+        kv_indptr,
+        /*maybe_mask_indptr=*/nullptr,
+        /*maybe_q_rope_offset=*/nullptr,
+        /*maybe_k_rope_offset=*/nullptr,
+        reinterpret_cast<DType*>(output),
+        /*lse=*/nullptr,
+        /*maybe_alibi_slopes=*/nullptr,
+        num_qo_heads,
+        num_kv_heads,
+        q_stride_n,
+        q_stride_h,
+        kv_stride_n,
+        kv_stride_h,
+        /*window_left=*/-1,
+        /*logits_soft_cap=*/0.0f,
+        sm_scale,
+        /*rope_scale=*/1.0f,
+        /*rope_theta=*/1e6f);
+
+    params.request_indices = request_indices;
+    params.qo_tile_indices = qo_tile_indices;
+    params.kv_tile_indices = kv_tile_indices;
+    params.o_indptr = q_indptr;
+    params.kv_chunk_size_ptr = kv_chunk_size_ptr;
+    params.total_num_rows = total_num_rows;
+    params.max_total_num_rows = static_cast<uint32_t>(total_q_len);
+    params.padded_batch_size = static_cast<uint32_t>(padded_batch_size);
+    params.partition_kv = false;
+
+    return static_cast<int>(
+        BatchPrefillWithRaggedKVCacheDispatched<
+            /*CTA_TILE_Q=*/16,
+            /*HEAD_DIM_QK=*/128,
+            /*HEAD_DIM_VO=*/128,
+            PosEncodingMode::kNone,
+            /*USE_FP16_QK_REDUCTION=*/false,
+            MaskMode::kNone,
+            Variant,
+            BatchPrefillRaggedParamsT>(
+            params,
+            /*tmp_v=*/nullptr,
+            /*tmp_s=*/nullptr,
+            /*enable_pdl=*/false,
+            reinterpret_cast<cudaStream_t>(stream)));
+}
+
 // ---------------------------------------------------------------------------
 // Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache.
 //
diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs
index 46fcba96..4f7f554c 100644
--- a/openinfer-kernels/src/ffi/shared.rs
+++ b/openinfer-kernels/src/ffi/shared.rs
@@ -478,6 +478,42 @@ unsafe extern "C" {
         stream: CUstream,
     ) -> i32;
 
+    pub fn single_prefill_nhd_noncausal_cuda(
+        q: *const Half,
+        output: *mut Half,
+        k: *const Half,
+        v: *const Half,
+        num_qo_heads: i32,
+        num_kv_heads: i32,
+        head_dim: i32,
+        q_len: i32,
+        kv_len: i32,
+        sm_scale: f32,
+        stream: CUstream,
+    ) -> i32;
+
+    pub fn batch_prefill_ragged_nhd_noncausal_cuda(
+        q: *const Half,
+        output: *mut Half,
+        k: *const Half,
+        v: *const Half,
+        q_indptr: *const i32,
+        kv_indptr: *const i32,
+        request_indices: *const i32,
+        qo_tile_indices: *const i32,
+        kv_tile_indices: *const i32,
+        kv_chunk_size_ptr: *const i32,
+        total_num_rows: *const u32,
+        num_qo_heads: i32,
+        num_kv_heads: i32,
+        head_dim: i32,
+        total_q_len: i32,
+        batch_size: i32,
+        padded_batch_size: i32,
+        sm_scale: f32,
+        stream: CUstream,
+    ) -> i32;
+
     pub fn repeat_f32_for_reduce_scatter_cuda(
         local: *const f32,
         repeated: *mut f32,
diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs
index 983eb817..bb3d8778 100644
--- a/openinfer-kernels/src/ops.rs
+++ b/openinfer-kernels/src/ops.rs
@@ -3,6 +3,7 @@
 mod attention;
 #[cfg(feature = "kimi-k2")]
 mod deepep;
+mod dense_attention;
 mod elementwise;
 mod embedding;
 #[cfg(feature = "kimi-k2")]
@@ -21,6 +22,10 @@ pub use attention::{
 pub use deepep::{
     DeepEp, DeepEpDispatchScratch, DeepEpPrefillCounts, deepep_info, deepep_unique_id,
 };
+pub use dense_attention::{
+    RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into,
+    single_prefill_nhd_noncausal_into,
+};
 pub use elementwise::{
     accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into,
     extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into,
diff --git a/openinfer-kernels/src/ops/dense_attention.rs b/openinfer-kernels/src/ops/dense_attention.rs
new file mode 100644
index 00000000..bf438707
--- /dev/null
+++ b/openinfer-kernels/src/ops/dense_attention.rs
@@ -0,0 +1,204 @@
+use anyhow::Result;
+use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut};
+
+use crate::ffi;
+use crate::tensor::{DeviceContext, HiddenStates};
+
+#[allow(clippy::too_many_arguments)]
+pub fn single_prefill_nhd_noncausal_into(
+    ctx: &DeviceContext,
+    q: &HiddenStates,
+    k: &HiddenStates,
+    v: &HiddenStates,
+    out: &mut HiddenStates,
+    num_qo_heads: usize,
+    num_kv_heads: usize,
+    head_dim: usize,
+) -> Result<()> {
+    let q_dim = num_qo_heads * head_dim;
+    let kv_dim = num_kv_heads * head_dim;
+    assert_eq!(q.hidden_dim, q_dim);
+    assert_eq!(k.hidden_dim, kv_dim);
+    assert_eq!(v.hidden_dim, kv_dim);
+    assert_eq!(v.seq_len, k.seq_len);
+    assert_eq!(out.hidden_dim, q_dim);
+    assert_eq!(out.seq_len, q.seq_len);
+    assert_eq!(
+        head_dim, 128,
+        "FlashInfer wrapper is instantiated for head_dim=128"
+    );
+
+    let (q_ptr, _gq) = q.data.device_ptr(&ctx.stream);
+    let (k_ptr, _gk) = k.data.device_ptr(&ctx.stream);
+    let (v_ptr, _gv) = v.data.device_ptr(&ctx.stream);
+    let (out_ptr, _go) = out.data.device_ptr_mut(&ctx.stream);
+    let sm_scale = 1.0f32 / (head_dim as f32).sqrt();
+    let status = unsafe {
+        ffi::single_prefill_nhd_noncausal_cuda(
+            q_ptr as *const ffi::Half,
+            out_ptr as *mut ffi::Half,
+            k_ptr as *const ffi::Half,
+            v_ptr as *const ffi::Half,
+            num_qo_heads as i32,
+            num_kv_heads as i32,
+            head_dim as i32,
+            q.seq_len as i32,
+            k.seq_len as i32,
+            sm_scale,
+            ctx.stream.cu_stream(),
+        )
+    };
+    if status != 0 {
+        anyhow::bail!(
+            "single_prefill_nhd_noncausal_cuda failed: status={}, q_len={}, kv_len={}, q_heads={}, kv_heads={}, head_dim={}",
+            status,
+            q.seq_len,
+            k.seq_len,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim
+        );
+    }
+    Ok(())
+}
+
+pub struct RaggedPrefillPlan {
+    q_indptr: CudaSlice<i32>,
+    kv_indptr: CudaSlice<i32>,
+    request_indices: CudaSlice<i32>,
+    qo_tile_indices: CudaSlice<i32>,
+    kv_tile_indices: CudaSlice<i32>,
+    kv_chunk_size: CudaSlice<i32>,
+    total_num_rows: CudaSlice<u32>,
+    batch_size: usize,
+    total_q_len: usize,
+}
+
+impl RaggedPrefillPlan {
+    pub fn new(
+        ctx: &DeviceContext,
+        q_lens: &[usize],
+        kv_lens: &[usize],
+        group_size: usize,
+    ) -> Result<Self> {
+        anyhow::ensure!(!q_lens.is_empty(), "ragged prefill batch is empty");
+        anyhow::ensure!(
+            q_lens.len() == kv_lens.len(),
+            "q_lens len {} != kv_lens len {}",
+            q_lens.len(),
+            kv_lens.len()
+        );
+        anyhow::ensure!(group_size > 0, "group_size must be positive");
+        let mut q_indptr = Vec::with_capacity(q_lens.len() + 1);
+        let mut kv_indptr = Vec::with_capacity(kv_lens.len() + 1);
+        q_indptr.push(0i32);
+        kv_indptr.push(0i32);
+        for (&q_len, &kv_len) in q_lens.iter().zip(kv_lens.iter()) {
+            anyhow::ensure!(q_len > 0, "ragged prefill q_len must be positive");
+            anyhow::ensure!(kv_len > 0, "ragged prefill kv_len must be positive");
+            q_indptr.push(q_indptr.last().copied().unwrap() + q_len as i32);
+            kv_indptr.push(kv_indptr.last().copied().unwrap() + kv_len as i32);
+        }
+        let total_q_len = *q_indptr.last().unwrap() as usize;
+        let mut request_indices = Vec::new();
+        let mut qo_tile_indices = Vec::new();
+        let mut kv_tile_indices = Vec::new();
+        const CTA_TILE_Q: usize = 16;
+        for (req_idx, &q_len) in q_lens.iter().enumerate() {
+            let packed_q_len = q_len * group_size;
+            let tiles = packed_q_len.div_ceil(CTA_TILE_Q);
+            for tile in 0..tiles {
+                request_indices.push(req_idx as i32);
+                qo_tile_indices.push(tile as i32);
+                kv_tile_indices.push(0i32);
+            }
+        }
+        let kv_chunk_size: Vec<i32> = kv_lens.iter().map(|&len| len as i32).collect();
+        Ok(Self {
+            q_indptr: ctx.stream.clone_htod(&q_indptr)?,
+            kv_indptr: ctx.stream.clone_htod(&kv_indptr)?,
+            request_indices: ctx.stream.clone_htod(&request_indices)?,
+            qo_tile_indices: ctx.stream.clone_htod(&qo_tile_indices)?,
+            kv_tile_indices: ctx.stream.clone_htod(&kv_tile_indices)?,
+            kv_chunk_size: ctx.stream.clone_htod(&kv_chunk_size)?,
+            total_num_rows: ctx.stream.clone_htod(&[total_q_len as u32])?,
+            batch_size: q_lens.len(),
+            total_q_len,
+        })
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn batch_prefill_ragged_nhd_noncausal_into(
+    ctx: &DeviceContext,
+    q: &HiddenStates,
+    k: &HiddenStates,
+    v: &HiddenStates,
+    out: &mut HiddenStates,
+    plan: &RaggedPrefillPlan,
+    num_qo_heads: usize,
+    num_kv_heads: usize,
+    head_dim: usize,
+) -> Result<()> {
+    let q_dim = num_qo_heads * head_dim;
+    let kv_dim = num_kv_heads * head_dim;
+    assert_eq!(q.hidden_dim, q_dim);
+    assert_eq!(k.hidden_dim, kv_dim);
+    assert_eq!(v.hidden_dim, kv_dim);
+    assert_eq!(v.seq_len, k.seq_len);
+    assert_eq!(out.hidden_dim, q_dim);
+    assert_eq!(out.seq_len, q.seq_len);
+    assert_eq!(q.seq_len, plan.total_q_len);
+    assert_eq!(
+        head_dim, 128,
+        "FlashInfer ragged wrapper is instantiated for head_dim=128"
+    );
+
+    let (q_ptr, _gq) = q.data.device_ptr(&ctx.stream);
+    let (k_ptr, _gk) = k.data.device_ptr(&ctx.stream);
+    let (v_ptr, _gv) = v.data.device_ptr(&ctx.stream);
+    let (out_ptr, _go) = out.data.device_ptr_mut(&ctx.stream);
+    let (q_indptr, _) = plan.q_indptr.device_ptr(&ctx.stream);
+    let (kv_indptr, _) = plan.kv_indptr.device_ptr(&ctx.stream);
+    let (request_indices, _) = plan.request_indices.device_ptr(&ctx.stream);
+    let (qo_tile_indices, _) = plan.qo_tile_indices.device_ptr(&ctx.stream);
+    let (kv_tile_indices, _) = plan.kv_tile_indices.device_ptr(&ctx.stream);
+    let (kv_chunk_size, _) = plan.kv_chunk_size.device_ptr(&ctx.stream);
+    let (total_num_rows, _) = plan.total_num_rows.device_ptr(&ctx.stream);
+    let sm_scale = 1.0f32 / (head_dim as f32).sqrt();
+    let status = unsafe {
+        ffi::batch_prefill_ragged_nhd_noncausal_cuda(
+            q_ptr as *const ffi::Half,
+            out_ptr as *mut ffi::Half,
+            k_ptr as *const ffi::Half,
+            v_ptr as *const ffi::Half,
+            q_indptr as *const i32,
+            kv_indptr as *const i32,
+            request_indices as *const i32,
+            qo_tile_indices as *const i32,
+            kv_tile_indices as *const i32,
+            kv_chunk_size as *const i32,
+            total_num_rows as *const u32,
+            num_qo_heads as i32,
+            num_kv_heads as i32,
+            head_dim as i32,
+            q.seq_len as i32,
+            plan.batch_size as i32,
+            plan.request_indices.len() as i32,
+            sm_scale,
+            ctx.stream.cu_stream(),
+        )
+    };
+    if status != 0 {
+        anyhow::bail!(
+            "batch_prefill_ragged_nhd_noncausal_cuda failed: status={}, total_q_len={}, batch_size={}, q_heads={}, kv_heads={}, head_dim={}",
+            status,
+            q.seq_len,
+            plan.batch_size,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim
+        );
+    }
+    Ok(())
+}
diff --git a/openinfer-qwen3-4b-dflash/Cargo.toml b/openinfer-qwen3-4b-dflash/Cargo.toml
new file mode 100644
index 00000000..47b11a25
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "openinfer-qwen3-4b-dflash"
+license = "Apache-2.0"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow = { workspace = true }
+crossbeam-channel = { workspace = true }
+cudarc = { workspace = true }
+half = { workspace = true }
+log = { workspace = true }
+memmap2 = { workspace = true }
+openinfer-core = { workspace = true }
+openinfer-kernels = { workspace = true }
+safetensors = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+[[bin]]
+name = "qwen3_dflash_forward_fixture"
+path = "src/bin/qwen3_dflash_forward_fixture.rs"
+
+[[bin]]
+name = "qwen3_dflash_forward_bench"
+path = "src/bin/qwen3_dflash_forward_bench.rs"
+
+[[bin]]
+name = "qwen3_dflash_batch_bench"
+path = "src/bin/qwen3_dflash_batch_bench.rs"
+
+[lints]
+workspace = true
diff --git a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs
new file mode 100644
index 00000000..84739906
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs
@@ -0,0 +1,145 @@
+use anyhow::Result;
+use cudarc::driver::CudaSlice;
+use openinfer_core::ops::RaggedPrefillPlan;
+use openinfer_core::tensor::HiddenStates;
+
+use crate::weights::DFlashDraftModel;
+
+pub struct DFlashBatchBuffers {
+    pub(crate) max_batch_size: usize,
+    pub(crate) q_len: usize,
+    pub(crate) ctx_len: usize,
+    pub(crate) total_q_len: usize,
+    pub(crate) total_ctx_len: usize,
+    pub(crate) total_kv_len: usize,
+    pub(crate) noise: HiddenStates,
+    pub(crate) target_hidden: HiddenStates,
+    pub(crate) target_projected: HiddenStates,
+    pub(crate) target_normed: HiddenStates,
+    pub(crate) hidden: HiddenStates,
+    pub(crate) hidden_out: HiddenStates,
+    pub(crate) normed: HiddenStates,
+    pub(crate) q: HiddenStates,
+    pub(crate) q_ctx_scratch: HiddenStates,
+    pub(crate) k_ctx: HiddenStates,
+    pub(crate) k_noise: HiddenStates,
+    pub(crate) v_ctx: HiddenStates,
+    pub(crate) v_noise: HiddenStates,
+    pub(crate) k_all: HiddenStates,
+    pub(crate) v_all: HiddenStates,
+    pub(crate) attn_out: HiddenStates,
+    pub(crate) o_buf: HiddenStates,
+    pub(crate) gate_up: HiddenStates,
+    pub(crate) act_out: HiddenStates,
+    pub(crate) positions_q: CudaSlice<i32>,
+    pub(crate) positions_ctx: CudaSlice<i32>,
+    pub(crate) ragged_plan: Option<CachedRaggedPlan>,
+}
+
+pub(crate) struct CachedRaggedPlan {
+    pub(crate) batch_size: usize,
+    pub(crate) plan: RaggedPrefillPlan,
+}
+
+impl DFlashBatchBuffers {
+    pub(crate) fn new(
+        model: &DFlashDraftModel,
+        max_batch_size: usize,
+        q_len: usize,
+        ctx_len: usize,
+    ) -> Result<Self> {
+        anyhow::ensure!(max_batch_size > 0, "max_batch_size must be positive");
+        anyhow::ensure!(q_len > 0, "q_len must be positive");
+        anyhow::ensure!(ctx_len > 0, "ctx_len must be positive");
+        let config = model.config();
+        let ctx = model.device_context();
+        let hidden = config.hidden_size;
+        let target_hidden_dim = config.hidden_size * config.target_layer_count();
+        let q_dim = config.q_dim();
+        let kv_dim = config.kv_dim();
+        let total_q_len = max_batch_size * q_len;
+        let total_ctx_len = max_batch_size * ctx_len;
+        let total_kv_len = max_batch_size * (ctx_len + q_len);
+        Ok(Self {
+            max_batch_size,
+            q_len,
+            ctx_len,
+            total_q_len,
+            total_ctx_len,
+            total_kv_len,
+            noise: HiddenStates::zeros(ctx, hidden, total_q_len)?,
+            target_hidden: HiddenStates::zeros(ctx, target_hidden_dim, total_ctx_len)?,
+            target_projected: HiddenStates::zeros(ctx, hidden, total_ctx_len)?,
+            target_normed: HiddenStates::zeros(ctx, hidden, total_ctx_len)?,
+            hidden: HiddenStates::zeros(ctx, hidden, total_q_len)?,
+            hidden_out: HiddenStates::zeros(ctx, hidden, total_q_len)?,
+            normed: HiddenStates::zeros(ctx, hidden, total_q_len)?,
+            q: HiddenStates::zeros(ctx, q_dim, total_q_len)?,
+            q_ctx_scratch: HiddenStates::zeros(ctx, q_dim, total_ctx_len)?,
+            k_ctx: HiddenStates::zeros(ctx, kv_dim, total_ctx_len)?,
+            k_noise: HiddenStates::zeros(ctx, kv_dim, total_q_len)?,
+            v_ctx: HiddenStates::zeros(ctx, kv_dim, total_ctx_len)?,
+            v_noise: HiddenStates::zeros(ctx, kv_dim, total_q_len)?,
+            k_all: HiddenStates::zeros(ctx, kv_dim, total_kv_len)?,
+            v_all: HiddenStates::zeros(ctx, kv_dim, total_kv_len)?,
+            attn_out: HiddenStates::zeros(ctx, q_dim, total_q_len)?,
+            o_buf: HiddenStates::zeros(ctx, hidden, total_q_len)?,
+            gate_up: HiddenStates::zeros(ctx, 2 * config.intermediate_size, total_q_len)?,
+            act_out: HiddenStates::zeros(ctx, config.intermediate_size, total_q_len)?,
+            positions_q: ctx.stream.alloc_zeros(total_q_len)?,
+            positions_ctx: ctx.stream.alloc_zeros(total_ctx_len)?,
+            ragged_plan: None,
+        })
+    }
+
+    pub(crate) fn set_active_batch(&mut self, batch_size: usize) {
+        debug_assert!(batch_size <= self.max_batch_size);
+        self.total_q_len = batch_size * self.q_len;
+        self.total_ctx_len = batch_size * self.ctx_len;
+        self.total_kv_len = batch_size * (self.ctx_len + self.q_len);
+        self.noise.seq_len = self.total_q_len;
+        self.target_hidden.seq_len = self.total_ctx_len;
+        self.target_projected.seq_len = self.total_ctx_len;
+        self.target_normed.seq_len = self.total_ctx_len;
+        self.hidden.seq_len = self.total_q_len;
+        self.hidden_out.seq_len = self.total_q_len;
+        self.normed.seq_len = self.total_q_len;
+        self.q.seq_len = self.total_q_len;
+        self.q_ctx_scratch.seq_len = self.total_ctx_len;
+        self.k_ctx.seq_len = self.total_ctx_len;
+        self.k_noise.seq_len = self.total_q_len;
+        self.v_ctx.seq_len = self.total_ctx_len;
+        self.v_noise.seq_len = self.total_q_len;
+        self.k_all.seq_len = self.total_kv_len;
+        self.v_all.seq_len = self.total_kv_len;
+        self.attn_out.seq_len = self.total_q_len;
+        self.o_buf.seq_len = self.total_q_len;
+        self.gate_up.seq_len = self.total_q_len;
+        self.act_out.seq_len = self.total_q_len;
+    }
+
+    pub(crate) fn prepare_ragged_plan(
+        &mut self,
+        model: &DFlashDraftModel,
+        batch_size: usize,
+    ) -> Result<()> {
+        let needs_rebuild = self
+            .ragged_plan
+            .as_ref()
+            .map(|cached| cached.batch_size != batch_size)
+            .unwrap_or(true);
+        if needs_rebuild {
+            let config = model.config();
+            let q_lens = vec![self.q_len; batch_size];
+            let kv_lens = vec![self.ctx_len + self.q_len; batch_size];
+            let plan = RaggedPrefillPlan::new(
+                model.device_context(),
+                &q_lens,
+                &kv_lens,
+                config.num_attention_heads / config.num_key_value_heads,
+            )?;
+            self.ragged_plan = Some(CachedRaggedPlan { batch_size, plan });
+        }
+        Ok(())
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
new file mode 100644
index 00000000..ba9ef95b
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
@@ -0,0 +1,407 @@
+use anyhow::Result;
+use half::bf16;
+use openinfer_core::ops;
+use openinfer_core::tensor::{DeviceContext, HiddenStates};
+
+use crate::batch_buffers::DFlashBatchBuffers;
+use crate::forward::DFlashTargetHidden;
+use crate::weights::{DFlashDraftModel, DFlashLayer};
+
+pub struct DFlashBatchInput<'a> {
+    pub noise_embedding: &'a HiddenStates,
+    pub target_hidden: DFlashTargetHidden<'a>,
+    pub position_ids: &'a [i32],
+}
+
+pub struct DFlashHostBatchInput<'a> {
+    pub noise_embedding: &'a [bf16],
+    pub target_hidden: &'a [bf16],
+    pub position_ids: &'a [i32],
+}
+
+impl DFlashDraftModel {
+    pub fn create_batch_buffers(
+        &self,
+        max_batch_size: usize,
+        q_len: usize,
+        ctx_len: usize,
+    ) -> Result<DFlashBatchBuffers> {
+        DFlashBatchBuffers::new(self, max_batch_size, q_len, ctx_len)
+    }
+
+    pub fn forward_batch<'a>(
+        &self,
+        requests: &[DFlashBatchInput<'_>],
+        bufs: &'a mut DFlashBatchBuffers,
+    ) -> Result<&'a HiddenStates> {
+        anyhow::ensure!(!requests.is_empty(), "DFlash batch is empty");
+        anyhow::ensure!(
+            requests.len() <= bufs.max_batch_size,
+            "DFlash batch size {} exceeds buffer capacity {}",
+            requests.len(),
+            bufs.max_batch_size
+        );
+        let q_len = bufs.q_len;
+        let ctx_len = bufs.ctx_len;
+        for req in requests {
+            let (actual_q, actual_ctx) = self.validate_forward_inputs(
+                req.noise_embedding,
+                &req.target_hidden,
+                req.position_ids,
+            )?;
+            anyhow::ensure!(
+                actual_q == q_len && actual_ctx == ctx_len,
+                "DFlash exact-shape batch expected q_len={}, ctx_len={} but got q_len={}, ctx_len={}",
+                q_len,
+                ctx_len,
+                actual_q,
+                actual_ctx
+            );
+        }
+        bufs.set_active_batch(requests.len());
+        compact_inputs(self.device_context(), requests, bufs)?;
+        self.forward_compact_batch(requests.len(), bufs)?;
+        Ok(&bufs.normed)
+    }
+
+    pub fn forward_host_batch<'a>(
+        &self,
+        requests: &[DFlashHostBatchInput<'_>],
+        bufs: &'a mut DFlashBatchBuffers,
+    ) -> Result<&'a HiddenStates> {
+        anyhow::ensure!(!requests.is_empty(), "DFlash host batch is empty");
+        anyhow::ensure!(
+            requests.len() <= bufs.max_batch_size,
+            "DFlash host batch size {} exceeds buffer capacity {}",
+            requests.len(),
+            bufs.max_batch_size
+        );
+        let config = self.config();
+        let noise_len = bufs.q_len * config.hidden_size;
+        let target_len = bufs.ctx_len * config.hidden_size * config.target_layer_count();
+        let position_len = bufs.ctx_len + bufs.q_len;
+        for req in requests {
+            anyhow::ensure!(
+                req.noise_embedding.len() == noise_len,
+                "noise_embedding len {} != {}",
+                req.noise_embedding.len(),
+                noise_len
+            );
+            anyhow::ensure!(
+                req.target_hidden.len() == target_len,
+                "target_hidden len {} != {}",
+                req.target_hidden.len(),
+                target_len
+            );
+            anyhow::ensure!(
+                req.position_ids.len() == position_len,
+                "position_ids len {} != {}",
+                req.position_ids.len(),
+                position_len
+            );
+        }
+        bufs.set_active_batch(requests.len());
+        compact_host_inputs(self.device_context(), requests, bufs)?;
+        self.forward_compact_batch(requests.len(), bufs)?;
+        Ok(&bufs.normed)
+    }
+
+    fn forward_compact_batch(
+        &self,
+        batch_size: usize,
+        bufs: &mut DFlashBatchBuffers,
+    ) -> Result<()> {
+        let config = self.config();
+        ops::gemm_into_checked(
+            self.device_context(),
+            &self.fc,
+            &bufs.target_hidden,
+            &mut bufs.target_projected,
+        )?;
+        ops::rms_norm_batch_into(
+            self.device_context(),
+            &bufs.target_projected,
+            &self.hidden_norm,
+            config.rms_norm_eps,
+            &mut bufs.target_normed,
+        );
+        copy_hidden(
+            self.device_context(),
+            &bufs.noise,
+            0,
+            &mut bufs.hidden,
+            0,
+            config.hidden_size,
+            bufs.total_q_len,
+        )?;
+        for layer in &self.layers {
+            self.forward_compact_batch_layer(layer, batch_size, bufs)?;
+        }
+        ops::rms_norm_batch_into(
+            self.device_context(),
+            &bufs.hidden,
+            &self.norm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        );
+        Ok(())
+    }
+
+    fn forward_compact_batch_layer(
+        &self,
+        layer: &DFlashLayer,
+        batch_size: usize,
+        bufs: &mut DFlashBatchBuffers,
+    ) -> Result<()> {
+        let config = self.config();
+        let ctx = self.device_context();
+        ops::rms_norm_batch_into(
+            ctx,
+            &bufs.hidden,
+            &layer.input_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        );
+        ops::gemm_into_checked(ctx, &layer.attention.q_proj, &bufs.normed, &mut bufs.q)?;
+        ops::gemm_into_checked(
+            ctx,
+            &layer.attention.k_proj,
+            &bufs.normed,
+            &mut bufs.k_noise,
+        )?;
+        ops::gemm_into_checked(
+            ctx,
+            &layer.attention.v_proj,
+            &bufs.normed,
+            &mut bufs.v_noise,
+        )?;
+        ops::qk_norm_rope_batch_decode_into(
+            ctx,
+            &mut bufs.q,
+            &mut bufs.k_noise,
+            &layer.attention.q_norm,
+            &layer.attention.k_norm,
+            &self.cos_cache,
+            &self.sin_cache,
+            &bufs.positions_q,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+            config.rms_norm_eps,
+        );
+
+        ops::gemm_into_checked(
+            ctx,
+            &layer.attention.k_proj,
+            &bufs.target_normed,
+            &mut bufs.k_ctx,
+        )?;
+        ops::gemm_into_checked(
+            ctx,
+            &layer.attention.v_proj,
+            &bufs.target_normed,
+            &mut bufs.v_ctx,
+        )?;
+        ops::qk_norm_rope_batch_decode_into(
+            ctx,
+            &mut bufs.q_ctx_scratch,
+            &mut bufs.k_ctx,
+            &layer.attention.q_norm,
+            &layer.attention.k_norm,
+            &self.cos_cache,
+            &self.sin_cache,
+            &bufs.positions_ctx,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+            config.rms_norm_eps,
+        );
+
+        compact_kv(
+            ctx,
+            &bufs.k_ctx,
+            &bufs.k_noise,
+            &mut bufs.k_all,
+            batch_size,
+            bufs.ctx_len,
+            bufs.q_len,
+        )?;
+        compact_kv(
+            ctx,
+            &bufs.v_ctx,
+            &bufs.v_noise,
+            &mut bufs.v_all,
+            batch_size,
+            bufs.ctx_len,
+            bufs.q_len,
+        )?;
+        bufs.prepare_ragged_plan(self, batch_size)?;
+        let cached_plan = bufs.ragged_plan.take().expect("ragged plan exists");
+        let attention_result = ops::batch_prefill_ragged_nhd_noncausal_into(
+            ctx,
+            &bufs.q,
+            &bufs.k_all,
+            &bufs.v_all,
+            &mut bufs.attn_out,
+            &cached_plan.plan,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+        );
+        bufs.ragged_plan = Some(cached_plan);
+        attention_result?;
+        ops::gemm_into_checked(
+            ctx,
+            &layer.attention.o_proj,
+            &bufs.attn_out,
+            &mut bufs.o_buf,
+        )?;
+        openinfer_kernels::ops::fused_add_rms_norm_round_batch_into(
+            ctx,
+            &mut bufs.hidden,
+            &bufs.o_buf,
+            &layer.post_attention_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        )?;
+        ops::gemm_into_checked(
+            ctx,
+            &layer.mlp.gate_up_proj,
+            &bufs.normed,
+            &mut bufs.gate_up,
+        )?;
+        ops::silu_mul_fused_batch_into(ctx, &bufs.gate_up, &mut bufs.act_out);
+        ops::gemm_into_checked(ctx, &layer.mlp.down_proj, &bufs.act_out, &mut bufs.o_buf)?;
+        ops::add_batch_into(ctx, &bufs.hidden, &bufs.o_buf, &mut bufs.hidden_out)?;
+        std::mem::swap(&mut bufs.hidden, &mut bufs.hidden_out);
+        Ok(())
+    }
+}
+
+fn compact_inputs(
+    ctx: &DeviceContext,
+    requests: &[DFlashBatchInput<'_>],
+    bufs: &mut DFlashBatchBuffers,
+) -> Result<()> {
+    let hidden = bufs.noise.hidden_dim;
+    let target_hidden = bufs.target_hidden.hidden_dim;
+    let mut pos_q = Vec::with_capacity(bufs.total_q_len);
+    let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len);
+    for (i, req) in requests.iter().enumerate() {
+        copy_hidden(
+            ctx,
+            req.noise_embedding,
+            0,
+            &mut bufs.noise,
+            i * bufs.q_len,
+            hidden,
+            bufs.q_len,
+        )?;
+        copy_hidden(
+            ctx,
+            req.target_hidden.concatenated,
+            0,
+            &mut bufs.target_hidden,
+            i * bufs.ctx_len,
+            target_hidden,
+            bufs.ctx_len,
+        )?;
+        pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]);
+        pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]);
+    }
+    let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len());
+    ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?;
+    let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len());
+    ctx.stream.memcpy_htod(&pos_ctx, &mut dst_ctx)?;
+    Ok(())
+}
+
+fn compact_host_inputs(
+    ctx: &DeviceContext,
+    requests: &[DFlashHostBatchInput<'_>],
+    bufs: &mut DFlashBatchBuffers,
+) -> Result<()> {
+    let hidden = bufs.noise.hidden_dim;
+    let target_hidden = bufs.target_hidden.hidden_dim;
+    let mut pos_q = Vec::with_capacity(bufs.total_q_len);
+    let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len);
+    for (i, req) in requests.iter().enumerate() {
+        let noise_offset = i * bufs.q_len * hidden;
+        let mut noise_dst = bufs
+            .noise
+            .data
+            .slice_mut(noise_offset..noise_offset + req.noise_embedding.len());
+        ctx.stream
+            .memcpy_htod(req.noise_embedding, &mut noise_dst)?;
+
+        let target_offset = i * bufs.ctx_len * target_hidden;
+        let mut target_dst = bufs
+            .target_hidden
+            .data
+            .slice_mut(target_offset..target_offset + req.target_hidden.len());
+        ctx.stream.memcpy_htod(req.target_hidden, &mut target_dst)?;
+
+        pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]);
+        pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]);
+    }
+    let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len());
+    ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?;
+    let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len());
+    ctx.stream.memcpy_htod(&pos_ctx, &mut dst_ctx)?;
+    Ok(())
+}
+
+fn compact_kv(
+    ctx: &DeviceContext,
+    ctx_part: &HiddenStates,
+    noise_part: &HiddenStates,
+    out: &mut HiddenStates,
+    batch_size: usize,
+    ctx_len: usize,
+    q_len: usize,
+) -> Result<()> {
+    let dim = ctx_part.hidden_dim;
+    for i in 0..batch_size {
+        copy_hidden(
+            ctx,
+            ctx_part,
+            i * ctx_len,
+            out,
+            i * (ctx_len + q_len),
+            dim,
+            ctx_len,
+        )?;
+        copy_hidden(
+            ctx,
+            noise_part,
+            i * q_len,
+            out,
+            i * (ctx_len + q_len) + ctx_len,
+            dim,
+            q_len,
+        )?;
+    }
+    Ok(())
+}
+
+pub(crate) fn copy_hidden(
+    ctx: &DeviceContext,
+    src: &HiddenStates,
+    src_token_offset: usize,
+    dst: &mut HiddenStates,
+    dst_token_offset: usize,
+    hidden_dim: usize,
+    token_count: usize,
+) -> Result<()> {
+    debug_assert_eq!(src.hidden_dim, hidden_dim);
+    debug_assert_eq!(dst.hidden_dim, hidden_dim);
+    debug_assert!(src_token_offset + token_count <= src.seq_len);
+    debug_assert!(dst_token_offset + token_count <= dst.seq_len);
+    let len = hidden_dim * token_count;
+    let src_offset = hidden_dim * src_token_offset;
+    let dst_offset = hidden_dim * dst_token_offset;
+    let src_view = src.data.slice(src_offset..src_offset + len);
+    let mut dst_view = dst.data.slice_mut(dst_offset..dst_offset + len);
+    ctx.stream.memcpy_dtod(&src_view, &mut dst_view)?;
+    Ok(())
+}
diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs
new file mode 100644
index 00000000..9ba748ae
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_batch_bench.rs
@@ -0,0 +1,227 @@
+use std::path::PathBuf;
+use std::time::Instant;
+
+use anyhow::{Context, Result, bail};
+use half::bf16;
+use openinfer_core::tensor::HiddenStates;
+use openinfer_qwen3_4b_dflash::{DFlashBatchInput, DFlashDraftModel, DFlashTargetHidden};
+use serde::Serialize;
+
+fn main() -> Result<()> {
+    let args = Args::parse()?;
+    let model = DFlashDraftModel::load(&args.model_path, args.device)?;
+    let config = model.config();
+    let ctx = model.device_context();
+    let mut reports = Vec::new();
+
+    for &batch_size in &args.batch_sizes {
+        let mut noises = Vec::with_capacity(batch_size);
+        let mut targets = Vec::with_capacity(batch_size);
+        let mut positions = Vec::with_capacity(batch_size);
+        for i in 0..batch_size {
+            let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_0000 + i as u64);
+            let target = deterministic_bf16(
+                args.ctx_len * config.hidden_size * config.target_layer_count(),
+                0xC0DE_0000 + i as u64,
+            );
+            noises.push(HiddenStates {
+                data: ctx.stream.clone_htod(&noise).context("noise h2d")?,
+                hidden_dim: config.hidden_size,
+                seq_len: args.q_len,
+            });
+            targets.push(HiddenStates {
+                data: ctx.stream.clone_htod(&target).context("target h2d")?,
+                hidden_dim: config.hidden_size * config.target_layer_count(),
+                seq_len: args.ctx_len,
+            });
+            positions.push(
+                (0..(args.ctx_len + args.q_len))
+                    .map(|pos| pos as i32)
+                    .collect::<Vec<_>>(),
+            );
+        }
+        let mut bufs = model.create_batch_buffers(batch_size, args.q_len, args.ctx_len)?;
+        let inputs = build_inputs(&noises, &targets, &positions);
+        for _ in 0..args.warmup {
+            let _ = model.forward_batch(&inputs, &mut bufs)?;
+            ctx.sync()?;
+        }
+        let mut latencies_ms = Vec::with_capacity(args.iters);
+        for _ in 0..args.iters {
+            ctx.sync()?;
+            let started = Instant::now();
+            let _ = model.forward_batch(&inputs, &mut bufs)?;
+            ctx.sync()?;
+            latencies_ms.push(started.elapsed().as_secs_f64() * 1000.0);
+        }
+        let stats = Stats::from(&latencies_ms);
+        let mean_s = stats.mean / 1000.0;
+        reports.push(BatchReport {
+            batch_size,
+            ctx_len: args.ctx_len,
+            q_len: args.q_len,
+            warmup: args.warmup,
+            iters: args.iters,
+            draft_tokens_per_s: (batch_size * args.q_len) as f64 / mean_s,
+            requests_per_s: batch_size as f64 / mean_s,
+            latency_ms: stats,
+        });
+    }
+
+    let report = Report {
+        schema: 1,
+        engine: "openinfer-qwen3-4b-dflash-batch",
+        model_path: args.model_path.to_string_lossy().to_string(),
+        device: args.device,
+        hidden_size: config.hidden_size,
+        target_layer_count: config.target_layer_count(),
+        reports,
+    };
+    println!("{}", serde_json::to_string_pretty(&report)?);
+    Ok(())
+}
+
+fn build_inputs<'a>(
+    noises: &'a [HiddenStates],
+    targets: &'a [HiddenStates],
+    positions: &'a [Vec<i32>],
+) -> Vec<DFlashBatchInput<'a>> {
+    noises
+        .iter()
+        .zip(targets.iter())
+        .zip(positions.iter())
+        .map(|((noise, target), position_ids)| DFlashBatchInput {
+            noise_embedding: noise,
+            target_hidden: DFlashTargetHidden {
+                concatenated: target,
+            },
+            position_ids,
+        })
+        .collect()
+}
+
+#[derive(Clone)]
+struct Args {
+    model_path: PathBuf,
+    device: usize,
+    ctx_len: usize,
+    q_len: usize,
+    warmup: usize,
+    iters: usize,
+    batch_sizes: Vec<usize>,
+}
+
+impl Args {
+    fn parse() -> Result<Self> {
+        let mut model_path = PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16");
+        let mut device = 0usize;
+        let mut ctx_len = 2usize;
+        let mut q_len = 16usize;
+        let mut warmup = 5usize;
+        let mut iters = 30usize;
+        let mut batch_sizes = vec![1, 2, 4, 8, 16, 32];
+        let mut args = std::env::args().skip(1);
+        while let Some(arg) = args.next() {
+            match arg.as_str() {
+                "--model-path" => model_path = PathBuf::from(next_value(&mut args, &arg)?),
+                "--device" => device = next_value(&mut args, &arg)?.parse()?,
+                "--ctx-len" => ctx_len = next_value(&mut args, &arg)?.parse()?,
+                "--q-len" => q_len = next_value(&mut args, &arg)?.parse()?,
+                "--warmup" => warmup = next_value(&mut args, &arg)?.parse()?,
+                "--iters" => iters = next_value(&mut args, &arg)?.parse()?,
+                "--batch-sizes" => {
+                    batch_sizes = next_value(&mut args, &arg)?
+                        .split(',')
+                        .map(str::parse)
+                        .collect::<std::result::Result<Vec<_>, _>>()?;
+                }
+                _ => bail!("unknown argument {arg}"),
+            }
+        }
+        if ctx_len == 0 || q_len == 0 || iters == 0 {
+            bail!("--ctx-len, --q-len, and --iters must be greater than zero");
+        }
+        if batch_sizes.is_empty() || batch_sizes.contains(&0) {
+            bail!("--batch-sizes must contain positive batch sizes");
+        }
+        Ok(Self {
+            model_path,
+            device,
+            ctx_len,
+            q_len,
+            warmup,
+            iters,
+            batch_sizes,
+        })
+    }
+}
+
+fn next_value(args: &mut impl Iterator<Item = String>, flag: &str) -> Result<String> {
+    args.next()
+        .with_context(|| format!("{flag} requires a value"))
+}
+
+fn deterministic_bf16(len: usize, seed: u64) -> Vec<bf16> {
+    let mut state = seed;
+    let mut out = Vec::with_capacity(len);
+    for _ in 0..len {
+        state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
+        let bits = ((state >> 32) as u32) as f32 / (u32::MAX as f32);
+        out.push(bf16::from_f32((bits * 2.0 - 1.0) * 0.125));
+    }
+    out
+}
+
+#[derive(Serialize)]
+struct Report {
+    schema: u32,
+    engine: &'static str,
+    model_path: String,
+    device: usize,
+    hidden_size: usize,
+    target_layer_count: usize,
+    reports: Vec<BatchReport>,
+}
+
+#[derive(Serialize)]
+struct BatchReport {
+    batch_size: usize,
+    ctx_len: usize,
+    q_len: usize,
+    warmup: usize,
+    iters: usize,
+    draft_tokens_per_s: f64,
+    requests_per_s: f64,
+    latency_ms: Stats,
+}
+
+#[derive(Serialize)]
+struct Stats {
+    mean: f64,
+    p50: f64,
+    p90: f64,
+    p99: f64,
+    min: f64,
+    max: f64,
+}
+
+impl Stats {
+    fn from(values: &[f64]) -> Self {
+        let mut sorted = values.to_vec();
+        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let mean = sorted.iter().sum::<f64>() / sorted.len() as f64;
+        Self {
+            mean,
+            p50: percentile(&sorted, 0.50),
+            p90: percentile(&sorted, 0.90),
+            p99: percentile(&sorted, 0.99),
+            min: sorted[0],
+            max: sorted[sorted.len() - 1],
+        }
+    }
+}
+
+fn percentile(sorted: &[f64], q: f64) -> f64 {
+    let idx = ((sorted.len() - 1) as f64 * q).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
new file mode 100644
index 00000000..cb5bd0c9
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
@@ -0,0 +1,301 @@
+use std::path::PathBuf;
+use std::time::Instant;
+
+use anyhow::{Context, Result, bail};
+use half::bf16;
+use openinfer_core::tensor::HiddenStates;
+use openinfer_qwen3_4b_dflash::{DFlashDraftModel, DFlashTargetHidden};
+use safetensors::{Dtype, SafeTensors};
+use serde::Serialize;
+
+fn main() -> Result<()> {
+    let args = Args::parse()?;
+    let model = DFlashDraftModel::load(&args.model_path, args.device)?;
+    let config = model.config();
+    let ctx = model.device_context();
+
+    let (noise, target_hidden, positions, ctx_len, q_len) = if let Some(fixture) = &args.fixture {
+        let bytes = std::fs::read(fixture)
+            .with_context(|| format!("failed to read fixture {}", fixture.display()))?;
+        let st = SafeTensors::deserialize(&bytes).context("parse fixture")?;
+        let noise = read_bf16(&st, "noise_embedding", &[1, args.q_len, config.hidden_size])?;
+        let target_hidden = read_bf16(
+            &st,
+            "target_hidden",
+            &[
+                1,
+                args.ctx_len,
+                config.hidden_size * config.target_layer_count(),
+            ],
+        )?;
+        let positions = read_i32(&st, "position_ids", &[1, args.ctx_len + args.q_len])?;
+        (noise, target_hidden, positions, args.ctx_len, args.q_len)
+    } else {
+        let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_4B16);
+        let target_hidden = deterministic_bf16(
+            args.ctx_len * config.hidden_size * config.target_layer_count(),
+            0xD4A5_C0DE,
+        );
+        let positions = (0..(args.ctx_len + args.q_len))
+            .map(|pos| pos as i32)
+            .collect::<Vec<_>>();
+        (noise, target_hidden, positions, args.ctx_len, args.q_len)
+    };
+
+    let noise = HiddenStates {
+        data: ctx.stream.clone_htod(&noise).context("noise h2d")?,
+        hidden_dim: config.hidden_size,
+        seq_len: q_len,
+    };
+    let target_hidden = HiddenStates {
+        data: ctx
+            .stream
+            .clone_htod(&target_hidden)
+            .context("target hidden h2d")?,
+        hidden_dim: config.hidden_size * config.target_layer_count(),
+        seq_len: ctx_len,
+    };
+    ctx.sync()?;
+
+    let mut cache = model.create_draft_cache(q_len, ctx_len, ctx_len + q_len)?;
+    if args.draft_cache {
+        model.prepare_step_context(
+            DFlashTargetHidden {
+                concatenated: &target_hidden,
+            },
+            &positions,
+            &mut cache,
+        )?;
+        ctx.sync()?;
+    }
+    for _ in 0..args.warmup {
+        if args.draft_cache {
+            cache.reset();
+            model.prepare_step_context(
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &positions,
+                &mut cache,
+            )?;
+            let _out = model.forward_with_draft_cache(&noise, &positions, &mut cache)?;
+        } else {
+            let _out = model.forward_with_cache(
+                &noise,
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &positions,
+                &mut cache,
+            )?;
+        }
+        ctx.sync()?;
+    }
+
+    let mut latencies_ms = Vec::with_capacity(args.iters);
+    for _ in 0..args.iters {
+        ctx.sync()?;
+        let started = Instant::now();
+        if args.draft_cache {
+            cache.reset();
+            model.prepare_step_context(
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &positions,
+                &mut cache,
+            )?;
+            let _out = model.forward_with_draft_cache(&noise, &positions, &mut cache)?;
+        } else {
+            let _out = model.forward_with_cache(
+                &noise,
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &positions,
+                &mut cache,
+            )?;
+        }
+        ctx.sync()?;
+        latencies_ms.push(started.elapsed().as_secs_f64() * 1000.0);
+    }
+
+    let report = Report {
+        schema: 1,
+        engine: "openinfer-qwen3-4b-dflash",
+        model_path: args.model_path.to_string_lossy().to_string(),
+        device: args.device,
+        ctx_len: args.ctx_len,
+        q_len: args.q_len,
+        hidden_size: config.hidden_size,
+        target_layer_count: config.target_layer_count(),
+        draft_cache: args.draft_cache,
+        warmup: args.warmup,
+        iters: args.iters,
+        latency_ms: Stats::from(&latencies_ms),
+    };
+    println!("{}", serde_json::to_string_pretty(&report)?);
+    Ok(())
+}
+
+#[derive(Clone)]
+struct Args {
+    model_path: PathBuf,
+    fixture: Option<PathBuf>,
+    device: usize,
+    ctx_len: usize,
+    q_len: usize,
+    warmup: usize,
+    iters: usize,
+    draft_cache: bool,
+}
+
+impl Args {
+    fn parse() -> Result<Self> {
+        let mut model_path = PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16");
+        let mut fixture = None;
+        let mut device = 0usize;
+        let mut ctx_len = 2usize;
+        let mut q_len = 16usize;
+        let mut warmup = 5usize;
+        let mut iters = 30usize;
+        let mut draft_cache = false;
+        let mut args = std::env::args().skip(1);
+        while let Some(arg) = args.next() {
+            match arg.as_str() {
+                "--model-path" => model_path = PathBuf::from(next_value(&mut args, &arg)?),
+                "--fixture" => fixture = Some(PathBuf::from(next_value(&mut args, &arg)?)),
+                "--device" => device = next_value(&mut args, &arg)?.parse()?,
+                "--ctx-len" => ctx_len = next_value(&mut args, &arg)?.parse()?,
+                "--q-len" => q_len = next_value(&mut args, &arg)?.parse()?,
+                "--warmup" => warmup = next_value(&mut args, &arg)?.parse()?,
+                "--iters" => iters = next_value(&mut args, &arg)?.parse()?,
+                "--draft-cache" | "--context-cache" => draft_cache = true,
+                _ => bail!("unknown argument {arg}"),
+            }
+        }
+        if ctx_len == 0 {
+            bail!("--ctx-len must be greater than zero");
+        }
+        if q_len == 0 {
+            bail!("--q-len must be greater than zero");
+        }
+        if iters == 0 {
+            bail!("--iters must be greater than zero");
+        }
+        Ok(Self {
+            model_path,
+            fixture,
+            device,
+            ctx_len,
+            q_len,
+            warmup,
+            iters,
+            draft_cache,
+        })
+    }
+}
+
+fn next_value(args: &mut impl Iterator<Item = String>, flag: &str) -> Result<String> {
+    args.next()
+        .with_context(|| format!("{flag} requires a value"))
+}
+
+fn deterministic_bf16(len: usize, seed: u64) -> Vec<bf16> {
+    let mut state = seed;
+    let mut out = Vec::with_capacity(len);
+    for _ in 0..len {
+        state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
+        let bits = ((state >> 32) as u32) as f32 / (u32::MAX as f32);
+        let value = (bits * 2.0 - 1.0) * 0.125;
+        out.push(bf16::from_f32(value));
+    }
+    out
+}
+
+#[derive(Serialize)]
+struct Report {
+    schema: u32,
+    engine: &'static str,
+    model_path: String,
+    device: usize,
+    ctx_len: usize,
+    q_len: usize,
+    hidden_size: usize,
+    target_layer_count: usize,
+    draft_cache: bool,
+    warmup: usize,
+    iters: usize,
+    latency_ms: Stats,
+}
+
+#[derive(Serialize)]
+struct Stats {
+    mean: f64,
+    p50: f64,
+    p90: f64,
+    p99: f64,
+    min: f64,
+    max: f64,
+}
+
+impl Stats {
+    fn from(values: &[f64]) -> Self {
+        let mut sorted = values.to_vec();
+        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let mean = sorted.iter().sum::<f64>() / sorted.len() as f64;
+        Self {
+            mean,
+            p50: percentile(&sorted, 0.50),
+            p90: percentile(&sorted, 0.90),
+            p99: percentile(&sorted, 0.99),
+            min: sorted[0],
+            max: sorted[sorted.len() - 1],
+        }
+    }
+}
+
+fn read_bf16(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Result<Vec<bf16>> {
+    let view = st
+        .tensor(name)
+        .with_context(|| format!("missing tensor {name}"))?;
+    if view.dtype() != Dtype::BF16 {
+        bail!("{name} must be BF16, got {:?}", view.dtype());
+    }
+    if view.shape() != shape {
+        bail!(
+            "{name} shape mismatch: expected {shape:?}, got {:?}",
+            view.shape()
+        );
+    }
+    Ok(view
+        .data()
+        .chunks_exact(2)
+        .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]])))
+        .collect())
+}
+
+fn read_i32(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Result<Vec<i32>> {
+    let view = st
+        .tensor(name)
+        .with_context(|| format!("missing tensor {name}"))?;
+    if view.dtype() != Dtype::I32 {
+        bail!("{name} must be I32, got {:?}", view.dtype());
+    }
+    if view.shape() != shape {
+        bail!(
+            "{name} shape mismatch: expected {shape:?}, got {:?}",
+            view.shape()
+        );
+    }
+    Ok(view
+        .data()
+        .chunks_exact(4)
+        .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
+        .collect())
+}
+
+fn percentile(sorted: &[f64], q: f64) -> f64 {
+    let idx = ((sorted.len() - 1) as f64 * q).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs
new file mode 100644
index 00000000..08ff66a3
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_fixture.rs
@@ -0,0 +1,155 @@
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use anyhow::{Context, Result, bail};
+use half::bf16;
+use openinfer_core::tensor::HiddenStates;
+use openinfer_qwen3_4b_dflash::{DFlashDraftModel, DFlashTargetHidden};
+use safetensors::{Dtype, SafeTensors, tensor::TensorView};
+
+fn main() -> Result<()> {
+    let args = Args::parse()?;
+    let fixture_bytes = std::fs::read(&args.fixture).with_context(|| {
+        format!(
+            "failed to read input fixture {}",
+            args.fixture.to_string_lossy()
+        )
+    })?;
+    let st = SafeTensors::deserialize(&fixture_bytes).context("parse input fixture")?;
+    let model = DFlashDraftModel::load(&args.model_path, args.device)?;
+    let config = model.config();
+    let ctx = model.device_context();
+
+    let noise = bf16_tensor(&st, "noise_embedding")?;
+    let target_hidden = bf16_tensor(&st, "target_hidden")?;
+    let positions = i32_tensor(&st, "position_ids")?;
+
+    if noise.1.len() != 3 || noise.1[0] != 1 || noise.1[2] != config.hidden_size {
+        bail!(
+            "noise_embedding shape mismatch: expected [1, q_len, {}], got {:?}",
+            config.hidden_size,
+            noise.1
+        );
+    }
+    if target_hidden.1.len() != 3
+        || target_hidden.1[0] != 1
+        || target_hidden.1[2] != config.hidden_size * config.target_layer_count()
+    {
+        bail!(
+            "target_hidden shape mismatch: expected [1, ctx_len, {}], got {:?}",
+            config.hidden_size * config.target_layer_count(),
+            target_hidden.1
+        );
+    }
+    let q_len = noise.1[1];
+    let ctx_len = target_hidden.1[1];
+    ensure_shape("position_ids", &positions.1, &[1, ctx_len + q_len])?;
+
+    let noise_embedding = HiddenStates {
+        data: ctx.stream.clone_htod(&noise.0)?,
+        hidden_dim: config.hidden_size,
+        seq_len: q_len,
+    };
+    let target_hidden = HiddenStates {
+        data: ctx.stream.clone_htod(&target_hidden.0)?,
+        hidden_dim: config.hidden_size * config.target_layer_count(),
+        seq_len: ctx_len,
+    };
+    let out = model.forward(
+        &noise_embedding,
+        DFlashTargetHidden {
+            concatenated: &target_hidden,
+        },
+        &positions.0,
+    )?;
+    ctx.sync()?;
+    let out = ctx.stream.clone_dtoh(&out.data)?;
+    ctx.sync()?;
+
+    let out_bytes = bf16_bytes(&out);
+    let tensors = HashMap::from([(
+        "openinfer_output".to_string(),
+        TensorView::new(Dtype::BF16, vec![1, q_len, config.hidden_size], &out_bytes)?,
+    )]);
+    safetensors::serialize_to_file(tensors, None, &args.out)?;
+    Ok(())
+}
+
+struct Args {
+    model_path: PathBuf,
+    fixture: PathBuf,
+    out: PathBuf,
+    device: usize,
+}
+
+impl Args {
+    fn parse() -> Result<Self> {
+        let mut model_path = None;
+        let mut fixture = None;
+        let mut out = None;
+        let mut device = 0usize;
+        let mut args = std::env::args().skip(1);
+        while let Some(arg) = args.next() {
+            match arg.as_str() {
+                "--model-path" => model_path = Some(PathBuf::from(next_value(&mut args, &arg)?)),
+                "--fixture" => fixture = Some(PathBuf::from(next_value(&mut args, &arg)?)),
+                "--out" => out = Some(PathBuf::from(next_value(&mut args, &arg)?)),
+                "--device" => device = next_value(&mut args, &arg)?.parse()?,
+                _ => bail!("unknown argument {arg}"),
+            }
+        }
+        Ok(Self {
+            model_path: model_path
+                .unwrap_or_else(|| PathBuf::from("/home/hezhaozhao/models/Qwen3-4B-DFlash-b16")),
+            fixture: fixture.context("--fixture is required")?,
+            out: out.context("--out is required")?,
+            device,
+        })
+    }
+}
+
+fn next_value(args: &mut impl Iterator<Item = String>, flag: &str) -> Result<String> {
+    args.next()
+        .with_context(|| format!("{flag} requires a value"))
+}
+
+fn ensure_shape(name: &str, got: &[usize], expected: &[usize]) -> Result<()> {
+    if got != expected {
+        bail!("{name} shape mismatch: expected {expected:?}, got {got:?}");
+    }
+    Ok(())
+}
+
+fn bf16_tensor(st: &SafeTensors<'_>, name: &str) -> Result<(Vec<bf16>, Vec<usize>)> {
+    let view = st.tensor(name)?;
+    if view.dtype() != Dtype::BF16 {
+        bail!("{name} must be BF16, got {:?}", view.dtype());
+    }
+    let values = view
+        .data()
+        .chunks_exact(2)
+        .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]])))
+        .collect();
+    Ok((values, view.shape().to_vec()))
+}
+
+fn i32_tensor(st: &SafeTensors<'_>, name: &str) -> Result<(Vec<i32>, Vec<usize>)> {
+    let view = st.tensor(name)?;
+    if view.dtype() != Dtype::I32 {
+        bail!("{name} must be I32, got {:?}", view.dtype());
+    }
+    let values = view
+        .data()
+        .chunks_exact(4)
+        .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
+        .collect();
+    Ok((values, view.shape().to_vec()))
+}
+
+fn bf16_bytes(values: &[bf16]) -> Vec<u8> {
+    let mut out = Vec::with_capacity(values.len() * 2);
+    for value in values {
+        out.extend(value.to_bits().to_le_bytes());
+    }
+    out
+}
diff --git a/openinfer-qwen3-4b-dflash/src/config.rs b/openinfer-qwen3-4b-dflash/src/config.rs
new file mode 100644
index 00000000..bebd1657
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/config.rs
@@ -0,0 +1,143 @@
+use anyhow::{Result, bail};
+use serde::Deserialize;
+use std::fs;
+use std::path::Path;
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct DFlashInnerConfig {
+    pub mask_token_id: u32,
+    pub target_layer_ids: Vec<usize>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct DFlashConfig {
+    pub architectures: Vec<String>,
+    pub attention_bias: bool,
+    pub attention_dropout: f32,
+    pub block_size: usize,
+    pub dflash_config: DFlashInnerConfig,
+    pub hidden_size: usize,
+    pub intermediate_size: usize,
+    pub num_attention_heads: usize,
+    pub num_hidden_layers: usize,
+    pub num_key_value_heads: usize,
+    pub num_target_layers: usize,
+    pub head_dim: usize,
+    pub max_position_embeddings: usize,
+    pub rms_norm_eps: f32,
+    pub rope_theta: f32,
+    pub tie_word_embeddings: bool,
+    pub vocab_size: usize,
+}
+
+impl DFlashConfig {
+    pub fn from_model_dir(model_path: &Path) -> Result<Self> {
+        let content = fs::read_to_string(model_path.join("config.json"))?;
+        let config: Self = serde_json::from_str(&content)?;
+        config.validate()?;
+        Ok(config)
+    }
+
+    pub fn validate(&self) -> Result<()> {
+        if self
+            .architectures
+            .iter()
+            .all(|name| name != "DFlashDraftModel")
+        {
+            bail!("DFlash config architectures must include DFlashDraftModel");
+        }
+        if self.attention_bias {
+            bail!("DFlash v1 expects bias-free Qwen3 projections");
+        }
+        if self.attention_dropout != 0.0 {
+            bail!("DFlash inference expects attention_dropout=0");
+        }
+        if self.num_hidden_layers == 0 {
+            bail!("DFlash draft must have at least one layer");
+        }
+        if self.num_hidden_layers != 5 {
+            bail!(
+                "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 with 5 draft layers, got {}",
+                self.num_hidden_layers
+            );
+        }
+        if self.block_size != 16 {
+            bail!(
+                "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 block_size=16, got {}",
+                self.block_size
+            );
+        }
+        if self.dflash_config.mask_token_id != 151669 {
+            bail!(
+                "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 mask_token_id=151669, got {}",
+                self.dflash_config.mask_token_id
+            );
+        }
+        if self.hidden_size == 0 || self.head_dim == 0 {
+            bail!("DFlash hidden_size/head_dim must be positive");
+        }
+        if self.num_attention_heads == 0 || self.num_key_value_heads == 0 {
+            bail!("DFlash attention/KV head counts must be positive");
+        }
+        if self.num_attention_heads % self.num_key_value_heads != 0 {
+            bail!("DFlash GQA requires attention heads divisible by KV heads");
+        }
+        if self.dflash_config.target_layer_ids.len() != self.num_hidden_layers {
+            bail!(
+                "DFlash target_layer_ids len {} must match draft layers {}",
+                self.dflash_config.target_layer_ids.len(),
+                self.num_hidden_layers
+            );
+        }
+        if self
+            .dflash_config
+            .target_layer_ids
+            .iter()
+            .any(|&layer| layer >= self.num_target_layers)
+        {
+            bail!("DFlash target_layer_ids must be within num_target_layers");
+        }
+        if self.dflash_config.target_layer_ids.as_slice() != [1, 9, 17, 25, 33] {
+            bail!(
+                "openinfer-qwen3-4b-dflash supports only Qwen3-4B-DFlash-b16 target_layer_ids=[1, 9, 17, 25, 33], got {:?}",
+                self.dflash_config.target_layer_ids
+            );
+        }
+        Ok(())
+    }
+
+    pub fn target_layer_count(&self) -> usize {
+        self.dflash_config.target_layer_ids.len()
+    }
+
+    pub fn q_dim(&self) -> usize {
+        self.num_attention_heads * self.head_dim
+    }
+
+    pub fn kv_dim(&self) -> usize {
+        self.num_key_value_heads * self.head_dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16";
+
+    #[test]
+    fn parses_local_dflash_config() {
+        let path = Path::new(LOCAL_DFLASH);
+        if !path.exists() {
+            eprintln!("skipping: {LOCAL_DFLASH} does not exist");
+            return;
+        }
+        let config = DFlashConfig::from_model_dir(path).expect("config");
+        assert_eq!(config.num_hidden_layers, 5);
+        assert_eq!(config.block_size, 16);
+        assert_eq!(config.dflash_config.mask_token_id, 151669);
+        assert_eq!(config.dflash_config.target_layer_ids, [1, 9, 17, 25, 33]);
+        assert_eq!(config.hidden_size, 2560);
+        assert_eq!(config.intermediate_size, 9728);
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs
new file mode 100644
index 00000000..818226c7
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/executor.rs
@@ -0,0 +1,640 @@
+use std::collections::HashMap;
+use std::path::Path;
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use half::bf16;
+use openinfer_core::tensor::HiddenStates;
+
+use crate::batch_buffers::DFlashBatchBuffers;
+use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden};
+use crate::forward::{DFlashDraftCache, DFlashTargetHidden};
+use crate::weights::DFlashDraftModel;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
+pub struct DFlashRequestId(pub u64);
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum DFlashCacheMode {
+    NoCache,
+    DraftCache,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub struct DFlashBatchKey {
+    pub q_len: usize,
+    pub ctx_len: usize,
+    pub past_len: usize,
+    pub cache_mode: DFlashCacheMode,
+}
+
+pub struct DFlashDraftRequest {
+    pub request_id: DFlashRequestId,
+    pub noise_embedding: HiddenStates,
+    pub target_hidden: HiddenStates,
+    pub position_ids: Vec<i32>,
+    pub cache_mode: DFlashCacheMode,
+}
+
+pub struct DFlashDraftHostRequest {
+    pub request_id: DFlashRequestId,
+    pub noise_embedding: Vec<bf16>,
+    pub target_hidden: Vec<bf16>,
+    pub position_ids: Vec<i32>,
+    pub q_len: usize,
+    pub ctx_len: usize,
+    pub cache_mode: DFlashCacheMode,
+}
+
+pub struct DFlashDraftResponse {
+    pub request_id: DFlashRequestId,
+    pub output: HiddenStates,
+    pub cache_seq_len: usize,
+    pub batch_size: usize,
+    pub elapsed: Duration,
+}
+
+pub struct DFlashDraftHostResponse {
+    pub request_id: DFlashRequestId,
+    pub output: Vec<bf16>,
+    pub hidden_dim: usize,
+    pub seq_len: usize,
+    pub cache_seq_len: usize,
+    pub batch_size: usize,
+    pub elapsed: Duration,
+}
+
+pub struct DFlashDraftBatchResponse {
+    pub request_ids: Vec<DFlashRequestId>,
+    pub output: HiddenStates,
+    pub cache_seq_lens: Vec<usize>,
+    pub batch_size: usize,
+    pub q_len: usize,
+    pub elapsed: Duration,
+}
+
+pub struct DFlashDraftBatchView<'a> {
+    pub request_ids: Vec<DFlashRequestId>,
+    pub output: &'a HiddenStates,
+    pub cache_seq_lens: Vec<usize>,
+    pub batch_size: usize,
+    pub q_len: usize,
+    pub elapsed: Duration,
+}
+
+pub struct DFlashExecutorOptions {
+    pub max_batch_size: usize,
+    pub max_step_context_len: usize,
+    pub max_seq_len: usize,
+}
+
+impl Default for DFlashExecutorOptions {
+    fn default() -> Self {
+        Self {
+            max_batch_size: 32,
+            max_step_context_len: 16,
+            max_seq_len: 4096,
+        }
+    }
+}
+
+pub struct DFlashExecutor {
+    model: DFlashDraftModel,
+    options: DFlashExecutorOptions,
+    buffers: HashMap<(usize, usize, usize), DFlashBatchBuffers>,
+    caches: HashMap<DFlashRequestId, DFlashDraftCache>,
+}
+
+impl DFlashExecutor {
+    pub fn load(
+        model_path: &Path,
+        device_ordinal: usize,
+        options: DFlashExecutorOptions,
+    ) -> Result<Self> {
+        let model = DFlashDraftModel::load(model_path, device_ordinal)?;
+        Ok(Self {
+            model,
+            options,
+            buffers: HashMap::new(),
+            caches: HashMap::new(),
+        })
+    }
+
+    pub fn model(&self) -> &DFlashDraftModel {
+        &self.model
+    }
+
+    pub fn max_batch_size(&self) -> usize {
+        self.options.max_batch_size
+    }
+
+    pub fn batch_key(&self, req: &DFlashDraftRequest) -> Result<DFlashBatchKey> {
+        let target = DFlashTargetHidden {
+            concatenated: &req.target_hidden,
+        };
+        let (q_len, ctx_len) =
+            self.model
+                .validate_forward_inputs(&req.noise_embedding, &target, &req.position_ids)?;
+        let past_len = self
+            .caches
+            .get(&req.request_id)
+            .map(DFlashDraftCache::seq_len)
+            .unwrap_or(0);
+        Ok(DFlashBatchKey {
+            q_len,
+            ctx_len,
+            past_len,
+            cache_mode: req.cache_mode,
+        })
+    }
+
+    pub fn host_batch_key(&self, req: &DFlashDraftHostRequest) -> Result<DFlashBatchKey> {
+        let config = self.model.config();
+        anyhow::ensure!(
+            req.noise_embedding.len() == req.q_len * config.hidden_size,
+            "noise_embedding len {} != q_len * hidden_size {}",
+            req.noise_embedding.len(),
+            req.q_len * config.hidden_size
+        );
+        anyhow::ensure!(
+            req.target_hidden.len()
+                == req.ctx_len * config.hidden_size * config.target_layer_count(),
+            "target_hidden len {} != ctx_len * target_layer_count * hidden_size {}",
+            req.target_hidden.len(),
+            req.ctx_len * config.hidden_size * config.target_layer_count()
+        );
+        anyhow::ensure!(
+            req.position_ids.len() == req.ctx_len + req.q_len,
+            "position_ids len {} != ctx_len + q_len {}",
+            req.position_ids.len(),
+            req.ctx_len + req.q_len
+        );
+        let past_len = self
+            .caches
+            .get(&req.request_id)
+            .map(DFlashDraftCache::seq_len)
+            .unwrap_or(0);
+        Ok(DFlashBatchKey {
+            q_len: req.q_len,
+            ctx_len: req.ctx_len,
+            past_len,
+            cache_mode: req.cache_mode,
+        })
+    }
+
+    pub fn execute_batch(
+        &mut self,
+        requests: Vec<DFlashDraftRequest>,
+    ) -> Result<Vec<DFlashDraftResponse>> {
+        let batch = self.execute_batch_compact(requests)?;
+        self.split_compact_response(batch)
+    }
+
+    pub fn execute_host_batch_compact(
+        &mut self,
+        requests: Vec<DFlashDraftHostRequest>,
+    ) -> Result<DFlashDraftBatchResponse> {
+        anyhow::ensure!(!requests.is_empty(), "DFlash host executor batch is empty");
+        anyhow::ensure!(
+            requests.len() <= self.options.max_batch_size,
+            "DFlash host executor batch size {} exceeds max_batch_size {}",
+            requests.len(),
+            self.options.max_batch_size
+        );
+        let key = self.host_batch_key(&requests[0])?;
+        for req in &requests[1..] {
+            let req_key = self.host_batch_key(req)?;
+            anyhow::ensure!(
+                req_key == key,
+                "DFlash host executor requires exact-shape batch: first={key:?}, got={req_key:?}"
+            );
+        }
+        if key.cache_mode == DFlashCacheMode::DraftCache {
+            return self.execute_cached_host_requests_serial_compact(requests, key);
+        }
+        let started = Instant::now();
+        let batch_size = requests.len();
+        let request_ids = requests
+            .iter()
+            .map(|request| request.request_id)
+            .collect::<Vec<_>>();
+        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
+        if !self.buffers.contains_key(&buffer_key) {
+            let bufs = self.model.create_batch_buffers(
+                self.options.max_batch_size,
+                key.q_len,
+                key.ctx_len,
+            )?;
+            self.buffers.insert(buffer_key, bufs);
+        }
+        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
+        let inputs = requests
+            .iter()
+            .map(|req| DFlashHostBatchInput {
+                noise_embedding: &req.noise_embedding,
+                target_hidden: &req.target_hidden,
+                position_ids: &req.position_ids,
+            })
+            .collect::<Vec<_>>();
+        let batch_output = self.model.forward_host_batch(&inputs, bufs)?;
+        self.model.device_context().sync()?;
+        let elapsed = started.elapsed();
+        let mut output = HiddenStates::zeros(
+            self.model.device_context(),
+            batch_output.hidden_dim,
+            batch_output.seq_len,
+        )?;
+        copy_hidden(
+            self.model.device_context(),
+            batch_output,
+            0,
+            &mut output,
+            0,
+            batch_output.hidden_dim,
+            batch_output.seq_len,
+        )?;
+        Ok(DFlashDraftBatchResponse {
+            request_ids,
+            output,
+            cache_seq_lens: vec![0; batch_size],
+            batch_size,
+            q_len: key.q_len,
+            elapsed,
+        })
+    }
+
+    pub fn execute_host_batch(
+        &mut self,
+        requests: Vec<DFlashDraftHostRequest>,
+    ) -> Result<Vec<DFlashDraftResponse>> {
+        let batch = self.execute_host_batch_compact(requests)?;
+        self.split_compact_response(batch)
+    }
+
+    pub fn execute_host_batch_host(
+        &mut self,
+        requests: Vec<DFlashDraftHostRequest>,
+    ) -> Result<Vec<DFlashDraftHostResponse>> {
+        let batch = self.execute_host_batch_compact(requests)?;
+        self.split_compact_host_response(batch)
+    }
+
+    pub fn execute_host_batch_view(
+        &mut self,
+        requests: Vec<DFlashDraftHostRequest>,
+    ) -> Result<DFlashDraftBatchView<'_>> {
+        anyhow::ensure!(!requests.is_empty(), "DFlash host executor batch is empty");
+        anyhow::ensure!(
+            requests.len() <= self.options.max_batch_size,
+            "DFlash host executor batch size {} exceeds max_batch_size {}",
+            requests.len(),
+            self.options.max_batch_size
+        );
+        let key = self.host_batch_key(&requests[0])?;
+        for req in &requests[1..] {
+            let req_key = self.host_batch_key(req)?;
+            anyhow::ensure!(
+                req_key == key,
+                "DFlash host executor requires exact-shape batch: first={key:?}, got={req_key:?}"
+            );
+        }
+        anyhow::ensure!(
+            key.cache_mode == DFlashCacheMode::NoCache,
+            "borrowed host batch view currently supports only NoCache mode"
+        );
+        let started = Instant::now();
+        let batch_size = requests.len();
+        let request_ids = requests
+            .iter()
+            .map(|request| request.request_id)
+            .collect::<Vec<_>>();
+        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
+        if !self.buffers.contains_key(&buffer_key) {
+            let bufs = self.model.create_batch_buffers(
+                self.options.max_batch_size,
+                key.q_len,
+                key.ctx_len,
+            )?;
+            self.buffers.insert(buffer_key, bufs);
+        }
+        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
+        let inputs = requests
+            .iter()
+            .map(|req| DFlashHostBatchInput {
+                noise_embedding: &req.noise_embedding,
+                target_hidden: &req.target_hidden,
+                position_ids: &req.position_ids,
+            })
+            .collect::<Vec<_>>();
+        let output = self.model.forward_host_batch(&inputs, bufs)?;
+        self.model.device_context().sync()?;
+        Ok(DFlashDraftBatchView {
+            request_ids,
+            output,
+            cache_seq_lens: vec![0; batch_size],
+            batch_size,
+            q_len: key.q_len,
+            elapsed: started.elapsed(),
+        })
+    }
+
+    pub fn execute_batch_compact(
+        &mut self,
+        requests: Vec<DFlashDraftRequest>,
+    ) -> Result<DFlashDraftBatchResponse> {
+        anyhow::ensure!(!requests.is_empty(), "DFlash executor batch is empty");
+        anyhow::ensure!(
+            requests.len() <= self.options.max_batch_size,
+            "DFlash executor batch size {} exceeds max_batch_size {}",
+            requests.len(),
+            self.options.max_batch_size
+        );
+        let key = self.batch_key(&requests[0])?;
+        for req in &requests[1..] {
+            let req_key = self.batch_key(req)?;
+            anyhow::ensure!(
+                req_key == key,
+                "DFlash executor requires exact-shape batch: first={key:?}, got={req_key:?}"
+            );
+        }
+        match key.cache_mode {
+            DFlashCacheMode::NoCache => self.execute_uncached_batch_compact(requests, key),
+            DFlashCacheMode::DraftCache => {
+                self.execute_cached_requests_serial_compact(requests, key)
+            }
+        }
+    }
+
+    pub fn reset_cache(&mut self, request_id: DFlashRequestId) -> Result<()> {
+        let Some(cache) = self.caches.get_mut(&request_id) else {
+            anyhow::bail!("unknown DFlash cache request_id {:?}", request_id);
+        };
+        cache.reset();
+        Ok(())
+    }
+
+    pub fn crop_cache(&mut self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> {
+        let Some(cache) = self.caches.get_mut(&request_id) else {
+            anyhow::bail!("unknown DFlash cache request_id {:?}", request_id);
+        };
+        cache.crop(seq_len)?;
+        Ok(())
+    }
+
+    pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result<usize> {
+        self.caches
+            .get(&request_id)
+            .map(DFlashDraftCache::seq_len)
+            .ok_or_else(|| anyhow::anyhow!("unknown DFlash cache request_id {:?}", request_id))
+    }
+
+    fn execute_uncached_batch_compact(
+        &mut self,
+        requests: Vec<DFlashDraftRequest>,
+        key: DFlashBatchKey,
+    ) -> Result<DFlashDraftBatchResponse> {
+        let started = Instant::now();
+        let batch_size = requests.len();
+        let request_ids = requests
+            .iter()
+            .map(|request| request.request_id)
+            .collect::<Vec<_>>();
+        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
+        if !self.buffers.contains_key(&buffer_key) {
+            let bufs = self.model.create_batch_buffers(
+                self.options.max_batch_size,
+                key.q_len,
+                key.ctx_len,
+            )?;
+            self.buffers.insert(buffer_key, bufs);
+        }
+        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
+        let inputs = requests
+            .iter()
+            .map(|req| DFlashBatchInput {
+                noise_embedding: &req.noise_embedding,
+                target_hidden: DFlashTargetHidden {
+                    concatenated: &req.target_hidden,
+                },
+                position_ids: &req.position_ids,
+            })
+            .collect::<Vec<_>>();
+        let batch_output = self.model.forward_batch(&inputs, bufs)?;
+        self.model.device_context().sync()?;
+        let elapsed = started.elapsed();
+        let mut output = HiddenStates::zeros(
+            self.model.device_context(),
+            self.model.config().hidden_size,
+            batch_size * key.q_len,
+        )?;
+        copy_hidden(
+            self.model.device_context(),
+            batch_output,
+            0,
+            &mut output,
+            0,
+            self.model.config().hidden_size,
+            batch_size * key.q_len,
+        )?;
+        Ok(DFlashDraftBatchResponse {
+            request_ids,
+            output,
+            cache_seq_lens: vec![0; batch_size],
+            batch_size,
+            q_len: key.q_len,
+            elapsed,
+        })
+    }
+
+    fn execute_cached_requests_serial_compact(
+        &mut self,
+        requests: Vec<DFlashDraftRequest>,
+        key: DFlashBatchKey,
+    ) -> Result<DFlashDraftBatchResponse> {
+        let started = Instant::now();
+        let batch_size = requests.len();
+        let mut request_ids = Vec::with_capacity(batch_size);
+        let mut cache_seq_lens = Vec::with_capacity(batch_size);
+        let mut output = HiddenStates::zeros(
+            self.model.device_context(),
+            self.model.config().hidden_size,
+            batch_size * key.q_len,
+        )?;
+        for (i, req) in requests.into_iter().enumerate() {
+            if !self.caches.contains_key(&req.request_id) {
+                let cache = self.model.create_draft_cache(
+                    key.q_len,
+                    self.options.max_step_context_len,
+                    self.options.max_seq_len,
+                )?;
+                self.caches.insert(req.request_id, cache);
+            }
+            let cache = self.caches.get_mut(&req.request_id).expect("cache exists");
+            self.model.prepare_step_context(
+                DFlashTargetHidden {
+                    concatenated: &req.target_hidden,
+                },
+                &req.position_ids,
+                cache,
+            )?;
+            let out = self.model.forward_with_draft_cache(
+                &req.noise_embedding,
+                &req.position_ids,
+                cache,
+            )?;
+            self.model.device_context().sync()?;
+            copy_hidden(
+                self.model.device_context(),
+                out,
+                0,
+                &mut output,
+                i * key.q_len,
+                self.model.config().hidden_size,
+                key.q_len,
+            )?;
+            request_ids.push(req.request_id);
+            cache_seq_lens.push(cache.seq_len());
+        }
+        Ok(DFlashDraftBatchResponse {
+            request_ids,
+            output,
+            cache_seq_lens,
+            batch_size,
+            q_len: key.q_len,
+            elapsed: started.elapsed(),
+        })
+    }
+
+    fn execute_cached_host_requests_serial_compact(
+        &mut self,
+        requests: Vec<DFlashDraftHostRequest>,
+        key: DFlashBatchKey,
+    ) -> Result<DFlashDraftBatchResponse> {
+        let started = Instant::now();
+        let batch_size = requests.len();
+        let config = self.model.config();
+        let mut request_ids = Vec::with_capacity(batch_size);
+        let mut cache_seq_lens = Vec::with_capacity(batch_size);
+        let mut output = HiddenStates::zeros(
+            self.model.device_context(),
+            config.hidden_size,
+            batch_size * key.q_len,
+        )?;
+        for (i, req) in requests.into_iter().enumerate() {
+            if !self.caches.contains_key(&req.request_id) {
+                let cache = self.model.create_draft_cache(
+                    key.q_len,
+                    self.options.max_step_context_len,
+                    self.options.max_seq_len,
+                )?;
+                self.caches.insert(req.request_id, cache);
+            }
+            let noise_embedding = HiddenStates {
+                data: self
+                    .model
+                    .device_context()
+                    .stream
+                    .clone_htod(&req.noise_embedding)?,
+                hidden_dim: config.hidden_size,
+                seq_len: key.q_len,
+            };
+            let target_hidden = HiddenStates {
+                data: self
+                    .model
+                    .device_context()
+                    .stream
+                    .clone_htod(&req.target_hidden)?,
+                hidden_dim: config.hidden_size * config.target_layer_count(),
+                seq_len: key.ctx_len,
+            };
+            let cache = self.caches.get_mut(&req.request_id).expect("cache exists");
+            self.model.prepare_step_context(
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &req.position_ids,
+                cache,
+            )?;
+            let out =
+                self.model
+                    .forward_with_draft_cache(&noise_embedding, &req.position_ids, cache)?;
+            self.model.device_context().sync()?;
+            copy_hidden(
+                self.model.device_context(),
+                out,
+                0,
+                &mut output,
+                i * key.q_len,
+                config.hidden_size,
+                key.q_len,
+            )?;
+            request_ids.push(req.request_id);
+            cache_seq_lens.push(cache.seq_len());
+        }
+        Ok(DFlashDraftBatchResponse {
+            request_ids,
+            output,
+            cache_seq_lens,
+            batch_size,
+            q_len: key.q_len,
+            elapsed: started.elapsed(),
+        })
+    }
+
+    fn split_compact_response(
+        &self,
+        batch: DFlashDraftBatchResponse,
+    ) -> Result<Vec<DFlashDraftResponse>> {
+        let mut responses = Vec::with_capacity(batch.batch_size);
+        for i in 0..batch.batch_size {
+            let mut output = HiddenStates::zeros(
+                self.model.device_context(),
+                self.model.config().hidden_size,
+                batch.q_len,
+            )?;
+            copy_hidden(
+                self.model.device_context(),
+                &batch.output,
+                i * batch.q_len,
+                &mut output,
+                0,
+                self.model.config().hidden_size,
+                batch.q_len,
+            )?;
+            responses.push(DFlashDraftResponse {
+                request_id: batch.request_ids[i],
+                output,
+                cache_seq_len: batch.cache_seq_lens[i],
+                batch_size: batch.batch_size,
+                elapsed: batch.elapsed,
+            });
+        }
+        Ok(responses)
+    }
+
+    fn split_compact_host_response(
+        &self,
+        batch: DFlashDraftBatchResponse,
+    ) -> Result<Vec<DFlashDraftHostResponse>> {
+        let host = self
+            .model
+            .device_context()
+            .stream
+            .clone_dtoh(&batch.output.data)?;
+        self.model.device_context().sync()?;
+        let row_len = batch.output.hidden_dim * batch.q_len;
+        let mut responses = Vec::with_capacity(batch.batch_size);
+        for i in 0..batch.batch_size {
+            responses.push(DFlashDraftHostResponse {
+                request_id: batch.request_ids[i],
+                output: host[i * row_len..(i + 1) * row_len].to_vec(),
+                hidden_dim: batch.output.hidden_dim,
+                seq_len: batch.q_len,
+                cache_seq_len: batch.cache_seq_lens[i],
+                batch_size: batch.batch_size,
+                elapsed: batch.elapsed,
+            });
+        }
+        Ok(responses)
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/src/forward.rs b/openinfer-qwen3-4b-dflash/src/forward.rs
new file mode 100644
index 00000000..7323e130
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/forward.rs
@@ -0,0 +1,886 @@
+use anyhow::Result;
+use cudarc::driver::CudaSlice;
+use openinfer_core::ops;
+use openinfer_core::tensor::HiddenStates;
+
+use crate::weights::{DFlashDraftModel, DFlashLayer};
+
+pub struct DFlashTargetHidden<'a> {
+    /// HF reference layout: `[seq_len, target_layer_count * hidden_size]`.
+    pub concatenated: &'a HiddenStates,
+}
+
+pub struct DFlashDraftCache {
+    pub(crate) q_len: usize,
+    pub(crate) state: DFlashDraftState,
+    pub(crate) step: DFlashStepContext,
+    pub(crate) scratch: ForwardBuffers,
+}
+
+pub(crate) struct DFlashDraftState {
+    pub(crate) max_seq_len: usize,
+    pub(crate) seq_len: usize,
+    pub(crate) layers: Vec<DFlashLayerPastKv>,
+}
+
+pub(crate) struct DFlashStepContext {
+    pub(crate) max_len: usize,
+    pub(crate) len: usize,
+    pub(crate) valid: bool,
+    pub(crate) layers: Vec<DFlashLayerStepContext>,
+}
+
+pub(crate) struct DFlashLayerStepContext {
+    pub(crate) k_ctx: HiddenStates,
+    pub(crate) v_ctx: HiddenStates,
+}
+
+pub(crate) struct DFlashLayerPastKv {
+    pub(crate) k_past: HiddenStates,
+    pub(crate) v_past: HiddenStates,
+}
+
+pub(crate) struct ForwardBuffers {
+    pub(crate) hidden_out: HiddenStates,
+    pub(crate) target_projected: HiddenStates,
+    pub(crate) target_normed: HiddenStates,
+    pub(crate) normed: HiddenStates,
+    pub(crate) q: HiddenStates,
+    pub(crate) q_ctx_scratch: HiddenStates,
+    pub(crate) k_ctx: HiddenStates,
+    pub(crate) k_noise: HiddenStates,
+    pub(crate) v_ctx: HiddenStates,
+    pub(crate) v_noise: HiddenStates,
+    pub(crate) k_all: HiddenStates,
+    pub(crate) v_all: HiddenStates,
+    pub(crate) attn_out: HiddenStates,
+    pub(crate) o_buf: HiddenStates,
+    pub(crate) gate_up: HiddenStates,
+    pub(crate) act_out: HiddenStates,
+    pub(crate) positions_q: CudaSlice<i32>,
+    pub(crate) positions_ctx: CudaSlice<i32>,
+}
+
+impl DFlashDraftModel {
+    pub fn create_draft_cache(
+        &self,
+        q_len: usize,
+        max_step_context_len: usize,
+        max_seq_len: usize,
+    ) -> Result<DFlashDraftCache> {
+        anyhow::ensure!(q_len > 0, "DFlash scratch requires q_len greater than zero");
+        anyhow::ensure!(
+            max_step_context_len > 0,
+            "DFlash cache requires max_step_context_len greater than zero"
+        );
+        anyhow::ensure!(
+            max_seq_len >= max_step_context_len + q_len,
+            "DFlash cache max_seq_len {} must fit at least one step: context {} + q_len {}",
+            max_seq_len,
+            max_step_context_len,
+            q_len
+        );
+        Ok(DFlashDraftCache {
+            q_len,
+            state: DFlashDraftState::new(self, max_seq_len)?,
+            step: DFlashStepContext::new(self, max_step_context_len)?,
+            scratch: ForwardBuffers::new(self, q_len, max_step_context_len)?,
+        })
+    }
+
+    pub fn forward(
+        &self,
+        noise_embedding: &HiddenStates,
+        target_hidden: DFlashTargetHidden<'_>,
+        position_ids: &[i32],
+    ) -> Result<HiddenStates> {
+        let (q_len, ctx_len) =
+            self.validate_forward_inputs(noise_embedding, &target_hidden, position_ids)?;
+        let mut bufs = ForwardBuffers::new(self, q_len, ctx_len)?;
+        self.project_target_hidden(target_hidden, &mut bufs)?;
+        self.run_forward(noise_embedding, ctx_len, position_ids, &mut bufs)?;
+        Ok(bufs.normed)
+    }
+
+    pub fn forward_with_cache<'a>(
+        &self,
+        noise_embedding: &HiddenStates,
+        target_hidden: DFlashTargetHidden<'_>,
+        position_ids: &[i32],
+        cache: &'a mut DFlashDraftCache,
+    ) -> Result<&'a HiddenStates> {
+        let (q_len, ctx_len) =
+            self.validate_forward_inputs(noise_embedding, &target_hidden, position_ids)?;
+        anyhow::ensure!(
+            cache.q_len == q_len && cache.step.max_len >= ctx_len,
+            "DFlash cache shape mismatch: cache q_len={}, max_step_context_len={} but input q_len={}, ctx_len={}",
+            cache.q_len,
+            cache.step.max_len,
+            q_len,
+            ctx_len
+        );
+        cache.reset();
+        self.prepare_step_context(target_hidden, position_ids, cache)?;
+        self.run_forward(noise_embedding, ctx_len, position_ids, &mut cache.scratch)?;
+        cache.step.valid = false;
+        Ok(&cache.scratch.normed)
+    }
+
+    pub fn prepare_step_context(
+        &self,
+        target_hidden: DFlashTargetHidden<'_>,
+        position_ids: &[i32],
+        cache: &mut DFlashDraftCache,
+    ) -> Result<()> {
+        let config = &self.config;
+        let ctx_len = target_hidden.concatenated.seq_len;
+        anyhow::ensure!(
+            ctx_len <= cache.step.max_len,
+            "DFlash step context length {} exceeds cache capacity {}",
+            ctx_len,
+            cache.step.max_len
+        );
+        anyhow::ensure!(
+            cache.state.seq_len + ctx_len + cache.q_len <= cache.state.max_seq_len,
+            "DFlash draft cache would exceed capacity: past {} + ctx {} + q {} > {}",
+            cache.state.seq_len,
+            ctx_len,
+            cache.q_len,
+            cache.state.max_seq_len
+        );
+        anyhow::ensure!(
+            ctx_len > 0,
+            "DFlash step context must contain at least one token"
+        );
+        anyhow::ensure!(
+            position_ids.len() >= ctx_len,
+            "position_ids len {} < ctx_len {}",
+            position_ids.len(),
+            ctx_len
+        );
+        anyhow::ensure!(
+            target_hidden.concatenated.hidden_dim
+                == config.target_layer_count() * config.hidden_size,
+            "target_hidden hidden_dim {} != {}",
+            target_hidden.concatenated.hidden_dim,
+            config.target_layer_count() * config.hidden_size
+        );
+        set_step_context_len(&mut cache.scratch, &mut cache.step.layers, ctx_len);
+        let mut positions_ctx = cache.scratch.positions_ctx.slice_mut(..ctx_len);
+        self.ctx
+            .stream
+            .memcpy_htod(&position_ids[..ctx_len], &mut positions_ctx)?;
+
+        ops::gemm_into_checked(
+            &self.ctx,
+            &self.fc,
+            target_hidden.concatenated,
+            &mut cache.scratch.target_projected,
+        )?;
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            &cache.scratch.target_projected,
+            &self.hidden_norm,
+            config.rms_norm_eps,
+            &mut cache.scratch.target_normed,
+        );
+        for (layer, cached) in self.layers.iter().zip(cache.step.layers.iter_mut()) {
+            ops::gemm_into_checked(
+                &self.ctx,
+                &layer.attention.k_proj,
+                &cache.scratch.target_normed,
+                &mut cached.k_ctx,
+            )?;
+            ops::gemm_into_checked(
+                &self.ctx,
+                &layer.attention.v_proj,
+                &cache.scratch.target_normed,
+                &mut cached.v_ctx,
+            )?;
+            ops::qk_norm_rope_batch_decode_into(
+                &self.ctx,
+                &mut cache.scratch.q_ctx_scratch,
+                &mut cached.k_ctx,
+                &layer.attention.q_norm,
+                &layer.attention.k_norm,
+                &self.cos_cache,
+                &self.sin_cache,
+                &cache.scratch.positions_ctx,
+                config.num_attention_heads,
+                config.num_key_value_heads,
+                config.head_dim,
+                config.rms_norm_eps,
+            );
+        }
+        cache.step.len = ctx_len;
+        cache.step.valid = true;
+        Ok(())
+    }
+
+    pub fn forward_with_draft_cache<'a>(
+        &self,
+        noise_embedding: &HiddenStates,
+        position_ids: &[i32],
+        cache: &'a mut DFlashDraftCache,
+    ) -> Result<&'a HiddenStates> {
+        anyhow::ensure!(cache.step.valid, "DFlash step context is not prepared");
+        anyhow::ensure!(
+            noise_embedding.hidden_dim == self.config.hidden_size,
+            "noise_embedding hidden_dim {} != {}",
+            noise_embedding.hidden_dim,
+            self.config.hidden_size
+        );
+        anyhow::ensure!(
+            noise_embedding.seq_len == cache.q_len,
+            "noise_embedding q_len {} != scratch q_len {}",
+            noise_embedding.seq_len,
+            cache.q_len
+        );
+        anyhow::ensure!(
+            position_ids.len() == cache.step.len + cache.q_len,
+            "position_ids len {} != step_context_len + q_len {}",
+            position_ids.len(),
+            cache.step.len + cache.q_len
+        );
+        anyhow::ensure!(
+            cache.state.seq_len + cache.step.len + cache.q_len <= cache.state.max_seq_len,
+            "DFlash draft cache would exceed capacity: past {} + ctx {} + q {} > {}",
+            cache.state.seq_len,
+            cache.step.len,
+            cache.q_len,
+            cache.state.max_seq_len
+        );
+        let past_len = cache.state.seq_len;
+        self.run_forward_with_draft_cache(noise_embedding, past_len, position_ids, cache)?;
+        cache.step.valid = false;
+        Ok(&cache.scratch.normed)
+    }
+
+    pub(crate) fn validate_forward_inputs(
+        &self,
+        noise_embedding: &HiddenStates,
+        target_hidden: &DFlashTargetHidden<'_>,
+        position_ids: &[i32],
+    ) -> Result<(usize, usize)> {
+        let config = &self.config;
+        anyhow::ensure!(
+            noise_embedding.hidden_dim == config.hidden_size,
+            "noise_embedding hidden_dim {} != {}",
+            noise_embedding.hidden_dim,
+            config.hidden_size
+        );
+        let ctx_len = target_hidden.concatenated.seq_len;
+        let q_len = noise_embedding.seq_len;
+        anyhow::ensure!(
+            ctx_len > 0,
+            "DFlash forward requires at least one target-hidden token"
+        );
+        anyhow::ensure!(
+            q_len > 0,
+            "DFlash forward requires at least one noise token"
+        );
+        anyhow::ensure!(
+            target_hidden.concatenated.hidden_dim
+                == config.target_layer_count() * config.hidden_size,
+            "target_hidden hidden_dim {} != {}",
+            target_hidden.concatenated.hidden_dim,
+            config.target_layer_count() * config.hidden_size
+        );
+        anyhow::ensure!(
+            position_ids.len() == ctx_len + q_len,
+            "position_ids len {} != ctx_len + q_len {}",
+            position_ids.len(),
+            ctx_len + q_len
+        );
+        Ok((q_len, ctx_len))
+    }
+
+    fn project_target_hidden(
+        &self,
+        target_hidden: DFlashTargetHidden<'_>,
+        bufs: &mut ForwardBuffers,
+    ) -> Result<()> {
+        let config = &self.config;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &self.fc,
+            target_hidden.concatenated,
+            &mut bufs.target_projected,
+        )?;
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            &bufs.target_projected,
+            &self.hidden_norm,
+            config.rms_norm_eps,
+            &mut bufs.target_normed,
+        );
+        Ok(())
+    }
+
+    pub(crate) fn run_forward(
+        &self,
+        noise_embedding: &HiddenStates,
+        ctx_len: usize,
+        position_ids: &[i32],
+        bufs: &mut ForwardBuffers,
+    ) -> Result<()> {
+        let q_len = noise_embedding.seq_len;
+        let mut positions_q = bufs.positions_q.slice_mut(..q_len);
+        self.ctx
+            .stream
+            .memcpy_htod(&position_ids[ctx_len..], &mut positions_q)?;
+        let mut positions_ctx = bufs.positions_ctx.slice_mut(..ctx_len);
+        self.ctx
+            .stream
+            .memcpy_htod(&position_ids[..ctx_len], &mut positions_ctx)?;
+
+        let mut hidden = clone_hidden(&self.ctx, noise_embedding)?;
+        for layer in &self.layers {
+            self.forward_layer(layer, &mut hidden, bufs)?;
+        }
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            &hidden,
+            &self.norm,
+            self.config.rms_norm_eps,
+            &mut bufs.normed,
+        );
+        Ok(())
+    }
+
+    fn run_forward_with_draft_cache(
+        &self,
+        noise_embedding: &HiddenStates,
+        past_len: usize,
+        position_ids: &[i32],
+        cache: &mut DFlashDraftCache,
+    ) -> Result<()> {
+        let ctx_len = cache.step.len;
+        let q_len = noise_embedding.seq_len;
+        let total_len = past_len + ctx_len + q_len;
+        let mut positions_q = cache.scratch.positions_q.slice_mut(..q_len);
+        self.ctx
+            .stream
+            .memcpy_htod(&position_ids[ctx_len..], &mut positions_q)?;
+
+        let mut hidden = clone_hidden(&self.ctx, noise_embedding)?;
+        for layer_idx in 0..self.layers.len() {
+            let layer = &self.layers[layer_idx];
+            self.forward_layer_with_draft_cache(
+                layer,
+                past_len,
+                total_len,
+                &cache.step.layers[layer_idx],
+                &mut cache.state.layers[layer_idx],
+                &mut hidden,
+                &mut cache.scratch,
+            )?;
+        }
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            &hidden,
+            &self.norm,
+            self.config.rms_norm_eps,
+            &mut cache.scratch.normed,
+        );
+        cache.state.seq_len = total_len;
+        set_past_seq_len(&mut cache.state.layers, total_len);
+        Ok(())
+    }
+
+    pub(crate) fn forward_layer(
+        &self,
+        layer: &DFlashLayer,
+        hidden: &mut HiddenStates,
+        bufs: &mut ForwardBuffers,
+    ) -> Result<()> {
+        let config = &self.config;
+        let q_len = hidden.seq_len;
+        let ctx_len = bufs.target_normed.seq_len;
+
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            hidden,
+            &layer.input_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        );
+
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.q_proj,
+            &bufs.normed,
+            &mut bufs.q,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.k_proj,
+            &bufs.normed,
+            &mut bufs.k_noise,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.v_proj,
+            &bufs.normed,
+            &mut bufs.v_noise,
+        )?;
+
+        ops::qk_norm_rope_batch_decode_into(
+            &self.ctx,
+            &mut bufs.q,
+            &mut bufs.k_noise,
+            &layer.attention.q_norm,
+            &layer.attention.k_norm,
+            &self.cos_cache,
+            &self.sin_cache,
+            &bufs.positions_q,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+            config.rms_norm_eps,
+        );
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.k_proj,
+            &bufs.target_normed,
+            &mut bufs.k_ctx,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.v_proj,
+            &bufs.target_normed,
+            &mut bufs.v_ctx,
+        )?;
+        // Normalize and rotate context K with its own positions. Q has already
+        // been prepared above; q_ctx_scratch only reuses the shared Q/K kernel.
+        ops::qk_norm_rope_batch_decode_into(
+            &self.ctx,
+            &mut bufs.q_ctx_scratch,
+            &mut bufs.k_ctx,
+            &layer.attention.q_norm,
+            &layer.attention.k_norm,
+            &self.cos_cache,
+            &self.sin_cache,
+            &bufs.positions_ctx,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+            config.rms_norm_eps,
+        );
+        concat_kv(
+            &self.ctx,
+            &bufs.k_ctx,
+            &bufs.k_noise,
+            ctx_len,
+            q_len,
+            &mut bufs.k_all,
+        )?;
+        concat_kv(
+            &self.ctx,
+            &bufs.v_ctx,
+            &bufs.v_noise,
+            ctx_len,
+            q_len,
+            &mut bufs.v_all,
+        )?;
+
+        ops::single_prefill_nhd_noncausal_into(
+            &self.ctx,
+            &bufs.q,
+            &bufs.k_all,
+            &bufs.v_all,
+            &mut bufs.attn_out,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.o_proj,
+            &bufs.attn_out,
+            &mut bufs.o_buf,
+        )?;
+        openinfer_kernels::ops::fused_add_rms_norm_round_batch_into(
+            &self.ctx,
+            hidden,
+            &bufs.o_buf,
+            &layer.post_attention_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        )?;
+
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.mlp.gate_up_proj,
+            &bufs.normed,
+            &mut bufs.gate_up,
+        )?;
+        ops::silu_mul_fused_batch_into(&self.ctx, &bufs.gate_up, &mut bufs.act_out);
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.mlp.down_proj,
+            &bufs.act_out,
+            &mut bufs.o_buf,
+        )?;
+        ops::add_batch_into(&self.ctx, hidden, &bufs.o_buf, &mut bufs.hidden_out)?;
+        std::mem::swap(hidden, &mut bufs.hidden_out);
+        Ok(())
+    }
+
+    fn forward_layer_with_draft_cache(
+        &self,
+        layer: &DFlashLayer,
+        past_len: usize,
+        total_len: usize,
+        step_context: &DFlashLayerStepContext,
+        past: &mut DFlashLayerPastKv,
+        hidden: &mut HiddenStates,
+        bufs: &mut ForwardBuffers,
+    ) -> Result<()> {
+        let config = &self.config;
+        let q_len = hidden.seq_len;
+        let ctx_len = bufs.target_normed.seq_len;
+
+        ops::rms_norm_batch_into(
+            &self.ctx,
+            hidden,
+            &layer.input_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        );
+
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.q_proj,
+            &bufs.normed,
+            &mut bufs.q,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.k_proj,
+            &bufs.normed,
+            &mut bufs.k_noise,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.v_proj,
+            &bufs.normed,
+            &mut bufs.v_noise,
+        )?;
+
+        ops::qk_norm_rope_batch_decode_into(
+            &self.ctx,
+            &mut bufs.q,
+            &mut bufs.k_noise,
+            &layer.attention.q_norm,
+            &layer.attention.k_norm,
+            &self.cos_cache,
+            &self.sin_cache,
+            &bufs.positions_q,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+            config.rms_norm_eps,
+        );
+
+        append_kv(
+            &self.ctx,
+            &step_context.k_ctx,
+            &bufs.k_noise,
+            past_len,
+            ctx_len,
+            q_len,
+            &mut past.k_past,
+        )?;
+        append_kv(
+            &self.ctx,
+            &step_context.v_ctx,
+            &bufs.v_noise,
+            past_len,
+            ctx_len,
+            q_len,
+            &mut past.v_past,
+        )?;
+        past.k_past.seq_len = total_len;
+        past.v_past.seq_len = total_len;
+
+        ops::single_prefill_nhd_noncausal_into(
+            &self.ctx,
+            &bufs.q,
+            &past.k_past,
+            &past.v_past,
+            &mut bufs.attn_out,
+            config.num_attention_heads,
+            config.num_key_value_heads,
+            config.head_dim,
+        )?;
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.attention.o_proj,
+            &bufs.attn_out,
+            &mut bufs.o_buf,
+        )?;
+        openinfer_kernels::ops::fused_add_rms_norm_round_batch_into(
+            &self.ctx,
+            hidden,
+            &bufs.o_buf,
+            &layer.post_attention_layernorm,
+            config.rms_norm_eps,
+            &mut bufs.normed,
+        )?;
+
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.mlp.gate_up_proj,
+            &bufs.normed,
+            &mut bufs.gate_up,
+        )?;
+        ops::silu_mul_fused_batch_into(&self.ctx, &bufs.gate_up, &mut bufs.act_out);
+        ops::gemm_into_checked(
+            &self.ctx,
+            &layer.mlp.down_proj,
+            &bufs.act_out,
+            &mut bufs.o_buf,
+        )?;
+        ops::add_batch_into(&self.ctx, hidden, &bufs.o_buf, &mut bufs.hidden_out)?;
+        std::mem::swap(hidden, &mut bufs.hidden_out);
+        Ok(())
+    }
+}
+
+impl DFlashDraftCache {
+    pub fn seq_len(&self) -> usize {
+        self.state.seq_len
+    }
+
+    pub fn reset(&mut self) {
+        self.state.seq_len = 0;
+        self.step.len = 0;
+        self.step.valid = false;
+        set_past_seq_len(&mut self.state.layers, 0);
+    }
+
+    pub fn crop(&mut self, seq_len: usize) -> Result<()> {
+        anyhow::ensure!(
+            seq_len <= self.state.seq_len,
+            "cannot crop DFlash draft cache from {} to larger length {}",
+            self.state.seq_len,
+            seq_len
+        );
+        self.state.seq_len = seq_len;
+        self.step.valid = false;
+        self.step.len = 0;
+        set_past_seq_len(&mut self.state.layers, seq_len);
+        Ok(())
+    }
+}
+
+impl DFlashDraftState {
+    fn new(model: &DFlashDraftModel, max_seq_len: usize) -> Result<Self> {
+        let config = &model.config;
+        let kv_dim = config.kv_dim();
+        let mut layers = Vec::with_capacity(config.num_hidden_layers);
+        for _ in 0..config.num_hidden_layers {
+            layers.push(DFlashLayerPastKv {
+                k_past: HiddenStates::zeros(&model.ctx, kv_dim, max_seq_len)?,
+                v_past: HiddenStates::zeros(&model.ctx, kv_dim, max_seq_len)?,
+            });
+        }
+        Ok(Self {
+            max_seq_len,
+            seq_len: 0,
+            layers,
+        })
+    }
+}
+
+impl DFlashStepContext {
+    fn new(model: &DFlashDraftModel, max_len: usize) -> Result<Self> {
+        let config = &model.config;
+        let kv_dim = config.kv_dim();
+        let mut layers = Vec::with_capacity(config.num_hidden_layers);
+        for _ in 0..config.num_hidden_layers {
+            layers.push(DFlashLayerStepContext {
+                k_ctx: HiddenStates::zeros(&model.ctx, kv_dim, max_len)?,
+                v_ctx: HiddenStates::zeros(&model.ctx, kv_dim, max_len)?,
+            });
+        }
+        Ok(Self {
+            max_len,
+            len: 0,
+            valid: false,
+            layers,
+        })
+    }
+}
+
+impl ForwardBuffers {
+    pub(crate) fn new(model: &DFlashDraftModel, q_len: usize, ctx_len: usize) -> Result<Self> {
+        let config = &model.config;
+        let ctx = &model.ctx;
+        let hidden = config.hidden_size;
+        let q_dim = config.q_dim();
+        let kv_dim = config.kv_dim();
+        Ok(Self {
+            hidden_out: HiddenStates::zeros(ctx, hidden, q_len)?,
+            target_projected: HiddenStates::zeros(ctx, hidden, ctx_len)?,
+            target_normed: HiddenStates::zeros(ctx, hidden, ctx_len)?,
+            normed: HiddenStates::zeros(ctx, hidden, q_len)?,
+            q: HiddenStates::zeros(ctx, q_dim, q_len)?,
+            q_ctx_scratch: HiddenStates::zeros(ctx, q_dim, ctx_len)?,
+            k_ctx: HiddenStates::zeros(ctx, kv_dim, ctx_len)?,
+            k_noise: HiddenStates::zeros(ctx, kv_dim, q_len)?,
+            v_ctx: HiddenStates::zeros(ctx, kv_dim, ctx_len)?,
+            v_noise: HiddenStates::zeros(ctx, kv_dim, q_len)?,
+            k_all: HiddenStates::zeros(ctx, kv_dim, ctx_len + q_len)?,
+            v_all: HiddenStates::zeros(ctx, kv_dim, ctx_len + q_len)?,
+            attn_out: HiddenStates::zeros(ctx, q_dim, q_len)?,
+            o_buf: HiddenStates::zeros(ctx, hidden, q_len)?,
+            gate_up: HiddenStates::zeros(ctx, 2 * config.intermediate_size, q_len)?,
+            act_out: HiddenStates::zeros(ctx, config.intermediate_size, q_len)?,
+            positions_q: ctx.stream.alloc_zeros(q_len)?,
+            positions_ctx: ctx.stream.alloc_zeros(ctx_len)?,
+        })
+    }
+}
+
+pub(crate) fn clone_hidden(
+    ctx: &openinfer_core::tensor::DeviceContext,
+    input: &HiddenStates,
+) -> Result<HiddenStates> {
+    let mut out = HiddenStates::zeros(ctx, input.hidden_dim, input.seq_len)?;
+    let src = input.data.slice(..input.hidden_dim * input.seq_len);
+    let mut dst = out.data.slice_mut(..input.hidden_dim * input.seq_len);
+    ctx.stream.memcpy_dtod(&src, &mut dst)?;
+    Ok(out)
+}
+
+pub(crate) fn concat_kv(
+    ctx: &openinfer_core::tensor::DeviceContext,
+    ctx_part: &HiddenStates,
+    noise_part: &HiddenStates,
+    ctx_len: usize,
+    q_len: usize,
+    out: &mut HiddenStates,
+) -> Result<()> {
+    debug_assert_eq!(ctx_part.seq_len, ctx_len);
+    debug_assert_eq!(noise_part.seq_len, q_len);
+    debug_assert_eq!(ctx_part.hidden_dim, noise_part.hidden_dim);
+    debug_assert_eq!(out.hidden_dim, ctx_part.hidden_dim);
+    debug_assert_eq!(out.seq_len, ctx_len + q_len);
+    let ctx_src = ctx_part.data.slice(..ctx_part.hidden_dim * ctx_len);
+    let mut ctx_dst = out.data.slice_mut(..ctx_part.hidden_dim * ctx_len);
+    ctx.stream.memcpy_dtod(&ctx_src, &mut ctx_dst)?;
+    let noise_src = noise_part.data.slice(..noise_part.hidden_dim * q_len);
+    let offset = ctx_part.hidden_dim * ctx_len;
+    let mut noise_dst = out
+        .data
+        .slice_mut(offset..offset + noise_part.hidden_dim * q_len);
+    ctx.stream.memcpy_dtod(&noise_src, &mut noise_dst)?;
+    Ok(())
+}
+
+pub(crate) fn append_kv(
+    ctx: &openinfer_core::tensor::DeviceContext,
+    ctx_part: &HiddenStates,
+    noise_part: &HiddenStates,
+    past_len: usize,
+    ctx_len: usize,
+    q_len: usize,
+    out: &mut HiddenStates,
+) -> Result<()> {
+    debug_assert_eq!(ctx_part.seq_len, ctx_len);
+    debug_assert_eq!(noise_part.seq_len, q_len);
+    debug_assert_eq!(ctx_part.hidden_dim, noise_part.hidden_dim);
+    debug_assert_eq!(out.hidden_dim, ctx_part.hidden_dim);
+    debug_assert!(past_len + ctx_len + q_len <= out.data.len());
+    let ctx_src = ctx_part.data.slice(..ctx_part.hidden_dim * ctx_len);
+    let ctx_offset = ctx_part.hidden_dim * past_len;
+    let mut ctx_dst = out
+        .data
+        .slice_mut(ctx_offset..ctx_offset + ctx_part.hidden_dim * ctx_len);
+    ctx.stream.memcpy_dtod(&ctx_src, &mut ctx_dst)?;
+    let noise_src = noise_part.data.slice(..noise_part.hidden_dim * q_len);
+    let noise_offset = ctx_part.hidden_dim * (past_len + ctx_len);
+    let mut noise_dst = out
+        .data
+        .slice_mut(noise_offset..noise_offset + noise_part.hidden_dim * q_len);
+    ctx.stream.memcpy_dtod(&noise_src, &mut noise_dst)?;
+    Ok(())
+}
+
+pub(crate) fn set_step_context_len(
+    bufs: &mut ForwardBuffers,
+    layers: &mut [DFlashLayerStepContext],
+    ctx_len: usize,
+) {
+    bufs.target_projected.seq_len = ctx_len;
+    bufs.target_normed.seq_len = ctx_len;
+    bufs.q_ctx_scratch.seq_len = ctx_len;
+    bufs.k_ctx.seq_len = ctx_len;
+    bufs.v_ctx.seq_len = ctx_len;
+    for layer in layers {
+        layer.k_ctx.seq_len = ctx_len;
+        layer.v_ctx.seq_len = ctx_len;
+    }
+}
+
+pub(crate) fn set_past_seq_len(layers: &mut [DFlashLayerPastKv], seq_len: usize) {
+    for layer in layers {
+        layer.k_past.seq_len = seq_len;
+        layer.v_past.seq_len = seq_len;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use half::bf16;
+    use std::path::Path;
+
+    const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16";
+
+    #[test]
+    fn draft_forward_smoke_local_model() {
+        let path = Path::new(LOCAL_DFLASH);
+        if !path.exists() {
+            eprintln!("skipping: {LOCAL_DFLASH} does not exist");
+            return;
+        }
+
+        let model = DFlashDraftModel::load(path, 0).expect("load model");
+        let config = model.config();
+        let ctx_len = 1;
+        let q_len = 1;
+        let noise_host = vec![bf16::ZERO; config.hidden_size * q_len];
+        let target_host =
+            vec![bf16::ZERO; config.hidden_size * config.target_layer_count() * ctx_len];
+        let noise_embedding = HiddenStates {
+            data: model.ctx.stream.clone_htod(&noise_host).expect("noise h2d"),
+            hidden_dim: config.hidden_size,
+            seq_len: q_len,
+        };
+        let target_hidden = HiddenStates {
+            data: model
+                .ctx
+                .stream
+                .clone_htod(&target_host)
+                .expect("target h2d"),
+            hidden_dim: config.hidden_size * config.target_layer_count(),
+            seq_len: ctx_len,
+        };
+
+        let out = model
+            .forward(
+                &noise_embedding,
+                DFlashTargetHidden {
+                    concatenated: &target_hidden,
+                },
+                &[0, 1],
+            )
+            .expect("forward");
+        model.ctx.sync().expect("sync");
+        assert_eq!(out.hidden_dim, config.hidden_size);
+        assert_eq!(out.seq_len, q_len);
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/src/lib.rs b/openinfer-qwen3-4b-dflash/src/lib.rs
new file mode 100644
index 00000000..4b64dcfc
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/lib.rs
@@ -0,0 +1,19 @@
+mod batch_buffers;
+mod batch_forward;
+mod config;
+mod executor;
+mod forward;
+mod scheduler;
+mod weights;
+
+pub use batch_buffers::DFlashBatchBuffers;
+pub use batch_forward::DFlashBatchInput;
+pub use config::{DFlashConfig, DFlashInnerConfig};
+pub use executor::{
+    DFlashBatchKey, DFlashCacheMode, DFlashDraftBatchResponse, DFlashDraftHostRequest,
+    DFlashDraftHostResponse, DFlashDraftRequest, DFlashDraftResponse, DFlashExecutor,
+    DFlashExecutorOptions, DFlashRequestId,
+};
+pub use forward::{DFlashDraftCache, DFlashTargetHidden};
+pub use scheduler::{DFlashSchedulerHandle, DFlashSchedulerOptions};
+pub use weights::DFlashDraftModel;
diff --git a/openinfer-qwen3-4b-dflash/src/scheduler.rs b/openinfer-qwen3-4b-dflash/src/scheduler.rs
new file mode 100644
index 00000000..a803ad15
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/scheduler.rs
@@ -0,0 +1,422 @@
+use std::collections::VecDeque;
+use std::path::{Path, PathBuf};
+use std::thread;
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use crossbeam_channel as channel;
+
+use crate::executor::{
+    DFlashBatchKey, DFlashDraftHostRequest, DFlashDraftHostResponse, DFlashExecutor,
+    DFlashExecutorOptions, DFlashRequestId,
+};
+
+pub struct DFlashSchedulerOptions {
+    pub executor: DFlashExecutorOptions,
+    pub max_wait: Duration,
+    pub max_total_tokens: usize,
+}
+
+impl Default for DFlashSchedulerOptions {
+    fn default() -> Self {
+        Self {
+            executor: DFlashExecutorOptions::default(),
+            max_wait: Duration::from_micros(200),
+            max_total_tokens: 512,
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct DFlashSchedulerHandle {
+    submit_tx: channel::Sender<SchedulerMessage>,
+}
+
+enum SchedulerMessage {
+    Submit {
+        request: DFlashDraftHostRequest,
+        response_tx: channel::Sender<Result<DFlashDraftHostResponse>>,
+    },
+    ResetCache {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<()>>,
+    },
+    CropCache {
+        request_id: DFlashRequestId,
+        seq_len: usize,
+        response_tx: channel::Sender<Result<()>>,
+    },
+    CacheSeqLen {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<usize>>,
+    },
+}
+
+struct PendingRequest {
+    request: DFlashDraftHostRequest,
+    response_tx: channel::Sender<Result<DFlashDraftHostResponse>>,
+    queued_at: Instant,
+}
+
+enum PendingItem {
+    Submit(PendingRequest),
+    Control(SchedulerControl),
+}
+
+enum SchedulerControl {
+    ResetCache {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<()>>,
+    },
+    CropCache {
+        request_id: DFlashRequestId,
+        seq_len: usize,
+        response_tx: channel::Sender<Result<()>>,
+    },
+    CacheSeqLen {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<usize>>,
+    },
+}
+
+impl DFlashSchedulerHandle {
+    pub fn start(
+        model_path: &Path,
+        device_ordinal: usize,
+        options: DFlashSchedulerOptions,
+    ) -> Result<Self> {
+        let (submit_tx, submit_rx) = channel::unbounded();
+        let (init_tx, init_rx) = channel::bounded(1);
+        let model_path = PathBuf::from(model_path);
+        let max_wait = options.max_wait;
+        let max_total_tokens = options.max_total_tokens;
+        thread::Builder::new()
+            .name("qwen3-dflash-scheduler".into())
+            .spawn(move || {
+                let mut executor =
+                    match DFlashExecutor::load(&model_path, device_ordinal, options.executor) {
+                        Ok(executor) => executor,
+                        Err(err) => {
+                            let _ = init_tx.send(Err(err));
+                            return;
+                        }
+                    };
+                let _ = init_tx.send(Ok(()));
+                scheduler_loop(&mut executor, submit_rx, max_wait, max_total_tokens);
+            })
+            .expect("failed to spawn DFlash scheduler thread");
+        init_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler initialization channel closed"))??;
+        Ok(Self { submit_tx })
+    }
+
+    pub fn submit(&self, request: DFlashDraftHostRequest) -> Result<DFlashDraftHostResponse> {
+        let (response_tx, response_rx) = channel::bounded(1);
+        self.submit_tx
+            .send(SchedulerMessage::Submit {
+                request,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
+        response_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
+    }
+
+    pub fn submit_with_enqueued_ack(
+        &self,
+        request: DFlashDraftHostRequest,
+        ack_tx: channel::Sender<()>,
+    ) -> Result<DFlashDraftHostResponse> {
+        let (response_tx, response_rx) = channel::bounded(1);
+        self.submit_tx
+            .send(SchedulerMessage::Submit {
+                request,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
+        let _ = ack_tx.send(());
+        response_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
+    }
+
+    pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> {
+        let (response_tx, response_rx) = channel::bounded(1);
+        self.submit_tx
+            .send(SchedulerMessage::ResetCache {
+                request_id,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
+        response_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
+    }
+
+    pub fn crop_cache(&self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> {
+        let (response_tx, response_rx) = channel::bounded(1);
+        self.submit_tx
+            .send(SchedulerMessage::CropCache {
+                request_id,
+                seq_len,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
+        response_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
+    }
+
+    pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result<usize> {
+        let (response_tx, response_rx) = channel::bounded(1);
+        self.submit_tx
+            .send(SchedulerMessage::CacheSeqLen {
+                request_id,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
+        response_rx
+            .recv()
+            .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
+    }
+}
+
+fn scheduler_loop(
+    executor: &mut DFlashExecutor,
+    submit_rx: channel::Receiver<SchedulerMessage>,
+    max_wait: Duration,
+    max_total_tokens: usize,
+) {
+    let mut pending: VecDeque<PendingItem> = VecDeque::new();
+    loop {
+        if pending.is_empty() {
+            match submit_rx.recv() {
+                Ok(msg) => handle_message_or_enqueue(msg, &mut pending),
+                Err(_) => break,
+            }
+        }
+        while let Ok(msg) = submit_rx.try_recv() {
+            handle_message_or_enqueue(msg, &mut pending);
+        }
+        if pending.is_empty() {
+            continue;
+        }
+        let head_wait = pending
+            .front()
+            .and_then(PendingItem::queued_elapsed)
+            .unwrap_or(max_wait);
+        if pending.len() == 1 && head_wait < max_wait {
+            let timeout = max_wait - head_wait;
+            if let Ok(msg) = submit_rx.recv_timeout(timeout) {
+                handle_message_or_enqueue(msg, &mut pending);
+                while let Ok(msg) = submit_rx.try_recv() {
+                    handle_message_or_enqueue(msg, &mut pending);
+                }
+            }
+        }
+        drain_one_batch(executor, &mut pending, max_total_tokens);
+    }
+    for pending in pending {
+        pending.send_stopped();
+    }
+}
+
+fn handle_message_or_enqueue(msg: SchedulerMessage, pending: &mut VecDeque<PendingItem>) {
+    match msg {
+        SchedulerMessage::Submit {
+            request,
+            response_tx,
+        } => pending.push_back(PendingItem::Submit(PendingRequest {
+            request,
+            response_tx,
+            queued_at: Instant::now(),
+        })),
+        SchedulerMessage::ResetCache {
+            request_id,
+            response_tx,
+        } => pending.push_back(PendingItem::Control(SchedulerControl::ResetCache {
+            request_id,
+            response_tx,
+        })),
+        SchedulerMessage::CropCache {
+            request_id,
+            seq_len,
+            response_tx,
+        } => pending.push_back(PendingItem::Control(SchedulerControl::CropCache {
+            request_id,
+            seq_len,
+            response_tx,
+        })),
+        SchedulerMessage::CacheSeqLen {
+            request_id,
+            response_tx,
+        } => pending.push_back(PendingItem::Control(SchedulerControl::CacheSeqLen {
+            request_id,
+            response_tx,
+        })),
+    }
+}
+
+fn drain_one_batch(
+    executor: &mut DFlashExecutor,
+    pending: &mut VecDeque<PendingItem>,
+    max_total_tokens: usize,
+) {
+    let Some(first) = pending.pop_front() else {
+        return;
+    };
+    let PendingItem::Submit(first) = first else {
+        if let PendingItem::Control(control) = first {
+            control.execute(executor);
+        }
+        return;
+    };
+    let key = match executor.host_batch_key(&first.request) {
+        Ok(key) => key,
+        Err(err) => {
+            let _ = first.response_tx.send(Err(err));
+            return;
+        }
+    };
+    let max_batch_size = executor_max_batch_size(executor);
+    let mut batch = vec![first];
+    let mut total_tokens = key.q_len + key.ctx_len + key.past_len;
+    if total_tokens > max_total_tokens {
+        let err = anyhow::anyhow!(
+            "DFlash scheduler request total tokens {} exceeds max_total_tokens {}",
+            total_tokens,
+            max_total_tokens
+        );
+        let first = batch.pop().expect("first request exists");
+        let _ = first.response_tx.send(Err(err));
+        return;
+    }
+    let mut i = 0;
+    while i < pending.len() && batch.len() < max_batch_size {
+        if !matches!(pending.get(i), Some(PendingItem::Submit(_))) {
+            break;
+        }
+        let matches = pending
+            .get(i)
+            .map(|candidate| {
+                let PendingItem::Submit(candidate) = candidate else {
+                    return false;
+                };
+                request_matches_key(
+                    executor,
+                    &candidate.request,
+                    key,
+                    total_tokens,
+                    max_total_tokens,
+                )
+            })
+            .unwrap_or(false);
+        if matches {
+            total_tokens += key.q_len + key.ctx_len + key.past_len;
+            match pending.remove(i).expect("pending index exists") {
+                PendingItem::Submit(request) => batch.push(request),
+                PendingItem::Control(_) => unreachable!("control items are batch barriers"),
+            }
+        } else {
+            i += 1;
+        }
+    }
+    let response_txs = batch
+        .iter()
+        .map(|req| req.response_tx.clone())
+        .collect::<Vec<_>>();
+    let requests = batch.into_iter().map(|pending| pending.request).collect();
+    match executor.execute_host_batch_host(requests) {
+        Ok(responses) => {
+            for (response_tx, response) in response_txs.into_iter().zip(responses.into_iter()) {
+                let _ = response_tx.send(Ok(response));
+            }
+        }
+        Err(err) => {
+            let message = err.to_string();
+            for response_tx in response_txs {
+                let _ = response_tx.send(Err(anyhow::anyhow!(message.clone())));
+            }
+        }
+    }
+}
+
+fn request_matches_key(
+    executor: &DFlashExecutor,
+    request: &DFlashDraftHostRequest,
+    key: DFlashBatchKey,
+    current_total_tokens: usize,
+    max_total_tokens: usize,
+) -> bool {
+    executor
+        .host_batch_key(request)
+        .map(|candidate| {
+            let candidate_tokens = candidate.q_len + candidate.ctx_len + candidate.past_len;
+            candidate == key && current_total_tokens + candidate_tokens <= max_total_tokens
+        })
+        .unwrap_or(false)
+}
+
+fn executor_max_batch_size(executor: &DFlashExecutor) -> usize {
+    executor.max_batch_size()
+}
+
+impl PendingItem {
+    fn queued_elapsed(&self) -> Option<Duration> {
+        match self {
+            PendingItem::Submit(request) => Some(request.queued_at.elapsed()),
+            PendingItem::Control(_) => None,
+        }
+    }
+
+    fn send_stopped(self) {
+        match self {
+            PendingItem::Submit(request) => {
+                let _ = request
+                    .response_tx
+                    .send(Err(anyhow::anyhow!("DFlash scheduler stopped")));
+            }
+            PendingItem::Control(control) => control.send_stopped(),
+        }
+    }
+}
+
+impl SchedulerControl {
+    fn execute(self, executor: &mut DFlashExecutor) {
+        match self {
+            SchedulerControl::ResetCache {
+                request_id,
+                response_tx,
+            } => {
+                let _ = response_tx.send(executor.reset_cache(request_id));
+            }
+            SchedulerControl::CropCache {
+                request_id,
+                seq_len,
+                response_tx,
+            } => {
+                let _ = response_tx.send(executor.crop_cache(request_id, seq_len));
+            }
+            SchedulerControl::CacheSeqLen {
+                request_id,
+                response_tx,
+            } => {
+                let _ = response_tx.send(executor.cache_seq_len(request_id));
+            }
+        }
+    }
+
+    fn send_stopped(self) {
+        match self {
+            SchedulerControl::ResetCache { response_tx, .. }
+            | SchedulerControl::CropCache { response_tx, .. } => {
+                let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped")));
+            }
+            SchedulerControl::CacheSeqLen { response_tx, .. } => {
+                let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped")));
+            }
+        }
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/src/weights.rs b/openinfer-qwen3-4b-dflash/src/weights.rs
new file mode 100644
index 00000000..c0f4f7aa
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/src/weights.rs
@@ -0,0 +1,274 @@
+use anyhow::{Context, Result, bail};
+use log::info;
+use openinfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec};
+use openinfer_core::weight_loader::{
+    deserialize_shards, load_shard_info, load_tensor_1d, load_tensor_2d, mmap_shards,
+    precompute_rope,
+};
+use std::collections::HashMap;
+use std::path::Path;
+
+use crate::config::DFlashConfig;
+
+pub(crate) struct DFlashAttention {
+    pub(crate) q_proj: DeviceMatrix,
+    pub(crate) k_proj: DeviceMatrix,
+    pub(crate) v_proj: DeviceMatrix,
+    pub(crate) o_proj: DeviceMatrix,
+    pub(crate) q_norm: DeviceVec,
+    pub(crate) k_norm: DeviceVec,
+}
+
+pub(crate) struct DFlashMlp {
+    pub(crate) gate_up_proj: DeviceMatrix,
+    pub(crate) down_proj: DeviceMatrix,
+}
+
+pub(crate) struct DFlashLayer {
+    pub(crate) input_layernorm: DeviceVec,
+    pub(crate) attention: DFlashAttention,
+    pub(crate) post_attention_layernorm: DeviceVec,
+    pub(crate) mlp: DFlashMlp,
+}
+
+pub struct DFlashDraftModel {
+    pub(crate) ctx: DeviceContext,
+    pub(crate) config: DFlashConfig,
+    pub(crate) layers: Vec<DFlashLayer>,
+    pub(crate) fc: DeviceMatrix,
+    pub(crate) hidden_norm: DeviceVec,
+    pub(crate) norm: DeviceVec,
+    pub(crate) cos_cache: DeviceVec,
+    pub(crate) sin_cache: DeviceVec,
+}
+
+// SAFETY: The model owns one CUDA context/stream and is intended to run on one
+// worker thread at a time, matching other OpenInfer model structs.
+unsafe impl Send for DFlashDraftModel {}
+unsafe impl Sync for DFlashDraftModel {}
+
+impl DFlashDraftModel {
+    pub fn load(model_path: &Path, device_ordinal: usize) -> Result<Self> {
+        info!(
+            "Loading Qwen3-4B DFlash draft model from {}",
+            model_path.display()
+        );
+        let ctx = DeviceContext::new_with_device(device_ordinal)?;
+        let config = DFlashConfig::from_model_dir(model_path)?;
+        let model_path_str = model_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("DFlash model path must be valid UTF-8"))?;
+        let (shard_paths, weight_map) = load_shard_info(model_path_str)?;
+        let mmaps = mmap_shards(&shard_paths)?;
+        let shards = deserialize_shards(&mmaps)?;
+
+        let fc = load_tensor_2d(&ctx, &shards, &weight_map, "fc.weight")
+            .context("load DFlash fc.weight")?;
+        ensure_matrix_shape(
+            "fc.weight",
+            &fc,
+            config.hidden_size,
+            config.hidden_size * config.target_layer_count(),
+        )?;
+        let hidden_norm = load_tensor_1d(&ctx, &shards, &weight_map, "hidden_norm.weight")?;
+        let norm = load_tensor_1d(&ctx, &shards, &weight_map, "norm.weight")?;
+        ensure_vec_len("hidden_norm.weight", &hidden_norm, config.hidden_size)?;
+        ensure_vec_len("norm.weight", &norm, config.hidden_size)?;
+
+        let mut layers = Vec::with_capacity(config.num_hidden_layers);
+        for layer_idx in 0..config.num_hidden_layers {
+            layers.push(load_layer(&ctx, &shards, &weight_map, &config, layer_idx)?);
+        }
+        let (cos_cache, sin_cache) = precompute_rope(
+            &ctx,
+            config.head_dim,
+            config.max_position_embeddings,
+            config.rope_theta,
+        )?;
+
+        Ok(Self {
+            ctx,
+            config,
+            layers,
+            fc,
+            hidden_norm,
+            norm,
+            cos_cache,
+            sin_cache,
+        })
+    }
+
+    pub fn config(&self) -> &DFlashConfig {
+        &self.config
+    }
+
+    pub fn target_layer_ids(&self) -> &[usize] {
+        &self.config.dflash_config.target_layer_ids
+    }
+
+    pub fn mask_token_id(&self) -> u32 {
+        self.config.dflash_config.mask_token_id
+    }
+
+    pub fn device_context(&self) -> &DeviceContext {
+        &self.ctx
+    }
+}
+
+fn load_layer(
+    ctx: &DeviceContext,
+    shards: &[safetensors::SafeTensors<'_>],
+    weight_map: &HashMap<String, usize>,
+    config: &DFlashConfig,
+    layer_idx: usize,
+) -> Result<DFlashLayer> {
+    let prefix = format!("layers.{layer_idx}");
+    let q_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.q_proj.weight"),
+    )?;
+    let k_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.k_proj.weight"),
+    )?;
+    let v_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.v_proj.weight"),
+    )?;
+    let o_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.o_proj.weight"),
+    )?;
+    ensure_matrix_shape("q_proj", &q_proj, config.q_dim(), config.hidden_size)?;
+    ensure_matrix_shape("k_proj", &k_proj, config.kv_dim(), config.hidden_size)?;
+    ensure_matrix_shape("v_proj", &v_proj, config.kv_dim(), config.hidden_size)?;
+    ensure_matrix_shape("o_proj", &o_proj, config.hidden_size, config.q_dim())?;
+
+    let gate_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.mlp.gate_proj.weight"),
+    )?;
+    let up_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.mlp.up_proj.weight"),
+    )?;
+    let gate_up_proj = DeviceMatrix::vstack(ctx, &[&gate_proj, &up_proj])?;
+    let down_proj = load_tensor_2d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.mlp.down_proj.weight"),
+    )?;
+    ensure_matrix_shape(
+        "gate_up_proj",
+        &gate_up_proj,
+        2 * config.intermediate_size,
+        config.hidden_size,
+    )?;
+    ensure_matrix_shape(
+        "down_proj",
+        &down_proj,
+        config.hidden_size,
+        config.intermediate_size,
+    )?;
+
+    let input_layernorm = load_tensor_1d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.input_layernorm.weight"),
+    )?;
+    let post_attention_layernorm = load_tensor_1d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.post_attention_layernorm.weight"),
+    )?;
+    let q_norm = load_tensor_1d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.q_norm.weight"),
+    )?;
+    let k_norm = load_tensor_1d(
+        ctx,
+        shards,
+        weight_map,
+        &format!("{prefix}.self_attn.k_norm.weight"),
+    )?;
+    ensure_vec_len("input_layernorm", &input_layernorm, config.hidden_size)?;
+    ensure_vec_len(
+        "post_attention_layernorm",
+        &post_attention_layernorm,
+        config.hidden_size,
+    )?;
+    ensure_vec_len("q_norm", &q_norm, config.head_dim)?;
+    ensure_vec_len("k_norm", &k_norm, config.head_dim)?;
+
+    Ok(DFlashLayer {
+        input_layernorm,
+        attention: DFlashAttention {
+            q_proj,
+            k_proj,
+            v_proj,
+            o_proj,
+            q_norm,
+            k_norm,
+        },
+        post_attention_layernorm,
+        mlp: DFlashMlp {
+            gate_up_proj,
+            down_proj,
+        },
+    })
+}
+
+fn ensure_matrix_shape(name: &str, matrix: &DeviceMatrix, rows: usize, cols: usize) -> Result<()> {
+    if matrix.rows != rows || matrix.cols != cols {
+        bail!(
+            "{name} shape mismatch: expected [{rows}, {cols}], got [{}, {}]",
+            matrix.rows,
+            matrix.cols
+        );
+    }
+    Ok(())
+}
+
+fn ensure_vec_len(name: &str, vector: &DeviceVec, len: usize) -> Result<()> {
+    if vector.len != len {
+        bail!("{name} length mismatch: expected {len}, got {}", vector.len);
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16";
+
+    #[test]
+    fn loads_local_dflash_weights() {
+        let path = Path::new(LOCAL_DFLASH);
+        if !path.exists() {
+            eprintln!("skipping: {LOCAL_DFLASH} does not exist");
+            return;
+        }
+        let model = DFlashDraftModel::load(path, 0).expect("load model");
+        assert_eq!(model.layers.len(), 5);
+        assert_eq!(model.fc.rows, 2560);
+        assert_eq!(model.fc.cols, 12800);
+    }
+}
diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
new file mode 100644
index 00000000..69ead63b
--- /dev/null
+++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
@@ -0,0 +1,701 @@
+//! HuggingFace remote-code golden gate for the standalone Qwen3-4B-DFlash draft.
+//!
+//! The fixture is generated by:
+//!
+//! ```ignore
+//! .venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \
+//!   --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+//!   --out test_data/qwen3-4b-dflash-hf-golden.safetensors
+//! ```
+
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Barrier};
+
+use half::bf16;
+use openinfer_core::tensor::HiddenStates;
+use openinfer_qwen3_4b_dflash::{
+    DFlashBatchInput, DFlashCacheMode, DFlashDraftHostRequest, DFlashDraftModel,
+    DFlashDraftRequest, DFlashExecutor, DFlashExecutorOptions, DFlashRequestId,
+    DFlashSchedulerHandle, DFlashSchedulerOptions, DFlashTargetHidden,
+};
+use safetensors::{Dtype, SafeTensors};
+
+const LOCAL_DFLASH: &str = "/home/hezhaozhao/models/Qwen3-4B-DFlash-b16";
+const GOLDEN: &str = concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/../test_data/qwen3-4b-dflash-hf-golden.safetensors"
+);
+
+const MEAN_TOL: f32 = 0.12;
+const P99_TOL: f32 = 0.35;
+
+#[test]
+fn dflash_forward_matches_hf_remote_code() {
+    let Some(model_path) = model_path_or_skip("dflash golden gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash golden gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let model = DFlashDraftModel::load(&model_path, 0).expect("load dflash");
+    let config = model.config();
+    let ctx = model.device_context();
+
+    let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target_hidden = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let expected = bf16_tensor(&st, "output", &[1, 3, config.hidden_size]);
+    let positions = i32_tensor(&st, "position_ids", &[1, 5]);
+
+    let noise_embedding = HiddenStates {
+        data: ctx.stream.clone_htod(&noise).expect("noise h2d"),
+        hidden_dim: config.hidden_size,
+        seq_len: 3,
+    };
+    let target_hidden = HiddenStates {
+        data: ctx.stream.clone_htod(&target_hidden).expect("target h2d"),
+        hidden_dim: config.hidden_size * config.target_layer_count(),
+        seq_len: 2,
+    };
+    let uncached = model
+        .forward(
+            &noise_embedding,
+            DFlashTargetHidden {
+                concatenated: &target_hidden,
+            },
+            &positions,
+        )
+        .expect("forward");
+    ctx.sync().expect("sync");
+    let uncached = ctx.stream.clone_dtoh(&uncached.data).expect("output d2h");
+    ctx.sync().expect("sync");
+    assert_deltas("dflash HF golden deltas", &uncached, &expected);
+
+    let mut cache = model
+        .create_draft_cache(3, 2, 8)
+        .expect("create draft cache");
+    let cached_one_shot = model
+        .forward_with_cache(
+            &noise_embedding,
+            DFlashTargetHidden {
+                concatenated: &target_hidden,
+            },
+            &positions,
+            &mut cache,
+        )
+        .expect("cached one-shot forward");
+    ctx.sync().expect("sync");
+    let cached_one_shot = ctx
+        .stream
+        .clone_dtoh(&cached_one_shot.data)
+        .expect("output d2h");
+    ctx.sync().expect("sync");
+    assert_deltas(
+        "dflash unified-cache one-shot HF golden deltas",
+        &cached_one_shot,
+        &expected,
+    );
+
+    cache.reset();
+    model
+        .prepare_step_context(
+            DFlashTargetHidden {
+                concatenated: &target_hidden,
+            },
+            &positions,
+            &mut cache,
+        )
+        .expect("prepare step context");
+    let cached = model
+        .forward_with_draft_cache(&noise_embedding, &positions, &mut cache)
+        .expect("cached forward");
+    ctx.sync().expect("sync");
+    let cached = ctx.stream.clone_dtoh(&cached.data).expect("output d2h");
+    ctx.sync().expect("sync");
+    assert_deltas("dflash draft-cache HF golden deltas", &cached, &expected);
+    assert_eq!(cache.seq_len(), 5);
+    cache.crop(2).expect("crop draft cache");
+    assert_eq!(cache.seq_len(), 2);
+}
+
+#[test]
+fn dflash_batched_forward_matches_single_forward() {
+    let Some(model_path) = model_path_or_skip("dflash batch gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash batch gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let model = DFlashDraftModel::load(&model_path, 0).expect("load dflash");
+    let config = model.config();
+    let ctx = model.device_context();
+
+    let noise0 = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target0 = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let positions0 = i32_tensor(&st, "position_ids", &[1, 5]);
+    let mut noise1 = noise0.clone();
+    for (i, value) in noise1.iter_mut().enumerate() {
+        if i % 13 == 0 {
+            *value = bf16::from_f32(value.to_f32() + 0.015625);
+        }
+    }
+    let mut target1 = target0.clone();
+    for (i, value) in target1.iter_mut().enumerate() {
+        if i % 31 == 0 {
+            *value = bf16::from_f32(value.to_f32() - 0.03125);
+        }
+    }
+    let mut positions1 = positions0.clone();
+    for value in &mut positions1 {
+        *value += 2;
+    }
+    let noise_a = HiddenStates {
+        data: ctx.stream.clone_htod(&noise0).expect("noise h2d"),
+        hidden_dim: config.hidden_size,
+        seq_len: 3,
+    };
+    let target_a = HiddenStates {
+        data: ctx.stream.clone_htod(&target0).expect("target h2d"),
+        hidden_dim: config.hidden_size * config.target_layer_count(),
+        seq_len: 2,
+    };
+    let noise_b = HiddenStates {
+        data: ctx.stream.clone_htod(&noise1).expect("noise h2d"),
+        hidden_dim: config.hidden_size,
+        seq_len: 3,
+    };
+    let target_b = HiddenStates {
+        data: ctx.stream.clone_htod(&target1).expect("target h2d"),
+        hidden_dim: config.hidden_size * config.target_layer_count(),
+        seq_len: 2,
+    };
+
+    let single = model
+        .forward(
+            &noise_a,
+            DFlashTargetHidden {
+                concatenated: &target_a,
+            },
+            &positions0,
+        )
+        .expect("single forward");
+    ctx.sync().expect("sync");
+    let single = ctx.stream.clone_dtoh(&single.data).expect("single d2h");
+    let single_row1 = model
+        .forward(
+            &noise_b,
+            DFlashTargetHidden {
+                concatenated: &target_b,
+            },
+            &positions1,
+        )
+        .expect("single row1 forward");
+    ctx.sync().expect("sync");
+    let single_row1 = ctx
+        .stream
+        .clone_dtoh(&single_row1.data)
+        .expect("single row1 d2h");
+
+    let mut bufs = model.create_batch_buffers(2, 3, 2).expect("batch buffers");
+    let batch = model
+        .forward_batch(
+            &[
+                DFlashBatchInput {
+                    noise_embedding: &noise_a,
+                    target_hidden: DFlashTargetHidden {
+                        concatenated: &target_a,
+                    },
+                    position_ids: &positions0,
+                },
+                DFlashBatchInput {
+                    noise_embedding: &noise_b,
+                    target_hidden: DFlashTargetHidden {
+                        concatenated: &target_b,
+                    },
+                    position_ids: &positions1,
+                },
+            ],
+            &mut bufs,
+        )
+        .expect("batch forward");
+    ctx.sync().expect("sync");
+    let batch = ctx.stream.clone_dtoh(&batch.data).expect("batch d2h");
+    let row_len = config.hidden_size * 3;
+    assert_deltas("dflash batch row0 vs single", &batch[..row_len], &single);
+    assert_deltas(
+        "dflash batch row1 vs single",
+        &batch[row_len..2 * row_len],
+        &single_row1,
+    );
+}
+
+#[test]
+fn dflash_executor_returns_request_tagged_batch_outputs() {
+    let Some(model_path) = model_path_or_skip("dflash executor gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash executor gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let mut executor = DFlashExecutor::load(
+        &model_path,
+        0,
+        DFlashExecutorOptions {
+            max_batch_size: 2,
+            max_step_context_len: 2,
+            max_seq_len: 8,
+        },
+    )
+    .expect("load executor");
+    let hidden_size = executor.model().config().hidden_size;
+    let target_layer_count = executor.model().config().target_layer_count();
+    let ctx = executor.model().device_context();
+    let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, hidden_size]);
+    let target = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, hidden_size * target_layer_count],
+    );
+    let positions = i32_tensor(&st, "position_ids", &[1, 5]);
+    let mk_req = |request_id| DFlashDraftRequest {
+        request_id: DFlashRequestId(request_id),
+        noise_embedding: HiddenStates {
+            data: ctx.stream.clone_htod(&noise).expect("noise h2d"),
+            hidden_dim: hidden_size,
+            seq_len: 3,
+        },
+        target_hidden: HiddenStates {
+            data: ctx.stream.clone_htod(&target).expect("target h2d"),
+            hidden_dim: hidden_size * target_layer_count,
+            seq_len: 2,
+        },
+        position_ids: positions.clone(),
+        cache_mode: DFlashCacheMode::NoCache,
+    };
+    let responses = executor
+        .execute_batch(vec![mk_req(7), mk_req(8)])
+        .expect("execute batch");
+    assert_eq!(responses.len(), 2);
+    assert_eq!(responses[0].request_id, DFlashRequestId(7));
+    assert_eq!(responses[1].request_id, DFlashRequestId(8));
+    assert_eq!(responses[0].output.hidden_dim, hidden_size);
+    assert_eq!(responses[0].output.seq_len, 3);
+    assert_eq!(responses[0].batch_size, 2);
+}
+
+#[test]
+fn dflash_scheduler_accepts_host_requests() {
+    let Some(model_path) = model_path_or_skip("dflash scheduler gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash scheduler gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let config =
+        openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config");
+    let noise0 = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target0 = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let positions0 = i32_tensor(&st, "position_ids", &[1, 5]);
+    let mut noise1 = noise0.clone();
+    for (i, value) in noise1.iter_mut().enumerate() {
+        if i % 13 == 0 {
+            *value = bf16::from_f32(value.to_f32() + 0.015625);
+        }
+    }
+    let mut target1 = target0.clone();
+    for (i, value) in target1.iter_mut().enumerate() {
+        if i % 31 == 0 {
+            *value = bf16::from_f32(value.to_f32() - 0.03125);
+        }
+    }
+    let mut positions1 = positions0.clone();
+    for value in &mut positions1 {
+        *value += 2;
+    }
+    let scheduler = DFlashSchedulerHandle::start(
+        &model_path,
+        0,
+        DFlashSchedulerOptions {
+            executor: DFlashExecutorOptions {
+                max_batch_size: 2,
+                max_step_context_len: 2,
+                max_seq_len: 8,
+            },
+            max_wait: std::time::Duration::from_millis(50),
+            max_total_tokens: 16,
+        },
+    )
+    .expect("start scheduler");
+    let barrier = Arc::new(Barrier::new(3));
+    let scheduler0 = scheduler.clone();
+    let barrier0 = Arc::clone(&barrier);
+    let t0 = std::thread::spawn(move || {
+        barrier0.wait();
+        scheduler0.submit(DFlashDraftHostRequest {
+            request_id: DFlashRequestId(42),
+            noise_embedding: noise0,
+            target_hidden: target0,
+            position_ids: positions0,
+            q_len: 3,
+            ctx_len: 2,
+            cache_mode: DFlashCacheMode::NoCache,
+        })
+    });
+    let barrier1 = Arc::clone(&barrier);
+    let t1 = std::thread::spawn(move || {
+        barrier1.wait();
+        scheduler.submit(DFlashDraftHostRequest {
+            request_id: DFlashRequestId(43),
+            noise_embedding: noise1,
+            target_hidden: target1,
+            position_ids: positions1,
+            q_len: 3,
+            ctx_len: 2,
+            cache_mode: DFlashCacheMode::NoCache,
+        })
+    });
+    barrier.wait();
+    let response0 = t0
+        .join()
+        .expect("join scheduler request 0")
+        .expect("submit 0");
+    let response1 = t1
+        .join()
+        .expect("join scheduler request 1")
+        .expect("submit 1");
+    assert_eq!(response0.request_id, DFlashRequestId(42));
+    assert_eq!(response1.request_id, DFlashRequestId(43));
+    assert_eq!(response0.hidden_dim, config.hidden_size);
+    assert_eq!(response1.hidden_dim, config.hidden_size);
+    assert_eq!(response0.seq_len, 3);
+    assert_eq!(response1.seq_len, 3);
+    assert_eq!(response0.output.len(), config.hidden_size * 3);
+    assert_eq!(response1.output.len(), config.hidden_size * 3);
+    assert_eq!(response0.batch_size, 2);
+    assert_eq!(response1.batch_size, 2);
+    assert_eq!(response0.cache_seq_len, 0);
+    assert_eq!(response1.cache_seq_len, 0);
+}
+
+#[test]
+fn dflash_scheduler_manages_draft_cache() {
+    let Some(model_path) = model_path_or_skip("dflash scheduler cache gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash scheduler cache gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let config =
+        openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config");
+    let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let positions = i32_tensor(&st, "position_ids", &[1, 5]);
+    let scheduler = DFlashSchedulerHandle::start(
+        &model_path,
+        0,
+        DFlashSchedulerOptions {
+            executor: DFlashExecutorOptions {
+                max_batch_size: 2,
+                max_step_context_len: 2,
+                max_seq_len: 8,
+            },
+            max_wait: std::time::Duration::from_millis(10),
+            max_total_tokens: 16,
+        },
+    )
+    .expect("start scheduler");
+    let request_id = DFlashRequestId(99);
+    let response = scheduler
+        .submit(DFlashDraftHostRequest {
+            request_id,
+            noise_embedding: noise,
+            target_hidden: target,
+            position_ids: positions,
+            q_len: 3,
+            ctx_len: 2,
+            cache_mode: DFlashCacheMode::DraftCache,
+        })
+        .expect("submit cached request");
+    assert_eq!(response.request_id, request_id);
+    assert_eq!(response.cache_seq_len, 5);
+    assert_eq!(
+        scheduler.cache_seq_len(request_id).expect("cache seq len"),
+        5
+    );
+    scheduler.crop_cache(request_id, 2).expect("crop cache");
+    assert_eq!(
+        scheduler.cache_seq_len(request_id).expect("cache seq len"),
+        2
+    );
+    scheduler.reset_cache(request_id).expect("reset cache");
+    assert_eq!(
+        scheduler.cache_seq_len(request_id).expect("cache seq len"),
+        0
+    );
+}
+
+#[test]
+fn dflash_scheduler_control_messages_are_fifo() {
+    let Some(model_path) = model_path_or_skip("dflash scheduler fifo gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash scheduler fifo gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let config =
+        openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config");
+    let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let positions = i32_tensor(&st, "position_ids", &[1, 5]);
+    let scheduler = DFlashSchedulerHandle::start(
+        &model_path,
+        0,
+        DFlashSchedulerOptions {
+            executor: DFlashExecutorOptions {
+                max_batch_size: 2,
+                max_step_context_len: 2,
+                max_seq_len: 8,
+            },
+            max_wait: std::time::Duration::from_millis(100),
+            max_total_tokens: 16,
+        },
+    )
+    .expect("start scheduler");
+    let request_id = DFlashRequestId(123);
+    let submitter = scheduler.clone();
+    let (ack_tx, ack_rx) = crossbeam_channel::bounded(1);
+    let submit = std::thread::spawn(move || {
+        submitter.submit_with_enqueued_ack(
+            DFlashDraftHostRequest {
+                request_id,
+                noise_embedding: noise,
+                target_hidden: target,
+                position_ids: positions,
+                q_len: 3,
+                ctx_len: 2,
+                cache_mode: DFlashCacheMode::DraftCache,
+            },
+            ack_tx,
+        )
+    });
+    ack_rx.recv().expect("submit should be enqueued");
+    let seq_len = scheduler
+        .cache_seq_len(request_id)
+        .expect("cache seq len must follow pending submit");
+    let response = submit
+        .join()
+        .expect("join cached submit")
+        .expect("cached submit");
+    assert_eq!(response.cache_seq_len, 5);
+    assert_eq!(seq_len, 5);
+    scheduler.reset_cache(request_id).expect("reset cache");
+    assert_eq!(
+        scheduler.cache_seq_len(request_id).expect("cache seq len"),
+        0
+    );
+}
+
+#[test]
+fn dflash_cache_control_rejects_unknown_request_ids() {
+    let Some(model_path) = model_path_or_skip("dflash cache rejection gate") else {
+        return;
+    };
+    let mut executor = DFlashExecutor::load(
+        &model_path,
+        0,
+        DFlashExecutorOptions {
+            max_batch_size: 2,
+            max_step_context_len: 2,
+            max_seq_len: 8,
+        },
+    )
+    .expect("load executor");
+    let unknown = DFlashRequestId(777);
+    let reset_err = executor.reset_cache(unknown).expect_err("reset must fail");
+    assert!(
+        reset_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected reset error: {reset_err}"
+    );
+    let crop_err = executor.crop_cache(unknown, 1).expect_err("crop must fail");
+    assert!(
+        crop_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected crop error: {crop_err}"
+    );
+    let seq_err = executor
+        .cache_seq_len(unknown)
+        .expect_err("cache seq len must fail");
+    assert!(
+        seq_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected seq len error: {seq_err}"
+    );
+
+    let scheduler = DFlashSchedulerHandle::start(&model_path, 0, DFlashSchedulerOptions::default())
+        .expect("start scheduler");
+    let reset_err = scheduler
+        .reset_cache(unknown)
+        .expect_err("scheduler reset must fail");
+    assert!(
+        reset_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected scheduler reset error: {reset_err}"
+    );
+    let crop_err = scheduler
+        .crop_cache(unknown, 1)
+        .expect_err("scheduler crop must fail");
+    assert!(
+        crop_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected scheduler crop error: {crop_err}"
+    );
+    let seq_err = scheduler
+        .cache_seq_len(unknown)
+        .expect_err("scheduler cache seq len must fail");
+    assert!(
+        seq_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected scheduler seq len error: {seq_err}"
+    );
+}
+
+fn assert_deltas(label: &str, actual: &[bf16], expected: &[bf16]) {
+    assert_eq!(actual.len(), expected.len());
+    let mut deltas = actual
+        .iter()
+        .zip(expected.iter())
+        .map(|(got, want)| (got.to_f32() - want.to_f32()).abs())
+        .collect::<Vec<_>>();
+    deltas.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let mean = deltas.iter().sum::<f32>() / deltas.len() as f32;
+    let p99 = deltas[((deltas.len() as f32 * 0.99).floor() as usize).min(deltas.len() - 1)];
+    let max = deltas[deltas.len() - 1];
+    eprintln!(
+        "{label}: mean={mean:.6}, p99={p99:.6}, max={max:.6}, n={}",
+        deltas.len()
+    );
+    assert!(mean <= MEAN_TOL, "mean delta {mean} > {MEAN_TOL}");
+    assert!(p99 <= P99_TOL, "p99 delta {p99} > {P99_TOL}; max={max}");
+}
+
+fn tensor<'a>(st: &'a SafeTensors<'_>, name: &str, dtype: Dtype, shape: &[usize]) -> &'a [u8] {
+    let view = st
+        .tensor(name)
+        .unwrap_or_else(|err| panic!("golden missing {name}: {err}"));
+    assert_eq!(view.dtype(), dtype, "{name} dtype mismatch");
+    assert_eq!(view.shape(), shape, "{name} shape mismatch");
+    view.data()
+}
+
+fn bf16_tensor(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Vec<bf16> {
+    tensor(st, name, Dtype::BF16, shape)
+        .chunks_exact(2)
+        .map(|chunk| bf16::from_bits(u16::from_le_bytes([chunk[0], chunk[1]])))
+        .collect()
+}
+
+fn i32_tensor(st: &SafeTensors<'_>, name: &str, shape: &[usize]) -> Vec<i32> {
+    tensor(st, name, Dtype::I32, shape)
+        .chunks_exact(4)
+        .map(|chunk| i32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
+        .collect()
+}
+
+fn model_path_or_skip(label: &str) -> Option<PathBuf> {
+    let path = std::env::var("OPENINFER_DFLASH_TEST_MODEL_PATH")
+        .map(PathBuf::from)
+        .unwrap_or_else(|_| PathBuf::from(LOCAL_DFLASH));
+    let config_path = path.join("config.json");
+    if !config_path.exists() {
+        eprintln!(
+            "skipping {label}: {}/config.json does not exist; set OPENINFER_DFLASH_TEST_MODEL_PATH to run it",
+            path.display()
+        );
+        return None;
+    }
+    let config_text = std::fs::read_to_string(&config_path).unwrap_or_else(|err| {
+        panic!(
+            "failed to read DFlash config {}: {err}",
+            config_path.display()
+        )
+    });
+    let config: serde_json::Value = serde_json::from_str(&config_text).unwrap_or_else(|err| {
+        panic!(
+            "failed to parse DFlash config {}: {err}",
+            config_path.display()
+        )
+    });
+    let is_dflash = config
+        .get("architectures")
+        .and_then(serde_json::Value::as_array)
+        .map(|items| {
+            items
+                .iter()
+                .any(|item| item.as_str() == Some("DFlashDraftModel"))
+        })
+        .unwrap_or(false);
+    if !is_dflash {
+        eprintln!(
+            "skipping {label}: {} is not a DFlashDraftModel checkpoint; set OPENINFER_DFLASH_TEST_MODEL_PATH",
+            path.display()
+        );
+        return None;
+    }
+    Some(path)
+}
diff --git a/test_data/qwen3-4b-dflash-hf-golden.safetensors b/test_data/qwen3-4b-dflash-hf-golden.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6007c5c65cf0113fc36f4f3124855cde3ad7dadc
GIT binary patch
literal 82540
zcmbTeb#zr%^e&1dxLa@uP6LDxg5>Nu7h2riEydkw@eo3S6Cg+kZV3dpoIU5lifi#g
zDcS<1P@J~F`_lV+<GnH78~2afjsUTJ)?RDQ&(_+fWtvp~{rA0pj~+w9M)&I5Yjm$3
zJ^X8Z??0l);IN_owfw942l$WfHL_pW=pKW6O$Zy=V?f_g_;QtingLa+2Lx3MsG9N<
zy$28LGpNU?0TaVgz88Y84Cys$P><2W289j9Z=`&&T9uHHn)pWl0e$=8i~sYpLDfP6
zQ(j?Yuc4#98#Z!C*vL^m#^R3w!%|*6xKg$1l~S(XXY~008Bb8kwTJZ$8{A`fuhISS
zGnM-f8xmHzf7rzSy@sWHRQ~U~qbj!=7dEtN#o#&>>opkMYgGS=z5g55=wT!K^#6ao
zLQtiel>)!%Gp0&V%GgGQg{6$OYPFE+A%Q``)q?#e1^5piHfq3VjOTyd`S<>PM^6}@
za+{5+ri^7&|6VDdb*U0ity_TqfAcr&yYEJYjm9i>2@D7d?1oDW9X4Q8SdXwFy~FzU
z9Wb<C%KZM{SE<wBzj6IPm#K<L4GH}JUMnatph~r>HU7KaurZ^Dj~V@cbeaFI5l}U-
zdhmZ&`k$r!UyJg;y<kv4m7p4d|33r!?^*#hs?<!m;-upL{g+Zysqv9gUTN{+gOBex
z2WK&JWG5}qPyB&y&>mUMdDU0v$}kzEAykTo>OvmKIm`p@<ZPEvEu#Kzj4t9+)L)NK
zGYX^8dRWTy8{Vc_-Al5YFSutoxBkcv^r*51QMCRnZS+^}M}v3;FB8_y(tzjkZ2p#y
zyTxcWP2z%F2(M{5S<mq^$;7|9^>u+hB@bWJj&eYvoz+^)>x-+5bG*7o>q<77rKPC5
z){`hM$MO6Jwbgx8kato5=g}$B#}mY}H81Dma}vfIsW@%aKefL;qWYX#x9P9?Oz%@O
z_LJ_qL;duV{3=y7P}Muwv2sVhb8~PdI!Mo)K73oY&^f(K=lB~gDlMEz&KJ(fJ!Fd%
z)U{kjeJCw2;taG#9>`L?uG^`m-qRS~pg#H!SEl^>MBd9h{h5-e6_@d*!r13hdh+9E
zoW&M)f0o+xTpoI-@H73>>8tOxibhEzor^i}v-fp2ccH<YNq2D<{*H6$5oadmY&&1Y
z>+IHBx|*#P)1o%NT$k*6(CsNd=n&mXY0WNf%R@L)XX_I7)kFG4Mr#v}r5qAY!|0<Z
zEu)^iSbR9YX~KnV4c)-;p564B-&0ZdEtQrw9KbWRZt{J{i+jJ%t#~PKLQUMKBF+T!
ztJ{uFah!~n3}&T9=rDezquj!*8ptBsv^Ce@?+#CQee47Gjn2{)oZc?Pdiiny?zxaA
z=^xaEmP<|B)fp!j96x8Dw;8{o0a}{B(EvU{i`^Q$8ne4V5_ygO<z%Pdr86ZuOZclZ
zgqCY|ny!nr2OptX{H>&A(lBQ{2WfN7#qIe|4wPvUtMPoo&1KS?655c`YB%lWrlC?g
z1v65bcT<9%XV#e#%H6o0ex*2m!EI=TKH*i8%yZ-{W#W7`Q9bn1TUrJ6vfYj4q8v#t
zIUQ#tUsi4^C-e^L^))4V8f${Y=vZFpUCoCjKdLYcee?}JYV$l@L5rk<3D$MGPy1@L
zoZ$otllgLye6*BSr-q!_dU!oXP|Y6ZzM2`oGZA-}ONQzV-lv;5vr~(`^gY+*i9AqC
zvC(=w7=I_p1}=r_cDcURq8z+e3hPwuB0o!4n+1KF9~HPpqIEhK*7roT&Yh)wc_+G}
zp01E7oJA*a5#A>mP}33WYnIa{N^iG`!(G&$Q<GOW(m?%_lc|+vM{VBpT#!Jm&-bWV
z=vb|*g}E?4#~e(dzqOfskt@<qlTfGMYd&#M)0w$A-}JVTC$v>ZYABD;sr;+DvQ0|r
zV!lQDWGf%g&D792i0Wy=$K<(liH>k{^wc(>$9aAPEXmJ#xi3{9bTOrqw>pzca9Z<~
z<LN1Rc_k0!I{J$D=qet8I*X@e&IsNwZ8g!ml;)wnPjU<1OIamWifAK_<4QIQcUC`>
zz-zHm?Kz*b9b=6Zk5hp&YaR^&1|4&DqqDkm3^47PCd({Yib_A>7UKoZDJ@IYQ7I*O
zj&qknbha~04tvkjBK@0|>SJBteIzd>f{$r)Q<)EIMIc{RTUAf%M_w;WWg_<n5|-6;
z+JTu4>j%e=CddJuud^kJhieu(L?>v!7Sb%7%8UgX&601ZG}n>Kd=~4lmETBSDhdo7
zp)WDVbNR7m<qDFZQEm+GlrYj%T223ThEX_BYrZB>JxxG8rsH+!{hz2Vx8o~Pjw7@L
zH|H`OiLbBJ3;6iM?I?BhXTC2T^{`fQlIVa_#Fo$^k|2+0Iv=HS8o{6S1Xd}HZHKvU
z&b{bY+V6&o!Mc|<eNg9bG`Ag2#nebDqo{UYlbfgTG4E22ar4=M{G&6K`(tbaG&NtL
zzPbgtIYOQ~N5KeNP?;w+wHC+g{D%2=C`qbvMJ-L)>?(Q=)V<_JVx)(-syU6GkD^69
zi>h!Edg;Daz(+aT605uj3^L9+!2bNv>A<n*>j%_Ex9S2;FEL!6e+LIH(gB)Y4|xxA
zezy=^1Zth46OzxS<!G?RVSTG*ol@YpBIw$|G#)q~tZ7XRu7>#y;!f^FUC*av74MSm
zG|!nPNj#sga|oE^8W?Uq<&;T$mQUbgt!A_<G+xSRf>T2WqfU$IQTJE<lU~U^Reeb{
z<*=p&@}8A0xO1Wyj64Uu)2(uX6Xd>o&TXd$F=KZqGc6G>hvJobNq;won_8!Vk!J!c
zvY9zt(Nxoc(n<qp7glx=xZ^N-@}&Dc-&RMC0xxRnR5~vU^e86+r}6<uN?4DU<a?S~
zr|TkK$Q{*(8v=uK(;AI-{WKq~c3#qEsjfx2nq9#Y^d@(8e)jC=5*qIumh*hx?L^b)
z2D{=<4NW6_eH>~vQAbN3jnzaxq}TMeHr5i-LMl>LBUF>O0v`+OZs(w0aF!*f<I0+i
z^1Ge&B>hFjEP8_*N;dsLj=LjiDQZ5>sSGZQ;kwcQcr+7O*pBaV1=AcWl)#OFSk3jA
zdr7-HaWWD$wF-S-$2`KFd%y~bT8pPTmpNDua6j$BU-_zx(eV@@k2C{!(rEPKRK7&5
zfqQ?@ehQ+qyocJ*OmOf6caBs>r57@H=o#jrHyr?K$71ewJ5OaBkhvtSA?xYMD=Ad-
zYYWY!?P#XEn_cfG-6eg|?aSl~_0v%FU@h&b{+Qttd_`}d%OZ5N-gal}uihc<uhLDH
zYj3T~_hgF>A$6j0r*W9gOz4f4z&S7f&QF{$9*b3Mt-CR^OSC){p-Jjv#&CYVfc4AA
zwZU-p-7x;-D7~iCrm&{Bxp=BX&@^u9R?`_84&?kr*SldllZvbBdgVKP!>zQ2RH9KF
z<0R;3@q(Kh@)+onkID1+70=Mp`dFrMAWha#Np!YQG@YY(nkVnLBTttI=_(mrh5pK<
z8)c$Y(xPg$lX!r^4S|mlz=rSms+QDdd`o+Ah%*7#f8|`@VPLx)auJGT2S%TXMgco&
z=uVxZ9*uLS^A4WDVQPSV^JIn0rfg=FT%#@eqqL({)DH8Q-gb8yd2iDt`dgN9cL@eZ
ze~}#yVSe5_g=G&_)QwIZ)I=tm9;i}7U%1QkZ;9k3;K*S0v5{(^ZC%+Q)pW03p=i#-
zC#1G|WDfr+%ba|=Pfl?^-h*)#qKvix#mal}N1u$K$70;3?8D!1UQ|(28U&{QOV>NS
zBpttX4?`<w)6x7B7v@`-<D$@}hrJ<Yf-aVSoOl|-4WUpMptA?aIwzM|N~JZI`eB{Z
z+cv<n9vbagrsbWjsL@8$S-(0y=DlaF9+NcUW5a=IJKV<FS|>me=i<{?kNn2hzLYG`
zBzN75QkGKdSNBJKsM~o9WrbE+4Gg~`?NKkOG={rLS0{>!dIzHW&p=ZIa(-QV_?$am
zvr8w61S)lh+8l~GY#@htyOT|4x%0FJuC(6iuS1;A?ia~y7HUnXp&YETU)#7P_>%6G
zjm}2?D95!px1<wLhaI6&QeHPoN9tY41U5UZIqjSjpFnpsOb=2HIv`o;HuS-GRP<2}
z;W}97jB=f~(_1QL(wY~}#^mqyf?Sf9oQ8w=C%UH&PiEImpjb-BpZGnz$2?xb4`icU
z#X2TPZ0KfG&=U$Lr95=U4dEwxfY#|YaQ|$2j{EzTQ=3|}8$1<D$GL({;KN)+GwEPG
zgjHGumDo*IK?7X}2Tat}?spuG4mrV}JoV&vu-Hc2k&pgH?@=)o^fYu^GClULlJay$
zGMTL$K_xV+^w;rH5@^v6J|R6{<>`8gT-}LY^vFZ10PpgHw$pPoQ`^d9DK6`@F{QSh
zytz2LIj?QO*DL7*t;cJ(2cl)>f?AA^!{<b~SD@1SNGsQe&*^!2;a0<l+QUJ-;3hm;
z_h<$7p}e@G3!x8yfob_A9RpHy(e{*qR)fPj^UoZ?_jM&S{Rc2^9ID#|hlX>g{w`bG
z^s<8IW5sUCIUY`#>`K}%;0`@YK3K8ssQWp*mUp>J&~LMOzeeyEcaS6jQ<gYmDYres
zH6$N>b>8c6%<puKqc9!pKGKc6SYxywWrO~h2R!>3%Hs(t^}N<`?|3sompPJJ=22xf
zddn>@>(!TY@KWiDJKwJ@fXe%!V}F&I+7i{9$jfx06bJ4UL7!znUFD+XdP?_zfp2pU
zy^3nyh>=>JtDBs^+{-do?>gyBR@q9KaqaC?&TNqj{7&ccK+aAdJ>_kC%$ksu)tra^
zkyY+CIidM=iR{y9BvfA0aTz-kYrllw>nS-!tN4ZUhF_vfrm{<CDPL$7XxvtEMCLeS
z;iaRgmlNhzv;WFX38N0snmcr%7UDh>u32>!53}950~mI*?r~iz4HoXFeRzwu<gr`|
z_%uqtawM<UZ**vi;*{Sw+EYbZn^+#id7*v!X<G__Lj9-(b%$F{U+^ur8mFN<&P*Uz
z0+d%~c+}f`gIZ|u(8YY)(*w%8pu}kbrvTLbGme&KGM$6zC2;$#djRZvS@!5m-K6b-
zs^OfQpL4voFsHYX{6-(sE{XtB`)EGnt8qY~SpFLbS%&9P45hQ<_$6v53f<h$X-vCu
zAGzG#5-At87y0A5cc_eh00W<p`r1&3!i7(gVm1zy*HYHW4D{a&rw%{%E<lgEn%C@;
zEF5Tk`L$CUs8B+O^E783M!HSLfd#*%SiMFkps>$t8p)~id73`goV1lY$V|BbrufQ5
z^u2eM-s38q16r@Lb3<aa3eWdM@-u3v<)EF7^Z>HW;HtpkJaDHjEks2up-@_^fzsBk
z%j<ZjMmpnwy^Gu<nicFZ8osn9jp1qTCZ0x9d4#7J@VbcXhK@gmwY^FQNhQe*Hmmg$
z#b|2HW1iDc=oWvj4er>;emtE$v_Q8=MqaAP{F!{=k8*N%4W@3o6gu}U?tK;fP&LZQ
z8F`=640RE#fhHGRLRwqK*(YteJaUryT3b8wVX)jo9i;g%yU*QW+*J!gMP!v4^g#-6
zZf`rRP-oyntX9=~QUEB?fvRzYHz#oKi{{~?K>h@+1$VZ^vxdKuA2^t2(i**>3;C#?
z1S{`HZ4^b{mf;@!EByEY8mH+)mmsfT=_FIRfh^WxI)JljM~VT9R&;Yfn?C^l{HZ;}
z!vT7Qx4Km@4{<b{E|cSi*gKTl+~&@xhECdzZ}MH-%~Tz(iBglwY8tqLBhYsjB_I2k
zjT*@@=*>#H3R%vt$RP&sY+Sho9C0!#Dgkp^nbRWeX#ll1DTOBlNdj3q4Zd9j4Vu@%
z=z+cEc{le53;*sG(0k4zE+wm_m!8uNbkqA2w~~jN&EAo&(0~D2kUKh^^&h7pROja8
zKF*Jvsn#y&vft@n`ICOu(m<Cu{M3A>iT64!g-hv9O`VpMLCeGcR?#8+Opj@8PGeq3
z6y4;l8cSg;l9DY2aXZY(Y$$-sJOw;^0QYqV-E@LBA+Kl&&S=IPWv4Eo{B|p~1%9ng
zp6fQ#!u-bFCE49AVA;&JJXht%P@qfnn)KjQ(p@7cUaN8wor3PKqlC5a@&OK4rK(WI
z)9I7$2D80(*I-6#$w#R0;uy<eXB$-kCf8>_xYd7wt}U_FzSIl2np67HP+qO?fPu@o
z3=m}pGMuG!(9Hn6n**M$sEshoYr)T%pnY=cK%OPTG+e7n554AIb>0GVTEdsaa%~+h
zUHC0!WQSAn6WI<V=&zlXxgzz@*6v0*<z|+9(n^%i=sV<dolR|?%R40x*1R*n<0cd=
z1GOa?Bn2nnvlnxa9qw+VU%--$_=+=)8Y3f%M~D5zo1h?%NE2?&B~5*F!XlpQ9_6&Q
zG`H2A^p1<$R@zVh6%S37>H3;3>2g`lA+*j}B>%_*q)n&kPkIJ#^Svx|Uh8DIfm%o;
zlc1He*aJX)k|OY8HIc5wqWZ4MAUQ)_I2U)6+o-zcZX8tTFi#=)wq5XawUh5SeUO>f
zu=&w>O>8gS3BOJF{dzzsKf4dAAU%4nF=yvwnF56|UcQxaoKsTU>fBgXasp861+vXX
zv>CYBiuZA#=Fl^Gf|qD}_c*@4#VKc|aC@B&e953$bOz19x;Nw>!I=?M33tE2na*)e
zLA?%km`us2ppE44b^S>;$+wiBmr`M#r9U}aP}%?LUF1eTB28QbCYqzd2E8zaC!yB=
zaaUlLN=a+Vi(WtNEYU^IJ6<VIF|+>sC#qsTodj}?;Vt;;D$g42#Rd4LrwSD%LanCL
zN)(2bd_j+-3zgD*T7h<hg%U7VAKckgi)!mC&JHfhXtN^Q_#)jjBcH;`#vo@Ytj{P;
zzpar?$3V-hf$}YGR{?*P$P4c)*Q+iRMRDt=zO)@G|B=+kopr$KuaHpsh6e-J(rXFL
zBL#V?i|i;l6Ybai;NU)d0v$4&>vATZqmTHbdsm}11mBw|{#sn#I(5PRgWN>un~}Vm
zR&p4!w#)j!&82Sg-<X}RTtmmYJ2)%(X<DrFNSZ4Rflg0ktKOD5sGz6L0(h)07<n@t
ztNpnIH0f-Krrw&v&T?02e|XO|6rzVxI!h903T4!sP<EG*9?!(uen%ZR9aM4~nyXvj
zG;ce-^&&l`yUuAS)o5=_@?@Y(U1_T;xf4`Wc4U6ro!#K|*7RCyKo?~OJ3N(1;L>Su
zme0YRQRsxC+6@|blg^<8cv8csc@{K)@&XFxPN;!6bka60Z2sXpe4g$(v694X^Z;8Z
zm;AH_%KfN2Up^7=jW=l~$?hD}yY6N_i#wmF@mkMqAoG!t4@W0&N5@Rl%%&PorUr0)
z{z#uX^Cci(B|fWv$VTKYU!g7jMM4Bc#W$QH6fWo4&u(YTj$6@QqR~i@43t$AkJE~h
z9UY&{T_nHy>k+t;SNwx>6qUA%=eWb9G|l7pG8FlTk2ynq!1Nz=7FB|F3+1KgkAi$q
zkq_GR`~p7aG#zp_N-mDjL+*7c4epqsmCQ_J#1?AcJui1Wx?Jzm9v-62Ij!-*Pu7+o
z*Uuc)0{nuPy1%(wfWO~MP6_59e(L@vuk|7@b-b!)v*!ip&<1c($9NT$@Gf%JYF*4m
zYoP2JIzv-635<9bN~?r40drV)2rY3sQ4`d0vS&1vktgzmcj<KC@D`|qwpy7V$yK1x
zzj^?w_$vBzw?6jx0aZ831}Ng$;L=u1@|TRpO8*A;RlvR>q{@2G5nd+)d5e^Sk4x`7
z(QMooDMW-Grvm7cDBj^+3>HhRn`nwNS_bO`q#Tpv8ESAUX6G65rFP`yf;Jn!qX;B~
zGu;IgsJ@z+tD7@Cj-Pmn@-*Ir>?jd6zlX0uGqsd<THeNJZs^p7yxDyY<ZOY}&I(jZ
za%OR6O^+m@qSl2A7>G1s2Vdr5^cwl*EOgf+$`7o%0xozYo!~1f;;XaZ*dkHch0xh)
z_zb1uQ_?_ZK~ZngG-jS0(kr2V$t|a%<dE_7P_|12#^>A<Ker6yS&g;N;LO*e9Olh!
z?@<ye@CRf7x8-}Oi<w?RgLsvm;;cx*S5X3(|Egnninijz+V1cIJxr6_R@w(u5vW0K
z4PeGW>a5?uzf9m0V2~kjTg5dN%;b?Sn7O5Ni%N&?;?K@U`HG*&0(}$7ZQ;BMabc`!
zStI}(G(WeepEweF%?}(A1J%4p7htR#^o@Lv9$YDdcre|UlE{pv!MhH^3<heV_Hd$5
z0ok+~Pv_cPf$DG~`eM0sfrE--KXVg)Vw!sdRbQB(zT_L}so6QVX=!Wgb+GYT=}sG+
zG}_7WGp(UWi)d-uqhDl!l-C#NxA9o3%kCj+K#9mg-qAw7ro~Mx7@`PPx|Noa(~?~e
z>%Y+3mn8=K5v|-o+5lPgI+_Qiv59>!KS%YYJaDVPw+A7c8BHn9VlQ&4r}QKBB_C%M
zS3%$V(L%W<)0`UG6&}75@U*8^HjCjHTrP`ruO}DK{CbOrVh()Zggc{VA85+HS4C9e
z6Pb<KY@<2!Hg$Bm&=UASA3ITcaeDVSGP>Tpi@La_HJ-9N!PpJGFPWU>vdHPLv$>L6
zQJ?V&?MvyvU>l&znCgH{Z+LUsVDMp#_R>mxU(<0>r#f{1BTBE`sStJ))7vgs^VO1?
z7Xh)8v<{TRJT7c1AoF{JD{OP>>2awHkE+@RbGT3D@pDfNtYdoQvvYWYY#=|{Eeo_J
zl=vx4Le`T=y*UFffD7%QJD^7rc!C>88A4y^2wJD>k+V;7!r|L}v@4Q;4D<))XAmz$
z22>c;n3G$%|7aR@^dB0d53xFfXsl%7QQ(%`P=Kqw!KjZxdLFBCP(~v+X(tP?BeaCK
z(qanGV6JY9>j-G54;qHLE()*w03#ovk+R=SWdpP|R7ydxi?7LMQXJWMtmi4I3KWcS
z2k~ebt|E=ym(Cn%&ehzV)E2ltS>M6QU2#5oX25|qlc|#3+;{uvUUvtH{4UARlHqhO
zg;9V7w@^#doD$f%$t5*WL902zQ(R-<)9=xR<byKM-6wm2-3`GI*^t>U1b*f>s~qg_
znz2B$EOt4y)@@Mqb7U0f)$v@)oWW@3%NO0id-<s2L%D#ay|lG%c8*Crx_u$f0Ipus
zcAA&d>k!ET#(Jsa^?Rxh#@y%R)=bF4I`X|*XE=#-a&3uqe}MWJEoJcfLv<!<vkSF@
z;yo@8bQHKo(7%s01Pb&xw}Q{eKutX5RWY7${zd$C9V%s^mcsA)BePq8y{*#F?w?T^
zNgPLcv?g9PUXz@BTE?v5HBcs-wH(&;H_By8Ap1zo%TSx+!2<K3Aj(7i=f_-?)!jg`
z(z2fN(ihr_{fBg#+T<`@z^(gn$Ah8Ga%&D==4Qd(>j<cwX}}NcJYqz_d=UMZiKjx@
zw&p#Y&DlV^fj2p5g_FZM#fjPh-sX{P;^o{MnmiNwz(X6ojx2_{j--o{i|RR*c$McG
zGO2}dB~|pi(;c;4NPVD?Qxtw~t_<aM0bgtF?2v-qQ}i0Hasu?z5-JMBuc(*#0?)+C
zhSOp;x*65++?nR)My9rg;@t>dPR%4ec2?t&URO4+D5JApJEMNRl#gq2AFjx~_zUvK
z>E2D;NJnyZyOUh{;@$v@-g4)Fo1TG1&QMA^l$o|+Jh${SRxA-Skd6x4=TL3GV}#Av
zgLEzf3zv~>xN6%JWdmh5);o{;%L=3z>A0*osMat(L0xna)!^<<VXg(VDyEf^pW|#p
z95TGSsHu}WQCM5RDXo(jWCSm%9Wt6(aApg^$@Pf2sdmt0?6wrf9&J@VFY%Iye0sh9
zp;Kj!P6xvIm>M)qv9qkzxjHuo=dIB@;H>dHOxL-MwF=gDGWF3OP`aZqnj(4#UECEu
zu9iNeo4~)9a8c9X-+z?t$!Se<_=!QtWy`>8zT|QeLdEgjUvveM?Ve(|vX-$UJgtFd
zjr3pM!PC9j>;eVD=t=&HJ^svgO$tAN3wm=PhdY;bAaqt&Tw@=wWiM^gX;R&E!9LM5
zsGzreR1#U>;p6!bW??F_hMLY_;6Y!08ybZ=-Jne+$VPxchR6r>eGa{do$@%(I<5{3
zd4VLmCpt8XGk{rs=4H}Ui*pfFbTPQa5U9HL&^2DTRpDa%f!Dh$H34hA4jSQ-oTN+K
z2-y1@W~(^I(+|=Tc>kK#a2s5=46^w3oZ!YdDITW+H}J-CdQ)3B>RF&gMVbjWl9Qjx
zUo?*A=uGrkZc|qlB$w3hG0&TTgk^LYvXK0`jR*1|{Xp4KQB}bJS?n%(2aL~Wugf!c
zIj41E;5KidV#aF>W#g)v*Y-lrkV_I!TVME$lju6ot7*849@56>@mS<3z4Qn1H3PYm
zH!Jj_>RQUek8qtea1?1xUwDW`vVnt5eN^Z*DTV4dtp|VvLGbriQR_81wZ5lYZXA?D
z4?c=M_=h`bVJ9~iwODh^UJSLRZu(lYnA>iUZq*x9fe%r2nGV$x?xZ&F`2|(5{@e@-
zWr19DLUjjn=F`BNzxA{0YXX2Y%aP<P)<pQgGCYyfAwg>oXA#Yf=|^O4S=85^CqdkW
z_j3=orM9%u8b(F9v9rhBESF^slGeo%EkDA|j@ItFOI+H?lQD;Dq2zrv6b`=~ufv{t
zIjH7FaCISCM_lY$Yy{)a;!IH3m*`Kug{-0$m)AHwN=-R~83p})0o~gTSQSK(P9Cly
zA1Sjsx?fV`B*O`)mlg2U>C8N>gI(5~sO9Y%>PEvoPXaPspdC6+m+3h<spUAU2{gy_
zEk0Y|8qEF@pu=3$#ttqCg`2V~v>3aLW#}64d8Xc%%yzMihq~O0SB#Yq=`WGIL;T@t
zuh2hQ4Rdybu6kx-@8q6_>sU_2KHwPSwI1w)57ou`k$k-sP!}7))N}Z>?2=&T3~lG)
zK(dcA*^{4BJp31ZLj`pil|p`=!_K4^w1K)fB?<hgui*&hNka~mPmB`@d`fe|&Bv&L
z=6@ldku`@SYY4M<Yn8&47Q==008%Y>2GJT+U{|Lr|Da2y8d!NNJa0R=#hDx|%-hg$
z+n~e`A!%&Nlki)!sJ!_i;lQuiz`*NLU#@6s`-3B#25WvAIr1o|mb<(gHT(_t)cWoX
zpti3m>ZU>6exf&Yj(xZe@aUCv<#KdV{P}U{HXvmyCl!2ebLcxibjbqgDjD<`P1jYr
zoi9o;o(MgE0GcV6{YrCmowkKu&TI!DJ^tNOfbY|{8b?8T*sW}{A{{_Frc7ml1NHO)
z7^^==L*x4)QEG#9B}lgDG0CCxJ*&CCL;<5u&`3^S>zuRzC;mYb(EaK7(1C2uPvDp%
zNGJ~RUcIEJldrhjG#dG427b)#cmq%}tu{qH%+<Ga0~+4LbJ3?MK4zC2$>%ZqV0=Y3
z>mGMMQ?i@v{OJVhJgA2E@+W6B2e>TK!q=Fs!}17R+FjSnDLL<%4ut)agSZf!Xj?ia
zF-}2@ET`RqtCfO|$pDWRMGY~Vdyu?@gU2ek3DgWJbR!*u-U`QAv4>n-qP4oSK6JTu
zM};nN|CR-`1S5I`^>NXC286D!^Wo1<OG;MtNbX8$9z&~i9GBHISc%0tiLbi_B%{6q
zZ&b3r=+_ItrXcxN-+Qw0G|bnp;ILKPfLF^v<R-m%Hm~CCvJ_mC+5W+2+{$p)(Wv<O
zd_*(bJsjzrL)O$EJ6qAZnNvEo49@Jlm4o^-e~<1ffU#$HckykW<+&y^IIl?rOLT|R
z9O7Q&eL%0=_D*O9)apVRp(ivwRdIiC%W+q%%ugBs|N9%Y!2G;*(;+ur#((lU?yMy_
zOkZgcE(gS0&s}vVoMI??yb@H<1F1(Db-jLuHeaokY;Sj%?xC4Z6t8ot>QHEc#oU5R
zLctxxZ&#D-vd5VLrPYrDkgf!%zZ{f;)LwIQOZ47e%-1i}U$bNPHBr-OLDPWpa2RDV
zF3@*6WmT_Ckn`k^{cNQf-f-Ec%QTi(@=PSk2|5*sSDsz%#lKOKKB39-2lg;$X)Y<P
zwRtf&2ex<BOuSZqrgq*c{0m&f9^g|Kut6JrsWoK0V%NhRDp|$XmXK2#sw>^`^bch=
zX`s;Kz~%S(85HRzpnPtqi_-sT$8)IX>RiK)hnvlZE2O6|n#Qxx$=!9e^NMdv9!*7u
z^#V@zorhYQ!M^CN(ex8-g3jD5QTlhy#!%~pG4@D|{<ExsyNu`g+*&5;cam0HIeoRJ
zI~bhYQ3=_`3=M*cNTgR-m5LUpFx@C!85+pXq&-lf8TKv`Whr=a8^05zgeJ%wLtP%!
zLUuIv2Cm3+q(Um&=r1Rd^T<8qR`qbsWEn8;7m1UXP!C^pk@k0Sb_&U%hmSe$wE^a8
zCUiz#Ch+b-+|v#k07mE_m1&+1#JPoeZU%1)oN)byzsd={=tOfWn^zlhCC%dOqzquE
z4e+5?adu&fR;3vnsb`T>251@{hx$G$9r2n?f$rh(Sd}n?wKWwF)g?Hc8^`%=X|1CX
z+=|N?U;Z5xw?HiyaVkOy6BpurPBQhx>y@F;G(oq}PQ9=Fcngi80<;X{-lvu-aX6Lc
ziZYgW0Gan_JIpZlm*`(0!aN>}uP^1VZf3Jds=@b1a9Mj(OF;XT;o-Vi9@9PSyiKL&
zPynRK8t1)%zHJR`?n!g`6<3z^Pz?>?gWpJ~nN8C<9oS+qmE%v82yYSw?u_INNWx}w
zM$P3E((CS4bXZp8Oj$TB{YzQh`sm?*+}gU1bHhct?rWV#Igr&((Q(+b8H7Y_uiHfD
zA@MmtAK4X`x4SL082TrYugPXzF4+5~JyaeJ?Jw+umBkD!(jO^Nvzi$3pcl>|RVa=5
z{*m8^!@X%B*gvQKCYyk*A#jU^cj*Lp+f@7=pOaW_>;AwQr6ia*Qk3@M1WYZkXm4i)
zZRA++%g?~p3!21+{kR|cV;%2+p5MVGM6g~dYH<_aq4qq8n&6D7$W%SdQ>By+<R|1y
zsgX^D%Qonbo}SE_)?|PWS_hW-=xjsPXVgpJkL&JF$dX;pSw2UpG*)Zl)LImF|8CGr
zsH;4tG?0EKQ1(aJLy^GHF;GC+I8d%R2~H(<Hc<Ap{GzEfy}aRg&5ItKgBdI$*Xgnp
z;s)w!eHsS$l>nx^PeBw54IjuEG!vA}G5l^0Z`MN_!R;%MNEF51K|UlL7jyw;Jb@R}
zJ4p}i7=saP;3znv?&N2}<*d_<Qc)qGP%zk{tgb=#)zgf824{U1Q7By36<Vn^v;cg{
zcHrz+$x3T6#>~z--s4?}thyO5)D>W^Kin%EO3j^~=&*D$2YXsY=&JhhQ|iwhb%ZQ+
z<DJo9p?O-7u5*1bXfDQiLd?SnEhkUqXEz^@#Ho{?Wxf{Xf;vT~N<UQID7^#qxfv-F
z(Is?rH1d@QjG`AhZ5b~|9=IKTECCKFgDtN`;av)VOPW&^ZmNFpbWLzaeLYE(1-18^
zo}kS%Lo%9sGFPTLTe+0eQ6sUpHBVzSKx^>{&Z7CXJkaQSIG3-~$<4%{_#QYon_Z*#
zG(_j{RGdv4>W<-aI$xp9^<VJCCug4is>Ye*S&TiPME2DRj1!r(9ec}LDF?>o)eO+&
zJ)j-t@?K9yT?maBCzIqI1!`Fa-%CU19xDC}-IJo|$O(J{$^0YdIzPq^OI;$UeVmmu
z^>~5?>qRKCcnzUZT8bxlOLB}X0&b+|H5#G+Pyiq17IF(JJXm}1&-$Aqvd{Zfp0gKz
zrMd*6GOB7v_XE$Mkz%x#^y51E2mQb+wH(xcG5GTNl9pd+sI7$a5Ko|KU7WNmj2=sb
z)A|B+kX?VJmin#T3Z6R4^Qp3(N?T>O(*-)97S`%FSp)}B&qm<R1WbAXqrISW!O+Dx
zN|Gc=%V7R8@O*bSjev$u!Y)}BboEkBEq=5M<IILRx(;ld<<>+E){;%o)oY>e)0mtz
z!#RpGtqZ6VXM*1!2`84$t<Q&W|E+ie@Vby|kXMhVaOWwM(-S@9X(aizA5NUyaaTEM
zp-ZxW%?ki~^V>i5C2Hq3yYQ36pjv%6>F`d$$r(C@JAI%}-9XG|8#wv8IG2_lTpKRa
z;OE9VF>*ula5?rUQCql-Q9!sHyptc`?7~Re%Fmtr>d~L|i0ly$M}zsB0}1B<iRZwp
z)x~N=OBt;WwCqV~v=Be^B;oaI0wG_~Y?;74co+?nMQ%;)EpxTB`PUu9TcLk?a!LN)
zy{#>QCRL$smgp6}g1o6EH<#4fnB(0`P>5@>YZOQawWHG(em5_zfM<@7*;vV0V91?N
zcCDx_dU%u8w_P|^PB}}G%Y#QcL3vN--{J6P=*Hw^S+1k#7da{a;#6ftMMmhn;7e{t
zsIpzySL><UuqLrwf-_5M?hV#VbS;h1!Fm=9TpeS6g!=dbxAV|_i&^WgtFUH$@kB#*
z%?U-C3{?cqLua3qzWg1ksTFm@=V{Pu2WbkN$}K2^IIPn>xl469*j>v{P&YZ~5KwLz
z_lI`-O@pNlcBTtLCq+<6y&T#Jo~Ec|<Y@h!-bh2YqR!Us5~pSOrqs~JvX;i-#K}%B
z2IN>neKjlZK)*-n8OjAbXi6?EaAqTqDdtANw<JQ5ESBQv>JgZ6KlI-c-LBSIhKj7l
zPbApP(GJ*Yx`mzD`dpC$ffbG%)@vH&G}e#eXa3;N+|kYAHbEXS9~tvP+(AKjqu(h|
za@$hkgE<@m^m#8?O-9D<EKW7laJwKCn9t8;DvzcGK%G8du~N9gVEsas^b+~-Gx|fe
zKp~XXwAee%#&dzVJ9V#<M$hP<(Eh7Y`$@<rHo&v?;%r=#k87A4%qyT*&uA5B!(Bjk
zR6IC1USH7;3bI{v3otB{@@iQ-$=gTQN~G+go}rK39c-Z|vI5Pj>3!VUIGhn!Db0|2
zPKAP9PJ8eg`OPJ)-gqF-lazJQbreF$bV93Y3)!P-IRxs!m!Ollhm)Y+N)xD-_nHT<
z-VBwUPXigxI+zHZCY?D!!f|$`Gk2z0c*TB58LrC`)cY^86g_v|{gE4KC*FYXztl~f
zLw0j1Jt}XZ^rmxNBvg0?!Yb0)*Hp%|aO2Ry1yI?(+z*|S2l}{yROLJBYb&@tfmXF8
z0s5wi?t%i3)C%+%dU}JmA@9fO_;wmk^LaEnIZ9XPSUF7J>M2gi_)1bbbH~|;iW#bH
zfZLV$5>hp&3<?2%E#O}?#+gnp<bqsA8s7x#u-q-cK{}jg!*j2p{K%eS;KkQy1DXq!
zc?kObdk&(Vv>SC-(dmi#&uAxc6@H}elGB1u=K?VVZ16(;u^)4r-qLkXfF5^eP;U9+
znM=zxWe0QyRs&}Rs1k3`Mv8qgu-6fIz@2a)OZ1`J6R*6S{-8xXAh|U<DGIDUkA3hA
z#y)tQkvtW3d)MuziBv(KQf*qMQ(Rvg7#c@ibS0<8-oh0qwEbei7$r><cB58FDx7dp
zE=@_i75Z=;7`hxE_txQfuEDvXIP*bo^#rRFg*v#0s*484{lfklWG8bUDa*HMnlp;_
zgyM{WRDiCy%W-a1-tTtTLaKCEGeNUo^lsHY&IKN%wd^!5h%;_Ok;1iNoF)XiWHe>$
zD~WQ$u!nixIVXL!q*TK>;8pH%uA$S&xG_{4>bxC4gm>*pk<<~V9bH)hr5uQ}#!H|(
zKWZZX4ZTcK-1%DrU9c8~ax4{*U0hYqOMdZ}yx`)6c*5iZeBxZVpoiK@CO|n9L<LT#
z;{2Bsc5dRF;5|7*cjzo;AW^dMLSCeuq$Vn8gwB#3c+E4sf)DC>bbb@9XbS)rx6(r(
zN4!+k$ueI;^s{8*X<UtZX?9b|@#9R|SO0b+(JyImF6WBJ$87?pud2rF#m}5y_#yOC
zF8jh=h~2+WGF(6MRm@xx^xk%-IpaAwU_vKkYkQGapLgqVO4?AH(%Wfrh_i7mo+Q6X
zlsu&pDY{6<VXpe%Ns^NMgEyXXnH`?ZK$h&1Ovky5>4fTN<k=^Au*b4PQ=S_<?(_mC
zhx1K6MW5u2yAh`b)1do%BVWy@E3xL6;dYCo+M3{sSKMrN4^A5f=rCR7z;&BPsFO78
zqMvqQhhVN+??!aS5`4`M*9`_vtZ~*TPW$pR>^Nnh-oU<jyk9o*4*o})Y97r66!GCy
zybfGpbgfec&v0ebq~ut55r3PHE;^T}6x2==++#RAV<o7R2AY-P;gs`YZcl1{Sqq<~
zcn&d_q{8Wi)!Y!3>Q8v;2UvAc;&3OC^cehhQ%7Rvk4RU(Q46k~=RltyqASo>7vWqk
z$y}(t)tU=Vp_qDsX0MRAKBKv~6QDYj_;$)>&+<D;c0Abgo=Mwz32=1+_tX8j!;Rbr
zEV7p`$O0q+SM@J`MNxcLD}%W<>N$MgS(mt}`3Q|iqI64J;5FN6Po!FTHI+$gSJNy`
z09&_5hwh^5bVf4DYaRk!&=C7XS)q9LQ+ZVW4&+*6sFiz`b_({Jy|KWsMp%1?XGmdg
zK{1&9OwcNq4-eIRW{*VcF-qC%5XnPcJS8=pv-4fdX+NA8kE63lOQDh_R$jX0pni%$
zkv)(qdi8J?`>p;Xy}2-0>noiEH-?L)U^9unlLq>jV&SC9@HSqBKabNhy(;2-)}`15
zTt*wUzJ~K^=L^mPT;_>9kB`VZJ%U~Otk}b>LeZKDXB<wuuiXybl+&&q^qtd9R>MPW
z3oR>wNEVXdlfTD^Q%?B}lqHhMPI5AG6!mre!0Sy>YhIm*`(7tspgr?&HX!8+38tC2
z_7<K;{`e?_ykaxs$x=E9SDYPsA}#Nu_mpzRD-u|=QnpDTaJ4)uuh+K7hkm1AtkMe1
zctNg$^ewd>=MP9S>TzkjYP7o&h`5yJ@L|_$?n7fN<u5efz60`a&>T3iHp52MPRk3;
zbrZk^%@k^(@iwEKb7Te(gX*m1p-0y0Bwb>sYdd>^vf?R?Z1#|4yWf6*W=*mSIFm^<
zNCKV9l7_3;Qnm%pwlmBYPGfy)-=i3ur!VLoK+(3YId^2fZs4+J9iAl0$X)DF=dezL
z%Kz_~NpnqyVD<aqw0kjAs`f(Uph<d17uf*&8QnUZck4DOjNHzlI@$wIIr!Tmwhn%K
z9jCRKOq?03)7@xpUi-YZvHw5|M8eC=!A}WN(pTnNll{nUeXbr`lc#AT<X_L>rQ>-$
zU*HC&n7J#Cke#bDxg}1GRlt0Av!js3<mT-*kC|c0+3U73GX0g}n(Agg{cc0e&qP`U
z`(Tl#2lV{|_qP3@w{?&$Z$j+>jYXc~V@C6~BRy<{4bvZWDY|Z^t!c7hS7@oSO>1sx
z8as+R^ENzT_{3Q)Tjf`q9r~y#_89&aU%jiD;jW%Yr19XyXF5~dwsf!XBxLm)q@dl%
z0cN%2Gye9P39>W57~cR#$Le3;jbMv?F6<(F;nK+Tnt%fmzzM&wkM2jmhuMrK5^32G
z)737heGf^?0kC~H6OS1_iL@a#)L~9Mt7Xv%Gt5v9rZC`VYh)$yn3KoOX<%wP8-hDp
zh?#aw6MKj6Les1<@5$GM*um<LRjzPkn*715Ju{I}8~_?!^<>B2Vd|Ju?lN9xeW;5q
z%l~qH+uLR{CsaHcY${t;yYqWD)cj=|>rPwPCc{7MG_O&kAGE)&H;17<?(rwMuc9U^
zy!IoVi+^vX(cIjg)JdLJW)Jn}qq2n`aaDP2UckrIHHYM&6Ja`G&KH>$`c{KY3lpb3
zbb$TA=CSL|1+FeX=|k!Up3h;QX@re(2Wl4h&kJ^eVLoo+kszhUGdE8)z1hP{Wf1D2
zovC8encc{m<4hh~P!?ld$H@e<RWG<@Y;)jTQB>(euu(sxGZ|_Z)Ox0ZEvmb*=FMzP
zEoN)lwLAlzRMI}8dPJs#S%mNAwx>*G8i+mQI{cj-VfLC$%+MW6u~*Pu=b6pAncJI%
zz{SJ(f06p?6x`Vr&16zTC;iJUfMvg7|GT9vV``aUP`hW1Kh9dOF*hiW9b^jI45owe
zgNnuTsqSi91FwoFP}~YS$DRix1)4~yZf62H`kM`$`^dlilkC<<n$J$N)p5N~bloY%
z4S<pbEgN6@iLX;?xye6qKJ@7irm?QV&ky27b{0<bHP8}p@gZE<rZzXgr!_G1UZ{es
zrm!iZRZJ?=25eGKjX2uJ^wq&8E#Kq??szzk0_GR0WN%R#^CwmVi8p$EI_~f~)YMYW
zrB&@wTSW&M|0CJBk^L6q+dzBuJ7nhdu!B%rve_7%>DZu4pt<+r40Jh-<6o$-naw@4
zlJ_3m)<E+Nt9;a!x2;XGUe`6yzNhpP@~IUZZFXsA+sih$gLJdWZD#NgI|*lmck^3x
z?r&7aJ;yz9YWJD9FtphRt)$<AN5kCpGRyu!FSLu9#s2n$wkHRx6M?<$9j2ZpL49x1
z#N>567AH89_>diIhad+z0<Dx0?khXe+m%j;tze_z)`#mYJQdN|4zdBf9T`t0AkQ8?
zgT$&9>fjs9XBIpYF_~+aWzrX8yF#zHlP$#^sHa_SK1zr=sHbf)AmK0Q)M9owyi5cg
zvX8hs>i#+QM!Mk2>FhDBDq&`(oysjt9#l^%JYTs^_dB8XFI~+fY0O@JESvD`uW+~>
z<@ChU4EfA{;N(qyz*TLq<l^2oubFSaT&9FwYx?pcq)A_(<MN_UiU40`88}_&&FkuC
znxeOJ10$E&{5a`+RZEx;W|7Tp`{K;i6aI%U^ECSr3=m>pNK0OV3L6GR@e5CbUnf%>
zyV<X$wH620WV9PX^V?b`rv<m_M7N49$YpFZv)w$DGLpwumL+DJXNuisKcTh?$~nql
zreR0uvHIH()4=SIO&YJe>?JMfl(6+}VKW}JYc<?dG9Kd(74|^|d18dlvR;mX3pnU*
z@f4yN=-gyl%!`c&iLS?1wG~X1*`lj8v$rL%0q?oN)p<Ntu_yAgE|N!EX&75N%HvS0
z^=!NlO|<iEkj)2OdQ)4N)wqjzq_3?^6^yABMmH7n-p+o7J4}Tf=>t;duSkKSYyi4y
z33zl0PQm!uB4(4v&lWN<<{CUe2D8EKWG9#}`1d(z;?4H7KH*|!H3iwm(3;)Q(=F@;
zZl!<1ajr$>@8N-VxqA>?Uc`ptw8lj!jQ*ym&1UQIO7|z=(n2s|Sq@|mkfIEbHQ0W`
z3$R1e7#{T}sII?tx6U$;kQtn#{B{Vx))`Qco46bMVss}_uZ7GBDi4%TgZHY4v?UGF
z1n~F2oZ7^IQ%=foGhG}!GZbKYdE>RIi+9E_Yq^oBV>^T4g>LF;w<L1${L+p>@b6T%
zxM_<1arHjXvw<y7<#8=Mx50&^tTs4O!5MB3x)14`PGdaH4_{lBzrnKs2TgnX8n{u$
zI{@yeJkb9geWDrW2>a<Z;PtoWGVr9dDZr7gS8rlp^%eO-VXs1tlFu|UIcy)UX@1c8
z(7==IWLrH2J<&fqoz8eF>MfG9yAlq4J`5i39{-9AV-vEWY}`Q)nJhX9tGP#mZENh~
zjyCbYh<GGKrzNkg2o;jc{H-3~5fGNEn8na%8Ek}!K&tYR3R}Df1-K-QiQ)ptu@W>j
zceVG-ajZ%?sJ@Ig+;qnCzB{o7selrJxaS&X0IxG+p;-EJ6Mloa$?mw&1F3bQeZUL`
zNo!VMZ+{*i(qJxNXUYIu%#7mkZf++EbF+|1_v&_UIP}vTa~pM6+UCXc_hkN&()Js3
zN>3m+o`*!#mug~e#$bHukk3rOjQ_%xe?aF?w*Kh1p(0uf`J*51XPixISJDQ&AI&nX
z>@09rwEND<EjiFV1MOct#Ri!Ky(lBFySdRU)@4+j7f~vpi;s5WVaSeqqPtz3Wr)y`
zK(ukF$%fh-ish;kYulK~dIbo2S!-$>l*1V4p(HMD>l-g{Cmek3V=8b5^!ZkdBb{kz
z)fnUG0^sLMw}V!*iO}A8aq=|)Sm<LEInzb_pT_RgJWi6GV}?OHXEJN32hLr0(;~==
z`#={pvE%G`lij}1Rwj%24PzK3{nY_FjsfmBpseOG71!rn)~3=bZX)}m#wOZ@@|c5>
zy{tBMp%OlO!=VuRAXBXjt=So;_dMW>TiE-U!SQA}Uqjwq*GvSbF181`h;g8<o5~`r
zz(msr3OW_KYdh5bEH}_RgT_CPGi`W!%oH$w#*bF>x432`+y<U_1=kIOHjK~?riD{j
zx&l|ew}osh@U5fU-{4HP{Z7l+b3oF;=82}VPqd3IDp^sDH*l7=6~DuNbUyP$zPNk!
z1NVYLsf$zZ<tW;uw*zH^-C&#9Ty`7JvL}#N-r;&~jMfF?ZDAMZJ~v@!IuF$6VI5<C
zr4cro+nDP5R0hL+WU^~0y~&3;jOAB2F%Yg@&A;fW>hP4oy4dc78!v?1;)_Bj$z^!g
zAbZiy<t!$~hRRWi!nOD7d|@DBC)CF}sJ!1f`;lDcz6=E0HsuXG+3CY$G5)T$q)qS?
z*3Vj;hww(NZiH9zB<w1b)Rc1qtFcSn!EVsz8VKju%uF)p8PD`{Q(#v&AVqP`3{?}M
z7vSu3+Nq|Uc=;F<^!K)bxx~HfM81dam}xBj-QU)v`8pISM>Z#&OhNLI)*gqe2|yZK
z(afWkG{=0va|#I*XL{;EU4&<0@Xkn>`S10sjKQ;hQJT{XHY@p(<iQi4dCiYn0o8mB
z^_CAFFa}*W08bz7HH&$G?F{!IK==gCX*weX&tk999ApTQ)C+32o=tiBx~CZitvHM8
zqdpc$U9e(r)Vc6SO5yM=vXFi;hBNDTINjVI9=)o0iCNnO)%Y0M>kN}XRp4hH;;HEv
z^IYffXv|I7&}^I$o?$BLx(`$?5;Ho0GqE`RM%&#!_8oX)GfsixxgC=vJ@Bp?C21xU
z#&EkF+Q*mi{ItGtc2HA&#Sxm;z6BRoa2J{bxI#^+>`A&s>)OLI(he~V>{5((0N(c^
zkwffRoCIiS3~KF|;@q7LXFrn_wb4e(;^%*N|5U|pPlR~_RGkVn=tw*-#BSnLQ^20&
z+GeZF;cVt0z8_*zn~dPxH@ZN}>q_w5P0p&NY)j1AJaBy;GZ)!-kcomL|3ia-oo`W_
z=QU2&LglW4!VNGj`Liyj^T6sfHVR0439GkKi@+TQA<wN~Dw^!(CmCwW^S?R<RkW9H
zNl9DEB)}t`G><hDD0?6K3*-0z)N5{=3VQ$rvC^M)9um&MCf?bI@n!)lJO;uXL*^FE
zZ`dEo>UT+Js~E>zkc~LKp9!_?z#pF1fi?r3gmYTR?dcyaDtXNsUcjA9Tf5j4<*SU{
zUg*{Iwvl!>qi{-SZE^v-5qrP`sXMUcuEqje>Pa(Xd?PXD5tP}khX!|`zp7$JMw^1r
z=4os_sHSD8ggC8;yZJ_yziK+1DVc`%jhYHyvCkQ#J3XB+7i4OJ&$F7JoyWkhWx%fV
zI4OKqPl1^pb2?y?kJ;-^cL{qG$B-IN$Jxm7n$E^!wkGl`GXZP+m8)rEO{*EqdG{io
z255#owJ5CrQTb6zV0G)^RpS+BW5NA_Sc%-I)#2v2V)tE6!0ng8dqRxTD^NFIG!ovX
zrZa_lm?pTxgW%7SNZBj%N`v<dfsQzjQ~G0|wu<7!_B<fe3g-!~y;A~kzI~R~0b|Vf
ztOD0Wn2%b~&4RyYAd7DgTv&p-Dq_l;xja)#0VT6L9w6%s3b8{?O>SqWyAw<#6vJGO
zwk@E}rb{|#>s-*gop8z{lZ}xP#@Ec&a$xQbwvvexU(*a)E|CvwPy0y@Xlr16N)7Gh
z=HUD^;OQCYv#QX~sq`|CyEjil@1`-6IUO`^U5urXPB(jLC2ulSq2woPVHR*fG22mI
z=qQ{!DFys#WJW-Lb+B)pL}cytkWni9Wh?u}&1hyI>+zT*&8r!qd83^_B|Ual&H{~R
z0k;}JKmNfFkY5zDe`rO<sVdve`kK+&-YBiX4n?H4<2u$K2%W@F(JOeDA=}XOHsy4r
z?gpo(*YT*qt5Bw6fII8K@=w{{e5L{L0<-mZeh993ra*69Z$CouEHahR8!kmkaq#*-
zy3#B)A;??<b%n`?_f|W@1yMO|>9z9}e&r1{mqxhbw6)KoLXRRb9cISyR``mi^o2S&
zouU7q(`KhIG+7k7zB!G+NtPdMF7p%CWTFj0RmW?6?#kz(g<Yh)4Yj=44!!tB59?P#
zRzW8z3!V-L24<x<zww69say%W>XrDFCLv3GNw3j!Gt32?UaDdH=`E<*b^J5t?vZ?h
zbfPm)g-;#@rrkhkY$@+9U{DTI5u^WE-XItIo!&cZxx7iIi`{in5U*k>96QdXL#Im*
zaOe%KN#B}O7&+tbF>=pLSGZa;#arS?MO64N`V5&)Fz>Ei!m!C>Hko~>`Jqq&i*&{j
zLB3eI&L6zTjzzlK0_V900M~Xqr+6HE@bBt(q@dHDU%F+$H-qtR9cNIxxpcBE$+^uM
zy@n+5iH6kv4(rn!zHua0I>yvHx>VXCVO@(oidmSYnv&CAgnnPD<?Ij0r8b&SoF85(
zx&AMX&O6@c^8MqEz4zXGkBkuKeO+%OWK_0f7E&rgNo61N*n982xA1x2*KH&b5oM1;
zp%P!x62Is9{lP=0bKd9ux$o<~uGi~zjXPW}|5$0?%g?pl{!LdDA6%7QjBo6m#$!Xz
z=%oMuN>(@1wbn^Qg6rXKcM5-2Qxn1!y2Mk{X(ho&c~JCvfx|M@`&*MAxMIs;l>MU}
zJ_C`zrDoyWK9(E$Lzrke!;W&);*#__@YW7@(E6f`ltv+H=c^Ev3rcstHM~G9zv1e`
zn(I>?Q8*lB;-9$j!D;=$M#O&c6KtlRNhNQBjtV}q8DN@Z{vV&t3gRhS`V;X(h#}Ob
z5IHL7VLa_7HTFFtOy3h7D!LuIlU1%n%wFNXl}?hH&g4zj&|d|y%$Iav?k}X9t4ob1
zHPO_LrO@5@&(sITh57Z>FpW0XW3iUR^_<q-H}^5@=!w4I7ofEdAd1BK3&HpLty^s!
zwTnZi(Rtx+d$(Rv4kokzP?pf8^{amGW@;K;Y5DxjSO-}gK0)7H;Ra}3{CwH)M`G|e
zbZ3K2756E9wIFcI!=HnWVOsv*Z@MCE7QW}p2JOOc{8Ds@h-gvJ_w-r8RAv0zniKR@
z0mXGUzHG0K_IKzQ8)tcZZ|pC`f1H*CUo5QUbFg<?z=ltQE^w{9VTo{mm?<0)wZV&h
z9PT3f{u0RGf4Yx)r5wI&MAj9QT1mH<V;x|!{-#?__P4mEMDMB;R!0|2;H>-$lCS97
z!0^74lFZvl4I8QmCz%l3Ac`ky3%YemhN)TCt#H9%I?+EQOPJT^4aZ<Ho$WchA}PWI
z>K+!m3`Fn;ZjAeg9h#1!KNlP|#OKCqxAZm8Y8(4)sLOq<Ib8UHejocd>=VALW6<+T
z@U>~6h5MkFO5rfj&uVED*0h;c4|dnam}C;}z`m%!30o0qc7*w`tTB#^r<Krx;TwKj
zSO7kdHK?H7-O*S>o$arX*_vpzS%(xllv5eUS!QC5PP94UC7VusePlDk#lawpzyPIu
zmv9vsppVJk*9NCgrt5VCx%z_sE&qC0JDi7iYlZ)LlL-FA*LQc>@kpw>jQ);3AHD-h
zPUj=HBq)tKdH|+)Oi?bhR5;dM)a*>Zo9Cu5$t$HVLaZ5x7r5jKhZ%ikCbRvh$-;iF
zmiD$O`WC%}i}kTC0<pp=?G33H_VW3|>!9Z3;Zao5rXc4f!DaOC<emx`8%N*72^;0c
zphg~)rk+Yi^zX@GV|?1J=na_t4r}3$6Q|1Kv1VBr@|CHGg`5IgVW+6S$Y$4MzvJ|Q
z1{<R#b&z;Y(n<1(NsJ?(@2yF_3`t63J~=B}qxUUyI2Da*5USYxq?79J^Tw;|p0ztk
z2*XwN$<hBSp$;$C%u>={x6=W$*nFI?2SV>k*7B(<N)~4_meE`b!=jU;g-p}UmLe=p
z#oZ(w&gnnn()y>i-;P;El-&OQu|w4eFS<eg5}kXUP$6>qHST53>=e}A`gq<~^qvmD
zy8f1|K0+N%>Y+;FTAyouH-|m$7S<2Dqc3ejsjh&kGaq%lh^5yfve)&Cb@5k%S=P3m
zvqoqR<$Vzyg?dpsOegClN0<#9f5Qsnzpe+Y2$4R&55r~l0{)6xIak|fbPKd>SRrg@
zub^9Q4u6$5T?IefGWm6&>C$9f`hh$4$lS2McF{lawMVoh?0Byqs~yM}z9ZlBJ6^!v
z>Y;@7@L9-mWUV(8%=pN4!+J}?Q6_otFZ(vaHzA@+G#T}zJ{mKA9R_qxPLL^}(mAAi
z6Fuc65Yb|G|4)?L$>9)kz3bor(_J~9@I6twgQ-6gtq=ReZ_}FOO~#R@n`1w^2b_jy
z;E(a3kHNCN?lzl4j<cJMafzA&MP&)~WVxj?ajLSv5qxc>!^T>|H$d4w37@?ew8dUt
z2JLPO{+7LR5;bcos9*xDYbxCLIrPzF*yv-I$ExxsFXPeg@g1rQsA73ac65Zl9o^P}
zVGB)$w`ql@(lA^Z4D(8bcFwR8Ub~u~>btq`Ex$jD5;a-3qM)w)|F4(!JAy*t6**76
z@?jY3`0xf=N!jrIs54q*E!4pD^kKf>=fh-|@bjDXby!4p|D)COkEzc(0GcoBYm<d3
z?hjiA`-Ib%5oNw>(8?b`Cs>6}mW4Mg7^VxUrQ<2%^(EAn|D=)M>;6y|Oa!UcL_eRX
zX{{<)beU|k!u}PqSRcYoZ|a0-wO<_;hRqfZf5-Zk`rS+=y2WZH!g^n%KMqC7?gZ<=
zBVWr$eu-YOrs#*&!&S0YQ|S1x3>>wDb+__vPIxEy(?;qu^n?j`)_d9`%;f(kw^_~P
zu)TNuQr>b=nAwv-Ign@<tg~C{rn{4xJzzEvvG)sC#3rJ^Riy9sZgdWw&>wEQ&b}lP
zPWyk~d(#Gmm2`Vl9i;w}9$@O{^SaHY^k+36_Wdrr5JU(1oA1(xX~-?@wBnX3jG;<2
zus_{66tAI@$q$0NmO?39OeSHk{==#N#<j6~)QV@vCX3d+?z*EO9o2@GjEurAJ*v52
zJbA-EbO}8T1yD!diC||$?rpY=oal9zBK)0|4SdHaBfHhf4Tx_N_V9vwHp&{ocIk8X
zd37l|eLdfSHF()3pyBqw=T;=<Wb}KfBF*Jf`4XV?dMFH+LA_gSb1*}X*z)idqV5qL
zuP_6b6SaFdae69#{Cd=i48&kqb@Q-Y*ey(r{_-W+k3B&)`@?dhe&+LHuiGq?k?Pjk
zj-rQj(|xfN;UiZM1YevJT-;T*tol5-XfAPcoG%q-iO!N4X-^*XcO45BC>3Tz=P8L2
zStm>dhDyh}F(Z-gtJ$ENf1}r7EN57m1~LInaggNKY5dl*zAz~F9(5aJv)SKteu|Zh
zVJG2R@;tghJD&#6oIT9utHT{<5ND5qt0MgnU(|%Nm?6w$|LGHHWsSoXr~u1thkl6~
zH-Km{-i>$nB?THAx|^fEM7rw*OM2Gk;0GA{UTtdoq>mNSUeXL(x+Xb%HY$?Fpj*yy
zrNiO)=~?>3{pX(Zap5GJz#CLxLh3ZTE2nIM{^)B(<-tiKjHx?#>^-2j=B_kYqnee8
zr2}7WK>=)o4KbC|Hrinu4ENdRU@krpg(FQk$!#Qa6_0*4HcS~bwCBn1mG>p=jI5QX
z8u*<uRg;5TTF@;uSewHk8<17mi268?+KlXe7!0Gd9}|p?@{n<#Yv*+nbBJp&t@a`6
zLI-`(24O9?Ej2$gD7<QkmQVUH3!^-&tq-|_Q9%Vb+8gkKDlTTDnV|8fwgWXyjlHT}
z!%J>9>S<4Z)G9N_WWSiaC7oPD8R-AhjcA~?!j{B^Eadba6WO|e!P;pXjSJOp=VT|h
z`RI+ub#-uDbHX1tV4+hbJ2PK8fd2oW=A(|bM^{};Hm0J>N^Y(KyHhyY$BCGyZTwP8
zN!{I7)&s1)U6T4z2l)@8onWSR;X5+Rza72^yBedb_O~D})dIEIX&wI(nV@X`cd2Lh
zKuSrfRtK`E0neD{)9EZp<*I}SsRXzNx_+j|4F3aq%NE|TWZ_o((Dkx5ylrM*9Q^Ss
zktJt1N7912)}oP*0!cmN{N?kx!$IzUVI8ny2Gk5^ANc>Q1koX_|DB9wZKZctbLk7=
zWz8D?O4ZE-Ys=G^vx}da5+?XWD}>^A%l-w|lPUBWL8j%y@tpsOe!Bjml~B1~2upzo
zD$reZ1*>U}C3KWe!iN4Y*GI0C9lW4NgQfPnz73M-ZWaBupgKrluI`l5_9{GN0X41b
zT}#e;UX8+q@cRyWPFF_T@r${onx>P@S_%!m8cM|p|AD3rySrk3C$T(NSQJj3oD60)
z)a6XP%`_NiRg};%WPF|7b+vpChXGqI|CW{qxm651z^R9`2A^3O-nxYENMBPg*+dob
z-smG=$0pj3#L<=VK8kPww2F^lzpeQ!wdm}QbOLBBC-G@icse+W_H_XLtRI=ZHDq71
zGj-q>ooKnKXe>faLQ`1AUE+5f|5YfQ=0A4Fi0!*wHGJ}$az|k^t|ofKZ{&#I3;XCn
z>rM8qBuY&x_G=k?kQQHkQQEMJ)xw9MgqvizM6>a)g>DUaaw9nKbF_&u>_-oM-KxVd
zOG-+1st9!qWpu8k^y*f^NpI;6%|R@=rT?QVE3=;i>PsGu@E?Ykt&sE~t23MY&N(dv
zc6*)9<Jx2fbK;?s<L_&S>tp%IpRUyWI@mWOb}Y58$&S}S!x#)YD(@%x*U<kSM(unP
zw-!yNclfq14f<Uc-#bi(-%qNRPs?doY2$Sgw)8Mc2s?8&+v{S~j7nx`>+kvk@#*{(
z7}70J+2v?H)u$;miRo4em5R>-@z49h@ohmA?ftQ+XxJq@<re8*L^K!7)h%G4A0&bq
zEkzHC!SPetP$?be_I;>DPZ69UW<BQ?!%Xv2w_XPSSlb1<TzBgsH(h3tf4*c5(aHvC
z#xQRb=T1p3_H>4=qyM6M)YNB`F5ue@mV=1XBz#WiYt=B0-n6RWY~rYCQZ(2b%%K|i
zB<H!TZ=r*!P3YjC*d=M_J|&BI3BBBF0-k57?eN{n9A<-gzGxxVl*Ml(%aKSNEEMiS
z(fk&snw=dQg}=^@vRDl)Q#Z^jz5MCmS9eYGq4xG=O*4}-eGybXm8fl&H{553-BUem
zbMRZ6KpEpSHShP#27xvU*g&#f_sC0vrt!1A!VOse2$0brKO+25|CQe1iMo^AHngP$
z@<6j=Q&fYa9IfDu(}ne|5VjonPWo87S?;iZ_*7;^Tgmx7vV;1ME9|?4J?YH*OTGb}
zoO3US4d|ur7h7*Xg8AplX#J2q&qHMC8@^2??kx1BGpKWgiGTAri)XBgZ%(%7Ro_>a
zX;!!|EVXVSFk2CS!E$JRkQ}v@)W{CjqTzUa@vG55u-P^IR~dZOL24X_M`OuN7Uq;M
z^O=|j)SREVKu>*HKVEydcgPL0;$g0^h-UX$T{&V?f;7~X+9&)zy5$B-%V4Z^Vn+2+
zJqtRWs-MCaf0IRU#YRMt<zV0?VCv+4HLT#_{$aKU*7Z;_Yf>L%T-cMi{x0<j)lhYE
zqEu&LM&<_HV8^ve=p?~vh8xMBe#ows3cn&IT_oGaL<pi|Zy0VT|6P!ioJU1+xYaEy
zRbeZltv17Y$8uXSGUii>P`g1g8>BTcys>}i=JS1T*+4k;r`E$3$~;9)VHb1yp00kl
zktkisXNAcZGiD*^DPqhU;X~a=Rp~{S$G^xq+v!fbDd@tl+d4NWiC_92$^>eus7&vI
zpS%>-(6s(VS#67L8fx-cJkU<RRx6^IC8ulTF$it~`<V~ybCufA3SnB}!y~de^FhNa
z?3z1EekJ1puT9`e30V3n|8cz(c0)^q84eT;*V!m?39tFF2a0GHD_LLlDLT?wH{I7G
z;%$P%WVVa)28zojpBUlm$)7&5-RO32>kL-}-kPB4!tDOD@GV_!Rq9nidG2Fb>dw|>
zu=>Ua%TOV`5c??*?h3wlHSoG#MtO87JH@26d#F6WkONZkP|R?}g4f$+zm3)W5l;`3
z=IuIKId_d*?aQvSTz9|A@0u)HXknN%J1P-gP_kZGa&Mp5|74I(39I|N{wH_Rz78jc
ziR{n_zeh@ili_aJWS-F_BuN}}b-0SGc`qqtsi+xRZLe{nzSHMmP?zitY6#=PdZ<_(
ziFX}?2X1c|=__Fx*6@Tb!?u^pWql50GEM*U0okprWYCvbCo){ch?Co*-~4^+;;-2v
zPC-jIz;B^KcCju7p=L2JS<q>A5#zrL*clzJ-%=IxXY8yLi29g!^(<Rh)mFKeK+cz9
zMos%q^i<w)v$YzC{sVmtm3ljoa7<X1*m<4&R|UxlA1QB@{Ty3HW$Yn$6FVrY=h2N)
z9H@c+eK))s9G4&Ej*ZYM+T5Df+X$aL9W?MW<xR9c@#lzXIsD6LE{&u7WOe58l!mNv
zG0wmNw+~$@JK9aru&$p-{P-WW2-(6;WDr~ELY)z$$E*D+naEK8?0d;ll-sF(KC$ti
z_ytzWr}o1wS+Z+-lE@vo8EyyIC?37`JRE3>mJZ8?vtduIiFT#X94_D||FYD+9!z#u
z_>IOiq)Mb27^$Iwe6Z++bOb#FEtc0~`je-2+(l%~(xO$SrYrwv(Cb~^e7$XT<!mDg
z<zzZt22tl%J1pul%N|t7-&p@{;Z$NqernBfgtb7FH9#G&Q_uZ3YVcCuRg1H_jbXzB
zg{g0vnGD-qH`f29-|0P2IDX%c4$JsWmKg4rJgA19$$Azh*Hnn^?T2`luR!24U^^bI
ze5<7)@1Gqc{hr>TrstFJS9egJn70aHWst;|<hFDA?cpn|x<}twuIptqH8aV5hcE?Q
ziwi7ExI37@4xW)3;Y#0=e!a>*nV0Yk+DrmENP0M2dz&1l3h(kAd975~G(63z<PMOq
zN9?-W7j6Ufy%|>1Z2lFSPn0f9PT@=NN+KDgCsZA*Kn*PFP45RWa>q;ke^F`wm`PR@
zeKuDJ)|59)Mz>x0unyWnfZo_dZ)jnVnCiEB8Oz9x@>axGfUouN7hD6X>$>2hO8D_X
z=I~jpn<UlTjE1dzE(O8{;MW(Z2u-VkO5QdsY8AuOe63k%<P%kofH-oXF1Ljx-gJpn
zz5a@w?X!OP>r|*uuWM7^DQtV7uNDiY`xsp=mC=Xa^|PZ)I*;#5r(Jca+a*<~f!PU8
zIHJG1+reGUVkyI!x&p5<H<)Aeg`!eDuo^xm$h3ej>`$>eUO)1!y|;-}@FrlJtLfif
zLoRtC_+%0hWv5#p@hB<%b%B+mnmYv<;`jXt4T7=$06B=X;Tfqeo9wEb)CAWOyDS;*
zrBW&zd-J=kXMSK~xN$aW1y1-PK?(aTP<xlyzL;#tRVl$Un^1jz+a}vKV%rl*bW7<k
zdqWDt*e5cBXckByCtB1f{fauf$?R%Dyj2R&`Ck3Y?I238)Z6-+?@J|D=;sk9U+}3c
zu=J>yBefP)G}-)avX!aP9<yPq-|&ep>(@%(3l>ojPrM&qZ~z^m2RiR{`ZUBJMVGo|
ztvHGKQESV<jMGVK>c(~lo2;Xy)Un|P*VT68{c`wx;k2*`)>p?<Mc`VYbSFzjwV|zx
zEp`*(mpf&b+v0{1msMv=p?V+riPX?s*F{!6T<X3<QCjau`@Gn}W7M{i{$K6MzSZJ1
zlpx=Cmk3$Mw<W%QU>_1u_fRR3!A@vLUj_trLbGur#&m5+B~WQt(JVSPGU7>c;qlMF
zh0D9cveQ22G-uMv=-mU!ZY0>B(Ev_!DVIhD+Z<5;HGPvhy>_mjWwIgcX?0sdWJqhZ
zP%UTUO~x`8vr1T#^IjD1<IyYT#|PvPZU$+?x-!hShFL!3s}14)O?_L>9A2_Zr`<6u
zhKgFPZX5j_;@N7d2#fmF)FP$!7g@Qhwj94cBK%1c?H_Q(75e`l(_^_thR8$B6fR+h
zK5$>7n*A%>#Dv#&u+rjTH<!V`0+asK@8ry!lF>$u-T$k7Yogt#jd6MrG(Meosp-zx
zUCwz`)Vj0iGg)Ez?`sAffHFG-e32|X>lR>tf9X0hTD|Exe=N_qv11fHag)hkUxBB6
zp_6r$Ea2x(!Isjvl*GE><k5bkYN4$awKVAeef%sGkVCc>kC!2wz&Tsx>fl}fM4d>f
zAM4t%Vz^eu>^~7rjpg<S`E(6YB$I{MY<Nlgh2PUdaX|j2>vOJdMDcGfSx~i}pyO4>
zJG^Z#=vq{^##S7Bw@EW<0npDETK_<ETZv6%Bo7K-vI17XP1Qo2@BCpK>k?lB#l4z;
z+cA;CGZ6*<{G9$Ae+xa~7Cw3q?Db{eS-&AGaK7#?Z0`Wy-<5qDgQcuM^{cHLs1@I!
z<uobJk%wI#j?(c^N5W5gN-DpC{psq<;AQI*Th?>dHgi8rE;#jEYPPD{5p4}Z%^MxF
z@2wyhxC1(21DNuAflya48mk-zLp|e$`%C1A&s$6Cl=|6Jdx|}r*10yAGmyk=vW69X
z3T6tl4cF>_*u}T{H5ImH!2?3nofN+0=Y$)gf22Y*l?fh|^pZBBBD#pLh(=l{d>S+W
z*RKSxyx`9V0qnFlOyVh1#|HXg0lj4A;L`7h>8Uz;mDAwJx5QaOR3yA$Z|mpyk(*jB
z{9b+!l4f9EVPy(i3+cul<mSz4<3&xri3(u7Of&Z_sVMI0Gq_ds_k4`4mhpcRyN1e8
z-72&7iA`}?xi8@n(Kf$t6F&&-@T+AGbB7P8F06*m-`aPO;pE(!YEH6H&HZeiIy8Kj
z|1$)J(7`ir6&oI{8N#LU9pF*(Igb;mwHk@4@***!Y?xfi@%dC=zz&LnXXe{Q{vD$>
zt}6AhHFQ+8$A)vVcfz(Fq2uJYS~A|a{|V;uIX1S=`k`V@h1pKBcy{blzV9_VXI~ig
zg=9T8Q>RukT#PSXgo5YnMV@iUegGM#@GrtD|AY5u@SjO$*1uP@c>fP{(vA%CQs2@v
ztg6L8Nr~3l?bMhY1`Xuk`wvS4U(FI-(tT|sqJnj&6O~1GlRaweQ&>$@+QH#rscwtI
zg;EmcmQLG&p2aV8ZQ{qtmwL~2(AQ-<Y_SNZ@^ANws$Gr$8qKo;I(IEphMD-(IB6Wt
z)1iI>{Q}=mA<;Bk03QBIGg_qE&Cu&Vf<agGtF6BON=y42L>F*IQr(Awv|-4jtv7nY
zYIP{O;u>mwYL*L<AD$^?!%j*D3GN(eb$wCCkzd&iLtDu{mC-VYL9N_9S2|o6>yKBh
zh|LuDA(r$2yPu@zBoDY{GWlXE&0N5qZ?UV+G!rplmiDux`O=HAydkJaP1u>!dK;xX
zBi)kOY%hx1L(T3R*%;J-6bDX(J9RZ#f{r?s9*|8S!7|}#*m=_JO_xE|xu{?5txwP}
z62T@ddH?LFU)^M?-|if2oBBY#5-!&!s0A7Qb3u7}8H@AT7FZGV=}q|a=HdI~ZI`+7
z(Lncl*wD?i_I{rG1#h^|YH4cT<QH(#%y7N?fVb<f6F_RI{9%2Jg51d!!$a01lHLeH
zl$o?3=&Ip%Vsj?g?mjuq8&3vpDU|@*@wPkclc22ys6xHD8DSFHoxik$<_<gKZxkQb
zlb@`_$@@XmfZ>Xwv(5LlwWd!Vj0ii&vinjpQj_wyDg3A8{>y~lMW4Wu=<tVMKCc%*
zM|06*rjcR!7mei(9x11FL={}8$>7x{Ex|$A!Zm&*D$NFz@16Fwf;X&|N%%xY@V1eD
z#V0ouobH^~^B}FoHXA-UGQ1xxvQPP`a#n{rwAR>I+OV5181y2x=GPLz-#p`x76_91
zwNA?H@GUwN3fe5Kh6)$2|4?_{Ro8;`X9h3HF_`9#XrqqA+pg2}*wYK-au&vKChDF-
zWuxLs_iA5wSzqeIvg?X)tR9oWM478H*>Yj0<sBoK$-TS*L(2yz?|{a-SqtK0egl^j
zWX-c-=lQ9hc}k`<o|r>#Gk3;(jIOY_?s9x{HvHfRJ}cRd>GFuoN^X$m$9kB&V3zPc
zyzY;%3ui4I(QT9EB?rSDedP7~;Eij8>zlj1Zh{{gmIQ+o2{TYBN6(dOA$xV1+f9Vu
zN#5v=`yh-XXTMfYx<i^MBc(kU;XO2+;(EtTad)us58=m|sBGLFOq7?yuB>1i&lKS(
z1I(Z^9VUCIr^;;e;G9$RXRJ9lHPqL1j%1QsdQnRU=kY8xU2Q)U=8;qz-P4sp>-!S#
zd4}HRIC`)WY^=oGufahpVd?RgUq{KcwGQ%=w03wtJSgbBpyE#99{#M0JwQ9^05>`8
z?nOszR=7`6*)cK#Q^_E8@yjibj@R{Wh^0V3`z7c`734v>4bD<Yk%bKOad6mE6plmg
zo=in=Ef`jG56G)Mv&zyBg!dYHL`LdLhJikc=n{=FdxG93*Dl-|^|Er3#-4&2YB<O4
ze`<r#*SA=LWyFtvi&veiX*sFmlPVR_J0`#YbJ%Jt61GJ}eF4jyhaY(pZSZ`wQ<{WZ
zH5W)`zx%-g*03l$^nv9K(}hD(Wir5*XM?vEgS!7gt$IiL!uOJsXD^7hJRV*0iscYP
z4ZBUW=*nl#hz@j!x_|?+-d+uVv5|PV$2_B{en7p^Q#S{sQXQ;3+iHXdBre=&M|Bg=
z>FKuWI=2K>IsvQ@7oC$C{42Ni);D86^W1d4pzBX;*oJ4FVW1tnR9Cu1@3?XPTUo{v
ztJ<*eUl71LINUO9v@3lV-9Qp);Cx@oCA3QBtms236At<8_PT9wpSWCj{1uv>JsVE9
z%Rp@zUSr3u69rOnbIoz8b4Qx!>*UI6!l&1oS2~8>Ua8O5vxYOn>ztVqDC}ji^QK_4
zoaAT+*x&3~zHk$)sy15p7I@4u-RK8vsQ=;xbB3GjRWcM~TzUFqYT!rO`it%_mjg9q
zD)Fjj*pQt{Ywy9$O8I_N3#~+Z@6K*4m4QJGi_j~(g|+Y$o2<QV4finDX_MB3pG>mh
zraD?DhF$q>w=H?SeEwuqpKQ-yI)gHY9b%Wj{spMlOHXbq6%}7SIlI}w@D1H2RJ9sm
zyn8C!c>B-oQLvZ(rJY)i+vjpHL5DeBk}TN`8;gFh6g_CB)?r@Q1bW;IjWi>wW?OWN
zE&5yh7UFLs*9;cYEjHAK`M$2c?~k@s1%A5^?D>@at%Fa>NT$tDZf1fuYWcarJk&!f
zF=0a){4;!ET(~6cNrs{yl^WeGpD#sT@E2K)H!1D5p-jF&Z{BgY(~XZzPD*RCdPUF{
zn`v5bZ9n!iX(~ZQZD!-#b~=t@`nRmK(Y8Rg#oou0oV2DWl}FHO8uDpV>s0&4ZAFRy
zN~&5r@c1Im>qg7(j>=|Bl*-_oo>*{RX1`^oDrJ&OPKQSr<fDpbFMha#z8lPOO?b)`
z-laJiq2K6z{z$Is^Fd?O;OE^K%Y%Pdr~89*;`H6<6g+=-{7qZwYG`(T<5;<_pUV>B
z%4*)M8|c1>)}XI!l^(M)%=u5S+?>1jqX%-#4IncxkgA9m!@sEc9}E^dEZ?~E?8H>`
z*M;({{NVQ3cur+HY<>~3d;sdn^RBriC;R&ztl~kinc11lZ;pP{KGExXAs7s%UrYwK
zu3iN<CwGO(&Yjb}d~Zc;xdF^(J4*aZl9Zo3Zf)!n*IEYH>oP|ck~JTN)>hQLYdI~5
zGV6X#%91s9-cCoq*%A3W%CA|YZP9zuDtKV=!F$#+_P{FZFY*Pd!XtZ|eJ>2FekPN&
zpr!(`ceW(GJiR%|sNdRooyOmLTnXl1_O~I?ERf`O3-EGhP&Kzl9fJFk)yC-6_|k4O
zIl85+$39snU%IBQq6Bu_E_1WfBKMV!4AQaR$Mm=8FP*NVTmefH6y!O3?OJRsy<nB}
zH<xt#&t0stCjDP&tr8RSn#rq{j_l;#sA<%mi1nL%pdH;1sTs?l_u_}hGgvBjCr58u
zf}XOJvYTqNt@@1|($Vgb<n>FWp(|}!nW<7h*J~m+@5!?yXP1+so^V4Sh@%uJsows&
zE`w!_8b$dy|EUAf>^j@cL@6r?-aiE&A0cU(9CS}!1cL-FYp}zmMCrQa)=@do7`@>l
zf6~~eWU$k?p>Doxa6^bGuP`U>b-Ym0B+El?6;l|POky9)Uor;7{F%Na)9o9p>C%P&
zxX+p2LtO&T$P6Mmss~*faQJU}NIJXY?j84o<bl(k)7-A9Od;Mjkw5Jwac+xC&<9Zq
z8->F7z#dBv)?&4b$r^TK6{<uX*9i^3f(EKGmbd)NeNX<O4il_C!;j^sChf80rekOf
zo_Vv(fN8&O+3b+MMeN&P=j1gQW;<=G$K7jEob2mMnxFp1_PoKbL5V=z@nEagKuPLn
zEqKB*mxG_)#_1pEQp4-M(#S&1LKVX(>1T1Sr=@_kHwn(j60$PCX$dRlw!5-a)9$kC
z)=)0GBDRpK`B2Nr1XtYd+HQJDa`{)H#TIibrHpH3qhkNVH*9lPvHsDrKAyXSsdkQm
zbT7GSOnz7b^BV&uPvPGQjwlnu;3&6ZLv*LDcjIHJsEyy|{t7lxeS5&Jpk6Rn&#flQ
zQ3y+$K_)6u@3a32Zjrq!c~HYL@YcV^hPsL9K@U04U+K%_(DQ`*qK9^ZOxP%y$>hFu
zvepVqq-n9>0ic^#tRS2^mwpB+*lPD=7%MV}j;){N8~q`Qfm#=G<Hl)_*7hi`T}#q3
z+!tCx^U4fYmz`Y;dswTT$ut*r8DK&Z91S+o@zM$uv0IAKiF?XrwwHC3uGBx_1ZVaS
zWnRWrEeb*}XYW`6H0t}<+$f9J2K?>G@!haE-ft17V2v%(R8}MC<bDcHF^l<*bg<{8
zDgLc17`-q4<wv9H2@SH6P2xK<YVlx-e!-lNFWKpT@$Vm_0q%6uoUrddz|d;L4XT9e
zqPgyw`--}S1iPmxIJr%t?vhQv(DCsJZllW?jC7Omo2{5X^|Q9u3GkxYIz%P}#cV~C
zmeX-S3o)txeOm_t+Z3x3t(Sr3qvWW(MW_dP<f^(j@b*E=5sjcaoO?qgg`J94aJTkR
zJxm<A$oYFt3(Ey*rXd>b6PFpDW*Qfp6fyHy&+_w6<pnz*%Yrw3F7`Q^_jXHVf5|Vh
zgR@sp-sk=3LRNGFrWSIqFLU!=k#hQs2r-j8S@PS~Xq=YRA*_7pUSKLqHO+~R9Mgr=
zV<<V>$5d@Skbm9c;DTJY<Y626+0L`?t2o_XxzAm*V6k1$wOU#W>GN>g1oCf>WU2c}
zHsb@ov}rK2*+jiub^~0vU+FQ?R@iC-`7D@i%_Ng9a&OuamqC)d4b~m+KgcG4MEkq6
zI!n{Uii7MrYfkEli{g7e4h~}n3$U9~oZaktpPlN*xyyjv+`uo6B^IvMLF6IIxIwON
z@U^V6<<`(Q(iHAPZ1#Yh(zo%T^<hifnVOhSQ@OG7Rqz(}G|_T_LLRx7bpWTTjufM-
zzN0IK#l+JURn(=!SCoV0?6+pNQMSoXD2@evRy*$wSb3X^H-1M}NDWY)^4~k6CD{Bq
zT>_iGpfTQan(M)t98A{z1njJAaKfF_qB>i$`zH3OZFVo(F8$h-<t<i7CU$3sn-Gka
z*Qhr=N{+LZjU>BW2Hv?v;`9Q@I43JV7JpU1D%#uB9{&^#U>aN#-5a!ZpUDVqr;BVU
zYx%r(1v{RWTW&daxSecJdwow1GmoH=6t(2~TWq;IEgk5$Z46iWnRgfgMt$u15e+BH
zU6h^P{QUBuMc5+tSZY(pR|}2d=jenhsG%%(DOvBoH7CkwQ>N#)!mEt8QP#$#rG6n3
zTxy?d;{LGY(w#f&SE8gWv~TIB?C&O9>gc4s@3zWjeE4-(%O-jU=DR7j9{>2Y%a7lg
z2e*my)Er7NnDa?Dhuug+2TDdiLK<rw`xX78tV^^d_KU@7YVBrw-6)$0j$QAVXiT?Z
zqN}SlZG-M56Oq@SM)}wxRA}iqQ1VLG0i|l8)YA8uI4~&wGdUqgT+bkleIAQvr+jP_
zx$p!$=-aL!k#-8Xy3|Y@eV3_O7p*LM7n+fa4|i~S#s_WU`w)dX1OZb}-ytWx$Hi&Z
z*d|v|t6CP{0^GPuZ$<Ysa-&hJx^mh>o#ZxXaq9iwBGOfq^p=c@qV{->I4h%>I5`tE
zC6?2Z+K`w|v4!@kX7-)f!#P$%ufmKU1&N@mrl7&&auUR{FgA>LD8%&5IrJhO2&&<O
zK2;`Q=sIfz_iAV#NoCywvKUh5w(BkTWuzM+&k^5y=q-6Z=p;GZF&nI7;{W45&ll`l
zw;kT}soS@IKI+dnTZv!Vty5efKFi;z9zQxKE$wPB!e+WcQA&K+AM(9Bq7`H!47!^8
z9?kub%SE(VgDP?xUvyetb(h>ySEtT&`c*Gm0q|NT9c-s;Bp%`_RV``EyKb%oSaPr)
zmeKLm!YC?(R#4c-X?eXSze{>B#hB<5o2CDY>cx-5%hh8&GD^~?t`-fm=d`8u01q>z
z5G=k>Ki9FrXZWS3a@hSB%;RS(*dMM&ESqK{hS!kPWJllBcwHP#cG+wmwL4{Oj|_${
zrPj*f3aR9t$WJJ!Kg&q%rpJQ<ZoZ8GvDe2cI#OqtOfN-um_s&GhFX71qp7J*Z=@GA
zHT9Vr+;~{pd~k1heH2f0hoR1qo7%wdaBK8P@YE`~Poz4NpcC9JE1-3vtiFT{*QMH+
zolK4G7omqDC6R5G3aB`@ogvOg_@_tg+;(~5w(5*%riwL-uEy{h=nL)io?diyVj0|e
zE2LBL5zS#PGw^s-`7;$%{t>Ib1nuUa8?Gxsh2wRcYb9S|>+yER+M;%}mgVxBi=&Q+
zN%-gpAKPUxQz0^}S-`wIEf@S{oi5fh`1}m?MOQ>srw&RYtnp41*dy2j)3oq_WcNXy
z+fX#KN)tL}Z{W=bQ2TkFcf5gJH`8)%ba2bfA`;eu9bD1Q(p+lFWa4lkvh)LNzLdbO
z_VNi@!FUSld^l8J<BkK%7CS26MMYviN>yxisV3?Soi3NiV19y^YprWQ(RVm`D}<^)
zn~c^s#dca1VpV0(cq4l;xU8F8XKXW<bitxhaLN}`-~As^p(41re2|J8aL4O%ZAt9d
zA}hf24Z@#TlX>=mU5F)Bp0;OAlO`4>14T4~T`i@y?^X27mu(j3tc{d}Cocnw{2_&+
z6Ou6~D-GmlH%)75W|v;elKYux%j9Q#`!Nlzq`j-Bf@ZLX$*2j*C4>F|qspS0ZMB`0
z$lY`QS_V6-p9KG7WnMwUc@oj1CS9y-IMZ%%GE(A?7P%!dfSsvg6F8+e$e7N?a=w(l
z)?Yp*f=*ZNG>hMl&h~k*+NIJ5E_C~B9L)KGRY3ne6rT@e)yG<sTmM8e;@jUR`s8un
z$|8lKg&J{JE%vzzw0Fx!x{vCXCqq!owdb>>uqHZ8PP*@;ovn;|fm;8TT+!~}tUi^W
z+)ZnXh5Z6jnI)5g@-jqox$HJvt7>l<kJYbqwcJdtjp}yE#_L1*OgC#`YABAnn$(3H
zhI`bHh8lN*W3{cpV&{mi<3KqTB_IEPGN%^hp9nd_#v6B8U>yr!Xe&5%CxRd1>%pM1
zkS8l*8(qBB)wi*wMrg|YtTa|wkq-0Z@S}=+wKTt@TC{<EE@j`iTXG+SHQK7cFeZ^F
z%^PNhEoXvJ_OLf$*W4^=0hMtn;3MNWEmN!`pJg-1AUS$aO8=+qkmTfT_h<o5YHu9{
zdprrh_(adRWyXXoy+v+)UJ$2STydAhP1k-p4Fz(tTkhU)rF9i?=>@%nmfyf$iz;C0
zIeoh*yWbEM04*r!;dSW_!~HvW;F7s5!8WTwB&wlB?PL2dxTsyEoo;ZW?Yg_dYMzAY
zzDkZZQQGr1tE2HQJ<KArj<Tn*ban~6)kSt`I=sj*oq~713(I*Qh3F5t3l|$}6J(Li
z)vWe4dr~#3hQ6@Vj>w_-iFmRx;DP-*lSq(RQ&RhORc7lZ+YO7Upm;TzWwTv>?EOga
z3Tnn;c@5_Lx9h0A^@^J=%WZ1#i>m?t2(kLtwUDjBFP?Ul=mZ)erFiqgvOu23674N_
z&t7zMT{kiurR{N0So*k>oVgk9yif-xIqa;<DD}}nzH>v_mG2}TE#b{r1|2VBdG6=h
z50qKTo&{6wv)FLU><Yq>i{lfoMV&x11;NTUqpMicQX7wV|G_<Pn>7}V3J%Kwdm#Nm
zluMX8JUQy>;=(uFP$m#8bDK4_+r>S6AGqsiKNIOK9wC#te~FbBxWAot&sbHag@fi6
z&|mTuaiqPyhu4{AyMuI5S)FTRKq%yPP_XEEVwTHs>EhaQ1}n-du9yy&_ob5c;kzcf
za@J7eg63H3udaHSKl)GZQ^#SJG|eNG-o#H_cgyR}0Am%k)tX?FiCz7{77@N@Hn{kj
zTSu*DOB*cNiJ7ac1QWs&b)Ft`Rq-Gl^nm^wZNayU(FH*PA#)L(CO(d{5%!OavSL)}
zKH%qG;m`3Pi~BmLtclhaCHG-;*R9vPZj0vR$#JfNRd+9f*IUKX1Y<e-ljN}7ruX-E
zH$}(Fw^0g>xdX8BG_E}BdeYiNr8SLqhK*I06p~k~@$O~gD-tV8bHdxGxvWI2&Nf8T
z1yyW${NJ%zHj^&k3)s^6V5{Ve=7SS|ad)jH>sZ^KYAa=W73aE_T*PBllGK*Ny&PMD
z2Q9!kJ<r_5H}HkWWu|>0GqjlY!+#~%MXiD5=BAIlK5Mm-n72KMg3-Yw{AX&=LM{6-
z*c3DfI>=hiRcd_MFEYml>QGr16t>Iufu<z7U6YA2*PV-R#M%|LuJKHUh%coBw3BYp
zPpqAtmHx6+E5M;k5Ji?_<%y1aLy{P&=HpW&edb;=kH}DnDS{8&JpAJ*_X!+elluoh
z@pCMlmXf1VQq$QaNv)^cE!Ui=-&eWEiFwqy>0;>1^X}_e;zg@S;$H*V|K-M8x+ELq
zr0vi=%-2ipdVzo2f}D5qyS9VFtAH@(YF@o7Hzg+?ceJL07ZxRtwOS7Z*(JC2)F+x-
z?_phM+~nZ2J_&9}c6xJaao5pTe1d~A1P?S)_o9KN#xG=&^4#dLhRU<LK|i}2WZ;vZ
zc4=+h{zv=Yk}2@W`EInkD+gUJxIl6~#pU>T<(6FI?iTHDC1Ne3d?6F1Edwh2Lvhh^
zy{38KDUU1`KPTED7h?m#NryoQ&&yg))l0V2UD6?-{napp9=hKSI%i9)2I@$9epg1z
zL>6fhyPlSLhF`i2Sbhn#(#!5SH^$PrM14^bEm_@O?hAKT^2%tu{s1`&*Q%?tv<!@@
z7#X?o)TGDBO2;g0w46`%2Xut($}Q*I`IsE<_<>H4v2ftmS?|B_p2uK=|A2V6lE42+
z;=l{*^bHX9zwQEgfdSx={I(%TtshGbB1da@PFZd1Jf3L_zhRcHG$!RsXDiFu_{whU
z9$OHU<;;`~GU%7?UuHG#a_L-2y=D3I3hGvzea8t&=gM1lX=1xAIdu$~B$v+CMUox7
zHGquL8eJ?6s8CrbXn%?oa)(7%u>WiNJ{~(x=-ff&2*9j$K~XDpS+EqW{0X+iZ9Z<i
zR3aZgC5VAXrt-<ZCsHTsBkXB+d`}w_%yofvaSMsq6T#Iz-Tx#EimQ)L!z_oS+T)JI
z`6+I_^re<<92_N4lJS*z9ReH8Vwc_5XbIKGrDh1`lcm3GJKRwnLg&I;c=5^-f^+Iy
zURk6kU`sV*FV@l7<-i9vVlTcY4jyz>Y@$}gAD_3p(KWpJMp@`?YtnrThtR=(wEWcG
z%yxyt8Fl-|zOX^~tWSAn0$J#@R6527&1^sDI2ktew*E<GVKy4o1K7|m_nrGwKG8wC
zLiZ3+C(3F#b5G5{US$i9yD3skZ=!utjm_uSf^NPl_(VU^Z93fLL>0{ePQFNl7;WY3
zHGNsqhEJn-9j9Yhrzu*QwO&J1_+4_@bmHouV5`d}C%_z+Ejbu8k1OD|fIPC>YnENR
zldaq1R_az127&K3pw58#V^W;FS|+rH&p835Ia%r59VYQz<q7X|Yw)k=sAa;B$-aHz
z=q(C9wD0toOyc+EW3p@3V87)+F)Hn1!BKfpZv+#a($8^>-AQHJs4d!Pg^1R@z<(w6
zaj@Pc&H4Y(O0af2-RqhI-0}fPtEc9ZL~eB`N9JmwjBxM8KDJhHrD9qQhFBguJI?y2
z*Eg*-E0TbJydR}tHr*&IF6mG}k3`+<%lOMt4^Z)X@c*P}Ly#xFRFs*S3f`5$JH^}E
zb*B)e&(hP?(dy&ZcUke^M?ENifhlsjWd2T6m`vp}+3tSYPdA_?$W%#&z4f(zb}(2E
z6CCKevD$R2qqIKLU#XKgjZ%(kM9fYX<NysE;8dpo9h{X~R)OeP&mwGMD!6KdykMna
zDLI+mm6N<SlV2qtD6TTxqk{f}B3hAuPsMh#@zqmgIR?2mh>tCaaqILo*8x1R0RHg@
zdU#6BD<kZj?Y9ivBNL}#Fkh1<95>*TRu9Cux0?)01zm~HrIytO+dX$ls>^NiA?-jd
z|C7c#Fj|EQHcB=`+p*ZyGC_yZ4WCZG*WD=p^SG1rJ3R-QDWLP*2<6@xSoRT1g~suY
zKCg@QKWU*gtvFo2w^hR;ce|4EyA>wpq=!=%;$8d2Pto%ZEFo02N>O6LI8DzvS#9xd
zxO>S~S~~gLP15h>q+E;+*cdm}cA=l0N7pFh@PEXpaoQ+6>Y5UDr?`5Yo%ZbG7WWEj
zQZmcyu9Jl+WlhngKbME@wi}|mY=MsC4#E;)VaY5xeKAYA>n4SC(&Y4V9f`K-GZ1id
zc*gHCpLKjk%Oz!7qz*o;7`P;<duIt+!5DnScdn(o245}<ax5u5KyDp$6uN6yBKbD`
z-O7W!*P+Cm=fovV`guuL$G7D(O%FyoZg<x>9cFp8Xs`rZ`boFi1-C{o!%T`=zOaZS
z#1D0|K!p+YOcRJB_wkJFTs`7$Wq9!=qQ^<F)^X6(Wu`J-k>>WUdxWLJVWgHclD*Us
zrlD@-mb|H>iLBRbIeW2z=N}{swMM!5!A;XIEVpZ;Rcv3h*=_}w-6q)K7`W(J>1n-~
zm3|z&G#!jr+<p?~Zd#;Wq7hi|X)s4ldrR{&2YRFyWz~ymAl3Y>C>wgzYs|XXLw>p~
z8o(=(j1H~h+%f;Lm6Epcw_Pnd(z@Us$tCIZee+-i6@t6ENn6+iv>|E|bf}%t&dN+X
z`x%{drNr!cn<kf}jye&2h5h%j>H51Y!5)t}>d4gNMMly+)J&IKGIW<paJR0KTN>C^
znG?+dchr?4?tsgKN4^ntvRQJ-U9gfqftu;^RN&6kWl|DNdbYiQ$FHm>T*RsDBTAiV
zyPyx=C6m_CpDedtu@_L@PGcKcLoZ9M0CRErx?iyI+PdDYq0?s>XrQ*nS!b|rfBVzs
zxxL`VrLkckut}gjZulSznp{7$VPL^HdN7&>bWca4K{HFhwzu7>pq;g5n$sG~XX(k+
z?iOlU<VmnhPFcHv&LDIIlOtL%Dhe7&B$6g-O==*s5vluWV$@2~x_8hrxYL3+Ue3Gh
z(s8oGxS^Stm=QiW2PE7Z-C%=-_J=)>$3BX3GhVi$R51HL+7qRqB4k7S;h?EYhowJ>
zespDA;UJe4)f8BH3A{-Fa%e2~qSWT#0xN91e&l8cd0>34v9j91G<I#2e(b(wWoBzS
z>m(WNo{5%l6XCpzT|Kg0In1!O;UI{Td~Jy=#@DP5X2Pvf>W?C*x<xdH>>}FUz-Qg(
zbbYjciQ6t;YeOx;M2m&2`)GTI-+e<z;=c~q!&uV&Kh1TltBGH1%k2GRey)C}U(2%i
zK}J0mn$1LQC<(TKz1u2H-O>HGP=3c-Od8{X(=(g0qvI|?cg_~!jdr-zu+;661FqFj
zlB(;<!%wfGFg7Os{OFQy?b)ce_c^&9w03)42gydoawhEfE;wR8v7x%=2pY;gw*qZu
zf7AkwG&**~vXh5O7XBSng^d^2;Y7X9H4#=a&ix}b*x~&$4L-PCdRue3Y7aGwl>)8p
zbhYc8AiC~x2e9pz@$q?VV=R@+4p$vYAI2j$6W@A({O3X05<kwB)gve+he2(LE|<${
z6^Y=3<T4ni9CgV}w6G-IkCrvc&Dp2}x-8(E`I*re84o_0r>9W`X7k+VU`L~Z4%*O$
z!+zqeoh|35_VDL<O|6Ugdmpx0RW_kaXA<UhX+m(qt#<Fg?r-W8DXa-vNnb(7FKeB_
zI=6zRRt_KC)aAn_mfN%-qaAUn^hH~!UFCqZwKRIsp2Zf*EO(MT{s<JDKjbwR*<ldF
zKW><fl=WagW)Rz(+K*FK%wkqN+$2@3IQff3(MNXF^&u<T(UMtKJ&qRK6#tTf9)k`r
z-m)?pJu8i$>wb!^gQ^mUJ8?2j_PK`9P58Rkw{5>vBu-vn9oF;Ibe4uwFgK{jY{S9$
z?sX!pfx3=5ir+9Im@g}%6{v!LMd!dJN5P7xoQwYgl-U53R0)rl&PKVx?Bivpoa#j~
z#l^__qzKE{dd}d#`n9FiH+Wi8>1tn)J4}`|>C>-^KWjIvESPLz5XXM((Dl)58y&3F
zM?{jk^0pL{d%9PT5dSC1hfKPer`f?ENAY4AEQ=kB9kbhb=88H7e0x0l+1{7mK?@}%
zWB9u}66<4&(bH#ga^Hb#rLbe}56i?XoYO%`%@y2nW64mzZ;hh&(BBGE50>D*AlG-%
zZh~@#>6b9MM#RL0(G<|v3fIh5SV0>L`eGtKpJBNa(<-63f92Eow>hnNJGjka`8i0-
z`VO}d?vCtdwKriG^i)fEes&`9`FEF(h_ap)t^~UtA?<C0n+8fr4FjpCJK2k7in41N
z>8v{JPTD2uCMM_HT4HAg=HKKA+Od1RH80$%PI$n+vD|p1KAINp)WgN1T)L6lC;RfG
zvV2E2(0uYJlN}{1mzCdLH~6t`3A)I+zrixAT6Ju*6UxN-;2vvOAo`KWS=|c2qONl4
z*4h+u6{VRGwMeV!2QnYcri}KqtI>M>DiUiYbM=uvr^UmCx+SU+eeX8F=bqzqC#OsQ
zksEFQNh|Dc3^g$0V^xFX8qdketts_A?vGmKGKQnk)0^pKauI*H2kh@SK6ipGkxBYX
zbi{SDU+lQPi?=Ll`Ry;6p_`+1GFLMC_iT}Ej$YMMwpmt!*bCciRGD|Jd+cjg?>A|v
z3-P<tU~LWDA{gLAIU}FQP<!mM)E(!tg&6}lEjPPl&`p-rQhU0sWMWV;n4=Rc!KS!c
zWDC=fmE7f42sKvJcf}Di4_Hr|p)<L=qn6~rm;Ov1dnIf79?$I=)NwCsDYuu?`=y(!
zFXPc$*f}>*me?{pM=vzK_Qdh(wkaa-6s(gpwpO@Z!i7@IPNN51b06qOnicOcQ_WqL
zwbn7ni9hDK29L$&zek<z>r&MngNB?*QV@CWyP}#@cej&@lUaD3Z{>nrld67jFdWbF
zE_TsUnH5Jq_GZk1=60hW&j&a5#oK-Y*7#c5k^y8ch9kR7UBy7R!cxMvl2qBUoc-%+
z(uDI+5tfiP{1n^zNJgU3K6V3GsVZ)`WYGmM^h9ht#NK*hCl{oR{SVb6CD8;PtZzyq
zJ!^FE%6<JK{x_Q=Pu*}mhhICOIpJ=%(NLJdu7_fob*KC$CwPagSo~(t&Q!_k7P*7^
z#4VH)R9AfN7TPSdvMfr53EuRwE6Yi$k0!I)y4Wwl6r-L({!2PBycb;+Ix#wAnd~3s
zW@Y^XKYvu0ME$iSyRuNnYbsl<-Qis)z)EZ4bA@f-XrtX(&gev2s>}`{E741*+jy+x
zRo*rgxta9}j*@)Le44Bu5LGK%YWKBm1)n7KWHTuhH6uYI`8PN(f60!hq^&W|1zazy
z{vG`jq@eQpBUvMy20Lp{QD#q)U3{LmNego^)V?M%B&=rD<a^10;+ROB*(oW=y(Fm{
zpOT?Y>!<N{Z{i{D;N!<y(mcXv!AAQLt3U2aS_N?BWj7Jexie}?&1MN|=&Sl0Qi5KE
z6m(tOkYrX%ZpQApWtv>aT3>K+DPbnB%?_wC(j-(>HrUQ?wzV&mo65CPT~J}zC~C($
z(}$_|qzln14c58~ub7CIJ<QeKpM$8~PdnLXvPyf%G^}!!D{Oa(^bcK6R<1gzF%1#k
zfy<xMrP@q4OJ{KUFc@-Y>&RI<L(c94s|>Sf6V#JdZnga3*4jXAqq&2+mQO#k;rb(9
zVw3c6^<{hfGkHy_lSQt;C$B+%H3gotkBjgp8!V5O(pENC<AbsPpII#d(%l12SZNF0
z`?iU>uHD@s(0Df;E4kb*&Nnw+z=O}Y1Z|;7>;IDFWa4QX{l_)189{%G*sp<L^h$OH
zzfaXF6_Oj>WTNcKXaaG3fxYVXSx%W3Q|716)QZ-fd;ae8shI8!{+;2jMZ;W0%dZ2e
zWXr5oEuY&2CtYiA>38z84CMr+m%T8iGU%4Yv54ETY+)98%6^@-lUVv^)Y3GI?z_A?
zOApzDpeBkHlZ@pzRE5>}v_IsQ#8G)WTPuJEo4du)f}mxDrV`}{=c853BU-GrZf=$v
ziv52muj)d)*#=jIRZE&7c0f@mqCezgcDseOz{?KD9+!culctWfuvVx}Pos}?SL~ww
z8^LklA_?-ccCh|<scJ!Srk*6)4ms&|#@^EG(p{-$hYkHD%N-dayCEYYYPvw0&$|WM
zOKV3}VVTT-A;<YT_-l-1(z&R$<E@EokfX|dZJN$@1ly&4@R`i9n{t4&mC@F^jHrg`
z^jR!})Ti<xg@0KK(m#?s=)^6*+k(pYoa@?DSMi&Q(OG&f=w-e3*XHiDr|=SHe(||u
z!6Lq6PSi!GNk6wvKcp_GsV38P!R`2u_?gyt+mkNQLKx|DoYs^e&nfcQ?#NB~4e!<!
zy*!PL1wqarzWm1-9gS5FbM>tXjO`Hh9UCHg%dCw3vVSCqvUXGmYuu-cg6rDZjS9|#
z%)5hSj)0Ah!zmld2zwh(mKr?yj>|;db5Gmq=Hc0f%VBU(QYB&>ekK(dHJP6r9DtL)
z6%^G`*a=fG@BvlqZ%f6E1}o)HEv{YNc>RaC|HuwY9u%hM<R5t_*rJ=YsH;ae`Mdfh
zXrmElITyIKAt&~8w=z1<ZyMvqYR_n*`%9mvkFzgG`A;nt^-<&dKd@gSCL3El8L6YJ
zn>CYsx=lYvvFRs!<Wz75Tgbv%?$ouhDZvX^ZBHFR?&7xFEESCVVL2f;U4nZ_xZTrQ
zXm+v??X>`m>>xjv3uMy_4K%x(M6_*;H!J`bsm&82_pRk34)l+5(tq{2RA9~1`Tao$
za5K}+b*Oy}BkYBT;pV_#hc46W@apf~znUB6Hf7k~YRVJEr{T-H*)jbNq<u&xSZQ>Q
zE_joYQ8jH1&isS*zbH#&D6#skB@ijPg2DfF9}@Zdfb8~g3Oj`x$;b?f-lZ$&SajU2
zjq-p~)}Z-Mjegb#64P?v)Bl3yR?Fhx<1Z2A$I4vdQx{F*>BYd9CG7<ppbI6duS!2=
zD>unLw-R#OJ(DHwX7CzO=OePsDZ?I8)oK{Em?)RbQgBnGMHJQ9VX5hqYYJ8`0vcap
z7p;txa|hrlU34@)eYHKXeXfBk#zdo??9+4P9P<ZbI89_d+-G*&uEM}>2ix2`E^h?e
z)55M3XK!9G+I>s)TxtCYpKuZ-uZBHwtw8{LTs?9S8^Nk;Ejjm8^axJl8B@Fe+%)ZH
zpSsc3Q(9V8dm!hm8K0~e+`9xdbMNS(=mLAyU01mwRO8fgTjXbQrpwv2dG3=ajVuVJ
zTPbrA*r;FwcC*q=Mqk?GMnvD(#9)OS)C|^I9_m)jf_LhJ?v=?i56LEy-Tg<(M#;1#
zpLLOCWVJWpH7i+rIUgmnzsP5(Uay<p9!LE+^;x54c#!Ve1D>9W{-rtgjWo4?qFM1N
z@Jy}A39hz}sbijKi}V7XJTLWrf<}@H48?6NI+^OQH|(&xW}*Ai()uudmlf8&+Bg`F
z1~WpN;2%!osS|}MZ}Ph>vo%ESv6j<bv&^zQzK_)o4+O{E751jNF45HXIQCtTo5_2n
z?KOGka%(yD?jzl88C?_N`vQ9e&b|$5Xa?g}-D9CW4@&N=BcnHsE;RcidYO~B-i?!p
z??2*}yBe036S<T*Zws-GZ>1}5)XS1;y2nQUX=YGJdc7QF(6&0!ZFOB?2Tkb>{?B#O
zR;VK-VNj+0V)#-Il(}QD@cXhvsnBt+#ICUJshMZ>KJ`4C=!tDWA9!<D4%?lNW?mn?
zZ&+}VHLplF^Z8&zY&A;AB=)O3=b|-H{|c<KuGETp>O!zz7b_D!)7!3^wjmnK#0L(Q
zShUeav7hj^i>NF(tK;Q0osJdWl$Bt#P|w=CT0;xb1Jgu1NPFq1$*iljMEz-^ORSO&
z(G~cfJK9N#g0(hV2G_y<M8oN)OY8R5!|eHakl9&Hx^=j<ln9cd`w1Dry2gkm$?@e0
zS{_?3q{qn#=8@Y$A!1E7`b9>%Gu(((muyoTYsT#cw`Bl0{xgvFK3U)%*L{Ilv_hV-
zT79)V^9Q>{E#z1C#8pDyoy0RbkrTfgO_V-SN&H)r;A-%#_9rgxiH=wg>*r2mL0?Io
zXaqYpP<MgKJCnC!7L1-Z^gmq|ogo9MHX!&$2Fhw8%6sTDXY^hBj>@fnjr+D)rMLBU
zvXdUxk|1Y{+fhM;#hEEV2M1CA4xH>~xLg`e*hSgzPDV+SA*<n82WdK{Cua_`5*=UE
z*{+q11$BNCt#`%wnFKOSC)mLw(ckiNG{cS28kPyKL)DTu;@J-QpZiEY;_jkTlAtk*
z<F=qKw$%1(PJF^?`6IT#r2;+gwL!K{e}fgz(r?*=>^csN@<}iPwDW<@vq50X5$Fgt
zWuHEcJaMpsq;%!AA6W3FoUkkM*!^ku-8fByy7ha|Q=iG6U>_K~3z|wn_ID#O=B!(&
z`8h>fu*}K$hqUy*ts-KMcRln1KDN5uN>cCbqRWm|<wr+(!|rQ)eZ{>2dP|f6AmLG3
z2dwduYahHD)T1k#jwtMUhHE2xbh!Q*90@*lv)u-}q|0@-nJYt|)hPQwcS$izx{)-s
z-Lok?bG1~WYJG*Rwo=+2Z=aU9yNjq&5DYaj*i3C|b9+Ze$sH-`XTxJ=;t>kNy?zq!
zdPTQ`^Z1A_-N%l}zwQTjGd>Sm`x4nGZ^$Fq8TSUc;Wmxi$C9+CGBBB0x*qnm9!&V6
zo|4TnI7*rodso)mVjXPrnK+u;mg&h@3z$j;siptN(Rn~gRV`hZJOs%(=bWR!^sQ==
zN>&lcsDPjdq9{>Nau{;XNX}W}Oy8=O3=%{{Q2`YMMMN;5DC+ya`PW;%_k@}5d(Wxb
zwQE<MIydOp{6|)(*)KZDbBFok+^*qusrKfOjJ_(^6-?7Mpv)3`n(XXqP(hn}L1wx<
zVYlRu@K-J2<6wXle20&gW_ahtlo*{l!?^#lhE5Ih%WdLB>b#eUHja4yrM#?vSQ<+`
zOL-EO^$L-Dv(EN~+Mj5fdQR!C6tXo^k?D6;h<6pW61vPg*l;VIZHsj=*<WjEB|m%V
z_+W2}c2-s=_ym7V2gnDWHA?+RZ|f1>-~c_e9fV0#@S6t$JGD|i1)O*uzCGH9_$f`R
zjs2m27!1{~q>i=Jl6DN9I9s1&(qaXDpk-{DOd?7=fV1X<m0XjxVK2NU^)o7>Bm8VP
z!cVk*Dn@I4-RnIhW7NaKo{P9J$M;D``-|w770+KuPQ8j)Qh{1hEbIdN$cW#Uas#Dn
zh7ascFv!>H@o*QbJVoEu$~Ml%SW!NunD+?l(RcQ;owu_-1Y2pSiTb-d#iw_nk9$D;
zS*^pVff<pO_N5lW-%?}XMDAs0@;~*Kv0~QAClZnSdQnNHVw6p*29=237v+0B5zf_S
zSnm<PYDqG}BgCod&Uq9#M0MNg+jOkGNc{fRwpuoC?pbuOU$-UxoE;<TPxQ&U9lz}B
zXLPftWhV;9AH*)h?x%*G{1v-H<!+lU(vY9);~h1DiV^qf5^eX(0vYdXZKA$m!>GH|
z_K}`j55luo*=eJLk|##})P~UnroN=A4V_q@2)@%!$4Nt2&Z1ZzF8~e~*2&aCLpoAk
z)gExxU9ri!N;m5{nP^9S6&(;m>07xQPKPzUM(+MNy`cqy-|Y?Wt&4ev{@_gMu$MFK
z0Y36b59qsk*yn0IR-I2ya%MGz8^MdYUYg0_Pr|NGqWtX9%sw1#s*z8Eq5MG3SHfr6
z%XsYy<E}ig_gzaP4fx;Tc<21sxJX-a_81X(q8>@fMa-y+Reu`HwX=5JigIT{S!R&_
zC42lS6uBC#U30MPNk0qbWb`Dgc0bScw5RHrWvl^;YaclB4)l$ZFpw=iUNXcR!eSdn
zbKoO?hJ}2XEtQf|!*`)4u9q{u8{g_3&hW0>1vHHPTW!tI{Z|?GC9iuj=aW*1sxztU
z?UQ;sS0B<FF`24Y>YRnO){xw)x&ERnEfcY8kiX4ZXA9DCE-FK`Ts*yX*H%o%%I4$I
z9vdjPRDiyv@afsw2bCmswtsJ3L-Z*QIv?|r&V4swX-&_Kgy}84AJiGLL5HK<|Bh!>
zpa$@|Pt}o@!+*9nt+JJ;DwulrRyq5Mxbho)XQhKt#Ea#YnL8z#@GL8^`g^Fexpg`x
z#q;V)?1_8Vz?bZN+8#3THquY;!4IbK-Z|ot=;JcL&K&HaoI&twl0EX2e~0Swg1_Vq
zQkgT{Zyr|AHuyzf=Vz9xfTnvB560RA%WR6@Va=3Pcuia3)6B1wRq`o+p5Up~;y<(m
zSbanv4{wCIEy8K9vqY2TMCTStpRHV%Z-Zh!Ecqf9-QVWQR$tFEjiL+WzSm`s3dj4h
za;hKmBv*-$oNBbmw#44YBc4K6IUdrhEDvKZ>-S*EATRFdl@<mgtQ-6Ovs!EgipDnW
z>xxD_Q)eG4#`-Swj}sjPpP_|yljpX+GLwJ)w02m>X)SIWV%hXC+1X>VL|=r7UGSQ4
zZIm<Z@1|R=xs1aKlC=r?a~@Bm0@&2Kal$ft$FRT6wk@vKHu&D6M1&r})krH@?9WLb
z^~5j3iB``7a+k0Cch+ZhAihq|X$ASni!rtS7EG!dIGcr;nVeUJkqr;thM#;R-;p)F
zEKU3*9D@5u$eAyK3+-eIe_G4`wpO6>lbSE_gWz0HfxM}Zw)Mh3CM*(;B}1wmX+%%Q
zb=xdCJhP1pqCCOH@FVo{ec<a=yKB$N2$|@;w1B*V4l_%-${T*lI>}4K=N|THu-Qh#
z@#q*J2YdtneGi<Yd&73i2X;^v1(}$fxj>ft*FidI$_e;=`Y_lix8xIjTzdzPQF;GX
z9|g^9ft-?#pn7`gA)AT5lP&d*kepfr+!?2D>o4TG!?2p~z?MX!^<6ZotwASN<aSC9
z{ocCUZA<YYsFDjUKX$y!*YSkuV0Zr#?b?t><_^YdIr{C!c>xKnnvd}$D@Bw(EbYCO
zzwcSCZ1}Hi*0J8tPx+T%WaUUvZLfzcj7*>|c$Kv%!@f20>iQd2!@Qob9tctZ*1yg6
zV1t9ftnSW?R7=&^i_*Ki(RZTA*TKiuXjS&7qgU1HWYU#1#WTTLhUh8#$<yh;uo8TA
zj%`8TSca`mg^8Tgw@@_8p>ZZjUHl_iX3ON@p(F$iU>gyw7~d5>>D?7=oH!G;6?W5Z
z!g6Z+*E*5?9LF1U(O!~zQ)^Al0-M|Bzt{x1>P7r_DWKQwpid^|RH4J|o_!PM@|C{P
zpMYt!3es}U<7bUYdbp=)4|jIwR_p<oaa>;01K}iiU4!^}IpJTy*D`7spXwXrKfA{1
zupIH4M2xJOUgG+VP6`rqCv~P*u+MIu2?jn1CQzAPS0;i5`+YffLx+?7gC`!tI<q*l
z+=Si%Z4v8CRbeLewHN#+_WKSIrU^BxPyB)=NMXqc*8M>w%1d^$!<S)gt%-frd?(1T
z8$4PQOVI4rMejwv*9|($o62a-El2%^ud`)gT20Z{sW~q3?!13bT>$g>R(jLNkPe0H
zDIdY^)P|GIq@(Sy?g2li`n$SLTY3eovk!>3U6=S2NmWiig)z^EVa}8W<c&!_!hY~3
zK0p?Sk8CuV)^YeoqBr5EYQZSics|`@nRSF+vNA;aZFupUF#T#_cB!q=_*cPHDJtKP
z%})2XG(FLw5qa)$c{|)-i;17_*}Ga84;?78e4zhN{`L>B<s9;bt<&e(+ZXLe`vt}J
zH#FDkzC^nDe^wWUmQD`)_rlE#M9Mme>p}46QA7?&9bK$BK-or?(`(3sAoQBd{mYJ4
zutRI2zGPuWLe#G+r?TvrJ&9GFr^5TBPlGpR)-qIwsuKNHXlZ<ZyL2GmYUwLcTB?yv
zb=Q|<EZO)H@8RvxVJ8QtgRa=acQ!!h>EjYGIedz&!6LF_9j!Ev{$w@$eXDJiBA;Oq
zX~Nmzhp{2p^I;-aRUNH=Yad_cZQ(dgy)xdi)VC1{PsP&krvv?}D5y+5MsI3Dv?A6r
zlS*dZ=p@~yMPTd^tAfw{5zf*oeB(NLbN0$><O4_i3C|ke<@@RF`p`2*8(S`Bk+cj}
z+Kc$_BD!l9>sXNTzA-D%2Y6?F7cKOdejBU^>VhyrM4IJ&W3&1G=F~<Hg1^t&QIrCF
zOcUad>vPh`w?<y~zkNP>*K$0&aqJ+~#(PBbD$xp_DSn==qeb$VzM^X-&}qRDtDvcq
zC35Q`-KnZu$(Bm_QtRO}r~;Msz1o3)-qD6ws_1t9|Nnv$cEd6UX@Ut_M)RTEe<B&Y
zns$>bVPB=sQDzgXu4`WJ0Bg<|?g=a5x6e^kPwmR=XjQy|uJl{h+pE&ujCPTnm2;1D
z_C}^@g8zU%`jHLB=dSq!tn&q|c&|N&;`ub(_!;(QLs&j>pDvNcnm$@b^7>_cLeF7E
zd6ZK<c=#d|ghpW#eL1L()zE*T2Q?#}_7+TGi3CAbSXED}WL?27X1@3_{SezP2QS<i
zl#Wc~9bfaAk#}W&P+fC+J!}q?#pA}o%9;{&OVA0IjAgV0jjIte*7?^~3KVl{`mbsh
zX82W#PJ)*#lj1yG6Wim;>6{m>ucppSxM8!su{}ox^R$2CyKROx;_1&@Bf4U~mHs-#
zUsRKg{v^5EN$E~ZJA;P)ubfFfVt?38X(*NaTW@I_;p^#`96c+TWz+EcTw2JM+shh>
zykPWU>RcGq>&E0KOA4QXZy%B*P;8Mr1&2O~<vwBcZ7OFka)#ZlN_2S;kk$9}zE<34
z;%D1X!1DTP^o38sthLy6lb{&=su|;)qqRvoi+&Q9r8bG2KP^)$#_D*gqu)q2CJGhw
z6~vNmdV_b%qlI*&w}3C-<Y&Jp-ZTIaFZiZ#qg3~sAo^!;pts~-@FRP?kJtD8@RF%c
z4(+?>C3wP;!Gx!LGn`?KHTQ|~zNdH%*jr}qOiV9DwJqNDmy8Jx1uf)8_>q;-hf-e#
z(06fFw%ZJQ$D7fQJPo^S4Tn$QQ=5}J{6q}d&y<;ete$oaPl6L|IL9!KRm;v~yjE09
zN@#g6E!@pV1f^3;3kF@C+kd3B&yAFntun(tv-?)rmrHhS>KCklt%pt2Cii;H8`)lB
zdl|CX5_$}sdxq|!hjE5i4aJk-noax>IP{Ya@Ulel$uQdzc<yHG;{Z1O(Dq8M@M(}P
z@cA;1=b!JV<v&@fsrPf%#iCPZ?``!F{-5mekEIQJQNU;VJm2XsdroF<3(reu*y3u<
z&ri<`zL2GO;!fRU`-qDJK=HgjPQKP9`c}9reAQ4xCCP7x_ro70ee{EH7)lhq9?agk
zMI@cZ?p?I=bmiQYu0#vYTlu9hccc^SWxiztc^3Ev<7_(5xC>8jPQ+}lT`U1~*r&{O
zmFIk`^}u_^rSg7V3t#VRp)@pSW&2nMuNG|Mzjym)?6R8f_KQAU_j@ME%ni0rgB2^V
zzc{v0(SP+BLBV)OR<B`n6q!jYE$O`_E9|9#q<3`6V5z6^x#0mk>P^_*UQT-q18+aE
z|5<h!<DUhEv@vn$`=GGj$EF&f7mWr}R#+n47AGx>%|tbHuzGv=y6%=d)Em+=%X_*G
zB0{}j(}Vx0)-qc;%qOqt6P|j~BYkv-j|gHx7C&Ug<DGo0pYWrJ8Hu#pbf@(SvSYz_
zVmX)zn#y`gI4AaCm$ziR^+H$4NrzTGdqoQpEv`!IU`CidIx+USb=5UtbDigL9q0*K
zTsPZCk%fG7E8cQ0=R}ukG0?AMw3`2FjmV`dGk5v}T?OxW+$WKhGSkyb=tf=c$En|4
z_AHtb{zk6K?`zYr4g5TV{~Vd4^}RCFX+~=&pO!KwC=YWih`-ICmi;)mI9pffWAOxP
z1zohZjkc#FhowMpoLzn!l-jCSf_sTmd4dw;GW&d`KY^bOr-rgYUh@1*QCmxPJ=>p?
z1RrJ0@xvZ>>PdX)3m-&9rm$xZpY`(LXng{e`IVpqmGs{Jk=*pNvXH*6-u`h=8GgN4
zbNgi(hZUsLdSpu6@&Q+RR!-wH-&uZ5z1d?bJhloJv5=F$sgqE0=vHDLy%%)2*6_o=
zSgPPtMf@HAIFdcOz(kMhGW*xg$pWiHci6AKMRz)9>3k?GItLsp_3pXHsOGlSPr|g!
z7#dHn;-T<)RFyAbl`FuhoVHpgX?Et9F0*gL&n-vfbA8oLM}~U--~jtR3teS1%zdu^
zroVZAZ{s_>GP^zvn|h3%)ttU2RyJDRM^gpaOU<Y)$XLYpI%n8GhTNWDon?{ll}0)u
zxMS!1pe0MSV81T$C4SjXp%Sfwi++kaen~fi56!I=F?T7<;APe#1C^R3O2;?W@PTYW
z>&maCWR5R_zo+*kG_CSd);HqkGklgXn@?9s`RHPzQbnwx16t2~O9!f*rFUTtbr$YR
zh08n!&##9j_j$00^IoyAF;%+~u!$@-oEydl`B7H=dHqQ%>etjPU49R{vTx6WtS9Ux
zeP0sD6JO91@YC9UO3}mciLI<m3oon>J-;3F$F;oXgfD#|hqM;FATG>mu#=WzQ-Vy|
zTN{&eKkJz+H*EW7S*caz9*pH5IY6#)N?w<`WL_VKFL)r8m45u-6{~^r)zJs5>o0zl
z?>!^Sh)-iuas-dbEGgur;egMCFW3j<5x4NLzYP6^6EEBqhfX($=d)muEoU!J+UGjN
z>!F*@AyXcuMT10eX`!dcdiy${f(VmJgxgf~+1AjuSnAxf{8&jLts4%3L!3h6I1pTn
zRA6@GH}tF&L_cnh#|=cq8l`*fXY1%sat~KGD-?eumvtcTKU2?W1OHJ2qVmT+L1(5k
zhO7KzJHkYAf}NHS8>5Y|g&Q6Ts4RsW!xS*?Nph|~HJJ{bI^5;D$!d`KQ~T4dTEjP6
zHlpSipjRI{5o+pOyl=kdwjDkZ%POJQw3U{P@0Kq)<$RL9#C)*%FW^GMd@1%b9>o05
z5~UyYqQ_xY8-ovIig%&P^qRJW?I-Jtnn$0q?tUSd?tzZgarob5ds`m&>^fPFYB6Sv
zy?}<=+#cB9Qp?Xp>ND4O6zE<QMDDKnqxFMZGRtzxbv!tYZ}6kxDi~g2I+A;O6*t9p
zd{<C`%IE}t6V^N%3qB=LSuXv_Ba*0v_Vpw>I4;6fXUbUp2qf(#J1zB$QdRD>Tt>8?
zLHv43>ktWxX=`G{S6U24I{^G#pu=^fI(9bPx_C}hiYt=a_Gl%`!adj-!MVfeNWG*x
z{#J(Eb#lsJ`7xNAIM#-c>&(;(cw2^OW^3Z_$7WK~y(E`yjem&ZyxGP}Oa|+hmO;PA
zCbL*Xzk%zfX2K=VsBijt5cENK$42-$J*t)BBgkQk`(rSqW!}LSOJnlnyQoS#{Fq#p
zPl+Q*o-^12N`6ls{f^w!+2G*~n*&GNXfw$GPiicdD;$R|f8P$H*w)sxtfXM!F>>j}
z;Q&4BQ}k9?op*T#y|xl4e%&|uZZAY`GF)D!1K<VE9WRg1&LH3GNe$x-O$K=y_z-%&
z`f-EAQ<|2EWK+ma-=~t6DV{F+CmF+B`7-#IwXZ_ANrBjKs%Zlxtp=K68Kf9q^1OX+
z>A<j1z78(vWU4-CqPa95dGa)Llmg&wM=hY7>({p7uX;m@FuRcR?0VO#!UNj#Z`@Br
zWGfgSBx@}rIfpv83u{SYQw&c0zJ9NnY?F=Gygm>fc32zwhU7R1G7WWoqV^zyF_Xkc
zs`%^R(pLJ3r+BHz5@}<%$i0$?GK1lF2hec7pyHG!UO(VIB3nTGxgUN30(?#0`8F1@
zD`<>g<ib+ZVgIe-nfyaP)3`0`nO@q2Sy{uc!^-w_^nbodZ--BZ{{?xY$CO)!{Il3@
z*0VJc=Vl~zvQ%@)>6?O)x|V572lXsmc%jzRX7-A2)tO#gX9NdWv!blhBx|mFyrJju
zlG-4p4>6>My(4XHlP-~`^#hn%KQ9rVMrTAV_H(#?mU2t8=_K&xF`uP1y$E}jNG9|+
zPwZM*^Mr$R1MiuZUGJ|iOIg<9ReuR)Hc$WY*Af>K5h{|eFQEQfRg<al6qcm0vFz1J
zRt$dmg+Jwgv#%p8?z!Y|ac-197$ktOtG$gRp<txPbE3f#SxQB4y3J2TTAuxaW#f!^
zJO9&Hz>Ct^$9CA}N2{P<#i=0v7AxVe62*%94`E$U=tR(7UnWMRfbeDHG@0of8zQ^4
zJ6Ui#EV+w6sk>x89Z4NfEm|ulP<%UYnbq#25VX;);S61&f7=)6W`B8<`zc)WP`CbB
z*64ldYtL$&XUYt3nQX^7>zp~-!`J90^!Oukg`K6-kXz^$>1a6`%mdpWf!VL=vx#fG
z6N*S-;`VYYgDvz_=19r2Fyu$<+9q;?Ioe-(+2`<=fE;kFJ#L%%)0ce%x<Cml&WzkF
zyw9gZ{NFSax7eoICVdRHLV7yN5V+S|4+p=(fqMFq@Jk}gJba;nzK9B%OB<og?eLz~
zz_Ul2>e67aP4z{rTyB)rthzF&2Ew(awm6uI>CW&ebiR(k=k`&!Mfys0xLbSH;UV7i
zOXNBGPo@xg8iMzuv>oc*NutSAn;Yb?G(Lva|IH5hQTmT(%V-<mdGWdh@V$B9+>LOP
zM_}Z&b$}M}6wA-&qF>pppe?!CBRlE|x?0hrwUJJcNqp+>)VD%QAfAP=3nt4l&vgoQ
zwX<-JmcdW9gY0QOmhp>R<eTS$W|Mi|Qo2Rnb<uOSKT;P}eye7JKg=O_?I}zMmI_ui
zgfCGoRC*d0%WF~qw%C;%vS6&b9JDN+59ZU=8fq1Ct8cA}#buliAjiq7=V52}f=9|#
z5lrvd5s=^MWvM|{ww>q~^+TG=lF1|X!r+$L70XVJIf4GUJgodyB2YCaFTfij<oXZE
zh}VHuQ*9)>_Y3P=$ezF(Hlr()gJmb`7%8A*J%?3*W%NdK?5Er9YS2oC`hP)De?_-Y
zq4_m<kLcJOZn49rSP3oT%k?HGTb*Yq&91$TCB5mFtTQJX_V7lZ+Z~xHFX&%>FTAMx
z><KN$eAGeqJ9VSL%5i%sldn88{Jy&sVnXyGDXrY@7UtKo<SLt8SshOP29D=vh`-CT
zuL-OApVG+E>df$JxG1Ot&ff@&p?<xA&0nw-9R@;ug6=q1j-gYmm)oc&U-5TudWx1{
zrT($5;T#<SPN>g=2c%9sd4s1MpxJCuaLy~}NNuT)Mc=Z@zACYMq?uF)S)=lfQae$)
zZOJF*67O!va82?Z$(gyGV45EGOTh{4Z?D^NS?qHpQBTWfFqMCTt6@IR0JF-BUH>3_
zvS#(=p3`p=8Q#NchY_#V#(LSI@PK8~LB86*^KaxSzNw)v2HU#nHh6hOzMGrjy^4JZ
z<E;uKt><Snm;8l?4D)K?C0&`ynQS&3cdq|#=Cy-UK40$#TiA)eK!XJKVWEaWg#8%@
zo?fB9U`vq7yVJx=`O~)8Tiap$tCs#sJRRii<Y7o30uhrFnc&Dr+C_Vl)knzpp5UIu
z{r+yOf}fC2{2mxQo?L%-<gzcbo%m@Q`uEp+Qn&_&*I#y%!+@8%gjFb~O>_|!H&<HN
zE^e1X50@4GrKM#W_##e^wDBBR;|Z%A*{%D?j}~fiZqGRFUBeI2<meZq7GKs&+H%&h
z7|%IN4h3!e)9^1V5ar%cB2G<vTMN;*-7NB|{-C41Rxll}ykI@h?Vs~m;rn_(UNhBN
z{y+94W+U|@Vq`&B$-v|^)DnB}e5;6cwb+?V(VMaQ-2c?cYx#}fB>uJ0?pfJr%W%2W
zvt{};EP>A1u(|EExo%!6H5&Viu*6)^yRy>rOAY@=_IbMS47Nh$AY4taTz<O@-<zP9
zQMOISTMIkF6SjuS3<O8J`)D+Y2A(=EW&-+eUCkIT&J7GNlAFePlCQmh|7{1nC$ls@
zl^q~UMY@-&l577LEcEw;$qwjaZD41x+u&JGge@$_Q-}LU!Et%fx?m#>d<Xx%F&IaF
z(}M4rN%f;I(Po}kW<{I$V!IKPqq4e0?`U(Wrp+4N(iEKn%baf2$ypMZwX9h2cIoWb
zHD7GGWW<x(+66f#SFD~l)#>DBONCR+&ed*u)Ovf=+QKOt>8aGpGMwiZ{m`~)NBK0C
zs&LP+?ct5sd*Q9fN!cv3*ulTNglF?h<VQEdrF>=%doq02vl3V7Fye12NqZ(1+)4SE
zXn2EInu|Lha-%|J<}6A2_(!%VOr7IYM}L-?@YV`m$-mYarsSA+@sK-qSr&6nF3Bqr
zeZNbouS4|@HO<V^O~uZRsuQ_uz~W1KL3sFSGc6Az`rh*Tv$}`;`mVhvReUKZ_XD=a
z%^o0eBHi?hCD8n2y;C_w(mT4{zO%DrIxA!hd_IXBrmuY~qiiSq_&70sKMLT7@)Ozg
z039iDAFa#$JI!Ga?FVbCQ(-CzaQGVD-OHm3<YCPlV$;Ke-$2dO9?YEHLx1*bw#L#&
zbIV&=MyKf~k)zb<lC(0s{zjybp7O<E7kwxh{4rwuyLv)e`dRAiLo|l3?e$K<CfUaX
zj)eFyc`SZD$O4uuq|fbLYR?n&AiDPsTc5HB?zc|3KgRR$ROeX7(RSVb2rTT*y%s-t
z07C!B`#rRnsSJ}EwjO?;>RHESK3tgl%jo}as@d?g1bQ*9OA9|3_CT>~qDj)jr`c^3
zj57FQb(Dc5nC$|+B|XTqkHTd?gzH|=<$jHS8im)i6rarvu>L_L$(g^WIpYh1j*<Q0
z*Qf+BMf<c--Y0yIxRg2465se)mZOulf^T<{v?%pCsf|#t;Y~vPie9rI=1<vHyAo7E
z6Fp4?InMmbRCVMEImg1Frst)aJjxF-Z>Nkiuffj8p7GB?#`4ixULjblM`Q#YPEDz(
z4D{l`doa8UoOz$_7pT8IWJRkH^}h;gfG*8sjc=om?t9R`pnXRs@?q?FP?%enXX#dd
zpA#6X@RbGrs+NxLw|#cqKlM-!L_U=-P(oS~H&-)PB|Q@{Pla1iOkYJMNkp5v70f^_
zUqP(ii}qMCUWItvN?!;@`tC?C{V5m=FSusawJ!amPdl9lz61tT&Zl}3dG@6+O|&?%
z@=JJ5O)PjmF)=GXlflbzuWnl`{f-o2cUE{i9qj!$jX#Pk>KbRhst~Wf3I?E7Ox7jv
zx(T7g&I-3l*>3D?eQ-k>^K+%7CF<KAxLJ;X+vDL^8=Wl8Mr&3&C#UJRc-IE~O}E-U
z*-37&2~>K^-ta%|RL}-nzNo{gVa&88_DL%GOIMz!fIgBgved75$sn80(@EIq``G7A
z>}Ib0hi=-2ZkOElqHXt)N)LuU2#R<S@15A)zl)XET*`?uE39LDGRoqAWWv{M9++`H
zJdbx$v-8f9fsU3%k+0wf9m&jglGokjNvXTZW_ZkB;KsekW?3y~eU1DA2ZagwM!yni
zPhR}H74+Xw2i}UD(fq#Mr^<Y}ASc0zj6TM%ifYV?dKqT1W<ziO!HVkqU<Yr$8ZD<t
zyhb!%&{@YyDR^Wb_{Uk9tTW(Q3A#&5p$?~ZD{Twk@T@TKk&#fBqo7U1=Bki|4ED?V
z9g5iLfV0<;{C3+jplF_uy!xz;wbWS!>2<ICD7V5)pdRyz*tes?ZFp!cg*a<7-apa8
zk=KGxy)8WUzE1H+_B9!DZ`tHOc}<@nAA+m{Y#i)5O}G#L-=vFWIl1&u^3uYwv_U~%
zBE_sO?ENOOFRy>*g<)lnsJC{~OS&~RDsaZ4hz^Dqq_X+0o~T_UzYNkSYnV04%x_62
zyNJ)*=*SrWns2dAwjUk(8_5{Vl;JWT8!FA*{<r0hky%Rm_@Bh=2uRd{x=9am=?{FX
zl<@Ud96otiXR!lIC4-Fcp@}7R8CqZqE1&qK)TjQL*5`O-I&!M(eOTDXI#W96*XYR;
ze2Z-LnMC=Ih+u!}SnUd@T_mHpp=!5pvFEi3wVaq0v(>EVA$guWd$Di!R{D;g6Cnze
zqk6cUPnhAg_1|DCJNvA^?R~&J(FD#X{U!rFFsjn>KhQPV=KH%awaG-l>Dc}ZuVy`A
zfOo^bSXIMNWtrs#le>od=??A-Q&}y~co)85o=kMJrk)dK@xETr0dSV#VEqBV<`4Od
z4>XzGnkZlDKw?%#tp2`@j&1gr{J1rfenh{1kr<rn6)Wwduz`i1hxoHuOM=>aY^ry$
zBJ9sAtWkN*#~i_&wnSGR9D<(w66~-FcF+R8pN*PHC$KxKZiMF)W3Q@(oOj3ncVk0a
ztrD^Kkj$ZWS{O|(1ubed+Qb^oNC#0ul$j6yj(j8|Vb^P|wqK1XYW!kp7kdZhnBDf<
z4G^t}rh`@W<*6oNyB}#?a_^Qzjf=$NS>(gH;q}*GLRW)kXk&+B-}2m?Rk0pK4W?sr
z@^86Mv}A1z@~sF`>qGrOfV!TZ=s8Sx+b`bR#}X5sBIa&p&0s$MoA0CM_+P-uMt_C>
z<kAQ`-=0WtM@DMi=tJJ(jIW_GlO*-{y|vO=)BESK%jjhn$n(CDsg~UvMA!3kRbo}6
zkK_q|N_)lL!jC3c7p86GU>`=XmOXs4jFsuWz_NqTulRtJHTIlW)&5v(Pi^gWP-Y(U
z)}9P{XXEaOJg~t$_{g1b6uk1FRsv&Md)uIoGJ7>%%f8b6u`FJS%;_DvEnb107qRki
z$SK-_s9Y6wVj(%}H~4ifGTlFQDZ1ye#xwL5iqjr+g+)Q4wYM*=to-N6a!LmHdY-Pk
zPL;V5`XYGRY5y|iguE5Jg#PV5N*m!z=fL6qcvL@6OU~7ZX_sHwk8(%%>Uli4l>db?
z^p3YTMPpg0y<n1`>l{5NlkHo}&J9eQK++j<iCA^m=Y`LJH&?V!Si!b=dJA<0C#>g$
z<8_3t=buBt1T1!ltDSHm|Lh}q(E!eay^BD=Bly(_B1>UzSiSCDbvb^z!1hR49f$X}
zlghGz=<@_oxGT6`jEFc!hT(hSN6~ase7s+fFJX{r$ouj}KPMa9AnKn;er)d;jJIC!
zsbt_jkJg?E?zIgKFg3!A*6jL*z6eJhO2=n9*1raL^NtPUuB1iUH!@!a>up#Lb0oDh
zf1B2(>Q!6fdBX9wl4zWZ*-9H_oBblc$Qo@&F4)|j)ZB`GgP&}}H)bmxm;M74jz-Dv
zfNRIYgRsm*&!_Y7kE6C-Ch2oT^Q+XkhWh}%WrdvZx?~Io@wa*50%GTSZLX^=qJc*3
zf1W8a#|ol4l+nVv9$he}uJH?=<o^rD=|y5pdDb(NZwN|D1s@pvV@#mn)Bdr$!h}-&
z1fTjMd`oKMEnkxl_F_lo!|`*Xu^-j+@iR7)x818r*3vJ5q6hRFtrF&;cOXaf6IbtS
z9qA+7Xni@E^c7W{Hr_*D(_f`Z{5JfqfmZU*>>|E+g7>3-Lo6wzpUKw52lh0UmL<Ml
z?rJs8GG+`;`tK--JG40Y>{NDggEb?z-lNufg?wwV&H<lqOCGYyqCU<t>#woYiK#E5
zTF#VH{!O^YnS!s+`kVHF&$P|n%XZQ?pEI6YulZWb>ZhbUo>{`U*U1`Dr^@fkwL>sN
zpYn9ke$>8ZfcXQ!;PUc3{`3x3^#QSPD~SGwEaQ{Q_(i);?p#aPYEf#M9j&{aw9uA&
zGiw}j;!+M7cM?)J?#n5fBgE;kJ}Ahk%&_wjc>8`<wy2DSr8FYXIuRKy)6qDuST)_R
zojF0iHMu8etQ^a$;A^P^{3Wen&#B!Sv&lD>5R<0rB}rgEN|4W`VYdhB@W?KG=)3ui
zr(lVbLvE_KL8&a1Xpo+pXj<9vAPxN`k;pf)UAri?L_GYF@A1xlmS|QR4_u?SIQ`dK
zU-QXSG5h;C`aYfvdlQG71*ttIb@Vrj!BS7+zf<LJDKG9l<e0axlEFw&`z6hf7ctS4
z-x$Stv>SF;XM<?frE}zb&>i%P>NURmO>(ArzQw;+rV;6*U><n$2HEo<E2$p@zt|cr
zPJh=UP+^=b54NKu^^-Qb!%qj)g~FeB->P_b2UcXhu7dks_2HoN2w4DYnB^Hj_Kze_
zoUT#MHPNqxzEK7ANvl-2!&NeeXd6LM{WM%<?ZtS@*|rMnoI_7?6|$IL<exBugrHy-
za=l_yOs^WX1E23%{CVzk%B!zacRlA<QFB}i+qTH#bO%)7B<V%%Bh&2;aiz1*1_9g0
zzaa)xpqeyaHc4HXf<5iB8h8cw=lW7Iz-K%)LQM9?JoOs-Yd%zNy^URhE338BN7mIh
z+E>EqbGm$b!iwfHjU7!E#d$0$Q|GXy(!uEau(i}GvM?FyGg-(v`oRCO4A|_u_8e^S
zyiO#4ISUdbP=kL4beX3u!oo6DbJGt~ODpIcX{IYAg=$<g&EkvosINIRJL~`t$OcCJ
z&rcIsuInaCvIH1Qub_Z@DT(l(j{1UMfkn}qXf33eFR^X11N|)|BBh~rUk`sP9nR2~
z{ogQkQVVrx7~gz8g?XmZiqj75<*@vs-%BQc!YhzBXZDYitH7Qv1*f%R@S2YU_dj)}
zGx!o;4NDrqCoiyHf>-?qDyNloniQApc;qX(-AfS3&sd)LCw^A{iXFigYjWqtHYWZT
z;nvKZeg~G&PB$>Uo7!kNlN|e+zF@P1j}y1(@3sxbbtOo$Lud<Gq6P7f6}m+B#GWOJ
zd?l&(hhAn^y21UI;rZe_@x_K-J-XVu$&}<xdeDnoE!_vhI;uC+!b-Z9oU6BVjT|73
z_a{P631`_j?w%+crr|w4lgB+B`28qos$F65pduW;mo}r4Q!KhC=*CoriBeCyk-IUy
zT`~}#YIsdOhi5Ldm;3-+<$3Q1pIYs6Y%<LAy!{Ke<rI~+*D3f~e}C6v^p9r?r}?j7
z%y!vs1E`C1B??u>B9dS#rJ|GNjt|%WfhJXS5OX*x;9W224JnB(Q7pbPQo(D(>E4%r
z^q#MeWb{RPMUQ(Jxkk=#*<OjE<Ff+S<${h+Sw`(*tUvVJbSEzH!7#Mh+B_&~uLNCm
zqZal3nt>bAKM4{!>po4cg@bLG--?vMl5(Oi<d5v(?Aa)3A1kFUE%a-<;bW~QZ0>mE
zN8hL2JWZs47E}6O<O95UmXyU0rh7%phtl_k)C9MG^wnOEDO(fBM8{Gyc?w@0qpxan
zPv>Q*z!vuz)Qh<Z0js5xhAxUc(VbQ{$V3n6EVy;1_^*~*S9lvQ?l<j(R*%l~ESe8I
zyDIhMo3Okdj^wn-`j?c`eR|F&%4(;R5{!D>oA?;}SQ=<gICusoEYH_2$=`Y@ZbUEd
z5q9H89YF@Vm6-ZSQs+XP@dH*hPLG(?_U*P?F8dCjqQi-PU&0nH`+Rt`Y6E<y0yUB%
zvR~K20M^N+klZ|s;v1bkV|*2}P^#-8FJLLwFZ@UrrDTkX@6}I@`6r$kJf0UU)dVzU
zZa07>KTDqfh17yQ<+qn5CdK3F{gi!ViM}-GLC;!C@8Stq;;CRL8Raj&k_>XKw<J1V
z_x9lE&%~lbk-b4pR*TujL0_$+ulh_~Xxti3R$S8;#ioXTTX8XvYO*>i%@OHKTz{Iq
z`HxKC)3B+(6K;?y%=URv!^o35--m$DNyM|2))Y*CQOjzSpROTAvG`MVQr7dF)indL
z;VRj4R~U6O9mGDC=Z>z=rM{Nn6wFpIPW_55h)hD8?In511#-o+So!d9Vh`P7Ikgu2
zsVr|hDah)RB}-VJ%;8Tf8q7`pPX6*n{tS%aG;utQEw`ESh#ar0Hbn6rtA%Vdo<9u)
z4n4qIXV@=xDUvz`ElJ+>nc*<$<6rqHzayN2l2g`JcN#N-<GCd}e7zYKxCi{18Z?Zp
zBOZOAOUSd12B&lucDcv)gm;Ork+5i_GpJFI({Ga`9lElBd03`yK@pg#m-y2mwqCO6
znV=h0$~tn5KCe2g{xZ4-N9scVmrjB}6RfPi88nv8+MP=MGLU7B7K%@iM4sV1QLej-
zJ&|%L$Q0Bfwk*Pf+Sw(asvp}_>E#)T$~$<IOR%Jw#_0plxT1ch^JS1F*qF#N@UKC<
zW4MBKdWL*}^IXxNrM)LnMX#r|t-9}kL5^nv)-idH>e7ckP2cl^*xyb0-pct7t3cc<
ziJI^Vo>g5p+j;E$JsGHN&|5NMQ|XyeS4}<qNyf|cU@+G9W&n!YVTUX0Z{Y!5Du0C&
zsX^{UnfhP2L@%RN)q+>AG*0_*-uQh#W)sOL>AZ)Znso=e-a%*BADb<G!tb=VR@0uI
zDE)nAu)#;!8gI>{&FA2~)9tsQv`+GMWUCX|vG&r=&x0AoSdm%!kglvJv>gm|qD=CN
zo<rtoKPua6^_;w@6RiP}Z#ldw)Sgz{=#2KC!<;b3R`O5qF{`zVY_VE6M7x4fCD~>A
zu%$cMMp3*g6Q>_1#)|s3Ae7XZR(q|oRV5~swDGb3d<~gV4(Ij)n8lC0_blSV274Ge
z9B$Au>RJ!2aKB89bwZ1H5LS;j_N-D0>{*XRB*3<gv&Or^+S((ryqD8FKGL6*LVDAW
z_{s2R3CJhbuv72qY8@9I2mR7=Utl3P#INL{h2Tl8yc*|@6It`^_CT}yZq17(`>8*n
zi^;9(dR5-0lg7|Ow&*4)0N$2|m;CA7sP%s@eIhyF>?^&a{o(Dkj{m@#yy@J1Mqcth
zxq3}{cDq?8l=ESYYr$>Hql#4V!LrS%w#94uHu;fEqO8o5T0RNXp5Ue9Q)P-b@HjEO
zyI!$)a4&o=`n%mjb1H$3y+D6r=0$zK=et3l%;B4?GgA|T!(<R^n2J3{CxW-NrG@{a
zv-CeN?YZ=al;(OyZutOTm&nX}GW~zZZU^%B3pvL&2Zr*PcTjFy(o?=nI|)B0!_j+^
zbhu5DvBdOKauRgR9PJi8YYXwKW#CcNk8r+Wg!cCA?%p!wL=-<W9UGk^O?|nal~1HL
zma<(NMH|R>^fFx!D*HNEbrbppTk2rmZ?68W|Jo>h7Dhjsd;fa*yJ0WQ$Xe9)4JqyH
zAY8JN{wrJLkv-uV=`^VBbzv--;xXT#tu+ljH|e=k_#&URPAlMbUlB2;+HQ7S{Vc2{
zQ+&J>_cy|mwo&rPTQ*)bq-TOCm5JX~y!Wi-u+@?u%edq_gB$P}YJ@sU|CX$dChmFS
zbVx{PtaBwDVCB%pLi}vCq@#1-ls974mlD~s={cRF-SsEQh#zIa{>Qk<cUD|GXhXYX
zW7*>c_5id!8vIObosYt?R>ymZ=vLH{8gf&I*dTw}+vzbr@g?jhFC6q<kVS_CFZ!EA
zg|%K=-;w&D!Tn%^jMZPmYbYya;%`KbQ*YfPCBed~dcxK-NwJN-ixRjUhH)QF`fYB7
zXQCTe_n+Sq^TD3aGWsy-=I_}o5Uhgk4bI79REi(?c(Taxk|q8@n9Wb>TI;BP<28HW
z{?~LU93`XD+l&t^VI}8lGU!zdR@R6P=yoYLgZtK5(s0u8VWfsnf^FQjMRb!DjZf04
zYWgkNa9^U%=CENjOFSp4+q3==(RPB)^{=$NCTec1AqRc-YseW(YA5UN734|k^JR&H
z8|*sy{Cp{+tG$&@)0mYZ)|~*SA7d|aYKk|~pZT77_NRu4QNQWEjlL1Gz9r$Gy4f2M
zK_6J!;9c8@pH$Gd!T))>l6<WbjI5~F;neJ0*&ZpYGd!E;h2xEiOvCHb;;Sd{%j>ck
z73EDd;4ih94-S5fa2s9lUAQi#8wglaAM?+2l^qM$p};0fQ<UQU@|vcDW6vita8^J!
zdb(&Iu+QQA_k(;sf#=vHOC|NTt1+CY?u!LA*CRx(CUgLHmuVnDmiPps^BL(LxfC3e
zzOkydMJ!0Y$9*X}0p~+Nr5QmF+r>PB)x@01K}2)<VcAO;>SAg;3Cs!1keI_~$12!o
z_8>^&DHa9a@vR$)aW%D$w36@jS)HDEJ{%!0dBOMx{%NoXKOGdLc2+I6739>HZJH-w
zTW|Zv;TbKUvy~e>!h4zx2LBw`a@Q9~Ti)<MO19{mplf6D+8%OIuF7<LydL@Fcz+or
zxR0{a)5lVg-m9aHa~U$3=MTfWHrI!04!C3vBF#_1M_MlWbo7$s!^Y-gMGx77iC$Wx
zTHN2)wA@I1E|_Qwv4o>C4!b(4pBY^SFxjQbWH)YuIWEc4MH9k{T9s&5!C%4p7xPvx
z`(%9~JS{zaI0{4uO}&$C97^~K9U{v-Ax!k-pb56MEPP1x%z)a}QIq7Br7AmP(Lt7J
zYuiMAk%2i8Q*|q~>(sve58-4j#bl4Z8BCJ2dfjVzuJ~}NATd2Ht+2ATnl4<bEt%r-
zIQG_E)A$s+H+~4m%e(rvRic;R9xGW`%12)$3NWM2)>;}*gjfG+b8I%OrH7BCpC~4e
zN1xO0t)J%#zO@_j8nY~3M+dxZTkx2R!9_6rYB<RsSY;6Do-gtnep?!{_QN&eRqzq+
zoCG7Tfl-;GyR0~srT6_K`^OIW8UI}xMVpXI-?qQ~WBmxfToB~pp61D5e0m~VeJf|f
zHFvxgS<3{JronpMn%XU!XqoYgOrBOJ$pzm3Cf`qoo<A+SP`p+VFJF>_HbMg!ZXwPs
z&T`exbXTNK(CRM@{6Z|Vyup0zo#5JH&h9Mo@w!(|*naRo(s+mDBx9K6Yhb7SeTa_@
zleCEU0f!Pi4;bFoSD|C{i=B$xuy-`|X4bRvJfDBqZc9E+dn}VN#P@x=f#@+?$C0bn
zjb@H!lL}<qtMyMQ<7i);*UG_eR@Wt#hqpX~@0|!Hqs~u2)o38EqY*9um7B;ka*DxH
z+Pa|!wh8a?^s~Z^SltrrcRD&p8LO>>{XycN_+dqF={x0Ze^cnM;axws$yjh+zmk}a
z3gdh)K=e;`dWvKldeRlXWue#7Tb80m1?^4VY_R{>c(ivX>Tn0K{voc1LAyjel3SeO
z-vx(kfIOkU`7})t)rw@7_h3Ouaz&<khxi0N3v<k_8{i&=?J9V^UDI22{ZAHZ9xydI
z?C*1QpsezIu^!~g7h)~sN$;s0eW`Sow5;n~i`l_&0$35%Vp5$vuBMay-~p2oYq8cj
zMS|<vKtH#A!8oIzN4U|Frzt0wrHF+()UMiEPN$dgLg?*3@aYM3Ob>)-Y_@+?q?cB<
z;r1STRf_2P82w_Uh~)D*cb8Vv#os5#`rYb~RX+#vXV5SGLz(N_rJ;^zwFmh|cx_GL
zbbF(dM2Rh76`ANceXqPFv+?6;dR*q&34fc8y)gkN?c&+xv^BC1buk^8+rX2!L_NE`
zr#U!N)Ha$fx(Yrr)|O~n-zV?+d$8HzK~_~A4AY;XSEwGjx6v%bMQ*ANYWR9z4AL(2
z&05P!qQn#iM_z-Ka~hF;?JLsJ+e=UWyq<A>-dB0*T{YBhb*`?}i^iFQ;F@%^Ytm1L
zY6bhAOkf;OGr$*yC89OqWb<tcRe4SvYLYMZOn6&qUkT5>>+hh(*3lx^&~e=v`9q5m
zMI)BPdEn3F*W?4fBv#qqpra@&GX~PdOJeKYg3Tz$^{}~<^#6Z{((^5!(24VFbLDAy
zO0xM1`1CUx#@>!3#_M`{p5q?4^`W=2Pvmp2>HAO<vT1v`Qh8m#Ig?#l6!aLX<1{H;
z505w_<KgRMe$l<U2=6@OtI<1}YwPG1=B%I5D)9wgCHOPU5<Mw}V4b;bFDlzs`<uR)
zUm|swDsYV`x<NkBc_@Ps5cg-#6QuWU(cgt`A=HKv{+mt&8Gf=`@}s5oT2V)1$VheK
zCGcQ@ZnU$k$AhqoHp9QZldq&Cdej$wESyTU@I3o8(^IhEe?cMUBZDQg!^WWVmmp`}
zc<QN=j*--JEeC^sn$=$?Bh71v^pvg-OM&|xz@SBBf_31)N3qdiSm!6a;|9<wiHxDX
ze~N8niLVGtXg79knr6`S`lD2gxARUY2UPjBsN}{H66gqP7;Q=1_*6@HiZ%3itTPJ0
zE8*baRkG%+8d6=FhPT!YKk;0y8U!1Nc1yS;r=Q-zzL@%|Q)HcOqE9Wi?pHbh>1}g%
zbc*iQ&HkZ3M8Bx!yYbm(VDlBQaxvN6?Vy@}777{4n{rf^2BopQTwc^#YtCpRPEs=c
z3T3LGUa%LnRpJ4ui(-(?H~Ucenp>*Y$RsUp>EbIrvrmKz{h+V=Pju82jc*EG)7!ek
zhlct6p0)6*zS^1T3(MN?U-N`NcmdB7Z78dVJB2*a=i<LF*tbCn)#9_*G`9_ybKZt{
zV7~|LWFj3A*X^#p?3Z;66HT)Eeb_MPDYdpX^r^uL-uHLjsk<!ohD18Dllb!RE3X|c
z(w2VE@<r45U$E^?cx3u$I<%>@zA`pl&ghfUIKG>=I;eqn@Q3!H=d^+PJDBnf%E|w1
zxx9pyKi7LOgZ~NY6&t;meJlylYpj2AWTY2}4!8gO0(f66dP3?@`REyGtHpzitmz_q
z9n7p4U*Qpuh)FBxhGldFEa;}crHjLZayq=NOSPIkiy~Meep^f0GOKR4VyAt&WF>~T
zk8*PfO6`5>8n1@qeYVxr3!a^|Y!BPm;60Snb76fQMm2wIFi2Bpp)T}qOg%k#c2qx+
zZz9j=UzVTy<KOo|_{$P~1ULR%-shCro1V<X;YFs(ZPw9Z?9o%2S6lfacxI7cj&If7
zPA?I9>r6@X(YDAY_$Dt*_Hr=1Nxh>Ny3_^>^fS699_M`SG%M+!durdNdJT=}NRN9G
zXJ~5*cd>%RuUS?&M>RN1z|6sE`ff5e_lEc7k06^aBJz#JPq|xDhG;&0oGf#U%nvts
z;6+gF(OUf9h@%V@u;26>pA2W|AS*owRjNbe3GeA_`2TZ$mv5aBnc;`f)(h)YSj*G&
z41}@Q<#ePi6}z{<x6IN(XX^$S&{jVZsm7V#y+I$nAZy`uxtOK=IL!NhWFl4QxJbup
z=MWf8ydL*AUbl@{aO%BBcZgEY*b;o96*l#rKabM&DxI|*yes+eI_nYUmzki-T)h4~
z2tSofZkLa@+VX>cDb?a%k$rsVEwll=GLL+$UA!xuF!g01dd|neY~P_jS}`xFyL1X&
z6%*_tX#W7)tf_6t<C1NGl#-#W*cf_Z(PA_oeI8GQ!?3@z{-i$^JuBPn4flXaYcj*k
zZ))ldU(D5{61dB^Sg0$*H1WGq34U=zI`S=x(cAvCiS%$?)p0&h7RWQ2I!E|BvhPW$
zmLUgxDYc@<;;#ld;iW%&T)vOJP5tSrboNwMy*r$Ze$z;g_;`>ZiOl<F-ukkwViG`F
zZ3rH;0V|L5)V-`2%FhUC%&g1hoEFOxEk#D(*t39b+#IjXJilJGvUCjB_sddHQ#<+c
zc{+DE)LLuDzMTy=g_V6ZHaQf;EMj+p!K}&a#*A9IY5mxBVx=GQmfFGx$Zbypi#9(H
zrUPx5rlhS=oO8y1CQ6o!4xw7GU!Q>2#cc*SzFi*!xleOD=VUFYErLtvv<t(Z<xTQx
z?uUWTP4Z1}g&%{0G69A<)SHDDY%9q6ijDFvM9H<-`qQwjao|NO$%D<5<ljEQvu?|B
z|C-s5>E)VFk~6j%9?;L9k*eg?obs{#FpIrLw`w#A7QIvNg3uQ^-PS>`(|eOA`lSCR
z4T!g;$j1+A9{=8wSaTSmzoPGG0*YK#TSS)ptDKfhcxGYP!Jpx38xOuE(GgLN4!Qw0
z#5&mDL5AQRO#@fGP5;E>;r3t@YrsvG*vVyhV5Vr`FY~m|Nzr)e@NxebY;EOJB8A!a
zn2(4R^=b44R`d$q*>m%LJ3+zM$jj;FfTeY})-bPLK3n%mQSC-8W`tacoRNLHT_=W{
zym~kltQj99Tj^kycd+?bUV0y_GpUDd^7WpdS-6GdG47>qrL(Bq4%FWw-%4%I23{s$
z-;bn*9AnpZIkyXd*RRG_TK4!+cJH>gz@r`yyRw=C{UC@{kB+Czwo5wed3Jw;f3432
zizW3&y0qR)=kQ)BWG$cAI$^r0Ea#N)UUc7{SWiQ(s@>rUOTtQ2k^ix`e4T$qtup3k
z(UP}Vckkp^yrB@2H1$UC_k&)VsAKgTrDqB}xGgRGIeHTQ;pdl7E1c@7_bT1juc)z<
z_7n1X$b=!ymhz=Np&7aL;TsF^)XU^XGqCoZ?9F-j^={7#Q`uy%dOk2JIozzJHILK^
z_t;E*7Bs0Lli_fG>8qZ{^V5so)^k}^pX~$nm>siP+J^f4JbRPp<W3u+ZyA1nCstKP
z3+PgIdk%WUo3`7RMe>J>q;cb2pv*ql(mt(kTQ#-Ayc9fmhYj{CGRa!TQ*CgFyie9T
z(9TgoJfRuz>LefFoA^d%ih3q&Cbu%@(PvqL=M9$IAbkN&dYWu@uGOcTI*YEB%=Dm4
z)iEJ`3SlkX9;DuHJVN94J$RqXxlhAqYF99Hua8h}HbJ>>4P(!u{fJa$f`fVhBwXnC
zWul&tj=q|wsDSTwLtCD$p@NB8B1oN6k=EMD*O9rl$A`-k-jSHLT1t6s*za-}bVVZ6
z8BY!~aQ?QvjST;QEsXXx_IKD%3-DG|JUut{4wG%cbk7&vf`wg*Ow{V}b+(e-i1|WE
zvh8#ww`DC0$fq_IoIB0$3`QwG<>gSma>ZY@mFy&E+Q7GT`U#);DxP;nTktct<qzGb
zU5H4ZbHXMdhR;W3nk9#^yy9dCP5pqZw$$2nrIZ_%dN1l{{urz&4NBc*cy3OwjORSA
zqo_u1hts82W)=|Z`@)0j`*K}HB_^-!!YZn3Ng~@-(BV*!hu!E1A6u()d{5BdZm~aG
zHPJ&`q$TAR%%FvEmQ}`v+x<b9mb@*Kw;|Ubtrz7nTWxKXX)ow?L%lwl-cXcCrRr4(
zjcb4<+H9EuTl&n(gCgZ*1HBkoJs*{R=J2tf<)Yn!Z_rx?8z;y4%-@2}ev$P!0=Jl<
zO)VGs={e7%8?do)aM=>^CCs^c)6z!ab-vi1wSHt}JK*p=Qby?i&_FYJx_GmonzRHd
z#>+3kF4+Z7&EV&3I+&Xr?v^ak<JJU3dRo)QQ@a*Z=fT#XWB5nbHLquhW?)}t%WUaM
z&T|g!sEPk75v@e*?g%FtK{gbEC&lG0>R3}en-%e0R)?&7H~yW-6ZX+(q?4bAEw-nd
z^{O#PD>5xS>`__C8z0waV6H!gLz&u=MMqjGO~zjLhhKO{U5zGOFj|_-s;XDA8^nyC
z$P%y8$@3+BA<O-0c*IX}_PG*I{y(YcLv4qrjc3wPQqW$+qBro_!@Q6zlf{}76lusB
zS58?+e65Jb@8tf(ySB-C`C6@LRjqpTAShbH8^+^W80&chF19voB|<!N6r1&)pp&Be
zym{mY`8U?to7ylqR;W|SvuDXQYXH}4j4fOQ2{Xd7o<vFgRf_P2?YvR!Hj^;EuyWij
zkj_g7zlX^tpxq9<=mUDi+UNtajKQAa|5@gV;FjhNS9oi^7HsrHEsh1|h@R42RwdXH
z^u|k)!!hKAll7?e0Lx}$$1}+XPvN_jsQtFkMm*~vqVdP}HZ_1B=noiXIkZ`<mXDDB
z=#BT8dC=dFNj3Z+f2;`1cD0XGZjvE}aEq#xbk{|!RufAiJAc~-)78w>MIFxhqY~sz
zgLNZ$F10++{X0E{3X%{XYqiKu_K|mHh}WcN?OWE|v{~>}v_!mzALVrSKvuP)zT-!9
zAUJkFC&+T67c@8>Ss@D|1@$3om?65?H`+Gd{R^!{zTH-r^Qp{r^ZVg*dY@f*72H?P
z@9T*1Zxfd_{DXd&_q{pniTlAhHIPS(MoKEDgW@Z&(j!{MV|Ys+&#c@E5uCSaFt!}g
z59DQSfmc3FXY5LJ=mR#AQ{eseBRi}O$iC|nRnkN^QQ517YS6&C`EDz#7s*PGX#Vhq
z_a+Bt{;Jf`@?IR=nn9Q9I=wHOJTW#+%krHkf~LgP+{y_Sy(2BP9Q$&C%Ki|2Rc@fq
z?BhvidJ)?IOR5+xWHsSOL%cS8wj7KqJM(|$YdXB>PyAr7Y^Da?A>P#+5?ymirC<mM
zzd{x3%RsiB!FT9-qSqE;^$Gf%3c_!SqQ{~>qT%e((*B|}1|M^KLG7=ZG&!uMx%JQR
zNm$sH)Ms^0DA@)SyXx8~m`1PN?=m7*Sf9}ix-sly`810J`n){l|8Q5-6!3DIx6^vM
z+=l8Z?Wu>XPHd*%lNY3or%n-Q$ULD{lFr5VfvmTf)SZSK-@di&oK)Qh=Rd8>^~+#f
zc#Ax^f!(3r(1YLmgZ`h>!N2mRXT*2s>lVF>x?kI0C6C>%Nt(tBMi0R@%EIYh_Hy=Y
zFv+i4*61NAhANg@QqQKH!UkG9cM7vN%x>{AobT$P)ye<X5OZf}Fa1`Nr4;tHT6crv
z9r3FLe!>^CDywW2ar`Cb)jSW{r)M%yF1s)P+AM)n+IXMGr<W$r{L!w$G2S3H4U^CO
zKu}tTTjpqQ$tMlTV#fOvO9a~r+ph30JUb*KDiU4_Lt9T(@?FWU^K?(JOc&r|o&0y0
z&C5LTQf;Rn%8Zb^#570zf}NB>)B?K`@BXmanu4ZQAI6Y+dS{#`8+W@9tD9m|ujnnE
zW*f9YJb$#Tt(F`%5!T;CbI6A>Me2ZnCxTq@k0lFHaV<}^TfRmets684rT&36EU+#x
zh_spjqj-{>oEzQzNjU1GaHRhj;Vy>gYRLk6&h&r$0DN)1W(|^qH+cGI*pX?vS@MxP
z_m^DJ+FH`zksiMP&}q7yj<DLN^f<j;-voK_tC{|tY)<(*Hp(aZSexzb=$*Nsy?i>=
zwJ<EFW%Z;@u(fd1|H(3aSC@eI*@>a~t)y0p&zARy=S5Is)_N|j4Q7KMUPPwBS*z%6
zo`0ghfzH%7oZ(-4QT+Z<m^zbrqK?*fApTT3QvdapGE|?}6gbBjxe#;NFLT1%@)!8m
zlRm$<z>F<K*8gO@mZ0KTOz*PZwfvSf@<H}2tm>jp(xtXS*XS&5?i;YSHmG>NSSBsw
zYvd{%uBG>2FZ%@#bcSq(_088w=!Xl4scCd9b%Q@6<%qD%?jU0>jP){OpZu>NOr{Ok
zu@P?`PruC_0YSY&yeTXCnN-z#1_J3v`hOjr3Al}A_r{IKEb}ZgnI$6jyVfQ{nL<(-
zLPb##pHfLUremJxd7kIl+3#B0EMy)-iHbyXiKhQ=|F7@*zAriZ?Du`1HQe{T*0Y{{
z=oUW)Ikckp<K9-zmVvF?hL1(R3X?69v;qykZRcz~dvq=`haR|X`mQ`JrNAt=sVw{z
zCK!A+YVD<>X4%0~%#89Zx*I*Dvwb05h(AA~^=}Q2Nd+6h?P6skO=YWI4)ghgaIsGJ
zky<|}Of3H`;D$@Qb~ftM8=kl)Z(uA}^mCN6>9Fj<aujUUm^%Lm-OMhkRn@O;jXv-s
ze=5GqD$7;kSw{ACpRA>3(Okz6eXeOizP%NliJN7lo+idFwX*y<gO-v}#JQ>Dm$u}V
z{aD@$u-8{)j~|uUL4^LN63hqU3^etO%6@YX_2b|L9Z5Fi{&@Dbh~|j@Pp8TuorTR-
zk9|T8Y3dexRT`6%8%Zyp2#Wcc$oM5yn#G`jn(<~flOC;a{6E{td;g|!{U857RSFVM
z#*#C(`k=@}zXl_Zg1ou~H$jD{NyNg2meq@Z4o2XmGoAT!vQZx+TV}FvgTC6-CV@Go
z_@BO(?4|yRPK^B#&bH4btA0#8Q6hCUbZn;h|399}iOpQ~(x{DV?FZqW_{cyX5uRjH
z$bESjJ7x#5$n8}5_Tr15Q^Q)J%WW=sxluR|zvE1h71!oqfpJ=mpS)nj>B`+AqhUT@
z$pV=KPPrH>D{=oyOH#LJBfkb8Vt@U;EUfx~JfWk)Q$80pDFZW`i#l}#PwNL`yA8e3
z@{l(=>LmZ29Js|-hXul)gO&A5@pt*-2fPkwd$C7!nlD7LXUeEv_J6@r`M~k-c%|q;
zG>-q-qw#&jvR+yUo1TqYeF9WC*zfo)ko~jT9ql%g<`5<U$s)4BBl3oDrjq%{HQ<X|
zVXg!GQ_$Sma4fu@`_lZPuChsC4jF-}S4I=+X>)yo2{a9@3To8nK?A=Wl#t36*Ac{?
zrrev91Ai~!zxW~Vz`ZU9yo>ITW}u6lnjN3JX89}y{dT=2PP3jc^Agcy5W&m-isz^5
zwNJ8Q!}+|PHxzE2K})a6{be2G;~<y!)MM=P@8J@eZHby!B}etN%F(Xyv^4rC5x2Bk
zFb8W&oLZ_!1}P!6t)<=v&!6^_+Ebswx@u_(w}s^NG<-+0wuAk*jeNr%chb^&4GkcT
z{uXYcik`T$YOJSK&c6Ce7|OQDLYt~zSbpElCufFPnJD%QjC4sb-rM=W`UmBdz76+z
zEO?qcG=lGZiAcwUA!2KJFUe_(6dT0;ZPoYi<|O<fy${zXd@eeEVSZNzyzmTMDTd0H
z))rcBKKH0C(PwNpF@GGY${J-Jv_>>nusqD@Rbd*7!sGPBt(UFd&D-04ZR{Ud2BOL<
zx=LS^PlH0R%T7e+r!)oo=`B@!o-fn|WXWR8D5#F6H8a?x6Z8uoqOH}f8#Rl+gOgx@
zv{Ee2?Hszyijxuc$^mQX_iR8g!-qJ$)w9yOzEN_6I`-fZP3dJS7M~vOBD#L#f!6Yp
z+SBWME1%=#Ee*Yd`9WY2O_R8h0;OY<PvUl#TN>!g;piB3MPoji?DbmS-UrA4tVqCs
zd%U7=@~@1$a`gz-nhV<+Ku^q7vRcvLY5ZWR)0M+SxFR})zOL!o9S_?MCTpqNK;ZO!
zc!j71pUVYsOD>~s?8h~o^do9d#%6Oxub~^K2|v+%-b|+i1MEC%eR{OILY^#_!D2DJ
zLigU%u(xE_4EBL<Q#$fwmJimo&Z$Q24w_&(p4^aPzC%Os;!l#mt~>N0{O)y5f*%9P
zyVTRO_8g!5CMbq?&a#r4Ianvl=!i(@?3=0P^g%6J?C*Qtc!2Nx#(rGWk*N#dW&>n<
z@G6Sm2rcNVV}-asc%O`PrlI*PcI!=@%F1npg{IT$WWMLX0yR-ekFgr1S+U<@?^476
zMURtJ`dMY&NVFKH+{aCS#z&e%7sHgtOCu@>TcmL~n<`75=veZ^e7g;ode#f*L@6Sn
zrlBra#GeahNO!qHWGuq|mWmDzPX~N&u-YH^8u>^L+EyQF^K=8z_M}WBpO*zA)DOx_
zet*mth8L(Sy{0w7vEG9^Mh7nq`uIlDgU~yX2?pAskb9u)0{TEn|6XUMzH5WzS9=c5
z*Hd=*I?u^G^rw|-r{Ay_f-xcY%lS>)1#0<>oVQ21`TsNpKbs+gEsAAaLT7yt_SZIG
z!pwdp;2vo9x4eEuR(@ZX!(fW(Q&y8$_W>B85$Jk){Xl!`Mk<b_$UrlskR-fdMvNJ>
zM1~PEMW^^}ONxG{y?j;bM62Zkcv|McQ0b`>PZK|3rNH8A_{=@;B7e(wvPlALzLEc<
z8$jkmiGFFloK9rlOM5?!SOI5lnNJ}8yka@xFM)(UV?}D~4l4mm|Il6s0hS`twfCW#
znH}4#lfl(L+XB>qWjf18=~iE*RIK4ZS1p4Tm7ndbl=IM+hI@GHxn3yVLAaqnw@6%a
zSuPzNyJQukj)kUBKW+_r;%*6jAuQ}QQG(VHXAjvP9j2qCT6jS+!iLXUi7;7C`3gJh
zY4myD8+O$%$t4@*d2;(R(P!f0<Tka^59PYl3GVCTI>KKBRkrd&+z#^+s`T?Rl+PUy
zW)b*wa7i^`f|HhqWB-Cp9`}bfAAO`NH5E>af}lRIzS@d^o+2BFmaD+M>%F@@PMlt2
z4Z~d8$BWsnNV41udwagray~86Q!bM?7lOSKw@p;H4*2$XO$vU6>wbbgBx6OyZ4g2(
z`_6B1TBxKx3|$71*=C^NukswM?iRU*8XmVkV1_Q9Hku=T7wx=4WV=5R|1_*>J^gvj
zXYY~A>QQNKWozVvAa1|Hm`8?>sDlcs`vjW-!rldUsHJCPS-At?6&WnqqIJ9q&om15
zGd83LpN{fdK3d-K*Wk0C5~1S``{wQIqDcIY{(u~wAy_OIWTuzazsXfkdPO@gS0x_y
z1P7i3FVtY?F3U1qXN@&0Q!V}?PxocD%Q$rdvOqEL_QpsD`Azbp^B$2r+JH>IjeSYg
z>EO5W;I69d<4i32Qn(Ammqh<f6N&kI!4NAS<&Ft>FsFM_U(0Jvu;Xx_2E%>Lf03{3
z1NOOx?b3t%+ebLL+KIl8o%}f`Dq04RVM1fRLMCm5#jms%!>6q`tJ8$&|BF{kOuZo6
zM)Z~7u+X`J^7WqAlb&F<_q<wkpmzq>Rwatvv)6Pr7D*4co)m6&mu<l?>kKcSib9y%
z&&X=qAP2yt9d&|j55M;Q<n6?K>t5i|4t(SP$o<??tW~vl$gEIb<jYV2ZrDDjua5Vv
z5uL6t!7A72HJ{=Knd$Z#HuIys?YAY-r+wIKYh~#ZdCps+n0*ocCXf11wA>4_TejH?
z;LC$B+#y;~_vm`b#reSJ^(Ho#CR*1XiN;wsPLN~Gx2=dj?s@cqULa$YjECgqv2eW~
zZG#@Bn=QL@5)(F=$(|<)wA7P91s#imFb~vw+XoS4`br-Z-xacj9P<JCV_nG_Eke9Y
zi+1v7upGOjFT<*YC2eQ0Ryr$MS~!7hKO58$%5t<~_9-kH|C$Ju&Oh-z`l(FtmwmoW
zl~q__J?7)jwkG_}ruw{Qk8TN{!8!_~`;3rPK1Jsd#fEBrBE#S4^2;6mA1Ul5z#WNM
z_07RoxruH4iEAxbqt>WZYrMUz)Mez&U*#2PrLWmgA0+8uBbRlgbL)$z%6QP}QNHPr
z4zx=!g9gsrByYsG^bP5OlPW#~&kE&_$hVqa7x*;&G`z3tz_s~79&^KNdQL{`HrC;Q
z_6Vwx`)>tpVR|d9kT=tZwi&N~L?`IIa4=8Vm*3X~4_$2kT7Q2=bNX<-E9W9*_^HLg
z1kXlaQeAo79~EZf%7w6jm9Z6mO&9ysaBKaO9%J8g=vc3$3oWxv)VjWjCwkK2HW{Cu
zZb?|$29VngucG<ALeO0%qloMcF58E?PO3`L;D)?S##yD~>^c?ZzI>iOcN<HeLT_Ti
zYVT@SWjgWP?U0|=0{Vvz*0x?!iYK^NxSZer67h8dQQ~s&Kx+F>oFd>nE|cR&v62^w
z!S|(uKf!#}*NJRj#$J%L@xDZg_pBmn#n(XxqSZvuL^`Z8+5hI-7h+q<`ni0h7lI9X
zOy=O(SMad*DBw#`_d4@?^II1pQ;z5nKLuYIr$1?WUlW@Q7CP)LVTY5ft(^86_;GWx
z`zzS{9qUO=cn$l#FO2DaI>ff|Je7Q5{fF#xYpkWEXA8D6t2T$IGNm3*s|+^y!9ENg
z*QcoOXYzxVLCa_vD@6zIUOf@cB7e_d^}e*nwI#@;E^1mC*l>USNKp0T?PO6{$tvnC
zKjLrr`##<a=^{ODt>Rn!G5LT#k(+urlF8P=Q5Jd3l4OD9_Qdp^1v*M=pzs#e-th4K
zUfkRG0y``p5R)&+O1ng-{$Ui3d%;YdLUn{1X|NX@bkRnTg}-BBZhOxhpTu*VfyEJR
zBu$)28F+kl8}7R$IlM*GFRZV5ddsf!ttnHG4qIIqRssR-L6iT<@`4RhZ7m4;FUv_}
z7%Rhg=6;$f+RQ%J;><e`{Z?j$8Mt3=G%>a+s`^7);<<eoIH@$uBua*EU|-q`vV!ki
zkIgl(KA_{iJ|)b7T2_Ny+ll)1TX;m~Nr&JHT}6E(WAN?cLjQv;!A74U4xPbkfAYn6
z_D)ODFH>jf+Hks8*B@nl&@||zxg+(pljhU?yhAO2HkKKlR}p<TBFRn-Hj+3QRgS9J
zD0s+S>5ab(3@Yjc-3cCU$Gy_bGa()y^}5O(<DOS?vsYuazFveoEeiWl72oaY;Ga!A
zM=WY<!B214DA4W>c5S1D67Y$^as`$(Qv1S0lc+p4v~u<(TEx$io-=6Nc55T$DgVZ%
z+E|Nvrsy4e(Mv{8+7PYjDM8|XxUSgV$6h`BnRThHMdJ@Vmwqq#;`d-Rd#spN0m=4A
zAbp`q?`iE_{m2LUs>mMuo;aA8xH}13yy1C?qZfk|EUm6Hy`21;F1lJCvNw}_gq*O!
z;SS5_lfARnp--Wod`sqE9bO5>$`Z?}nWYWbcf7ao?_jwPVws7vMVa<Jl;5(IfB%3C
zvOnAzCc{1xOnHnxg0=2vJ?4c<WS;@Ae@Gu+G&;wBx9+mQhnaasU+b4breJfQ#c6#@
zr=a7Nlv?O;qa?-46Jf?%jbI6x?1q;GbI$R#mIo%$T}%3GqQwlqNqi`$ufkS_*uh8}
zOV)iv>|bn!7LV2oZV>A?gFepT^MA|zU>To##<PV>bd>!ii}ZkuhU1S8GHO(ZC-SrX
z3(q(N()u;n<9U6K4v&rDO*cq+O@qhA^^_0ug|b+OD^qcTf9$-T)bCQW5V=dik<$BG
zSbe~rjqtJdz{}a&`hyL#DR@eH8?MPj+3nOB@Ax0mz&rXye_2|yMya+?rt7U(BhXwO
zJmoOBcf0M7iA2|Gk_|N1O)hE+_`n)32)caIo}z}hjb4>EeV@EY&rY}aZM3TI!Ua^s
zvV$1+rIvJ+4ptBz8N-U!@V7lZ+1uDUE$9DXtz%Js=6OSQ{HGvs)}nv7(^h$UklJEl
zZY>(i`EU_Gmw`Q+gw>XW4^NY>V5UWpk@{!sN?28Yw@S)sL%VAuV>jg;eL<7rW4x&b
zo|vQ-lVL=)s#=Wb*iUPF1rY69>_qL@C2JuMV%6avC(+7BgM_BAj)!cwtgvogiT8Y8
z#(73g)Wn#<yc6c|tK0!oOq6+QejN>CtzXqjzRCXN+Z%gsCXp2NarQ1uragPp(m&#F
zE9-~a!n4G3ld&d&c&pJ(l|=WK`v%)AjmVBsY~^Kma5|=%x3!v@24<Lt{=juah1%Ly
zzqMLaDKbY-YZrM94?BU@mfo)GHeK$+f-ZWOpNivq>kRY<doSrlIcJiUbIz|~YfG`l
zmwmmxWJhcwc%9Q0@|Qe<qB%;N>RDn@PEO$L0_U{XPdr)M$~RIBypS_21&Ue<`b(o1
z$RcxfiIml^HO4AQSkrpThd!Cw0R3X*+Z^1k@RQAk`6T-UJS}ni`+0p4oIF_{3lI4w
z$q-IK*FEexZ8CrRK0CaKzfH9!cF0ysq4+=HsK}$7{p{oaYGZiGn(z|`q4RyEbh|$c
z3ZX9Dm!-Zx%%hXRqi5(c`Bhee<j%qTj%#nSP$R2C-dzRO%!lT2hc}-uN68xPwU!OB
zB6LM|h<qE)!zR{}ff6?dchY!$?&OrQsDm%+-&C=C%lpKaCE)4t-dH>9X{KA9<@=6-
zBZ_+`G{<GKS~G$l>DjPTnk8D;lI;NRf6fobCc_5jp&-rYEx75~zOhFYycc$H`p&$Y
ze{P%fH(3!}4IXMc-ZGOG){fZ7a9>PJD-=H??}ioerYe#-n!yuyJ_JPc<ucqX?5I!B
z#a0mncbi=q>@WK=G|Mvnq5Oq?_mMgh^*wYPb@jrU!{1{!%S-Wav8?o*TH2zz9lqTu
z><unjLJw0(cAy<7v9kRW7TPn}=a@&XV}+O0qy`Exh(cFr7czD>9VQh(PZ#uAe-$RQ
z*D^*s`ri@mQPN^Cl$=(8)wp71{6FfudBf4<<vqep89|CL7sejgW89Cy{crx3bDM-@
zCxgBdPViFts_b=%F7P|r1!QrR2%nC>&x~hekGF7>O1?Of4%OnCJ*i#n4;u~snhxfy
zYtKiP<0VCON!Uai5>Kv?W1r?mW=?-f6F)A8q=roL#LbN-bhCG~v69Z`68m_wSVhkl
zzXDU3@4tpH6653tI_VbqocLNz@52RlSral-XX?xS70yq+{zt9O%&H`QVt=qkR%$k4
zds-i&b;C~HfZ5Ko`13h=JlaVQ1!cnLeS=T;Emlx(Ngmi&aaMf1j?^i_-UM~Mv`2Io
zcr{69X*V6A`=k~+#dfa<=J-_kZuYmESM~2S;u-06=JxKeMEo|<H#^aCpNGLgGQeU#
zCsml4mOcEKh)|I^CL3gG>MoD!TrkEw&x3cRiFDUt_5`f8qGgIs(-&;EA0ejyXw&$K
z{9a2w;^$K&Nekh_r@WUh<f&fd=AU-{Ds{vpkY7n%%=c!Np6GZDv9qpWj$i~kQ6zpE
zpT6(MiP*`!?+)wjU)cQcd0Q0O1|#H@r<78<!pJW}eWK>``Q*EFFwD(Z@&Re1@B2HN
zC(3k6%VN`IH7qnyad<`kvAUX__}5;F*f}Ch@29F+-q;i$M=qJ^4^u%?Rtu)FF!q|w
zw|(Ik(i4VJ4tuQ|9~Wlg6mJHbWXJ7&BFG_~N^V<^E;@sM65Xc<v~>6>IVr1e@~mV-
zDuVRM+_q!-A2@!CZt%<DM8Ax;W{UjEQ>WKv?9fws(5jQiBEAL8mjrgbfL8JkS+1tN
z?yJB6uUhsfvsJWzu!+dQU4XV93|5~wy@KAiqS58d2Kn13Me2lSKwA%Ns}><fWMu75
zpk#j`mE@GZh0QDP`KvvpwXFikd_0-^3>KY>cP_vwi0wX({kY59WTB2Qn!e}XJf}aW
z9V|6yLQF32pV=T^5vKQ#iSvnj?%%T!KEsRYe*rfRfuI_bJMK^cJtj>p=2O9>)#!Fv
zM4bAaJlaCK%Pt?PN!D5a4QoYL$|j9V<>+NrvX@rkBtceR?aW9C(f;GJV=ws`%d4UH
zp+7PQv249=lr7+{lRli2)Mu=)74<AO8mxHR6Pn;%%j7pAd7>>niD+|8Hu3BaZ7BY^
z3O}3={w|9bmx5!Bwok+AVEZpD6U=-;aNo*%A#z(LKT9WQYimblAFMM$Du197F2RrT
zdBZ4Yb@f^H^!xCE&J2!))1{71kVfGY<(4v83|7gbG2(I-o9|3w(_vuJ`5xDPK9Mt)
zf5=;O3{TLy#P8f%$9w25pJ%tply4IUTF6MxB;SFl{`0%RSk6hch^7N&3bpiC;paa{
z9=u>ZEaX$K6{e5=62$!r>1_!WqQ4gPG@y*ncvd_$@<7q_!GlZn533B8KMvX{L8Q3m
zZRG`WZEL=JqRrFpHWY?-)}H{g^aA(fQzZhV*R;_&ptglFkNEnB4UrXo!Xsdmld)a&
z&SoGB7KQh3_bTx@mf4fyU9~9t#EI>2A-)JR(8F@fGmzol)<@CjD%ewUC-%F2fKoTc
zM)OSV$o9Wsy~cj@588v=e~2vuW6~E#hM^BI{D6F!q$hlf=Z<$HtJeqR9``cwnby@?
zg0zm=<X{mfY%;pf9zSgDV*~tiuv`+H;u2Q;rkt1So)@;!gIH9@PTHeV*6Z>X5l>6!
zQI=@ka4tNuR5%q@nMYd@Gjj#Ay^6N>+1>^X>5*t{{T(h>%5svQU#H{ZELeK4CV`z&
z!xnnUe-9S=_mak+m1bT|f0ogy%l*1_l|1NeqhUT}(YU*!U=UY==D~58)G&YAL#+_4
z0k_>Ke_Kwt+&@NNJ8Ip>!6X!hYViD2V&Ow_@j%NKoo}Zd>+&4&DbkSmaZD>}cOrgD
zFhJAzc;Z~Eu(dx+UONUB-79s8(8Z!9h)`chvQ@FAUch&OwNL0!6pJchvD7_agQL>i
zo1+4f;bA?A{-mO;0Nq8*JzwAjL7!RUCnIOzT}Q)b;U%|;w)Oemfy9G*k{!e~D6&xB
z2zv3kZk~^vyh8^2qu|stXz^*|mC?VWkrcls4>;l3M^}0U?L+SVUO%$Tx>8e|^J<aV
zegJlJ9-h8MH~O0v2Ib@)yFQ*6UXxG!Xe)y@zKbf+tuSE~rM(T^cF$`Aouqr^Q1EZK
zQm=u9b4EMs=em>nOj>l+t$gEoyuu_0ytJ#%3zFFLPvze*^wm1U&qzOQ6wZO|lmtKj
zB-{{$SEu)HJ#i|cls*Rg+z@W_OvIG;(MLX^M*2P9RDlkGoi;;%pf5DpKO$D9jdj$`
z;gR}fiA@i}vEdbeg!yFW@&Ak<;nvuBNqFCK-|w0D-irPO`LVh!g(ofutNJV9GNQ^?
z{tZa%voL40O)wP}G+GOyX<ZM>#jD#xR%s!)X0`3cFN#`G^sPV1^)r2G&{*Q0Vt1uk
zI8-9|Vm46bI2cSxa?dl&`WS%S-;-oe@qKC!qa)qyrjG(8Bqk-~#hcrJy&qBTNY=!?
z3>)nwDnTQH%<PchPp7T4H;+Bk66E<MzC+e&_Mo``983!ONBZ(qzhH|`Ylq->+ZdFV
z-M(EKSSmPf8L=#H{jHI?y2(BZzV}7iL7HoCv2Z>Wn)96Fnynq-K?A_IZ`zZRD|n6-
z9fHoe3~V`(laoX2OR49F?B{SWh^oC6M$4+JmAnt#@H?>43#?{}rq}VdN>Xj7Wg)`0
z2dk*A#{-*tJL33F{S1Y2C7p>|(P75wx1h(9;Y{}SIsM1tSioP>p1#m^u~zY1tYWud
zy00N7-?FpPQBK<qSWgyKu8X(O!eG2~OqR}yDso9Vi_EDylM3EX$4RxIA<sU_7x-|x
z+g{RbVLKn|dA*~27F*<py%2p1YxP?AIQlPlxy$i@v$s;#*4o=XThq&I{xe$YJCk0e
zj~)r9!saUY62D+K?QKpB?I&9#?t@MPtKZ_6{35@7qqd<U^`0gZnQybp6Mbq}L(`>>
z!<*ma=f1b<KE~_e>vKFE`KPJuaVI)wjW!ORmyN7Tu4vb2H~USh6E*768+^%gfeact
zJK`R0lnF4LH+={G_ZXdvHDw_T<}=-ZR#rM%EP4r@|1;Zb*?m`7gXmh+e>89qdo|c+
z`TfX!eEA)3%q`9{c*Zw%RIo4nRtkAz;_+3TD}B6gxJDB9D&`~B9%gM9(m(P8{nk6>
zYg=Ih;awF$u+!}e{A3Rqasb$I7dBH`pVAIQixSL|=tp&;KHi>Hrek&Q2s*Cb2pjXf
ziL(<OLEi;HUhVA%JSP<{)5lhj72c4VM8nmVL&md07lW)+G;8r3n`8m8>&0-c|7p#!
zu9E%|eMULrDX}uzE!gd4c&Bc9UZ}dj8AsEv*E4mD^w&eNjnwH@u`4Hivd-b`-nHOV
zB<dM$9C)fRIdXxlGO8Mqh2L<%<`DO$Y7!ZFqGSVsZx8Om<359Hes6!kiMG_w9RI-A
z$T{oA_g;_e_rpRD7M%5QyrZwB*AqcW_J0q!WQuo0g*fQ*v=C>3*7;mHDzC-Q@I)K5
zMm*~G;It9@Iyh@5<RrYF)6$wO)$yRb(PsW47`rsqv4P0_z8%(r(nMYUwIkjg?AMuk
zi~C5_|F*h@i1de^_P05Muo|n~Kn5!+1N5=TlRi@R$C6|ZnJ2d&#A2I7%R1A$WumwD
z<^C>J{c4(pJk?I`qRXw*P<!Zf_{uW9<zGod|G{d4pDG5-xyOq~fg+~xRMj*odc?+C
zIXcDa*pslgIpm&Xc7M9fl{GfYs-dJ0k_Gy>EoL8^`b$=jnD&tl2Wi!Zm*2KCK|^{N
z)7yHVZG-WVz!Rq<nTM_CW7R-l%{`50*PfbcS$te<B514tS#BEGw^UF>DxfL;3#U(u
z^`zK!e)DMZ+@pThj%yz8ty8p~{z{&iCq?vW)De1qd~EonzwWzbM{p^4GgzSu^jrGY
zCQyetYc+kjZm}AkjktABGJ{7hlJS3$CE&4SALj+Z&Bd{&gILmLzb;cOon{54Ir($1
zu9TyG6yCU$x4TJBYDX@qt7U>|Fu-D3SPq~-u3}d|awcTiqv*Kzf@b8xzd~mJk{w1n
zQ~Buo>rc1iE8!?^vgrNP98C?Ic^~*yAQ}8Ko@BA_lmal|S-kHQdYHbk$$E}eZ0m8<
z!9n_j{t&hadPy(+F4jRqTSrzXMBJ_(ZpYF-<}@drqVV|Jdf#$WaoC|NY@?Klp4Vi~
zLso~`57HNOt&H=vdc@n44f=;urM;sY>2)7sh3M`0+A45ntd%a42}+L#-_PWZ_!!F_
zUXhEw+Kc#;Qa`MN68i+W>o`*wI2ol&sB6<#W3yrAH9bj2NM6n<cJLH9Sa%|MUu}Zl
zHTFv|(<l8E-YcO_y&Kd*r!A!=`1~q6Do;ntN19ntrIUjleV5f9t?m4#)eOIszSaz_
zF`W;qH<>DIb+F)Ckm*#JEBXD^$P}ah8YX!;IO>OZ#?rpfU$^Jb2ReH{*%|%;PyN|B
zX=lsv8}ruO-FMmsTWRO^!60XRs<pHy@cPox+0xclg641QC(2wjy+U_URy?l1KE*nH
zDr>RW{LzQHIans6u*C8(f@xtJe@UJsqMQiU;R_YOU%C7fdJ4z8p=-WjGg+fk*v)P@
zO%Ob)Pt!4Wnkbi67X&YR3C#ea`B9j{X(OyubS?aTAKvkz*U{lT^8_tO-|J(XjvExd
zsm;7b*ijGKe0x2%RQE^T_3V*#WT^F8&OgVeuTZgi%df~5p8}uh63q7w`a_V}cS}Cm
zV)cj$`$4cXyoA53&pJIF{+>_QKK`e~z`DKaRp*pmAGlI>PqO8nLs!A`o01*p+jufm
zMSmxJLgv^M;>%GRhew0v!9Y2%&qeI&aw-v<;7><PVV@(tK>Q6mCp(D6$#~pBzaWpt
zhKJ>Kmpu!!N*|r(Uz?Koqh17D(7;Zjhvt-2?cqIRgUJEgJ%d))IpLes#HYbq6B9bi
zOWydbdY{0`?vZCcM*W;(|4}7d<G&Fv=aGx%&}VQm=*xR?h5-~-6<c3m{{&<FzILpi
z2W(Tu+R%qoScai1d?m~|k-2)AKD*3XmK^$%_6@(a7U-?t247g6_>{nGlpfRuu4K%|
zBc0?;pX6`xHk+w<u65>22Y+f?DMNHQZUy5*L3LD5tiBBNYnI=C@x6Wrb~+k8=Ny*2
zoAtYDw=|*P?*}h0vdLar6TRWRrJS77olM*t7ug|=VR8rXmT6wihI#gQ!2A4e+!rPP
zX#wvcNj^@`Xa%s=-_!tefB+g`FZ5(lvp(!aHKWgqYzi*o`!jqb9{8z!!kYG@LYb57
zQ(L}^G5HR@IuN$1SRmaoHUph?GyU4<ByL|=;(q7CFv<03w|8UR@ypso_r8*tN7)EF
zO{M>>F>(AG<z7x=$MbZ$Fi9?2KYUTm3dq>lKi(VWR7BR=Iy!DofPtIndt{pjR?=HZ
zYt2cv+hJwn4eU>=?)A`@-n5=Ds1%un4Gx8ctkTcCC3WvRa?7jhemUzeXbx}TN3=OT
z1na0ErG{mR9>f?eNM~{#ICV1T$MR_r7vA_<tgenDKh6gM&#_Th)aP`v4@ITUZ0B{7
zt<>u_L>p;+9Rqu-rH>F5(g$nNt470YUeWhs&6vd31C_Zlk-8kHc9^dRuOGEE;bxn!
z19i9U)AaFpI7=E?5{ToW^bAwzAX}-E$%t)&SXkEgk&T8@xht$cVhtl?vt1!mtnnH4
zP+J5erAaVa-}XZ=uT}5`I(k8=eZbb)b*xjtm$;VlYr4q#ke}j)sz;Tfhtww{O`t2F
zLeNRJphYZ{cQ|o;L04KYr;>`d#L4;hrHN<p?_w$1IY`G@{xQBEpM4sAe2zb=MP!Pc
z_S67fgXmTnyt7lD_m18i^mIyF^XYnaAxszlPt#gs(P&2|5LCnFsq^BqhqQFOJRMhq
zwOp{&%SWFyljhMzRP67@+Ja!GTL!%Qcvv-9>77(9vkb34%QKP_vwLALV3pBahl4Ip
z%M871ONiZ7Z6GMHgqBC)Stm!NyKlB}`md$vUSis2ID8CcumMcCTD-S5L`B;tyNCj<
z+-$R@i5JwHvM_c?vQdX!8XlKN<^Sh7_;!%{Y}x4FYMwBg?m=}w8)Rkv?Q-2LR6qSX
z)hP9MIh(uOPWo|qi|qe1(VKgq{6O%nsOQw@@b3@6ny*+1uLcTyl^F3jd7!EPB{M-~
z1NE7D<$|@5IdHWTSmSBdotZ9ll;opZ@OkaWv(=PA_;gC_b+7A{qmxsMNEgXO>~7*+
zG;xY7om{r^vQoJ-f;CEGCuOP+_xFQWJ+svHVRl3+CF;k~lEGjm{A>;vcrnlFRo(QY
zm5?99S?C%qKxCICy*@+M{f_*)2Y&b$Hr|M=lvCc)v9_6vJ)PRbc%PwZ!lt?n&YTbJ
zU_Pi}EO92iUXNt;v0#9|VCapsDq2(?tpWB?83k5t#6;}H@(U_WiXZbBd15l}_a1er
zlQO_}ho?Awls<aXpGEC%jUCqJ1nE0?O73`~AFZC}u~c|w_TWR^k4AmT5BQ6gNq=HK
z!Xn!b{>&8rR|i=Uy#EfG!w|io*F#R;Fm++N&f^J6dojr<6Y;EUmctf>&uT5AAyubn
z;>Oh@;Tm{Cg&@#s#G9UaR*!1~%?@vC$W9a`v%c-kJhya05g0FH(9w$0R~NTA;SWK2
z-sPxO(m`5ISB05<p;ZM7Cff@{vVX(u%zwK@d{5ktS(<3Oi->WOEIKZdtRvA9=_Jy7
znrz@{DG>e3?!xgWhgq<XGcsIvkng5hNtw?3Zn2^8@i}xPKCmq?Omy*hS=et9r^9Zt
z=ks)&ou+Q|oPR{t`OMGD4ZP->XN-6BbFqC=Pn&pRLcnUfYMVfbt#q~(ut|PU`|$lG
zLHJB#l}lb+|EB>~ni6~?1!bjVjArq?dOMt_*}xccwT`y(&VC>JzC!ja4W`T!?`IA5
zAKi+*y&rTTpLfBFdypR=(GoBvZt~Lq!Zon#)-pr(Q=Nfjv(7omd{4`-@=L57`#qc<
zieuK^x@Z+YCS`c8B4n&ezSKsB{W;~Hq$Q#g`QK66HYkAx6xIFqfGk$TcYrzP`+s&H
zzYoD}X~_pU>@O{f)mQS|cyW9EKr?%mXiIy<4tXPgKqlHuRk02o179jV9??nqseTgD
zGixvD>)^My@p9&m8FOOvs9x1?K=2R3b|8Zr@)g)WujFF>VGE7<Mo`-mV5GVHH<wP5
zR390sY17EnRrwuX`%|8n;m}69${v{DRo1V(?~K)~x8Hl~;qU<3!*@O(mRW_~(7xp2
z3E-@{ek{!B|B@|U*TOznE7~}|XPyrwS2VOTWST2F4xONoULjK#@m}a_OCquV_oNJZ
zid9)g?Wu(<)3*8!pZnQ|!i-+_c7c*1MIY1Kbate(&RPS$lEX8SEgtw+vBdqHOM;$J
zW-o-9Wuc`3gLT){V9(*;?xkSnL`7(Z&XLzp5VC<t+Y&P>ku&BibKZ!V)%~(H1D{sa
zji`+<R#tX=&_GM;Qn|)A?bXJyKmC7j*AsThH|t@2i+J&}m!Za)E&i>{)2`asbHx|=
zK^ZA`BX#_39nLwosbOiXIa{nKy7X$^ZmuupM8-LG*mbkj^2E%!AMKi4lm+ziF0(@Z
zBo?y4R-pg%=Xp;B<*cw?vUxC_8$q?OLio8&#=jodw=9D!w!iH#=<YDQGCw?E%${xD
z19f+a{t42V;g8!-pqy8=q-3+4x|~(X5M9U)tkXT%@yopR3)E{r^3eNPmUwewekXi9
ztxhDy4Ag2wj|tKmO^drf^kTS9M!>F4+B@N}V4N4V0?tinkwM_S_rZNbeV{)YHYduJ
zK<79b(idxI?F<%NE`bAmoOSTT1luf}C#mfPz}rdQ(kEI*&BD}{`gCq@C6i~8v-Yzb
zlsY<wzRA>BF3+#?Y_q?pxv6~JK=J*@$4UuLXXnBJM6jjg(ogHtPp1*DEtlz$T^Qcf
z^Ex(?UYmPfeEs)egx?{noU~PNy+r5H8GF}n$<xBEPjK1|wuek~mT5=BsayPp#TngO
zg`yR`mk-nL*_Ypg0)9W76?{pBqCNUiccMrw<s5^w)rZoTYEL=+({iE#|3#)~<n^q#
z=J!gnK+;KK{$vf$>@~wb!kE4R|9o1GdsA^w@zK10d+%Y)h?gJb`>?o9WYzaa_S+5m
zF-GVGPXV#@rt&^3wkTYvS;H65&G%6MVj`s^=F$(Na=%8qYd&4;oqVaBiWKr8>|TZF
zDcb>Jtq5zGuD=tz#_OD*upb9Ie*mUVFI%;!zDV}xbYA^(*w6%jlBZiuN6a;U=u4Hm
zE6^}nvs&3Wp}9_5!nf!#hflXaXKd~VK=d0x2=`)l$X9t#dY+1o)kmbdcIM4~)sCJG
zUrVvX?AbYf0?gdoM~6&>!Yh-A=j&vcR5PU}dj>?2MsJyD1MISwGGRyIp?cgmX%YU^
z0`Hs$_W#Fz0|kxro;pt58spRTz+y*1>aS=%8Kc~>E{WR!X30X<rgNl<ZbwCP%Z*2O
zBzx3>jTPn=sL#o=KL=OCajIco&CK-WQ#LPb>aF96sL(-Ign5ll!8jTWHOmX&nSpvi
zzL#m%%@<gEvcR@jZ~Yrhxg0)plziDlN8zoLK`|+M9sS@jtl^39o>bIp@-yuIs4vF@
zQss$gvfl7GHxG6WCgLZRH47Hj(BHCVau7DX*yG_IEy{hA71_nP@`tTU-9=73$j@D}
zU%*65=-{{$UiLYX2PT-+Ps_j7OZM0;n<%f!F*z&sy)ALFf#wLmkb?32I#ja9TRRms
z>!E)IX}nPML%E4(&iCvV=*Iz_-}Oq^f?$;HWbK=Kahpjdnh|7xMKvJ0UXwNS91Nq^
zdJhOGi!Okf_3%tKg_9`<_@>u2zihM}VM~9>A0hL(ObNJ;-HT!!OUeBI1&2bjET~Fz
zK`N`gBWO6U4w38LPVd+Oe++wkm+Il;oHOH`s2t{}P8s)qTNUE#5zPfQD+uoFFRO$b
zLfDIq;b{NC57u9cA3uh@UyO~^#65Fqxw&8ktiG5>ygzf+GI}W)Z4vK4mdcdqjkSHw
z<Q1=N7j3C0=5!?cB`+B-PF_qK-yRM^f$V4LsUgp`vic+lvRXV_YPPVlZgKcjFv;FR
L;r%8^V=4N7bqkmF

literal 0
HcmV?d00001

diff --git a/tools/accuracy/bench_qwen3_4b_dflash_forward.py b/tools/accuracy/bench_qwen3_4b_dflash_forward.py
new file mode 100644
index 00000000..fb232e4c
--- /dev/null
+++ b/tools/accuracy/bench_qwen3_4b_dflash_forward.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""Benchmark Qwen3-4B-DFlash forward in Hugging Face and OpenInfer.
+
+The benchmark uses the same synthetic fixed inputs for both engines, so the
+result isolates the standalone drafter forward cost. It does not measure the
+full speculative decoding loop because the OpenInfer target/controller path is
+not implemented yet.
+
+Example:
+
+    .venv/bin/python tools/accuracy/bench_qwen3_4b_dflash_forward.py \
+        --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+        --openinfer-bin target/release/qwen3_dflash_forward_bench \
+        --out target/benchmarks/qwen3-dflash/forward.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+import torch
+from safetensors.torch import load_file, save_file
+from transformers import AutoModel
+
+SEED = 0xD4A5_4B16
+
+
+def stats(values: list[float]) -> dict[str, float]:
+    sorted_values = sorted(values)
+    if not sorted_values:
+        return {"mean": 0.0, "p50": 0.0, "p90": 0.0, "p99": 0.0, "min": 0.0, "max": 0.0}
+    def pct(q: float) -> float:
+        idx = round((len(sorted_values) - 1) * q)
+        return float(sorted_values[min(idx, len(sorted_values) - 1)])
+    return {
+        "mean": float(sum(sorted_values) / len(sorted_values)),
+        "p50": pct(0.50),
+        "p90": pct(0.90),
+        "p99": pct(0.99),
+        "min": float(sorted_values[0]),
+        "max": float(sorted_values[-1]),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--draft-model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16")
+    parser.add_argument("--fixture-out", default="target/benchmarks/qwen3-dflash/forward-input.safetensors")
+    parser.add_argument("--openinfer-bin", type=Path, help="Path to qwen3_dflash_forward_bench")
+    parser.add_argument("--openinfer-draft-cache", action="store_true")
+    parser.add_argument("--openinfer-context-cache", action="store_true", help=argparse.SUPPRESS)
+    parser.add_argument("--out", default="target/benchmarks/qwen3-dflash/forward.json")
+    parser.add_argument("--device", type=int, default=0)
+    parser.add_argument("--ctx-len", type=int, default=2)
+    parser.add_argument("--q-len", type=int, default=16)
+    parser.add_argument("--warmup", type=int, default=5)
+    parser.add_argument("--iters", type=int, default=30)
+    parser.add_argument("--target-model-path", default="/home/hezhaozhao/models/Qwen3-4B")
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for the DFlash forward benchmark")
+
+    draft = AutoModel.from_pretrained(
+        args.draft_model_path,
+        dtype=torch.bfloat16,
+        device_map={"": f"cuda:{args.device}"},
+        trust_remote_code=True,
+    ).eval()
+    device = next(draft.parameters()).device
+
+    gen = torch.Generator(device=device).manual_seed(SEED)
+    hidden = draft.config.hidden_size
+    target_layer_count = len(draft.target_layer_ids)
+    noise_embedding = torch.randn((1, args.q_len, hidden), generator=gen, device=device, dtype=torch.bfloat16)
+    target_hidden = torch.randn(
+        (1, args.ctx_len, hidden * target_layer_count),
+        generator=gen,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    position_ids = torch.arange(args.ctx_len + args.q_len, device=device, dtype=torch.int32).unsqueeze(0)
+    fixture_path = Path(args.fixture_out)
+    fixture_path.parent.mkdir(parents=True, exist_ok=True)
+    save_file(
+        {
+            "noise_embedding": noise_embedding.detach().to("cpu", dtype=torch.bfloat16).contiguous(),
+            "target_hidden": target_hidden.detach().to("cpu", dtype=torch.bfloat16).contiguous(),
+            "position_ids": position_ids.detach().to("cpu", dtype=torch.int32).contiguous(),
+        },
+        str(fixture_path),
+    )
+
+    hf_latencies = []
+    with torch.inference_mode():
+        for _ in range(args.warmup):
+            _ = draft(
+                noise_embedding=noise_embedding,
+                target_hidden=target_hidden,
+                position_ids=position_ids,
+                use_cache=False,
+                is_causal=False,
+            )
+        torch.cuda.synchronize(device)
+        for _ in range(args.iters):
+            start = time.perf_counter()
+            _ = draft(
+                noise_embedding=noise_embedding,
+                target_hidden=target_hidden,
+                position_ids=position_ids,
+                use_cache=False,
+                is_causal=False,
+            )
+            torch.cuda.synchronize(device)
+            hf_latencies.append((time.perf_counter() - start) * 1000.0)
+
+    openinfer_latencies = None
+    if args.openinfer_bin is not None:
+        cmd = [
+            str(args.openinfer_bin),
+            "--model-path",
+            args.draft_model_path,
+            "--fixture",
+            str(fixture_path),
+            "--device",
+            str(args.device),
+            "--warmup",
+            str(args.warmup),
+            "--iters",
+            str(args.iters),
+        ]
+        openinfer_draft_cache = args.openinfer_draft_cache or args.openinfer_context_cache
+        if openinfer_draft_cache:
+            cmd.append("--draft-cache")
+        raw = subprocess.run(cmd, check=True, capture_output=True, text=True).stdout
+        payload = json.loads(raw)
+        openinfer_latencies = payload["latency_ms"]
+
+    report = {
+        "schema": 1,
+        "draft_model_path": args.draft_model_path,
+        "target_model_path": args.target_model_path,
+        "device": args.device,
+        "ctx_len": args.ctx_len,
+        "q_len": args.q_len,
+        "warmup": args.warmup,
+        "iters": args.iters,
+        "openinfer_draft_cache": args.openinfer_draft_cache or args.openinfer_context_cache,
+        "fixture_out": str(fixture_path),
+        "hf_remote_code": {
+            "engine": "transformers",
+            "latency_ms": stats(hf_latencies),
+        },
+        "openinfer": openinfer_latencies,
+    }
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    print(f"wrote {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py b/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py
new file mode 100644
index 00000000..3ea144d8
--- /dev/null
+++ b/tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+"""Compare Qwen3-4B-DFlash HF drafter vs OpenInfer drafter in one target loop.
+
+This is an end-to-end drafter-substitution probe for the current
+`openinfer-qwen3-4b-dflash` boundary. The target model, tokenizer, target KV
+cache, target verification, target `lm_head`, and greedy sampler all come from
+Transformers. The only variable is the drafter:
+
+  * HF remote-code `DFlashDraftModel.forward`
+  * OpenInfer `qwen3_dflash_forward_fixture`
+
+The script intentionally uses a no-draft-cache loop on both sides because the
+current OpenInfer crate implements standalone draft forward only, not DFlash's
+Python `DynamicCache` path or an OpenInfer target/controller.
+
+Example:
+
+    .venv/bin/python tools/accuracy/compare_qwen3_4b_dflash_drafter_generation.py \
+        --target-model-path /home/hezhaozhao/models/Qwen3-4B \
+        --draft-model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+        --openinfer-bin target/release/qwen3_dflash_forward_fixture \
+        --out target/accuracy/qwen3-dflash/drafter-generation.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors.torch import load_file, save_file
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, DynamicCache
+
+DEFAULT_PROMPTS = [
+    "Hello, my name is",
+    "The capital of France is",
+    "Qwen is a language model that",
+    "1, 1, 2, 3, 5,",
+]
+
+
+def sha256_u32_le(values: list[int]) -> str:
+    digest = hashlib.sha256()
+    for value in values:
+        digest.update(int(value).to_bytes(4, byteorder="little", signed=False))
+    return digest.hexdigest()
+
+
+def sha256_text(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def first_diff(left: list[int], right: list[int]) -> dict[str, Any] | None:
+    limit = min(len(left), len(right))
+    for index in range(limit):
+        if left[index] != right[index]:
+            return {
+                "index": index,
+                "hf_drafter": left[index],
+                "openinfer_drafter": right[index],
+                "reason": "token_mismatch",
+            }
+    if len(left) != len(right):
+        return {
+            "index": limit,
+            "hf_drafter": left[limit] if len(left) > limit else None,
+            "openinfer_drafter": right[limit] if len(right) > limit else None,
+            "reason": "length_mismatch",
+        }
+    return None
+
+
+def input_device(model: torch.nn.Module) -> torch.device:
+    return next(model.parameters()).device
+
+
+def extract_context_feature(hidden_states: tuple[torch.Tensor, ...], layer_ids: list[int]) -> torch.Tensor:
+    # HF hidden_states includes the embedding output at index 0.
+    return torch.cat([hidden_states[layer_id + 1] for layer_id in layer_ids], dim=-1)
+
+
+def greedy(logits: torch.Tensor) -> torch.Tensor:
+    return torch.argmax(logits, dim=-1)
+
+
+def tensor_deltas(got: torch.Tensor, want: torch.Tensor) -> dict[str, float]:
+    deltas = (got.float() - want.float()).abs().flatten().detach().cpu()
+    if deltas.numel() == 0:
+        return {"mean": 0.0, "p99": 0.0, "max": 0.0, "n": 0}
+    sorted_deltas = torch.sort(deltas).values
+    p99_index = min(int(deltas.numel() * 0.99), deltas.numel() - 1)
+    return {
+        "mean": float(deltas.mean().item()),
+        "p99": float(sorted_deltas[p99_index].item()),
+        "max": float(sorted_deltas[-1].item()),
+        "n": int(deltas.numel()),
+    }
+
+
+def merge_delta_stats(items: list[dict[str, float]]) -> dict[str, float] | None:
+    if not items:
+        return None
+    total_n = sum(int(item["n"]) for item in items)
+    if total_n == 0:
+        return {"mean": 0.0, "p99": 0.0, "max": 0.0, "n": 0}
+    # The exact aggregate p99 needs raw samples. For this report the per-block
+    # worst p99 is the conservative summary, and max is exact.
+    return {
+        "mean": sum(item["mean"] * item["n"] for item in items) / total_n,
+        "p99": max(item["p99"] for item in items),
+        "max": max(item["max"] for item in items),
+        "n": total_n,
+    }
+
+
+@dataclass
+class Runtime:
+    target: torch.nn.Module
+    draft: torch.nn.Module
+    tokenizer: Any
+    target_layer_ids: list[int]
+    block_size: int
+    mask_token_id: int
+    stop_token_ids: list[int]
+    openinfer_bin: Path | None
+    draft_model_path: Path
+    repo_root: Path
+    device_ordinal: int
+    collect_hidden_delta: bool
+
+
+def run_openinfer_draft(
+    runtime: Runtime,
+    *,
+    noise_embedding: torch.Tensor,
+    target_hidden: torch.Tensor,
+    position_ids: torch.Tensor,
+    temp_dir: Path,
+    step_index: int,
+) -> torch.Tensor:
+    fixture = temp_dir / f"dflash-input-{step_index:03d}.safetensors"
+    out = temp_dir / f"dflash-output-{step_index:03d}.safetensors"
+    save_file(
+        {
+            "noise_embedding": noise_embedding.detach().to("cpu", dtype=torch.bfloat16).contiguous(),
+            "target_hidden": target_hidden.detach().to("cpu", dtype=torch.bfloat16).contiguous(),
+            "position_ids": position_ids.detach().to("cpu", dtype=torch.int32).contiguous(),
+        },
+        str(fixture),
+    )
+    if runtime.openinfer_bin is not None:
+        cmd = [
+            str(runtime.openinfer_bin),
+            "--model-path",
+            str(runtime.draft_model_path),
+            "--fixture",
+            str(fixture),
+            "--out",
+            str(out),
+            "--device",
+            str(runtime.device_ordinal),
+        ]
+    else:
+        cmd = [
+            "cargo",
+            "run",
+            "--release",
+            "-p",
+            "openinfer-qwen3-4b-dflash",
+            "--bin",
+            "qwen3_dflash_forward_fixture",
+            "--",
+            "--model-path",
+            str(runtime.draft_model_path),
+            "--fixture",
+            str(fixture),
+            "--out",
+            str(out),
+            "--device",
+            str(runtime.device_ordinal),
+        ]
+    subprocess.run(cmd, cwd=runtime.repo_root, check=True)
+    tensors = load_file(str(out))
+    return tensors["openinfer_output"].to(input_device(runtime.target), dtype=torch.bfloat16)
+
+
+def draft_hidden(
+    runtime: Runtime,
+    *,
+    kind: str,
+    noise_embedding: torch.Tensor,
+    target_hidden: torch.Tensor,
+    position_ids: torch.Tensor,
+    temp_dir: Path,
+    step_index: int,
+) -> tuple[torch.Tensor, dict[str, float] | None]:
+    with torch.inference_mode():
+        hf_hidden = runtime.draft(
+            target_hidden=target_hidden,
+            noise_embedding=noise_embedding,
+            position_ids=position_ids,
+            use_cache=False,
+            is_causal=False,
+        )
+    if kind == "hf":
+        return hf_hidden, None
+    oi_hidden = run_openinfer_draft(
+        runtime,
+        noise_embedding=noise_embedding,
+        target_hidden=target_hidden,
+        position_ids=position_ids,
+        temp_dir=temp_dir,
+        step_index=step_index,
+    )
+    delta = tensor_deltas(oi_hidden, hf_hidden) if runtime.collect_hidden_delta else None
+    return oi_hidden, delta
+
+
+def generate_with_drafter(
+    runtime: Runtime,
+    *,
+    prompt: str,
+    max_new_tokens: int,
+    kind: str,
+    temp_dir: Path,
+) -> dict[str, Any]:
+    dev = input_device(runtime.target)
+    encoded = runtime.tokenizer(prompt, return_tensors="pt")
+    input_ids = encoded.input_ids.to(dev)
+    num_input_tokens = input_ids.shape[1]
+    max_length = num_input_tokens + max_new_tokens
+    output_ids = torch.full(
+        (1, max_length + runtime.block_size),
+        runtime.mask_token_id,
+        dtype=torch.long,
+        device=dev,
+    )
+    all_position_ids = torch.arange(output_ids.shape[1], device=dev).unsqueeze(0)
+
+    target_cache = DynamicCache()
+    with torch.inference_mode():
+        output = runtime.target(
+            input_ids,
+            position_ids=all_position_ids[:, :num_input_tokens],
+            past_key_values=target_cache,
+            use_cache=True,
+            logits_to_keep=1,
+            output_hidden_states=True,
+        )
+    output_ids[:, :num_input_tokens] = input_ids
+    output_ids[:, num_input_tokens : num_input_tokens + 1] = greedy(output.logits)
+    target_hidden = extract_context_feature(output.hidden_states, runtime.target_layer_ids)
+
+    start = num_input_tokens
+    accepted_plus_fallback_lengths: list[int] = []
+    hidden_deltas: list[dict[str, float]] = []
+    step_index = 0
+    while start < max_length:
+        q_len = runtime.block_size
+        block_output_ids = output_ids[:, start : start + q_len].clone()
+        block_position_ids = all_position_ids[:, start : start + q_len]
+        noise_embedding = runtime.target.model.embed_tokens(block_output_ids)
+
+        ctx_len = target_hidden.shape[1]
+        draft_position_ids = all_position_ids[:, start - ctx_len : start + q_len]
+        hidden, delta = draft_hidden(
+            runtime,
+            kind=kind,
+            noise_embedding=noise_embedding,
+            target_hidden=target_hidden,
+            position_ids=draft_position_ids,
+            temp_dir=temp_dir,
+            step_index=step_index,
+        )
+        if delta is not None:
+            hidden_deltas.append(delta)
+        draft_logits = runtime.target.lm_head(hidden[:, -runtime.block_size + 1 :, :])
+        block_output_ids[:, 1:] = greedy(draft_logits)
+
+        with torch.inference_mode():
+            output = runtime.target(
+                block_output_ids,
+                position_ids=block_position_ids,
+                past_key_values=target_cache,
+                use_cache=True,
+                output_hidden_states=True,
+            )
+        posterior = greedy(output.logits)
+        matches = block_output_ids[:, 1:] == posterior[:, :-1]
+        acceptance_length = int(matches.cumprod(dim=1).sum(dim=1)[0].item())
+        advanced = acceptance_length + 1
+        output_ids[:, start : start + advanced] = block_output_ids[:, :advanced]
+        output_ids[:, start + advanced] = posterior[:, acceptance_length]
+        start += advanced
+        target_cache.crop(start)
+        target_hidden = extract_context_feature(output.hidden_states, runtime.target_layer_ids)[:, :advanced, :]
+        accepted_plus_fallback_lengths.append(advanced)
+        step_index += 1
+
+        generated_so_far = output_ids[0, num_input_tokens : min(start + 1, max_length)]
+        if runtime.stop_token_ids and torch.isin(
+            generated_so_far,
+            torch.tensor(runtime.stop_token_ids, device=generated_so_far.device),
+        ).any():
+            break
+
+    full_ids = output_ids[0, :max_length]
+    full_ids = full_ids[full_ids != runtime.mask_token_id]
+    if runtime.stop_token_ids:
+        generated = full_ids[num_input_tokens:]
+        stop_tensor = torch.tensor(runtime.stop_token_ids, device=generated.device)
+        stop_positions = torch.isin(generated, stop_tensor).nonzero(as_tuple=True)[0]
+        if stop_positions.numel() > 0:
+            full_ids = full_ids[: num_input_tokens + int(stop_positions[0].item()) + 1]
+
+    full_token_ids = [int(token) for token in full_ids.detach().cpu().tolist()]
+    generated_token_ids = full_token_ids[num_input_tokens:]
+    full_text = runtime.tokenizer.decode(full_token_ids, skip_special_tokens=False)
+    generated_text = runtime.tokenizer.decode(generated_token_ids, skip_special_tokens=False)
+    return {
+        "prompt_token_ids": [int(token) for token in input_ids[0].detach().cpu().tolist()],
+        "full_token_ids": full_token_ids,
+        "generated_token_ids": generated_token_ids,
+        "full_text": full_text,
+        "generated_text": generated_text,
+        "token_sha256": sha256_u32_le(generated_token_ids),
+        "text_sha256": sha256_text(generated_text),
+        "accepted_plus_fallback_lengths": accepted_plus_fallback_lengths,
+        "hidden_delta_vs_hf": merge_delta_stats(hidden_deltas),
+    }
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--target-model-path", required=True)
+    parser.add_argument("--draft-model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16")
+    parser.add_argument("--out", default="target/accuracy/qwen3-dflash/drafter-generation.json")
+    parser.add_argument("--prompt", action="append", help="Prompt to test; can be repeated.")
+    parser.add_argument("--max-new-tokens", type=int, default=12)
+    parser.add_argument("--openinfer-bin", type=Path, help="Path to a built qwen3_dflash_forward_fixture binary.")
+    parser.add_argument("--repo-root", type=Path, default=Path(__file__).resolve().parents[2])
+    parser.add_argument("--device", type=int, default=0)
+    parser.add_argument("--skip-hidden-delta", action="store_true")
+    parser.add_argument("--stop-token-id", type=int, action="append", default=[])
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for the DFlash drafter generation comparison")
+
+    target = AutoModelForCausalLM.from_pretrained(
+        args.target_model_path,
+        dtype=torch.bfloat16,
+        device_map={"": f"cuda:{args.device}"},
+        trust_remote_code=True,
+    ).eval()
+    draft = AutoModel.from_pretrained(
+        args.draft_model_path,
+        dtype=torch.bfloat16,
+        device_map={"": f"cuda:{args.device}"},
+        trust_remote_code=True,
+    ).eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.target_model_path, trust_remote_code=True)
+
+    stop_token_ids = list(args.stop_token_id)
+    eos = getattr(target.config, "eos_token_id", None)
+    if isinstance(eos, int):
+        stop_token_ids.append(eos)
+    elif isinstance(eos, list):
+        stop_token_ids.extend(int(token) for token in eos)
+    stop_token_ids = sorted(set(stop_token_ids))
+
+    runtime = Runtime(
+        target=target,
+        draft=draft,
+        tokenizer=tokenizer,
+        target_layer_ids=[int(layer) for layer in draft.target_layer_ids],
+        block_size=int(draft.block_size),
+        mask_token_id=int(getattr(draft, "mask_token_id", None) or draft.config.dflash_config["mask_token_id"]),
+        stop_token_ids=stop_token_ids,
+        openinfer_bin=args.openinfer_bin,
+        draft_model_path=Path(args.draft_model_path),
+        repo_root=args.repo_root,
+        device_ordinal=args.device,
+        collect_hidden_delta=not args.skip_hidden_delta,
+    )
+
+    prompts = args.prompt or DEFAULT_PROMPTS
+    cases = []
+    with tempfile.TemporaryDirectory(prefix="qwen3-dflash-parity-") as tmp:
+        temp_dir = Path(tmp)
+        for index, prompt in enumerate(prompts):
+            hf = generate_with_drafter(
+                runtime,
+                prompt=prompt,
+                max_new_tokens=args.max_new_tokens,
+                kind="hf",
+                temp_dir=temp_dir,
+            )
+            openinfer = generate_with_drafter(
+                runtime,
+                prompt=prompt,
+                max_new_tokens=args.max_new_tokens,
+                kind="openinfer",
+                temp_dir=temp_dir,
+            )
+            token_diff = first_diff(hf["generated_token_ids"], openinfer["generated_token_ids"])
+            text_match = hf["generated_text"] == openinfer["generated_text"]
+            token_match = token_diff is None
+            classification = "all_token_text_exact" if token_match and text_match else "drafter_generation_mismatch"
+            cases.append(
+                {
+                    "id": f"prompt_{index:03d}",
+                    "prompt": prompt,
+                    "max_new_tokens": args.max_new_tokens,
+                    "prompt_token_ids": hf["prompt_token_ids"],
+                    "hf_drafter": hf,
+                    "openinfer_drafter": openinfer,
+                    "token_match": token_match,
+                    "text_match": text_match,
+                    "classification": classification,
+                    "first_diff": token_diff,
+                }
+            )
+            print(
+                f"{classification}: {prompt!r}; "
+                f"hf_accept={hf['accepted_plus_fallback_lengths']} "
+                f"openinfer_accept={openinfer['accepted_plus_fallback_lengths']}"
+            )
+
+    result = {
+        "schema": 1,
+        "comparison": "qwen3_4b_dflash_drafter_generation",
+        "mode": "greedy_bs1_no_draft_cache_drafter_substitution",
+        "target_model_path": args.target_model_path,
+        "draft_model_path": args.draft_model_path,
+        "openinfer_bin": str(args.openinfer_bin) if args.openinfer_bin else None,
+        "block_size": runtime.block_size,
+        "target_layer_ids": runtime.target_layer_ids,
+        "mask_token_id": runtime.mask_token_id,
+        "stop_token_ids": runtime.stop_token_ids,
+        "torch_version": torch.__version__,
+        "transformers_version": __import__("transformers").__version__,
+        "case_count": len(cases),
+        "all_token_text_exact": all(case["classification"] == "all_token_text_exact" for case in cases),
+        "cases": cases,
+    }
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(result, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    print(f"wrote {out}")
+    if not result["all_token_text_exact"]:
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py b/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py
new file mode 100644
index 00000000..5efc34b0
--- /dev/null
+++ b/tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""Generate a tiny HuggingFace remote-code golden for Qwen3-4B-DFlash-b16.
+
+The DFlash crate compares its standalone draft forward against this fixture
+without importing Python at Rust test time. The input tensors are synthetic but
+seed-pinned, so the fixture exercises the exact `DFlashDraftModel.forward`
+contract: selected target hidden states, noise embeddings, and absolute
+position ids.
+
+    .venv/bin/python tools/accuracy/dump_qwen3_4b_dflash_hf_golden.py \
+        --model-path /home/hezhaozhao/models/Qwen3-4B-DFlash-b16 \
+        --out test_data/qwen3-4b-dflash-hf-golden.safetensors
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import torch
+from safetensors.torch import save_file
+from transformers import AutoModel
+
+SEED = 0xD4A5_4B16
+CTX_LEN = 2
+Q_LEN = 3
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--model-path", default="/home/hezhaozhao/models/Qwen3-4B-DFlash-b16")
+    parser.add_argument("--out", default="test_data/qwen3-4b-dflash-hf-golden.safetensors")
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required to generate the DFlash bf16 golden")
+
+    model = AutoModel.from_pretrained(
+        args.model_path,
+        dtype=torch.bfloat16,
+        device_map="cuda",
+        trust_remote_code=True,
+    ).eval()
+
+    gen = torch.Generator(device="cuda").manual_seed(SEED)
+    hidden = model.config.hidden_size
+    target_layers = len(model.target_layer_ids)
+    noise_embedding = torch.randn(
+        (1, Q_LEN, hidden),
+        generator=gen,
+        device="cuda",
+        dtype=torch.bfloat16,
+    )
+    target_hidden = torch.randn(
+        (1, CTX_LEN, hidden * target_layers),
+        generator=gen,
+        device="cuda",
+        dtype=torch.bfloat16,
+    )
+    position_ids = torch.arange(CTX_LEN + Q_LEN, device="cuda", dtype=torch.int64).unsqueeze(0)
+
+    with torch.inference_mode():
+        output = model(
+            noise_embedding=noise_embedding,
+            target_hidden=target_hidden,
+            position_ids=position_ids,
+            use_cache=False,
+            is_causal=False,
+        )
+    torch.cuda.synchronize()
+
+    tensors = {
+        "noise_embedding": noise_embedding.cpu(),
+        "target_hidden": target_hidden.cpu(),
+        "position_ids": position_ids.to(torch.int32).cpu(),
+        "output": output.cpu(),
+    }
+    meta = {
+        "model_path": args.model_path,
+        "seed": str(SEED),
+        "ctx_len": str(CTX_LEN),
+        "q_len": str(Q_LEN),
+        "hidden_size": str(hidden),
+        "target_layer_ids": ",".join(str(layer) for layer in model.target_layer_ids),
+        "block_size": str(model.block_size),
+        "mask_token_id": str(model.mask_token_id),
+        "torch_version": torch.__version__,
+        "transformers_version": __import__("transformers").__version__,
+    }
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    save_file(tensors, str(out), metadata=meta)
+    print(f"wrote {out}: ctx_len={CTX_LEN}, q_len={Q_LEN}, hidden={hidden}, seed={SEED}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 8011980c5ad8e02ba5e96e0a8b71f3e219ddaaa9 Mon Sep 17 00:00:00 2001
From: hezz <hezz@example.com>
Date: Mon, 22 Jun 2026 12:13:25 +0800
Subject: [PATCH 2/6] refactor(qwen3-dflash): collapse batch buffers into a
 single instance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns the DFlash executor with the rest of the project (Qwen3
BatchDecodeBuffers, Kimi/DeepSeek scratch): one allocation sized for
the worst case, narrowed per forward via set_active_shape, instead of
the unique-per-crate HashMap<(batch, q, ctx), buffer> cache that grew
a fresh GPU buffer set for every unseen (q_len, ctx_len) combo.

* DFlashExecutor now holds a single DFlashBatchBuffers, allocated once
  in load() for (max_batch_size, max_q_len, max_step_context_len).
  New DFlashExecutorOptions.max_q_len gates the q-axis capacity.
* set_active_batch(bs) -> set_active_shape(bs, q_len, ctx_len); both
  forward paths derive the shape from the requests themselves so
  callers no longer pre-set it.
* prepare_ragged_plan cache key now covers (batch_size, q_len, ctx_len)
  — with a single instance the shape can change between forwards, so
  keying only on batch_size would reuse a stale plan.
* compact_host_inputs stitches all requests on the host and uploads
  noise/target with one H2D each (was one launch per request per
  tensor), matching Qwen3's sync_paged_meta upload pattern.
* Compact NoCache paths materialize the owned output via a single
  clone_batch_output dtod instead of zeros + copy_hidden over the full
  span.

Tests/benches keep working: create_batch_buffers keeps its 3-positional
signature (now (max_batch, max_q, max_ctx)), and test struct literals
gain the max_q_len field.
---
 .../src/batch_buffers.rs                      |  62 ++++++--
 .../src/batch_forward.rs                      | 105 ++++++++++----
 openinfer-qwen3-4b-dflash/src/executor.rs     | 137 +++++++++---------
 .../tests/hf_golden_gate.rs                   |   5 +
 4 files changed, 198 insertions(+), 111 deletions(-)

diff --git a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs
index 84739906..8d7cef74 100644
--- a/openinfer-qwen3-4b-dflash/src/batch_buffers.rs
+++ b/openinfer-qwen3-4b-dflash/src/batch_buffers.rs
@@ -7,6 +7,11 @@ use crate::weights::DFlashDraftModel;
 
 pub struct DFlashBatchBuffers {
     pub(crate) max_batch_size: usize,
+    pub(crate) max_q_len: usize,
+    pub(crate) max_ctx_len: usize,
+    /// Active shape for the current batch — set by `set_active_shape` before
+    /// each forward. `q_len`/`ctx_len` may shrink below `max_*`; the physical
+    /// buffers are sized for the max, so the active values only narrow the view.
     pub(crate) q_len: usize,
     pub(crate) ctx_len: usize,
     pub(crate) total_q_len: usize,
@@ -38,32 +43,40 @@ pub struct DFlashBatchBuffers {
 
 pub(crate) struct CachedRaggedPlan {
     pub(crate) batch_size: usize,
+    pub(crate) q_len: usize,
+    pub(crate) ctx_len: usize,
     pub(crate) plan: RaggedPrefillPlan,
 }
 
 impl DFlashBatchBuffers {
+    /// Allocate a single-instance buffer sized for the worst case
+    /// (`max_batch_size × max_q_len` / `× max_ctx_len`). Each forward narrows
+    /// the active shape via `set_active_shape`, mirroring Qwen3's
+    /// `BatchDecodeBuffers` (one allocation, dynamic `set_batch_size`).
     pub(crate) fn new(
         model: &DFlashDraftModel,
         max_batch_size: usize,
-        q_len: usize,
-        ctx_len: usize,
+        max_q_len: usize,
+        max_ctx_len: usize,
     ) -> Result<Self> {
         anyhow::ensure!(max_batch_size > 0, "max_batch_size must be positive");
-        anyhow::ensure!(q_len > 0, "q_len must be positive");
-        anyhow::ensure!(ctx_len > 0, "ctx_len must be positive");
+        anyhow::ensure!(max_q_len > 0, "max_q_len must be positive");
+        anyhow::ensure!(max_ctx_len > 0, "max_ctx_len must be positive");
         let config = model.config();
         let ctx = model.device_context();
         let hidden = config.hidden_size;
         let target_hidden_dim = config.hidden_size * config.target_layer_count();
         let q_dim = config.q_dim();
         let kv_dim = config.kv_dim();
-        let total_q_len = max_batch_size * q_len;
-        let total_ctx_len = max_batch_size * ctx_len;
-        let total_kv_len = max_batch_size * (ctx_len + q_len);
+        let total_q_len = max_batch_size * max_q_len;
+        let total_ctx_len = max_batch_size * max_ctx_len;
+        let total_kv_len = max_batch_size * (max_ctx_len + max_q_len);
         Ok(Self {
             max_batch_size,
-            q_len,
-            ctx_len,
+            max_q_len,
+            max_ctx_len,
+            q_len: max_q_len,
+            ctx_len: max_ctx_len,
             total_q_len,
             total_ctx_len,
             total_kv_len,
@@ -92,11 +105,18 @@ impl DFlashBatchBuffers {
         })
     }
 
-    pub(crate) fn set_active_batch(&mut self, batch_size: usize) {
+    /// Narrow the active shape for this forward: sets `q_len`/`ctx_len` and
+    /// recomputes every buffer's `seq_len` to `batch_size × (q|ctx)`. Buffers
+    /// stay sized for the max, so callers can freely vary batch/q/ctx below it.
+    pub(crate) fn set_active_shape(&mut self, batch_size: usize, q_len: usize, ctx_len: usize) {
         debug_assert!(batch_size <= self.max_batch_size);
-        self.total_q_len = batch_size * self.q_len;
-        self.total_ctx_len = batch_size * self.ctx_len;
-        self.total_kv_len = batch_size * (self.ctx_len + self.q_len);
+        debug_assert!(q_len <= self.max_q_len);
+        debug_assert!(ctx_len <= self.max_ctx_len);
+        self.q_len = q_len;
+        self.ctx_len = ctx_len;
+        self.total_q_len = batch_size * q_len;
+        self.total_ctx_len = batch_size * ctx_len;
+        self.total_kv_len = batch_size * (ctx_len + q_len);
         self.noise.seq_len = self.total_q_len;
         self.target_hidden.seq_len = self.total_ctx_len;
         self.target_projected.seq_len = self.total_ctx_len;
@@ -123,10 +143,17 @@ impl DFlashBatchBuffers {
         model: &DFlashDraftModel,
         batch_size: usize,
     ) -> Result<()> {
+        // The plan depends on (batch_size, q_len, ctx_len); with a single
+        // instance buffer any of them can change between forwards, so all three
+        // must be part of the cache key.
         let needs_rebuild = self
             .ragged_plan
             .as_ref()
-            .map(|cached| cached.batch_size != batch_size)
+            .map(|cached| {
+                cached.batch_size != batch_size
+                    || cached.q_len != self.q_len
+                    || cached.ctx_len != self.ctx_len
+            })
             .unwrap_or(true);
         if needs_rebuild {
             let config = model.config();
@@ -138,7 +165,12 @@ impl DFlashBatchBuffers {
                 &kv_lens,
                 config.num_attention_heads / config.num_key_value_heads,
             )?;
-            self.ragged_plan = Some(CachedRaggedPlan { batch_size, plan });
+            self.ragged_plan = Some(CachedRaggedPlan {
+                batch_size,
+                q_len: self.q_len,
+                ctx_len: self.ctx_len,
+                plan,
+            });
         }
         Ok(())
     }
diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
index ba9ef95b..1ca49050 100644
--- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs
+++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
@@ -23,10 +23,10 @@ impl DFlashDraftModel {
     pub fn create_batch_buffers(
         &self,
         max_batch_size: usize,
-        q_len: usize,
-        ctx_len: usize,
+        max_q_len: usize,
+        max_ctx_len: usize,
     ) -> Result<DFlashBatchBuffers> {
-        DFlashBatchBuffers::new(self, max_batch_size, q_len, ctx_len)
+        DFlashBatchBuffers::new(self, max_batch_size, max_q_len, max_ctx_len)
     }
 
     pub fn forward_batch<'a>(
@@ -41,9 +41,22 @@ impl DFlashDraftModel {
             requests.len(),
             bufs.max_batch_size
         );
-        let q_len = bufs.q_len;
-        let ctx_len = bufs.ctx_len;
-        for req in requests {
+        // All requests in an exact-shape batch share one (q_len, ctx_len); read
+        // it from the first, then narrow the buffer's active shape to match.
+        let (q_len, ctx_len) = self.validate_forward_inputs(
+            requests[0].noise_embedding,
+            &requests[0].target_hidden,
+            requests[0].position_ids,
+        )?;
+        anyhow::ensure!(
+            q_len <= bufs.max_q_len && ctx_len <= bufs.max_ctx_len,
+            "DFlash batch shape q_len={}, ctx_len={} exceeds buffer capacity q_len={}, ctx_len={}",
+            q_len,
+            ctx_len,
+            bufs.max_q_len,
+            bufs.max_ctx_len,
+        );
+        for req in &requests[1..] {
             let (actual_q, actual_ctx) = self.validate_forward_inputs(
                 req.noise_embedding,
                 &req.target_hidden,
@@ -58,7 +71,7 @@ impl DFlashDraftModel {
                 actual_ctx
             );
         }
-        bufs.set_active_batch(requests.len());
+        bufs.set_active_shape(requests.len(), q_len, ctx_len);
         compact_inputs(self.device_context(), requests, bufs)?;
         self.forward_compact_batch(requests.len(), bufs)?;
         Ok(&bufs.normed)
@@ -77,10 +90,39 @@ impl DFlashDraftModel {
             bufs.max_batch_size
         );
         let config = self.config();
-        let noise_len = bufs.q_len * config.hidden_size;
-        let target_len = bufs.ctx_len * config.hidden_size * config.target_layer_count();
-        let position_len = bufs.ctx_len + bufs.q_len;
-        for req in requests {
+        let hidden = config.hidden_size;
+        let target_hidden_dim = config.hidden_size * config.target_layer_count();
+        // Derive the shared (q_len, ctx_len) from the first request, the same
+        // way forward_batch derives it from device tensors.
+        let first = &requests[0];
+        anyhow::ensure!(
+            first.noise_embedding.len() % hidden == 0,
+            "noise_embedding len {} is not a multiple of hidden_size {}",
+            first.noise_embedding.len(),
+            hidden,
+        );
+        let q_len = first.noise_embedding.len() / hidden;
+        anyhow::ensure!(
+            first.target_hidden.len() % target_hidden_dim == 0,
+            "target_hidden len {} is not a multiple of target_hidden_dim {}",
+            first.target_hidden.len(),
+            target_hidden_dim,
+        );
+        let ctx_len = first.target_hidden.len() / target_hidden_dim;
+        anyhow::ensure!(q_len > 0, "DFlash host batch q_len must be positive");
+        anyhow::ensure!(ctx_len > 0, "DFlash host batch ctx_len must be positive");
+        anyhow::ensure!(
+            q_len <= bufs.max_q_len && ctx_len <= bufs.max_ctx_len,
+            "DFlash host batch shape q_len={}, ctx_len={} exceeds buffer capacity q_len={}, ctx_len={}",
+            q_len,
+            ctx_len,
+            bufs.max_q_len,
+            bufs.max_ctx_len,
+        );
+        let noise_len = q_len * hidden;
+        let target_len = ctx_len * target_hidden_dim;
+        let position_len = ctx_len + q_len;
+        for req in &requests[1..] {
             anyhow::ensure!(
                 req.noise_embedding.len() == noise_len,
                 "noise_embedding len {} != {}",
@@ -100,7 +142,7 @@ impl DFlashDraftModel {
                 position_len
             );
         }
-        bufs.set_active_batch(requests.len());
+        bufs.set_active_shape(requests.len(), q_len, ctx_len);
         compact_host_inputs(self.device_context(), requests, bufs)?;
         self.forward_compact_batch(requests.len(), bufs)?;
         Ok(&bufs.normed)
@@ -323,27 +365,28 @@ fn compact_host_inputs(
 ) -> Result<()> {
     let hidden = bufs.noise.hidden_dim;
     let target_hidden = bufs.target_hidden.hidden_dim;
-    let mut pos_q = Vec::with_capacity(bufs.total_q_len);
-    let mut pos_ctx = Vec::with_capacity(bufs.total_ctx_len);
-    for (i, req) in requests.iter().enumerate() {
-        let noise_offset = i * bufs.q_len * hidden;
-        let mut noise_dst = bufs
-            .noise
-            .data
-            .slice_mut(noise_offset..noise_offset + req.noise_embedding.len());
-        ctx.stream
-            .memcpy_htod(req.noise_embedding, &mut noise_dst)?;
-
-        let target_offset = i * bufs.ctx_len * target_hidden;
-        let mut target_dst = bufs
-            .target_hidden
-            .data
-            .slice_mut(target_offset..target_offset + req.target_hidden.len());
-        ctx.stream.memcpy_htod(req.target_hidden, &mut target_dst)?;
+    let q_len = bufs.q_len;
+    let ctx_len = bufs.ctx_len;
+    let batch_size = requests.len();
 
-        pos_ctx.extend_from_slice(&req.position_ids[..bufs.ctx_len]);
-        pos_q.extend_from_slice(&req.position_ids[bufs.ctx_len..]);
+    // Stitch all requests into contiguous host slices, then upload each tensor
+    // in a single H2D copy — matches Qwen3's batch metadata upload pattern and
+    // avoids one launch per request per tensor.
+    let mut noise_flat = Vec::with_capacity(batch_size * q_len * hidden);
+    let mut target_flat = Vec::with_capacity(batch_size * ctx_len * target_hidden);
+    let mut pos_q = Vec::with_capacity(batch_size * q_len);
+    let mut pos_ctx = Vec::with_capacity(batch_size * ctx_len);
+    for req in requests {
+        noise_flat.extend_from_slice(req.noise_embedding);
+        target_flat.extend_from_slice(req.target_hidden);
+        pos_ctx.extend_from_slice(&req.position_ids[..ctx_len]);
+        pos_q.extend_from_slice(&req.position_ids[ctx_len..]);
     }
+
+    let mut noise_dst = bufs.noise.data.slice_mut(..noise_flat.len());
+    ctx.stream.memcpy_htod(&noise_flat, &mut noise_dst)?;
+    let mut target_dst = bufs.target_hidden.data.slice_mut(..target_flat.len());
+    ctx.stream.memcpy_htod(&target_flat, &mut target_dst)?;
     let mut dst_q = bufs.positions_q.slice_mut(..pos_q.len());
     ctx.stream.memcpy_htod(&pos_q, &mut dst_q)?;
     let mut dst_ctx = bufs.positions_ctx.slice_mut(..pos_ctx.len());
diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs
index 818226c7..a879fc18 100644
--- a/openinfer-qwen3-4b-dflash/src/executor.rs
+++ b/openinfer-qwen3-4b-dflash/src/executor.rs
@@ -4,10 +4,10 @@ use std::time::{Duration, Instant};
 
 use anyhow::Result;
 use half::bf16;
-use openinfer_core::tensor::HiddenStates;
+use openinfer_core::tensor::{DeviceContext, HiddenStates};
 
 use crate::batch_buffers::DFlashBatchBuffers;
-use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden};
+use crate::batch_forward::{copy_hidden, DFlashBatchInput, DFlashHostBatchInput};
 use crate::forward::{DFlashDraftCache, DFlashTargetHidden};
 use crate::weights::DFlashDraftModel;
 
@@ -85,6 +85,10 @@ pub struct DFlashDraftBatchView<'a> {
 pub struct DFlashExecutorOptions {
     pub max_batch_size: usize,
     pub max_step_context_len: usize,
+    /// Largest draft length (`q_len`) the executor must serve. Batch buffers
+    /// are sized once for `max_batch_size × max_q_len`, so every shape at or
+    /// below it reuses the same allocation (mirrors Qwen3's `BatchDecodeBuffers`).
+    pub max_q_len: usize,
     pub max_seq_len: usize,
 }
 
@@ -93,6 +97,7 @@ impl Default for DFlashExecutorOptions {
         Self {
             max_batch_size: 32,
             max_step_context_len: 16,
+            max_q_len: 16,
             max_seq_len: 4096,
         }
     }
@@ -101,7 +106,10 @@ impl Default for DFlashExecutorOptions {
 pub struct DFlashExecutor {
     model: DFlashDraftModel,
     options: DFlashExecutorOptions,
-    buffers: HashMap<(usize, usize, usize), DFlashBatchBuffers>,
+    /// Single-instance batch buffer, sized for the worst case
+    /// (`max_batch_size × max_q_len × max_step_context_len`). Each forward
+    /// narrows the active shape via `set_active_shape` instead of reallocating.
+    buffers: DFlashBatchBuffers,
     caches: HashMap<DFlashRequestId, DFlashDraftCache>,
 }
 
@@ -112,10 +120,15 @@ impl DFlashExecutor {
         options: DFlashExecutorOptions,
     ) -> Result<Self> {
         let model = DFlashDraftModel::load(model_path, device_ordinal)?;
+        let buffers = model.create_batch_buffers(
+            options.max_batch_size,
+            options.max_q_len,
+            options.max_step_context_len,
+        )?;
         Ok(Self {
             model,
             options,
-            buffers: HashMap::new(),
+            buffers,
             caches: HashMap::new(),
         })
     }
@@ -212,22 +225,24 @@ impl DFlashExecutor {
         if key.cache_mode == DFlashCacheMode::DraftCache {
             return self.execute_cached_host_requests_serial_compact(requests, key);
         }
+        anyhow::ensure!(
+            key.q_len <= self.options.max_q_len,
+            "DFlash host q_len {} exceeds executor max_q_len {}",
+            key.q_len,
+            self.options.max_q_len
+        );
+        anyhow::ensure!(
+            key.ctx_len <= self.options.max_step_context_len,
+            "DFlash host ctx_len {} exceeds executor max_step_context_len {}",
+            key.ctx_len,
+            self.options.max_step_context_len
+        );
         let started = Instant::now();
         let batch_size = requests.len();
         let request_ids = requests
             .iter()
             .map(|request| request.request_id)
             .collect::<Vec<_>>();
-        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
-        if !self.buffers.contains_key(&buffer_key) {
-            let bufs = self.model.create_batch_buffers(
-                self.options.max_batch_size,
-                key.q_len,
-                key.ctx_len,
-            )?;
-            self.buffers.insert(buffer_key, bufs);
-        }
-        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
         let inputs = requests
             .iter()
             .map(|req| DFlashHostBatchInput {
@@ -236,23 +251,12 @@ impl DFlashExecutor {
                 position_ids: &req.position_ids,
             })
             .collect::<Vec<_>>();
-        let batch_output = self.model.forward_host_batch(&inputs, bufs)?;
+        let batch_output = self.model.forward_host_batch(&inputs, &mut self.buffers)?;
         self.model.device_context().sync()?;
         let elapsed = started.elapsed();
-        let mut output = HiddenStates::zeros(
-            self.model.device_context(),
-            batch_output.hidden_dim,
-            batch_output.seq_len,
-        )?;
-        copy_hidden(
-            self.model.device_context(),
-            batch_output,
-            0,
-            &mut output,
-            0,
-            batch_output.hidden_dim,
-            batch_output.seq_len,
-        )?;
+        // forward returns a borrow into self.buffers; materialize an owned copy
+        // so the next batch can reuse the buffer without aliasing the response.
+        let output = clone_batch_output(self.model.device_context(), batch_output)?;
         Ok(DFlashDraftBatchResponse {
             request_ids,
             output,
@@ -302,22 +306,24 @@ impl DFlashExecutor {
             key.cache_mode == DFlashCacheMode::NoCache,
             "borrowed host batch view currently supports only NoCache mode"
         );
+        anyhow::ensure!(
+            key.q_len <= self.options.max_q_len,
+            "DFlash host q_len {} exceeds executor max_q_len {}",
+            key.q_len,
+            self.options.max_q_len
+        );
+        anyhow::ensure!(
+            key.ctx_len <= self.options.max_step_context_len,
+            "DFlash host ctx_len {} exceeds executor max_step_context_len {}",
+            key.ctx_len,
+            self.options.max_step_context_len
+        );
         let started = Instant::now();
         let batch_size = requests.len();
         let request_ids = requests
             .iter()
             .map(|request| request.request_id)
             .collect::<Vec<_>>();
-        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
-        if !self.buffers.contains_key(&buffer_key) {
-            let bufs = self.model.create_batch_buffers(
-                self.options.max_batch_size,
-                key.q_len,
-                key.ctx_len,
-            )?;
-            self.buffers.insert(buffer_key, bufs);
-        }
-        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
         let inputs = requests
             .iter()
             .map(|req| DFlashHostBatchInput {
@@ -326,7 +332,7 @@ impl DFlashExecutor {
                 position_ids: &req.position_ids,
             })
             .collect::<Vec<_>>();
-        let output = self.model.forward_host_batch(&inputs, bufs)?;
+        let output = self.model.forward_host_batch(&inputs, &mut self.buffers)?;
         self.model.device_context().sync()?;
         Ok(DFlashDraftBatchView {
             request_ids,
@@ -393,22 +399,24 @@ impl DFlashExecutor {
         requests: Vec<DFlashDraftRequest>,
         key: DFlashBatchKey,
     ) -> Result<DFlashDraftBatchResponse> {
+        anyhow::ensure!(
+            key.q_len <= self.options.max_q_len,
+            "DFlash q_len {} exceeds executor max_q_len {}",
+            key.q_len,
+            self.options.max_q_len
+        );
+        anyhow::ensure!(
+            key.ctx_len <= self.options.max_step_context_len,
+            "DFlash ctx_len {} exceeds executor max_step_context_len {}",
+            key.ctx_len,
+            self.options.max_step_context_len
+        );
         let started = Instant::now();
         let batch_size = requests.len();
         let request_ids = requests
             .iter()
             .map(|request| request.request_id)
             .collect::<Vec<_>>();
-        let buffer_key = (self.options.max_batch_size, key.q_len, key.ctx_len);
-        if !self.buffers.contains_key(&buffer_key) {
-            let bufs = self.model.create_batch_buffers(
-                self.options.max_batch_size,
-                key.q_len,
-                key.ctx_len,
-            )?;
-            self.buffers.insert(buffer_key, bufs);
-        }
-        let bufs = self.buffers.get_mut(&buffer_key).expect("buffer inserted");
         let inputs = requests
             .iter()
             .map(|req| DFlashBatchInput {
@@ -419,23 +427,10 @@ impl DFlashExecutor {
                 position_ids: &req.position_ids,
             })
             .collect::<Vec<_>>();
-        let batch_output = self.model.forward_batch(&inputs, bufs)?;
+        let batch_output = self.model.forward_batch(&inputs, &mut self.buffers)?;
         self.model.device_context().sync()?;
         let elapsed = started.elapsed();
-        let mut output = HiddenStates::zeros(
-            self.model.device_context(),
-            self.model.config().hidden_size,
-            batch_size * key.q_len,
-        )?;
-        copy_hidden(
-            self.model.device_context(),
-            batch_output,
-            0,
-            &mut output,
-            0,
-            self.model.config().hidden_size,
-            batch_size * key.q_len,
-        )?;
+        let output = clone_batch_output(self.model.device_context(), batch_output)?;
         Ok(DFlashDraftBatchResponse {
             request_ids,
             output,
@@ -638,3 +633,15 @@ impl DFlashExecutor {
         Ok(responses)
     }
 }
+
+/// Materialize an owned snapshot of a batch forward's output (a borrow into
+/// the single-instance buffer). One allocation + one device-to-device copy of
+/// the active region; the next batch may overwrite the buffer immediately.
+fn clone_batch_output(ctx: &DeviceContext, src: &HiddenStates) -> Result<HiddenStates> {
+    let mut dst = HiddenStates::zeros(ctx, src.hidden_dim, src.seq_len)?;
+    let len = src.hidden_dim * src.seq_len;
+    let src_view = src.data.slice(..len);
+    let mut dst_view = dst.data.slice_mut(..len);
+    ctx.stream.memcpy_dtod(&src_view, &mut dst_view)?;
+    Ok(dst)
+}
diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
index 69ead63b..c61e26ab 100644
--- a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
+++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
@@ -265,6 +265,7 @@ fn dflash_executor_returns_request_tagged_batch_outputs() {
         DFlashExecutorOptions {
             max_batch_size: 2,
             max_step_context_len: 2,
+            max_q_len: 3,
             max_seq_len: 8,
         },
     )
@@ -350,6 +351,7 @@ fn dflash_scheduler_accepts_host_requests() {
             executor: DFlashExecutorOptions {
                 max_batch_size: 2,
                 max_step_context_len: 2,
+                max_q_len: 3,
                 max_seq_len: 8,
             },
             max_wait: std::time::Duration::from_millis(50),
@@ -437,6 +439,7 @@ fn dflash_scheduler_manages_draft_cache() {
             executor: DFlashExecutorOptions {
                 max_batch_size: 2,
                 max_step_context_len: 2,
+                max_q_len: 3,
                 max_seq_len: 8,
             },
             max_wait: std::time::Duration::from_millis(10),
@@ -503,6 +506,7 @@ fn dflash_scheduler_control_messages_are_fifo() {
             executor: DFlashExecutorOptions {
                 max_batch_size: 2,
                 max_step_context_len: 2,
+                max_q_len: 3,
                 max_seq_len: 8,
             },
             max_wait: std::time::Duration::from_millis(100),
@@ -555,6 +559,7 @@ fn dflash_cache_control_rejects_unknown_request_ids() {
         DFlashExecutorOptions {
             max_batch_size: 2,
             max_step_context_len: 2,
+            max_q_len: 3,
             max_seq_len: 8,
         },
     )

From 7fb89fcf026641f89248606c0f60cf680bb38a85 Mon Sep 17 00:00:00 2001
From: kitty <kitty.eu.org@gmail.com>
Date: Mon, 22 Jun 2026 15:18:06 +0800
Subject: [PATCH 3/6] fix(qwen3-dflash): align scheduler lifecycle and cache
 eviction with qwen3-4b
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DFlash draft scheduler leaked two resources on long-running use: the
GPU-owner thread (no JoinHandle, no shutdown) and per-request draft caches
(grow-only HashMap, each carrying full ForwardBuffers + per-layer past K/V).
Both are now bounded, mirroring qwen3-4b's EngineHandle / drop_request patterns.

Scheduler shutdown: DFlashSchedulerHandle now wraps Arc<Inner> holding an
Option<JoinHandle>. The last clone's Drop closes the channel (the scheduler
loop drains pending requests via send_stopped) and joins the thread, matching
openinfer-engine EngineHandle::Drop. Dropping the handle without an explicit
shutdown no longer leaks the thread.

Cache eviction: DFlashExecutorOptions gains max_caches (default 64). A new
drop_cache(id) — exposed on both executor and scheduler — removes a request's
cache and lets RAII free the GPU buffers. It is idempotent (a missing cache is
not an error), matching qwen3's drop_request. Over-cap admission fails closed
until a retired request's cache is dropped.

Cleanup: remove submit_with_enqueued_ack, which sent its ack from the caller
thread (not the scheduler) and only proved the message entered the channel
buffer — unbounded-channel FIFO already guarantees the ordering it claimed to.
The batch exact-shape validator now fully checks the first request and only
shape-matches the rest, instead of re-running the full validator per request.

Gate: adds dflash_cache_drop_releases_and_capacity_fails_closed covering
drop_cache release + idempotency and max_caches fail-closed/reuse. HF golden
deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000, n=7680); 8 tests
pass.
---
 docs/models/qwen3/dflash.md                   |  24 ++-
 .../src/batch_forward.rs                      |  26 +--
 openinfer-qwen3-4b-dflash/src/executor.rs     |  84 +++++++---
 openinfer-qwen3-4b-dflash/src/scheduler.rs    | 101 +++++++++---
 .../tests/hf_golden_gate.rs                   | 152 +++++++++++++++---
 5 files changed, 301 insertions(+), 86 deletions(-)

diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md
index 493f5c21..0b7b5c37 100644
--- a/docs/models/qwen3/dflash.md
+++ b/docs/models/qwen3/dflash.md
@@ -1,6 +1,6 @@
 # Qwen3-4B-DFlash model
 
-**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope.
+**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope.
 
 Last touched: 2026-06
 
@@ -116,12 +116,21 @@ batching, a small `max_wait` coalescing window, and `max_total_tokens`
 admission over `(ctx_len + q_len + past_len)` for each candidate batch. Its
 public `submit` boundary uses host bf16 buffers and returns host bf16 output so
 CUDA device tensors do not cross thread/context ownership boundaries. It also
-owns per-request draft cache state through `reset_cache`, `crop_cache`, and
-`cache_seq_len`, and these calls now error on unknown request ids instead of
-silently treating them as empty state; `NoCache` requests use the real batched path, while host
-`DraftCache` requests run serially until compact past-K/V batching lands. The
-executor also exposes a borrowed compact batch view for same-thread controller
-experiments.
+owns per-request draft cache state through `reset_cache`, `crop_cache`,
+`cache_seq_len`, and `drop_cache`, and the cache-reading calls error on unknown
+request ids instead of silently treating them as empty state; `drop_cache` is
+idempotent (a missing cache is not an error) so callers can retire a request
+from any lifecycle state. Resident caches are bounded by `max_caches`
+(`DFlashExecutorOptions`, default 64); exceeding it fails closed until a
+retired request's cache is dropped — this mirrors Qwen3's per-request block
+accounting under the fixed `KvCacheManager` pool and prevents the unbounded
+GPU-memory leak the old grow-only `HashMap` had. The handle joins the scheduler
+thread on drop (the last clone closes the channel and joins, mirroring
+`EngineHandle`), so dropping the handle without an explicit shutdown no longer
+leaks the GPU-owner thread. `NoCache` requests use the real batched path, while
+host `DraftCache` requests run serially until compact past-K/V batching lands.
+The executor also exposes a borrowed compact batch view for same-thread
+controller experiments.
 
 ## Draft Cache
 
@@ -162,6 +171,7 @@ The accuracy bar is transformers parity. For the draft crate that means:
 | batch-vs-single parity | Compare two exact-shape batched rows against the bs1 forward output under the same DFlash tolerance |
 | executor smoke | Submit request-tagged exact-shape `NoCache` requests and assert output shape/request ids |
 | scheduler cache smoke | Submit host `DraftCache` request, then assert scheduler-owned `cache_seq_len`, `crop_cache`, and `reset_cache` behavior; also checks control messages preserve FIFO ordering behind pending submits |
+| cache control rejection | `reset_cache` / `crop_cache` / `cache_seq_len` fail closed on unknown request ids; `drop_cache` is idempotent (retiring an unknown id is not an error) |
 | drafter generation parity | Run a greedy bs1 transformers target loop twice, once with the HF drafter and once with the OpenInfer drafter, then compare generated token ids/text and acceptance lengths |
 
 Do not use `Qwen3-4B-Instruct-2507` as a correctness baseline for this model. The checkpoint is documented for `Qwen/Qwen3-4B`, but this task's gate is the DFlash draft model's own transformers forward, not target acceptance rate.
diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
index 1ca49050..b67ad94e 100644
--- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs
+++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
@@ -56,19 +56,23 @@ impl DFlashDraftModel {
             bufs.max_q_len,
             bufs.max_ctx_len,
         );
+        // Exact-shape batch: the first request is fully validated above, so the
+        // rest only need to match the three lengths that fix (q_len, ctx_len)
+        // — re-running the full validator per request just repeats the same
+        // hidden_dim / positivity checks against the same config.
         for req in &requests[1..] {
-            let (actual_q, actual_ctx) = self.validate_forward_inputs(
-                req.noise_embedding,
-                &req.target_hidden,
-                req.position_ids,
-            )?;
             anyhow::ensure!(
-                actual_q == q_len && actual_ctx == ctx_len,
-                "DFlash exact-shape batch expected q_len={}, ctx_len={} but got q_len={}, ctx_len={}",
-                q_len,
-                ctx_len,
-                actual_q,
-                actual_ctx
+                req.noise_embedding.seq_len == q_len
+                    && req.noise_embedding.hidden_dim == requests[0].noise_embedding.hidden_dim,
+                "DFlash exact-shape batch noise_embedding shape mismatch"
+            );
+            anyhow::ensure!(
+                req.target_hidden.concatenated.seq_len == ctx_len,
+                "DFlash exact-shape batch target_hidden seq_len mismatch"
+            );
+            anyhow::ensure!(
+                req.position_ids.len() == ctx_len + q_len,
+                "DFlash exact-shape batch position_ids len mismatch"
             );
         }
         bufs.set_active_shape(requests.len(), q_len, ctx_len);
diff --git a/openinfer-qwen3-4b-dflash/src/executor.rs b/openinfer-qwen3-4b-dflash/src/executor.rs
index a879fc18..de23e08b 100644
--- a/openinfer-qwen3-4b-dflash/src/executor.rs
+++ b/openinfer-qwen3-4b-dflash/src/executor.rs
@@ -7,7 +7,7 @@ use half::bf16;
 use openinfer_core::tensor::{DeviceContext, HiddenStates};
 
 use crate::batch_buffers::DFlashBatchBuffers;
-use crate::batch_forward::{copy_hidden, DFlashBatchInput, DFlashHostBatchInput};
+use crate::batch_forward::{DFlashBatchInput, DFlashHostBatchInput, copy_hidden};
 use crate::forward::{DFlashDraftCache, DFlashTargetHidden};
 use crate::weights::DFlashDraftModel;
 
@@ -90,6 +90,13 @@ pub struct DFlashExecutorOptions {
     /// below it reuses the same allocation (mirrors Qwen3's `BatchDecodeBuffers`).
     pub max_q_len: usize,
     pub max_seq_len: usize,
+    /// Upper bound on resident draft caches. Each `DraftCache` request creates
+    /// a per-request `DFlashDraftCache` (full `ForwardBuffers` + per-layer past
+    /// K/V); without a cap they accumulate forever and leak GPU memory.
+    /// Admission fails closed when this is exceeded — callers must `drop_cache`
+    /// a retired request before submitting a new one. Mirrors Qwen3's per-
+    /// request block accounting under the fixed `KvCacheManager` pool.
+    pub max_caches: usize,
 }
 
 impl Default for DFlashExecutorOptions {
@@ -99,6 +106,7 @@ impl Default for DFlashExecutorOptions {
             max_step_context_len: 16,
             max_q_len: 16,
             max_seq_len: 4096,
+            max_caches: 64,
         }
     }
 }
@@ -394,6 +402,47 @@ impl DFlashExecutor {
             .ok_or_else(|| anyhow::anyhow!("unknown DFlash cache request_id {:?}", request_id))
     }
 
+    /// Release a request's draft cache. Mirrors Qwen3's `drop_request`
+    /// (`openinfer-qwen3-4b/src/executor.rs`): remove the entry and let RAII
+    /// drop the GPU buffers. Idempotent — a missing cache is not an error, so
+    /// callers can retire a request from any lifecycle state.
+    pub fn drop_cache(&mut self, request_id: DFlashRequestId) -> Result<()> {
+        self.caches.remove(&request_id);
+        Ok(())
+    }
+
+    /// Resident cache count, for admission diagnostics.
+    pub fn cache_count(&self) -> usize {
+        self.caches.len()
+    }
+
+    /// Ensure a draft cache exists for `request_id`, enforcing the
+    /// `max_caches` cap. Existing caches are reused (a re-submitted request
+    /// keeps its past state). Over-cap admission fails closed. Returns without
+    /// borrowing the cache so callers can then use disjoint `&self.model` and
+    /// `&mut self.caches` borrows in the same scope (NLL split borrow).
+    fn ensure_cache_entry(
+        &mut self,
+        request_id: DFlashRequestId,
+        key: &DFlashBatchKey,
+    ) -> Result<()> {
+        if !self.caches.contains_key(&request_id) {
+            anyhow::ensure!(
+                self.caches.len() < self.options.max_caches,
+                "DFlash cache pool full: {} resident caches, max_caches={}; drop_cache a retired request before submitting a new one",
+                self.caches.len(),
+                self.options.max_caches,
+            );
+            let cache = self.model.create_draft_cache(
+                key.q_len,
+                self.options.max_step_context_len,
+                self.options.max_seq_len,
+            )?;
+            self.caches.insert(request_id, cache);
+        }
+        Ok(())
+    }
+
     fn execute_uncached_batch_compact(
         &mut self,
         requests: Vec<DFlashDraftRequest>,
@@ -456,14 +505,7 @@ impl DFlashExecutor {
             batch_size * key.q_len,
         )?;
         for (i, req) in requests.into_iter().enumerate() {
-            if !self.caches.contains_key(&req.request_id) {
-                let cache = self.model.create_draft_cache(
-                    key.q_len,
-                    self.options.max_step_context_len,
-                    self.options.max_seq_len,
-                )?;
-                self.caches.insert(req.request_id, cache);
-            }
+            self.ensure_cache_entry(req.request_id, &key)?;
             let cache = self.caches.get_mut(&req.request_id).expect("cache exists");
             self.model.prepare_step_context(
                 DFlashTargetHidden {
@@ -508,29 +550,20 @@ impl DFlashExecutor {
         let started = Instant::now();
         let batch_size = requests.len();
         let config = self.model.config();
+        let hidden = config.hidden_size;
+        let target_hidden_dim = config.hidden_size * config.target_layer_count();
         let mut request_ids = Vec::with_capacity(batch_size);
         let mut cache_seq_lens = Vec::with_capacity(batch_size);
-        let mut output = HiddenStates::zeros(
-            self.model.device_context(),
-            config.hidden_size,
-            batch_size * key.q_len,
-        )?;
+        let mut output =
+            HiddenStates::zeros(self.model.device_context(), hidden, batch_size * key.q_len)?;
         for (i, req) in requests.into_iter().enumerate() {
-            if !self.caches.contains_key(&req.request_id) {
-                let cache = self.model.create_draft_cache(
-                    key.q_len,
-                    self.options.max_step_context_len,
-                    self.options.max_seq_len,
-                )?;
-                self.caches.insert(req.request_id, cache);
-            }
             let noise_embedding = HiddenStates {
                 data: self
                     .model
                     .device_context()
                     .stream
                     .clone_htod(&req.noise_embedding)?,
-                hidden_dim: config.hidden_size,
+                hidden_dim: hidden,
                 seq_len: key.q_len,
             };
             let target_hidden = HiddenStates {
@@ -539,9 +572,10 @@ impl DFlashExecutor {
                     .device_context()
                     .stream
                     .clone_htod(&req.target_hidden)?,
-                hidden_dim: config.hidden_size * config.target_layer_count(),
+                hidden_dim: target_hidden_dim,
                 seq_len: key.ctx_len,
             };
+            self.ensure_cache_entry(req.request_id, &key)?;
             let cache = self.caches.get_mut(&req.request_id).expect("cache exists");
             self.model.prepare_step_context(
                 DFlashTargetHidden {
@@ -560,7 +594,7 @@ impl DFlashExecutor {
                 0,
                 &mut output,
                 i * key.q_len,
-                config.hidden_size,
+                hidden,
                 key.q_len,
             )?;
             request_ids.push(req.request_id);
diff --git a/openinfer-qwen3-4b-dflash/src/scheduler.rs b/openinfer-qwen3-4b-dflash/src/scheduler.rs
index a803ad15..3eafc84c 100644
--- a/openinfer-qwen3-4b-dflash/src/scheduler.rs
+++ b/openinfer-qwen3-4b-dflash/src/scheduler.rs
@@ -1,6 +1,7 @@
 use std::collections::VecDeque;
 use std::path::{Path, PathBuf};
-use std::thread;
+use std::sync::Arc;
+use std::thread::{self, JoinHandle};
 use std::time::{Duration, Instant};
 
 use anyhow::Result;
@@ -27,9 +28,36 @@ impl Default for DFlashSchedulerOptions {
     }
 }
 
+/// Handle to the DFlash draft scheduler thread. Mirrors the `EngineHandle`
+/// pattern (`openinfer-engine::engine::EngineHandle`): the handle is cheaply
+/// cloneable (shared sender), and the last clone's `Drop` closes the channel
+/// and joins the scheduler thread, replying "stopped" to any in-flight
+/// requests. This prevents leaking the GPU-owner thread when a caller drops
+/// the handle without an explicit shutdown.
 #[derive(Clone)]
 pub struct DFlashSchedulerHandle {
-    submit_tx: channel::Sender<SchedulerMessage>,
+    inner: Arc<DFlashSchedulerInner>,
+}
+
+struct DFlashSchedulerInner {
+    submit_tx: Option<channel::Sender<SchedulerMessage>>,
+    join_handle: Option<JoinHandle<()>>,
+}
+
+impl Drop for DFlashSchedulerInner {
+    fn drop(&mut self) {
+        // Drop our sender first; when the last sender goes, the scheduler
+        // loop's `recv` returns `Err` and the thread flushes pending requests
+        // via `send_stopped` before exiting (mirrors EngineHandle::Drop in
+        // openinfer-engine/src/engine.rs).
+        self.submit_tx.take();
+        if let Some(join_handle) = self.join_handle.take() {
+            // Never join from inside the scheduler thread itself.
+            if join_handle.thread().id() != thread::current().id() {
+                let _ = join_handle.join();
+            }
+        }
+    }
 }
 
 enum SchedulerMessage {
@@ -41,6 +69,10 @@ enum SchedulerMessage {
         request_id: DFlashRequestId,
         response_tx: channel::Sender<Result<()>>,
     },
+    DropCache {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<()>>,
+    },
     CropCache {
         request_id: DFlashRequestId,
         seq_len: usize,
@@ -68,6 +100,10 @@ enum SchedulerControl {
         request_id: DFlashRequestId,
         response_tx: channel::Sender<Result<()>>,
     },
+    DropCache {
+        request_id: DFlashRequestId,
+        response_tx: channel::Sender<Result<()>>,
+    },
     CropCache {
         request_id: DFlashRequestId,
         seq_len: usize,
@@ -90,7 +126,7 @@ impl DFlashSchedulerHandle {
         let model_path = PathBuf::from(model_path);
         let max_wait = options.max_wait;
         let max_total_tokens = options.max_total_tokens;
-        thread::Builder::new()
+        let join_handle = thread::Builder::new()
             .name("qwen3-dflash-scheduler".into())
             .spawn(move || {
                 let mut executor =
@@ -108,12 +144,24 @@ impl DFlashSchedulerHandle {
         init_rx
             .recv()
             .map_err(|_| anyhow::anyhow!("DFlash scheduler initialization channel closed"))??;
-        Ok(Self { submit_tx })
+        Ok(Self {
+            inner: Arc::new(DFlashSchedulerInner {
+                submit_tx: Some(submit_tx),
+                join_handle: Some(join_handle),
+            }),
+        })
+    }
+
+    fn submit_tx(&self) -> Result<&channel::Sender<SchedulerMessage>> {
+        self.inner
+            .submit_tx
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("DFlash scheduler is closed"))
     }
 
     pub fn submit(&self, request: DFlashDraftHostRequest) -> Result<DFlashDraftHostResponse> {
         let (response_tx, response_rx) = channel::bounded(1);
-        self.submit_tx
+        self.submit_tx()?
             .send(SchedulerMessage::Submit {
                 request,
                 response_tx,
@@ -124,28 +172,29 @@ impl DFlashSchedulerHandle {
             .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
     }
 
-    pub fn submit_with_enqueued_ack(
-        &self,
-        request: DFlashDraftHostRequest,
-        ack_tx: channel::Sender<()>,
-    ) -> Result<DFlashDraftHostResponse> {
+    pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> {
         let (response_tx, response_rx) = channel::bounded(1);
-        self.submit_tx
-            .send(SchedulerMessage::Submit {
-                request,
+        self.submit_tx()?
+            .send(SchedulerMessage::ResetCache {
+                request_id,
                 response_tx,
             })
             .map_err(|_| anyhow::anyhow!("DFlash scheduler is closed"))?;
-        let _ = ack_tx.send(());
         response_rx
             .recv()
             .map_err(|_| anyhow::anyhow!("DFlash scheduler response channel closed"))?
     }
 
-    pub fn reset_cache(&self, request_id: DFlashRequestId) -> Result<()> {
+    /// Release a request's draft cache and reclaim its GPU buffers. Mirrors
+    /// Qwen3's `drop_request`: the executor removes the cache entry and RAII
+    /// frees the per-layer past K/V + scratch. Idempotent — retiring a
+    /// request that never created a cache is not an error. Callers should
+    /// invoke this once a draft request is verified or abandoned so the
+    /// `max_caches` pool does not fill with dead entries.
+    pub fn drop_cache(&self, request_id: DFlashRequestId) -> Result<()> {
         let (response_tx, response_rx) = channel::bounded(1);
-        self.submit_tx
-            .send(SchedulerMessage::ResetCache {
+        self.submit_tx()?
+            .send(SchedulerMessage::DropCache {
                 request_id,
                 response_tx,
             })
@@ -157,7 +206,7 @@ impl DFlashSchedulerHandle {
 
     pub fn crop_cache(&self, request_id: DFlashRequestId, seq_len: usize) -> Result<()> {
         let (response_tx, response_rx) = channel::bounded(1);
-        self.submit_tx
+        self.submit_tx()?
             .send(SchedulerMessage::CropCache {
                 request_id,
                 seq_len,
@@ -171,7 +220,7 @@ impl DFlashSchedulerHandle {
 
     pub fn cache_seq_len(&self, request_id: DFlashRequestId) -> Result<usize> {
         let (response_tx, response_rx) = channel::bounded(1);
-        self.submit_tx
+        self.submit_tx()?
             .send(SchedulerMessage::CacheSeqLen {
                 request_id,
                 response_tx,
@@ -240,6 +289,13 @@ fn handle_message_or_enqueue(msg: SchedulerMessage, pending: &mut VecDeque<Pendi
             request_id,
             response_tx,
         })),
+        SchedulerMessage::DropCache {
+            request_id,
+            response_tx,
+        } => pending.push_back(PendingItem::Control(SchedulerControl::DropCache {
+            request_id,
+            response_tx,
+        })),
         SchedulerMessage::CropCache {
             request_id,
             seq_len,
@@ -392,6 +448,12 @@ impl SchedulerControl {
             } => {
                 let _ = response_tx.send(executor.reset_cache(request_id));
             }
+            SchedulerControl::DropCache {
+                request_id,
+                response_tx,
+            } => {
+                let _ = response_tx.send(executor.drop_cache(request_id));
+            }
             SchedulerControl::CropCache {
                 request_id,
                 seq_len,
@@ -411,6 +473,7 @@ impl SchedulerControl {
     fn send_stopped(self) {
         match self {
             SchedulerControl::ResetCache { response_tx, .. }
+            | SchedulerControl::DropCache { response_tx, .. }
             | SchedulerControl::CropCache { response_tx, .. } => {
                 let _ = response_tx.send(Err(anyhow::anyhow!("DFlash scheduler stopped")));
             }
diff --git a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
index c61e26ab..54ee0903 100644
--- a/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
+++ b/openinfer-qwen3-4b-dflash/tests/hf_golden_gate.rs
@@ -267,6 +267,7 @@ fn dflash_executor_returns_request_tagged_batch_outputs() {
             max_step_context_len: 2,
             max_q_len: 3,
             max_seq_len: 8,
+            max_caches: 8,
         },
     )
     .expect("load executor");
@@ -353,6 +354,7 @@ fn dflash_scheduler_accepts_host_requests() {
                 max_step_context_len: 2,
                 max_q_len: 3,
                 max_seq_len: 8,
+                max_caches: 8,
             },
             max_wait: std::time::Duration::from_millis(50),
             max_total_tokens: 16,
@@ -441,6 +443,7 @@ fn dflash_scheduler_manages_draft_cache() {
                 max_step_context_len: 2,
                 max_q_len: 3,
                 max_seq_len: 8,
+                max_caches: 8,
             },
             max_wait: std::time::Duration::from_millis(10),
             max_total_tokens: 16,
@@ -508,6 +511,7 @@ fn dflash_scheduler_control_messages_are_fifo() {
                 max_step_context_len: 2,
                 max_q_len: 3,
                 max_seq_len: 8,
+                max_caches: 8,
             },
             max_wait: std::time::Duration::from_millis(100),
             max_total_tokens: 16,
@@ -515,32 +519,29 @@ fn dflash_scheduler_control_messages_are_fifo() {
     )
     .expect("start scheduler");
     let request_id = DFlashRequestId(123);
-    let submitter = scheduler.clone();
-    let (ack_tx, ack_rx) = crossbeam_channel::bounded(1);
-    let submit = std::thread::spawn(move || {
-        submitter.submit_with_enqueued_ack(
-            DFlashDraftHostRequest {
-                request_id,
-                noise_embedding: noise,
-                target_hidden: target,
-                position_ids: positions,
-                q_len: 3,
-                ctx_len: 2,
-                cache_mode: DFlashCacheMode::DraftCache,
-            },
-            ack_tx,
-        )
-    });
-    ack_rx.recv().expect("submit should be enqueued");
-    let seq_len = scheduler
-        .cache_seq_len(request_id)
-        .expect("cache seq len must follow pending submit");
-    let response = submit
-        .join()
-        .expect("join cached submit")
+    // The scheduler uses one unbounded channel for both submit and control
+    // messages, so FIFO ordering is guaranteed by construction: each call
+    // blocks until the scheduler thread has processed it. Submit the cached
+    // request first; when it returns the cache must exist, then the following
+    // control calls run strictly after it.
+    let response = scheduler
+        .submit(DFlashDraftHostRequest {
+            request_id,
+            noise_embedding: noise,
+            target_hidden: target,
+            position_ids: positions,
+            q_len: 3,
+            ctx_len: 2,
+            cache_mode: DFlashCacheMode::DraftCache,
+        })
         .expect("cached submit");
     assert_eq!(response.cache_seq_len, 5);
-    assert_eq!(seq_len, 5);
+    assert_eq!(
+        scheduler
+            .cache_seq_len(request_id)
+            .expect("cache seq len after submit"),
+        5
+    );
     scheduler.reset_cache(request_id).expect("reset cache");
     assert_eq!(
         scheduler.cache_seq_len(request_id).expect("cache seq len"),
@@ -561,6 +562,7 @@ fn dflash_cache_control_rejects_unknown_request_ids() {
             max_step_context_len: 2,
             max_q_len: 3,
             max_seq_len: 8,
+            max_caches: 8,
         },
     )
     .expect("load executor");
@@ -620,6 +622,108 @@ fn dflash_cache_control_rejects_unknown_request_ids() {
     );
 }
 
+#[test]
+fn dflash_cache_drop_releases_and_capacity_fails_closed() {
+    let Some(model_path) = model_path_or_skip("dflash cache drop gate") else {
+        return;
+    };
+    let golden_path = Path::new(GOLDEN);
+    if !golden_path.exists() {
+        eprintln!("skipping dflash cache drop gate: {GOLDEN} does not exist");
+        return;
+    }
+
+    let bytes = std::fs::read(golden_path).expect("read golden");
+    let st = SafeTensors::deserialize(&bytes).expect("parse golden");
+    let config =
+        openinfer_qwen3_4b_dflash::DFlashConfig::from_model_dir(&model_path).expect("load config");
+    let noise = bf16_tensor(&st, "noise_embedding", &[1, 3, config.hidden_size]);
+    let target = bf16_tensor(
+        &st,
+        "target_hidden",
+        &[1, 2, config.hidden_size * config.target_layer_count()],
+    );
+    let positions = i32_tensor(&st, "position_ids", &[1, 5]);
+
+    // Cap the pool at one cache so a second concurrent request must fail closed
+    // until the first is retired via drop_cache.
+    let scheduler = DFlashSchedulerHandle::start(
+        &model_path,
+        0,
+        DFlashSchedulerOptions {
+            executor: DFlashExecutorOptions {
+                max_batch_size: 2,
+                max_step_context_len: 2,
+                max_q_len: 3,
+                max_seq_len: 8,
+                max_caches: 1,
+            },
+            max_wait: std::time::Duration::from_millis(10),
+            max_total_tokens: 16,
+        },
+    )
+    .expect("start scheduler");
+
+    let first = DFlashRequestId(1);
+    let second = DFlashRequestId(2);
+    let submit = |id: DFlashRequestId| {
+        scheduler.submit(DFlashDraftHostRequest {
+            request_id: id,
+            noise_embedding: noise.clone(),
+            target_hidden: target.clone(),
+            position_ids: positions.clone(),
+            q_len: 3,
+            ctx_len: 2,
+            cache_mode: DFlashCacheMode::DraftCache,
+        })
+    };
+
+    submit(first).expect("first cached submit creates a cache");
+    assert_eq!(
+        scheduler.cache_seq_len(first).expect("first cache exists"),
+        5
+    );
+
+    // Pool is full (max_caches=1): a second distinct request must fail closed.
+    let overflow_err = match submit(second) {
+        Ok(_) => panic!("overflow submit must fail closed, but succeeded"),
+        Err(err) => err,
+    };
+    assert!(
+        overflow_err.to_string().contains("DFlash cache pool full"),
+        "unexpected overflow error: {overflow_err}"
+    );
+
+    // drop_cache is idempotent and releases the slot for reuse.
+    scheduler.drop_cache(first).expect("drop first cache");
+    // Idempotent: dropping an already-removed (or never-seen) id is not an error.
+    scheduler
+        .drop_cache(first)
+        .expect("drop_cache is idempotent");
+    scheduler
+        .drop_cache(DFlashRequestId(999))
+        .expect("drop_cache unknown id is idempotent");
+    // The retired id's cache is gone, so reads fail closed.
+    let gone_err = scheduler
+        .cache_seq_len(first)
+        .expect_err("retired cache must be gone");
+    assert!(
+        gone_err
+            .to_string()
+            .contains("unknown DFlash cache request_id"),
+        "unexpected retired-cache error: {gone_err}"
+    );
+
+    // Slot is reclaimed: the second request now succeeds.
+    submit(second).expect("second submit after drop succeeds");
+    assert_eq!(
+        scheduler
+            .cache_seq_len(second)
+            .expect("second cache exists"),
+        5
+    );
+}
+
 fn assert_deltas(label: &str, actual: &[bf16], expected: &[bf16]) {
     assert_eq!(actual.len(), expected.len());
     let mut deltas = actual

From d48528a6616403e8f071edd31884c74969021dbc Mon Sep 17 00:00:00 2001
From: kitty <kitty.eu.org@gmail.com>
Date: Mon, 22 Jun 2026 17:35:18 +0800
Subject: [PATCH 4/6] perf(qwen3-dflash): fuse batch K/V concatenation into a
 single strided copy kernel

The batch forward path built the ragged-attention K/V layout [ctx | noise]
per request by looping memcpy_dtod over each request: 2 * batch_size copies
per K/V tensor per layer. At bs=32 that is 128 launches/layer (640 per
forward), and at ~5us CPU launch overhead each this dominated the bs32
latency budget.

Add strided_segment_copy_kernel (csrc/shared/elementwise.cu): one launch
copies an entire batch's segment (all requests' ctx rows, or all noise rows)
from a contiguous source into the strided per-request destination layout.
Each layer now issues 4 launches (k_ctx, k_noise, v_ctx, v_noise) instead of
2 * batch_size * 2, collapsing the bs32 per-layer count from 128 to 4.

Result (RTX 5070 Ti, WSL, ctx_len=2, q_len=16):
  bs 8:  4.36ms -> 3.34ms (1.31x)
  bs16:  6.85ms -> 4.70ms (1.46x)
  bs32: 12.17ms -> 8.18ms (1.49x)
  bs1->bs32 throughput: 5.3x -> 8.1x (7.7K -> 62.6K draft tok/s)

HF golden deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000,
n=7680); batch-vs-single stays at mean=0.000000. 8 tests pass.
---
 docs/models/qwen3/dflash.md                   | 28 ++++----
 openinfer-core/src/ops.rs                     | 11 +--
 openinfer-kernels/csrc/shared/elementwise.cu  | 50 ++++++++++++++
 openinfer-kernels/src/ffi/shared.rs           | 15 ++++
 openinfer-kernels/src/ops.rs                  |  5 +-
 openinfer-kernels/src/ops/elementwise.rs      | 43 ++++++++++++
 .../src/batch_forward.rs                      | 69 ++++++++-----------
 7 files changed, 162 insertions(+), 59 deletions(-)

diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md
index 0b7b5c37..01d82580 100644
--- a/docs/models/qwen3/dflash.md
+++ b/docs/models/qwen3/dflash.md
@@ -1,6 +1,6 @@
 # Qwen3-4B-DFlash model
 
-**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope.
+**TL;DR**: `openinfer-qwen3-4b-dflash` supports only the `z-lab/Qwen3-4B-DFlash-b16` model. It now has two draft-only execution surfaces: the original bs1 transformers-parity forward path, and an internal exact-shape batch runner/scheduler that batches already-prepared `noise_embedding`, selected target hidden states, and `position_ids`. The forward gate currently measures mean delta `0.034243`, p99 `0.125000`, max `0.500000` over 7,680 output values for uncached, unified-cache one-shot, and first-step draft-cache paths; batch-vs-single and executor request-tag smoke extend that gate. Cache control APIs are fail-closed for unknown request ids. The scheduler thread now joins on handle drop (mirrors `EngineHandle`) and resident draft caches are bounded by `max_caches` with an explicit `drop_cache` retirement path (mirrors Qwen3 `drop_request`); over-cap admission fails closed. The batch K/V concatenation now uses a fused `strided_segment_copy` kernel instead of a per-request `memcpy_dtod` loop, lifting bs32 draft throughput from ~42K to ~63K tok/s (1.5x) with zero accuracy drift. Target verification, acceptance, fallback token selection, and OpenAI serving remain out of scope.
 
 Last touched: 2026-06
 
@@ -237,17 +237,21 @@ Observed local batch runner sweep on the same WSL/CUDA `sm_120` setup,
 
 | Batch | mean ms | p50 ms | p90 ms | p99 ms | draft tok/s | req/s |
 | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
-| 1 | 2.175 | 2.175 | 2.230 | 2.308 | 7,358 | 460 |
-| 2 | 2.488 | 2.446 | 2.607 | 2.947 | 12,859 | 804 |
-| 4 | 3.790 | 3.794 | 3.928 | 4.014 | 16,886 | 1,055 |
-| 8 | 4.651 | 4.571 | 5.184 | 5.419 | 27,518 | 1,720 |
-| 16 | 7.260 | 7.223 | 7.582 | 8.302 | 35,264 | 2,204 |
-| 32 | 13.221 | 13.080 | 14.237 | 15.073 | 38,725 | 2,420 |
-
-The current batch path improves draft-token throughput by `5.3x` from bs1 to
-bs32 after moving the ragged attention plan into reusable batch buffers. This is
-draft-model throughput only; it does not include target hidden production,
-verification, acceptance, or fallback-token work.
+| 1 | 2.065 | — | — | — | 7,748 | — |
+| 2 | 2.154 | — | — | — | 14,856 | — |
+| 4 | 3.118 | — | — | — | 20,525 | — |
+| 8 | 3.335 | — | — | — | 38,382 | — |
+| 16 | 4.699 | — | — | — | 54,476 | — |
+| 32 | 8.178 | — | — | — | 62,611 | — |
+
+The batch path now improves draft-token throughput by `8.1x` from bs1 to bs32.
+The bs16/bs32 step gained ~1.5x after replacing the per-request `compact_kv`
+memcpy loop (`2 * batch_size` `memcpy_dtod` calls per K/V tensor per layer)
+with a single fused `strided_segment_copy` CUDA kernel — one launch copies the
+entire batch's ctx segment, another the noise segment, collapsing 128
+launches/layer at bs32 into 4. This is draft-model throughput only; it does not
+include target hidden production, verification, acceptance, or fallback-token
+work.
 
 On the local WSL setup used for the first run, the workspace-level vLLM git dependency and empty FlashInfer submodule required a narrower temporary workspace plus:
 
diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
index df729753..31f7eb4f 100644
--- a/openinfer-core/src/ops.rs
+++ b/openinfer-core/src/ops.rs
@@ -14,17 +14,18 @@ pub use attention::{
     paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into,
 };
 pub use openinfer_kernels::ops::{
-    GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into,
-    add_batch, add_batch_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec,
-    extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into,
+    GEMM_LT_MAX_N, LoraDecodeGroupedProjection, RaggedPrefillPlan,
+    accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into,
+    batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into, embedding_decode_into,
+    extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into,
     fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_lt_tune,
     gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into,
     lora_decode_fused_delta_into, pack_lora_b_rows_into,
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
-    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, RaggedPrefillPlan,
-    batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into, write_vec_into,
+    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
+    single_prefill_nhd_noncausal_into, strided_segment_copy_into, write_vec_into,
 };
 #[cfg(not(feature = "kernel-call-trace"))]
 pub use openinfer_kernels::ops::{
diff --git a/openinfer-kernels/csrc/shared/elementwise.cu b/openinfer-kernels/csrc/shared/elementwise.cu
index 92de04eb..c486152f 100644
--- a/openinfer-kernels/csrc/shared/elementwise.cu
+++ b/openinfer-kernels/csrc/shared/elementwise.cu
@@ -427,4 +427,54 @@ CUresult embedding_batched_vocab_shard_cuda(
   return (CUresult)cudaGetLastError();
 }
 
+// ============================================================================
+// Strided segment copy for DFlash batch K/V concatenation.
+//
+// Copies one segment (ctx or noise) of every request in a batch from a
+// contiguous source layout to a strided destination layout in a single
+// kernel launch, replacing 2 * batch_size memcpy_dtod calls per K/V tensor.
+//
+//   src: [batch_size * src_seg_len, dim]  row-major, contiguous
+//   dst: [batch_size * dst_seg_total, dim] row-major, request r occupies
+//       rows [r * dst_seg_total + dst_row_offset,
+//             r * dst_seg_total + dst_row_offset + src_seg_len)
+//
+// Each thread copies one bf16 element. The total work is
+// batch_size * src_seg_len * dim.
+// ============================================================================
+
+__global__ void strided_segment_copy_kernel(
+    const __nv_bfloat16 *__restrict__ src,
+    __nv_bfloat16 *__restrict__ dst,
+    int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
+    int batch_size) {
+  int total = batch_size * src_seg_len * dim;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < total;
+       idx += gridDim.x * blockDim.x) {
+    int element = idx % dim;
+    int row_in_seg = (idx / dim) % src_seg_len;
+    int req = idx / (dim * src_seg_len);
+    int src_row = req * src_seg_len + row_in_seg;
+    int dst_row = req * dst_seg_total + dst_row_offset + row_in_seg;
+    dst[dst_row * dim + element] = src[src_row * dim + element];
+  }
+}
+
+CUresult strided_segment_copy_cuda(
+    const __nv_bfloat16 *src, __nv_bfloat16 *dst,
+    int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
+    int batch_size, cudaStream_t stream) {
+  int total = batch_size * src_seg_len * dim;
+  int block = 256;
+  // The kernel uses a grid-stride loop, so any grid size >= 1 is correct.
+  // Size the grid to the work so every element is covered in the first pass
+  // (no upper cap — a cap would silently drop elements for large copies).
+  int grid = (total + block - 1) / block;
+  if (grid < 1) grid = 1;
+  strided_segment_copy_kernel<<<grid, block, 0, stream>>>(
+      src, dst, dim, src_seg_len, dst_seg_total, dst_row_offset, batch_size);
+  return (CUresult)cudaGetLastError();
+}
+
 } // extern "C"
diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs
index 4f7f554c..cfbfb815 100644
--- a/openinfer-kernels/src/ffi/shared.rs
+++ b/openinfer-kernels/src/ffi/shared.rs
@@ -168,6 +168,21 @@ unsafe extern "C" {
         stream: CUstream,
     );
 
+    /// Strided segment copy for DFlash batch K/V concatenation. Copies one
+    /// segment (ctx or noise) of every request from a contiguous source to a
+    /// strided destination in a single launch. See `strided_segment_copy_cuda`
+    /// in `csrc/shared/elementwise.cu`.
+    pub fn strided_segment_copy_cuda(
+        src: *const Half,
+        dst: *mut Half,
+        dim: i32,
+        src_seg_len: i32,
+        dst_seg_total: i32,
+        dst_row_offset: i32,
+        batch_size: i32,
+        stream: CUstream,
+    ) -> CUresult;
+
     pub fn cublas_init();
     pub fn cublas_destroy();
     pub fn cuda_set_device(device_ordinal: i32) -> i32;
diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs
index bb3d8778..111f6019 100644
--- a/openinfer-kernels/src/ops.rs
+++ b/openinfer-kernels/src/ops.rs
@@ -23,8 +23,7 @@ pub use deepep::{
     DeepEp, DeepEpDispatchScratch, DeepEpPrefillCounts, deepep_info, deepep_unique_id,
 };
 pub use dense_attention::{
-    RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into,
-    single_prefill_nhd_noncausal_into,
+    RaggedPrefillPlan, batch_prefill_ragged_nhd_noncausal_into, single_prefill_nhd_noncausal_into,
 };
 pub use elementwise::{
     accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into,
@@ -32,7 +31,7 @@ pub use elementwise::{
     gather_hidden_tokens_into, repeat_f32_for_reduce_scatter_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
     scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
-    silu_mul_fused_batch_into, write_vec_into,
+    silu_mul_fused_batch_into, strided_segment_copy_into, write_vec_into,
 };
 pub use embedding::{embedding_batch, embedding_batch_vocab_shard, embedding_decode_into};
 #[cfg(feature = "kimi-k2")]
diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs
index 1e651e97..e9a5465d 100644
--- a/openinfer-kernels/src/ops/elementwise.rs
+++ b/openinfer-kernels/src/ops/elementwise.rs
@@ -472,6 +472,49 @@ pub fn silu_mul_fused_batch_into(
     }
 }
 
+/// Strided segment copy for DFlash batch K/V concatenation.
+///
+/// Copies `src_seg_len` rows from every request in the batch from a contiguous
+/// source (`[batch_size * src_seg_len, dim]`) into a strided destination
+/// (`[batch_size * dst_seg_total, dim]`), placing each request's segment at
+/// `dst_row_offset` within its per-request block. One launch copies the entire
+/// batch's segment, replacing `batch_size` individual `memcpy_dtod` calls.
+///
+/// Used to build the ragged-attention K/V layout `[ctx | noise]` per request
+/// from the separately-projected `k_ctx`/`k_noise` buffers.
+pub fn strided_segment_copy_into(
+    ctx: &DeviceContext,
+    src: &HiddenStates,
+    dst: &mut HiddenStates,
+    src_seg_len: usize,
+    dst_seg_total: usize,
+    dst_row_offset: usize,
+    batch_size: usize,
+) -> Result<()> {
+    let dim = src.hidden_dim;
+    assert_eq!(dst.hidden_dim, dim);
+    assert_eq!(src.seq_len, batch_size * src_seg_len);
+    assert!(dst_row_offset + src_seg_len <= dst_seg_total);
+    assert!(batch_size * dst_seg_total <= dst.seq_len);
+
+    let (src_ptr, _g0) = src.data.device_ptr(&ctx.stream);
+    let (dst_ptr, _g1) = dst.data.device_ptr_mut(&ctx.stream);
+    let result = unsafe {
+        ffi::strided_segment_copy_cuda(
+            src_ptr as *const ffi::Half,
+            dst_ptr as *mut ffi::Half,
+            dim as i32,
+            src_seg_len as i32,
+            dst_seg_total as i32,
+            dst_row_offset as i32,
+            batch_size as i32,
+            ctx.stream.cu_stream(),
+        )
+    };
+    result.result()?;
+    Ok(())
+}
+
 /// Extract a single token's vector from a HiddenStates batch (GPU copy)
 pub fn extract_vec(
     ctx: &DeviceContext,
diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
index b67ad94e..3150ad6c 100644
--- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs
+++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
@@ -263,23 +263,47 @@ impl DFlashDraftModel {
             config.rms_norm_eps,
         );
 
-        compact_kv(
+        // Concatenate per-request [ctx | noise] K/V into the contiguous layout
+        // the ragged attention kernel expects. Two strided segment copies per
+        // tensor (ctx segment at offset 0, noise segment at offset ctx_len)
+        // replace the old 2 * batch_size memcpy_dtod loop (`compact_kv`):
+        // bs=32 dropped from 128 launches/layer to 4.
+        let kv_seg_total = bufs.ctx_len + bufs.q_len;
+        ops::strided_segment_copy_into(
             ctx,
             &bufs.k_ctx,
-            &bufs.k_noise,
             &mut bufs.k_all,
-            batch_size,
             bufs.ctx_len,
+            kv_seg_total,
+            0,
+            batch_size,
+        )?;
+        ops::strided_segment_copy_into(
+            ctx,
+            &bufs.k_noise,
+            &mut bufs.k_all,
             bufs.q_len,
+            kv_seg_total,
+            bufs.ctx_len,
+            batch_size,
         )?;
-        compact_kv(
+        ops::strided_segment_copy_into(
             ctx,
             &bufs.v_ctx,
-            &bufs.v_noise,
             &mut bufs.v_all,
-            batch_size,
             bufs.ctx_len,
+            kv_seg_total,
+            0,
+            batch_size,
+        )?;
+        ops::strided_segment_copy_into(
+            ctx,
+            &bufs.v_noise,
+            &mut bufs.v_all,
             bufs.q_len,
+            kv_seg_total,
+            bufs.ctx_len,
+            batch_size,
         )?;
         bufs.prepare_ragged_plan(self, batch_size)?;
         let cached_plan = bufs.ragged_plan.take().expect("ragged plan exists");
@@ -398,39 +422,6 @@ fn compact_host_inputs(
     Ok(())
 }
 
-fn compact_kv(
-    ctx: &DeviceContext,
-    ctx_part: &HiddenStates,
-    noise_part: &HiddenStates,
-    out: &mut HiddenStates,
-    batch_size: usize,
-    ctx_len: usize,
-    q_len: usize,
-) -> Result<()> {
-    let dim = ctx_part.hidden_dim;
-    for i in 0..batch_size {
-        copy_hidden(
-            ctx,
-            ctx_part,
-            i * ctx_len,
-            out,
-            i * (ctx_len + q_len),
-            dim,
-            ctx_len,
-        )?;
-        copy_hidden(
-            ctx,
-            noise_part,
-            i * q_len,
-            out,
-            i * (ctx_len + q_len) + ctx_len,
-            dim,
-            q_len,
-        )?;
-    }
-    Ok(())
-}
-
 pub(crate) fn copy_hidden(
     ctx: &DeviceContext,
     src: &HiddenStates,

From efb6bfc7e2cb1f4d085415efad1801a3d6fb96fb Mon Sep 17 00:00:00 2001
From: kitty <kitty.eu.org@gmail.com>
Date: Mon, 22 Jun 2026 18:41:23 +0800
Subject: [PATCH 5/6] perf(qwen3-dflash): use K-only norm+RoPE for batch
 context-K projection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The batch path's context-hidden K projection needs RMSNorm + RoPE, but it has
no corresponding Q — the draft Q comes only from the noise tokens. The code
reused the joint qk_norm_rope kernel with a scratch Q buffer whose result was
immediately discarded. For Qwen3-4B's 16:4 GQA ratio that wasted 80% of the
kernel's work (num_q_heads of every num_q_heads + num_kv_heads blocks) on a
dead Q branch.

Add k_norm_rope_batched_decode_cuda: same per-head RMSNorm + RoPE logic but
launches only num_kv_heads blocks per token, restricted to the K tensor. Wire
it into the batch context-K path.

HF golden deltas unchanged (mean=0.034243, p99=0.125000, max=0.500000,
n=7680); batch-vs-single stays at mean=0.000000. 8 tests pass.

ctx_len=32 bs32: 9.50ms -> 9.26ms (+2.6%); benefit scales with ctx_len
since the dead-Q work grows with context length.
---
 openinfer-core/src/ops.rs                     |   4 +-
 .../csrc/shared/prefill_attention.cu          | 108 ++++++++++++++++++
 openinfer-kernels/src/ffi/shared.rs           |  19 +++
 openinfer-kernels/src/ops.rs                  |   7 +-
 openinfer-kernels/src/ops/attention.rs        |  44 +++++++
 .../src/batch_forward.rs                      |   9 +-
 6 files changed, 182 insertions(+), 9 deletions(-)

diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
index 31f7eb4f..9d3e0628 100644
--- a/openinfer-core/src/ops.rs
+++ b/openinfer-core/src/ops.rs
@@ -19,8 +19,8 @@ pub use openinfer_kernels::ops::{
     batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into, embedding_decode_into,
     extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into,
     fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_lt_tune,
-    gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into,
-    lora_decode_fused_delta_into, pack_lora_b_rows_into,
+    gemm_per_token, gemv, k_norm_rope_batch_decode_into, linear,
+    lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into,
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
diff --git a/openinfer-kernels/csrc/shared/prefill_attention.cu b/openinfer-kernels/csrc/shared/prefill_attention.cu
index a7b24b66..086883d8 100644
--- a/openinfer-kernels/csrc/shared/prefill_attention.cu
+++ b/openinfer-kernels/csrc/shared/prefill_attention.cu
@@ -136,4 +136,112 @@ void qk_norm_rope_batched_decode_cuda(
     );
 }
 
+// ============================================================================
+// K-only norm + RoPE variant for the DFlash batch path.
+//
+// The context-hidden K projection needs RMSNorm + RoPE, but there is no
+// corresponding Q (the draft Q comes only from the noise tokens). Calling the
+// joint QK kernel on the context K would waste num_q_heads / (num_q_heads +
+// num_kv_heads) of the GPU work — 80% for Qwen3-4B's 16:4 GQA ratio — on a Q
+// buffer whose result is immediately discarded. This variant launches only
+// num_kv_heads blocks per token.
+//
+// It reuses the same in-place per-head RMSNorm + RoPE logic as the joint
+// kernel, restricted to the K tensor.
+// ============================================================================
+
+__global__ void k_norm_rope_kernel(
+    __nv_bfloat16* __restrict__ k,        // [kv_dim, seq_len] modified in-place
+    const __nv_bfloat16* __restrict__ k_norm_weight,  // [head_dim]
+    const __nv_bfloat16* __restrict__ cos_cache,      // [max_pos * head_dim]
+    const __nv_bfloat16* __restrict__ sin_cache,
+    int num_kv_heads, int head_dim,
+    int seq_len, int kv_dim,
+    const int* start_pos_d,  // if non-null, overrides start_pos per token
+    float eps,
+    int cos_max_pos
+) {
+    int head_local = blockIdx.x;
+    int token = blockIdx.y;
+    int d = threadIdx.x;
+
+    int offset = head_local * head_dim + d + token * kv_dim;
+    float val = __bfloat162float(k[offset]);
+
+    // RMSNorm: sum of squares via warp reduction
+    float sq = val * val;
+    sq = warp_reduce_sum(sq);
+
+    int warp_id = d / WARP_SIZE;
+    int lane_id = d % WARP_SIZE;
+    __shared__ float warp_sums[4];  // head_dim/32 = 4 warps
+    if (lane_id == 0) warp_sums[warp_id] = sq;
+    __syncthreads();
+
+    __shared__ float s_inv_rms;
+    {
+        float v = (lane_id < 4) ? warp_sums[lane_id] : 0.0f;
+        float total = warp_reduce_sum(v);
+        if (lane_id == 0) s_inv_rms = rsqrtf(total / head_dim + eps);
+    }
+    __syncthreads();
+
+    __nv_bfloat16 normed = __float2bfloat16(val * s_inv_rms);
+    float normed_f = __bfloat162float(normed) * __bfloat162float(k_norm_weight[d]);
+
+    __shared__ __nv_bfloat16 smem[HEAD_DIM];
+    smem[d] = __float2bfloat16(normed_f);
+    __syncthreads();
+
+    int half = head_dim / 2;
+    int pos = start_pos_d ? __ldg(start_pos_d + token) : token;
+    if (pos < 0 || pos >= cos_max_pos) __trap();
+
+    __nv_bfloat16 result;
+    if (d < half) {
+        float lo = __bfloat162float(smem[d]);
+        float hi = __bfloat162float(smem[d + half]);
+        float c = __bfloat162float(cos_cache[pos * head_dim + d]);
+        float s = __bfloat162float(sin_cache[pos * head_dim + d]);
+        float lo_cos = __bfloat162float(__float2bfloat16(lo * c));
+        float hi_sin = __bfloat162float(__float2bfloat16(hi * s));
+        result = __float2bfloat16(lo_cos - hi_sin);
+    } else {
+        int pair_d = d - half;
+        float lo = __bfloat162float(smem[pair_d]);
+        float hi = __bfloat162float(smem[d]);
+        float c = __bfloat162float(cos_cache[pos * head_dim + pair_d]);
+        float s = __bfloat162float(sin_cache[pos * head_dim + pair_d]);
+        float lo_sin = __bfloat162float(__float2bfloat16(lo * s));
+        float hi_cos = __bfloat162float(__float2bfloat16(hi * c));
+        result = __float2bfloat16(lo_sin + hi_cos);
+    }
+
+    k[offset] = result;
+}
+
+void k_norm_rope_batched_decode_cuda(
+    __nv_bfloat16* k,                    // [kv_dim * batch_size] in-place
+    const __nv_bfloat16* k_norm_weight,
+    const __nv_bfloat16* cos_cache,
+    const __nv_bfloat16* sin_cache,
+    const int* positions,                // [batch_size] per-request positions on GPU
+    int num_kv_heads,
+    int head_dim,
+    int batch_size,
+    float rms_eps,
+    int cos_max_pos,
+    cudaStream_t stream
+) {
+    int kv_dim = num_kv_heads * head_dim;
+    dim3 grid(num_kv_heads, batch_size);
+    k_norm_rope_kernel<<<grid, head_dim, 0, stream>>>(
+        k, k_norm_weight, cos_cache, sin_cache,
+        num_kv_heads, head_dim,
+        /*seq_len=*/batch_size, kv_dim,
+        /*start_pos_d=*/positions,
+        rms_eps, cos_max_pos
+    );
+}
+
 } // extern "C"
diff --git a/openinfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs
index cfbfb815..66f118df 100644
--- a/openinfer-kernels/src/ffi/shared.rs
+++ b/openinfer-kernels/src/ffi/shared.rs
@@ -246,6 +246,25 @@ unsafe extern "C" {
         stream: CUstream,
     );
 
+    /// K-only norm + RoPE for the DFlash batch context-K path. Same per-head
+    /// RMSNorm + RoPE as `qk_norm_rope_batched_decode_cuda` but launches only
+    /// `num_kv_heads` blocks per token — the draft path has no context Q, so
+    /// the joint kernel wastes the Q work. See `k_norm_rope_batched_decode_cuda`
+    /// in `csrc/shared/prefill_attention.cu`.
+    pub fn k_norm_rope_batched_decode_cuda(
+        k: *mut Half,
+        k_norm_weight: *const Half,
+        cos_cache: *const Half,
+        sin_cache: *const Half,
+        positions: *const i32,
+        num_kv_heads: i32,
+        head_dim: i32,
+        batch_size: i32,
+        rms_eps: f32,
+        cos_max_pos: i32,
+        stream: CUstream,
+    );
+
     // Scatter contiguous KV → paged layout (one layer, FlashInfer prefill append).
     pub fn paged_kv_scatter_cuda(
         kv_data: *const Half,
diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs
index 111f6019..135afd42 100644
--- a/openinfer-kernels/src/ops.rs
+++ b/openinfer-kernels/src/ops.rs
@@ -14,9 +14,10 @@ mod norm;
 mod sampling;
 
 pub use attention::{
-    PrefillPagedPlan, paged_attention_batch_decode_hd256_into, paged_attention_batch_decode_into,
-    paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into,
-    qk_norm_partial_rope_batched_decode_hd256_into, qk_norm_rope_batch_decode_into,
+    PrefillPagedPlan, k_norm_rope_batch_decode_into, paged_attention_batch_decode_hd256_into,
+    paged_attention_batch_decode_into, paged_attention_batch_decode_split_kv_into,
+    prefill_attention_paged_into, qk_norm_partial_rope_batched_decode_hd256_into,
+    qk_norm_rope_batch_decode_into,
 };
 #[cfg(feature = "kimi-k2")]
 pub use deepep::{
diff --git a/openinfer-kernels/src/ops/attention.rs b/openinfer-kernels/src/ops/attention.rs
index e3ae955a..3351e8d1 100644
--- a/openinfer-kernels/src/ops/attention.rs
+++ b/openinfer-kernels/src/ops/attention.rs
@@ -497,6 +497,50 @@ pub fn qk_norm_rope_batch_decode_into(
     }
 }
 
+/// K-only norm + RoPE for the DFlash batch context-K path.
+///
+/// Applies in-place RMSNorm + RoPE to `k` only — the draft path's context K
+/// projection has no corresponding Q, so the joint `qk_norm_rope` kernel would
+/// waste `num_q_heads / (num_q_heads + num_kv_heads)` of its work on a Q buffer
+/// whose result is discarded (80% for Qwen3-4B's 16:4 GQA). This variant
+/// launches only `num_kv_heads` blocks per token.
+#[allow(clippy::too_many_arguments)]
+pub fn k_norm_rope_batch_decode_into(
+    ctx: &DeviceContext,
+    k: &mut HiddenStates,
+    k_norm_weight: &DeviceVec,
+    cos_cache: &DeviceVec,
+    sin_cache: &DeviceVec,
+    positions_d: &CudaSlice<i32>,
+    num_kv_heads: usize,
+    head_dim: usize,
+    rms_eps: f32,
+) {
+    let batch_size = k.seq_len;
+
+    let (k_ptr, _gk) = k.data.device_ptr_mut(&ctx.stream);
+    let (kn_ptr, _gkn) = k_norm_weight.data.device_ptr(&ctx.stream);
+    let (cos_ptr, _gc) = cos_cache.data.device_ptr(&ctx.stream);
+    let (sin_ptr, _gs) = sin_cache.data.device_ptr(&ctx.stream);
+    let (pos_ptr, _gp) = positions_d.device_ptr(&ctx.stream);
+
+    unsafe {
+        ffi::k_norm_rope_batched_decode_cuda(
+            k_ptr as *mut ffi::Half,
+            kn_ptr as *const ffi::Half,
+            cos_ptr as *const ffi::Half,
+            sin_ptr as *const ffi::Half,
+            pos_ptr as *const i32,
+            num_kv_heads as i32,
+            head_dim as i32,
+            batch_size as i32,
+            rms_eps,
+            (cos_cache.data.len() / head_dim) as i32,
+            ctx.stream.cu_stream(),
+        );
+    }
+}
+
 /// Batched QK RMSNorm + partial RoPE for Qwen3.5 HD256 decode.
 ///
 /// Reads Q from interleaved `q_full` ([q, gate] per head), writes prepared Q into `q`,
diff --git a/openinfer-qwen3-4b-dflash/src/batch_forward.rs b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
index 3150ad6c..3ec0d96f 100644
--- a/openinfer-qwen3-4b-dflash/src/batch_forward.rs
+++ b/openinfer-qwen3-4b-dflash/src/batch_forward.rs
@@ -248,16 +248,17 @@ impl DFlashDraftModel {
             &bufs.target_normed,
             &mut bufs.v_ctx,
         )?;
-        ops::qk_norm_rope_batch_decode_into(
+        // Context-K needs norm + RoPE but has no corresponding Q. The K-only
+        // kernel launches num_kv_heads blocks per token instead of
+        // num_q_heads + num_kv_heads, dropping 80% of the joint kernel's work
+        // (the dead Q branch) for Qwen3-4B's 16:4 GQA ratio.
+        ops::k_norm_rope_batch_decode_into(
             ctx,
-            &mut bufs.q_ctx_scratch,
             &mut bufs.k_ctx,
-            &layer.attention.q_norm,
             &layer.attention.k_norm,
             &self.cos_cache,
             &self.sin_cache,
             &bufs.positions_ctx,
-            config.num_attention_heads,
             config.num_key_value_heads,
             config.head_dim,
             config.rms_norm_eps,

From 3d98d55b1dfe5bbd40b90bbcb8215dfb7dc6b7d4 Mon Sep 17 00:00:00 2001
From: kitty <kitty.eu.org@gmail.com>
Date: Tue, 23 Jun 2026 12:26:41 +0800
Subject: [PATCH 6/6] fix(qwen3-dflash): forward ctx-len/q-len to the forward
 bench binary

The Python forward bench script generated its fixture with the caller's
--ctx-len/--q-len but launched the Rust runner without forwarding them, so
the runner kept its defaults (2/16) and rejected the fixture shape for any
non-default dimension.

Pass --ctx-len/--q-len through to the runner, and make the Rust fixture path
derive ctx_len/q_len from the fixture's actual tensor shapes so the two sides
agree regardless of which flags the caller repeats.
---
 .../src/bin/qwen3_dflash_forward_bench.rs     | 23 ++++++++++++-------
 .../accuracy/bench_qwen3_4b_dflash_forward.py |  4 ++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
index cb5bd0c9..dac9f374 100644
--- a/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
+++ b/openinfer-qwen3-4b-dflash/src/bin/qwen3_dflash_forward_bench.rs
@@ -18,18 +18,25 @@ fn main() -> Result<()> {
         let bytes = std::fs::read(fixture)
             .with_context(|| format!("failed to read fixture {}", fixture.display()))?;
         let st = SafeTensors::deserialize(&bytes).context("parse fixture")?;
-        let noise = read_bf16(&st, "noise_embedding", &[1, args.q_len, config.hidden_size])?;
+        // Derive ctx_len/q_len from the fixture's actual tensor shapes so the
+        // bench works for any --ctx-len/--q-len the Python side used, rather
+        // than requiring the caller to repeat them on both sides.
+        let noise_view = st
+            .tensor("noise_embedding")
+            .with_context(|| "missing tensor noise_embedding")?;
+        let q_len = noise_view.shape()[1];
+        let target_view = st
+            .tensor("target_hidden")
+            .with_context(|| "missing tensor target_hidden")?;
+        let ctx_len = target_view.shape()[1];
+        let noise = read_bf16(&st, "noise_embedding", &[1, q_len, config.hidden_size])?;
         let target_hidden = read_bf16(
             &st,
             "target_hidden",
-            &[
-                1,
-                args.ctx_len,
-                config.hidden_size * config.target_layer_count(),
-            ],
+            &[1, ctx_len, config.hidden_size * config.target_layer_count()],
         )?;
-        let positions = read_i32(&st, "position_ids", &[1, args.ctx_len + args.q_len])?;
-        (noise, target_hidden, positions, args.ctx_len, args.q_len)
+        let positions = read_i32(&st, "position_ids", &[1, ctx_len + q_len])?;
+        (noise, target_hidden, positions, ctx_len, q_len)
     } else {
         let noise = deterministic_bf16(args.q_len * config.hidden_size, 0xD4A5_4B16);
         let target_hidden = deterministic_bf16(
diff --git a/tools/accuracy/bench_qwen3_4b_dflash_forward.py b/tools/accuracy/bench_qwen3_4b_dflash_forward.py
index fb232e4c..34fe05b2 100644
--- a/tools/accuracy/bench_qwen3_4b_dflash_forward.py
+++ b/tools/accuracy/bench_qwen3_4b_dflash_forward.py
@@ -129,6 +129,10 @@ def main() -> int:
             str(fixture_path),
             "--device",
             str(args.device),
+            "--ctx-len",
+            str(args.ctx_len),
+            "--q-len",
+            str(args.q_len),
             "--warmup",
             str(args.warmup),
             "--iters",