openinfer-project · kitty-eu-org · Jun 18, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,6 +16,7 @@ members = [
     "openinfer-deepseek-v2-lite",
     "openinfer-kimi-k2",
     "openinfer-qwen3-4b",
+    "openinfer-qwen3-4b-dflash",
     "openinfer-qwen35-4b",
     "openinfer-sample",
     "openinfer-kv-cache",
@@ -129,6 +130,7 @@ openinfer-engine = { path = "openinfer-engine" }
 openinfer-kernels = { path = "openinfer-kernels" }
 openinfer-kimi-k2 = { path = "openinfer-kimi-k2" }
 openinfer-qwen3-4b = { path = "openinfer-qwen3-4b" }
+openinfer-qwen3-4b-dflash = { path = "openinfer-qwen3-4b-dflash" }
 openinfer-qwen35-4b = { path = "openinfer-qwen35-4b" }
 openinfer-sample = { path = "openinfer-sample" }
 openinfer-deepseek-v2-lite = { path = "openinfer-deepseek-v2-lite" }

diff --git a/docs/index.md b/docs/index.md
@@ -29,6 +29,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | `models/qwen3/green-ctx-sm-partition.md` | Green Context SM partition (`OPENINFER_SM_PARTITION=20`) runs prefill/decode on disjoint SMs so decode stops stalling behind co-scheduled prefill: 5090 mid-band ITL p99 ~halved, TPOT down (−22% @QPS12), but TTFT 2–4× worse (prefill deferred + fewer SMs) — a TTFT↔ITL/TPOT trade, not a free win. Two-graph change (decode CUDA graph captured on the green decode stream) adds ~5% ITL p99 / 1–4% TPOT on top. Mechanism, A/B table, Xid-31/gemm_lt pitfalls. |
 | `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB, batched greedy sampling (#307), mixed greedy/non-greedy sampling (#284), and pegaflow KV offload (#316) are landed; open set is zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, and YaRN #8 follow-up. |
 | `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. |
+| `models/qwen3/dflash.md` | `openinfer-qwen3-4b-dflash` supports only `z-lab/Qwen3-4B-DFlash-b16`: standalone model config/weights/forward plus transformers remote-code parity, with no generic DFlash framework or Qwen3 server/controller changes in this task. |
 | `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. |
 | `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. |
 | `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. |

diff --git a/docs/models/qwen3/dflash.md b/docs/models/qwen3/dflash.md
diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
@@ -13,17 +13,19 @@ pub use attention::{
     paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into,
 };
 pub use openinfer_kernels::ops::{
-    GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into,
-    add_batch, add_batch_into, argmax, argmax_batch_bf16_into, bf16_hidden_to_f32_into,
+    GEMM_LT_MAX_N, LoraDecodeGroupedProjection, RaggedPrefillPlan,
+    accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, argmax,
+    argmax_batch_bf16_into, batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into,
     embedding_decode_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into,
     f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm,
     gemm_graphsafe_into_checked, gemm_graphsafe_ref_into_checked, gemm_into_checked, gemm_lt_tune,
-    gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into,
-    lora_decode_fused_delta_into, pack_lora_b_rows_into,
+    gemm_per_token, gemv, k_norm_rope_batch_decode_into, linear,
+    lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into,
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
-    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into,
+    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
+    single_prefill_nhd_noncausal_into, strided_segment_copy_into, write_vec_into,
 };
 #[cfg(not(feature = "kernel-call-trace"))]
 pub use openinfer_kernels::ops::{

diff --git a/openinfer-kernels/csrc/shared/elementwise.cu b/openinfer-kernels/csrc/shared/elementwise.cu
@@ -427,4 +427,54 @@ CUresult embedding_batched_vocab_shard_cuda(
   return (CUresult)cudaGetLastError();
 }
 
+// ============================================================================
+// Strided segment copy for DFlash batch K/V concatenation.
+//
+// Copies one segment (ctx or noise) of every request in a batch from a
+// contiguous source layout to a strided destination layout in a single
+// kernel launch, replacing 2 * batch_size memcpy_dtod calls per K/V tensor.
+//
+//   src: [batch_size * src_seg_len, dim]  row-major, contiguous
+//   dst: [batch_size * dst_seg_total, dim] row-major, request r occupies
+//       rows [r * dst_seg_total + dst_row_offset,
+//             r * dst_seg_total + dst_row_offset + src_seg_len)
+//
+// Each thread copies one bf16 element. The total work is
+// batch_size * src_seg_len * dim.
+// ============================================================================
+
+__global__ void strided_segment_copy_kernel(
+    const __nv_bfloat16 *__restrict__ src,
+    __nv_bfloat16 *__restrict__ dst,
+    int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
+    int batch_size) {
+  int total = batch_size * src_seg_len * dim;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < total;
+       idx += gridDim.x * blockDim.x) {
+    int element = idx % dim;
+    int row_in_seg = (idx / dim) % src_seg_len;
+    int req = idx / (dim * src_seg_len);
+    int src_row = req * src_seg_len + row_in_seg;
+    int dst_row = req * dst_seg_total + dst_row_offset + row_in_seg;
+    dst[dst_row * dim + element] = src[src_row * dim + element];
+  }
+}
+
+CUresult strided_segment_copy_cuda(
+    const __nv_bfloat16 *src, __nv_bfloat16 *dst,
+    int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
+    int batch_size, cudaStream_t stream) {
+  int total = batch_size * src_seg_len * dim;
+  int block = 256;
+  // The kernel uses a grid-stride loop, so any grid size >= 1 is correct.
+  // Size the grid to the work so every element is covered in the first pass
+  // (no upper cap — a cap would silently drop elements for large copies).
+  int grid = (total + block - 1) / block;
+  if (grid < 1) grid = 1;
+  strided_segment_copy_kernel<<<grid, block, 0, stream>>>(
+      src, dst, dim, src_seg_len, dst_seg_total, dst_row_offset, batch_size);
+  return (CUresult)cudaGetLastError();
+}
+
 } // extern "C"
diff --git a/openinfer-kernels/csrc/shared/paged_attention.cu b/openinfer-kernels/csrc/shared/paged_attention.cu
@@ -22,6 +22,7 @@ using namespace flashinfer;
 using DType  = __nv_bfloat16;
 using IdType = int32_t;
 using ParamsT = BatchDecodeParams<DType, DType, DType, IdType>;
+using BatchPrefillRaggedParamsT = BatchPrefillRaggedParams<DType, DType, DType, IdType>;
 using Variant = DefaultAttention</*custom_mask=*/false,
                                  /*sliding_window=*/false,
                                  /*logits_soft_cap=*/false,
@@ -607,6 +608,157 @@ int single_prefill_cuda(
             reinterpret_cast<cudaStream_t>(stream)));
 }
 
+// ---------------------------------------------------------------------------
+// Single-request non-causal prefill over contiguous NHD K/V.
+//
+// DFlash draft attention materializes K/V as token-major HiddenStates:
+//   q: [q_len, num_qo_heads, head_dim]
+//   k/v: [kv_len, num_kv_heads, head_dim]
+// This wrapper mirrors vLLM's non-causal FlashAttention/FlashInfer semantics:
+// no causal mask, no sliding window, and GQA handled by FlashInfer.
+// ---------------------------------------------------------------------------
+int single_prefill_nhd_noncausal_cuda(
+    void*    q,
+    void*    output,
+    void*    k,
+    void*    v,
+    int32_t  num_qo_heads,
+    int32_t  num_kv_heads,
+    int32_t  head_dim,
+    int32_t  q_len,
+    int32_t  kv_len,
+    float    sm_scale,
+    void*    stream)
+{
+    uint32_t q_stride_n = num_qo_heads * head_dim;
+    uint32_t q_stride_h = head_dim;
+    uint32_t kv_stride_n = num_kv_heads * head_dim;
+    uint32_t kv_stride_h = head_dim;
+
+    PrefillParamsT params(
+        reinterpret_cast<DType*>(q),
+        reinterpret_cast<DType*>(k),
+        reinterpret_cast<DType*>(v),
+        /*maybe_custom_mask=*/nullptr,
+        reinterpret_cast<DType*>(output),
+        /*lse=*/nullptr,
+        /*maybe_alibi_slopes=*/nullptr,
+        num_qo_heads,
+        num_kv_heads,
+        static_cast<uint32_t>(q_len),
+        static_cast<uint32_t>(kv_len),
+        q_stride_n,
+        q_stride_h,
+        kv_stride_n,
+        kv_stride_h,
+        static_cast<uint32_t>(head_dim),
+        /*window_left=*/-1,
+        /*logits_soft_cap=*/0.0f,
+        sm_scale,
+        /*rope_scale=*/1.0f,
+        /*rope_theta=*/1e6f);
+
+    return static_cast<int>(
+        SinglePrefillWithKVCacheDispatched<
+            /*HEAD_DIM_QK=*/128,
+            /*HEAD_DIM_VO=*/128,
+            PosEncodingMode::kNone,
+            /*USE_FP16_QK_REDUCTION=*/false,
+            MaskMode::kNone,
+            Variant,
+            PrefillParamsT>(
+            params,
+            /*tmp=*/nullptr,
+            reinterpret_cast<cudaStream_t>(stream)));
+}
+
+// ---------------------------------------------------------------------------
+// Batched non-causal prefill over compact ragged NHD K/V.
+//
+// DFlash groups exact-shape draft requests into compact token-major tensors:
+//   q: [sum(q_len), num_qo_heads, head_dim]
+//   k/v: [sum(kv_len), num_kv_heads, head_dim]
+// with q_indptr/kv_indptr separating requests. This maps directly to
+// FlashInfer BatchPrefillWithRaggedKVCache with MaskMode::kNone.
+// ---------------------------------------------------------------------------
+int batch_prefill_ragged_nhd_noncausal_cuda(
+    void*    q,
+    void*    output,
+    void*    k,
+    void*    v,
+    int32_t* q_indptr,
+    int32_t* kv_indptr,
+    int32_t* request_indices,
+    int32_t* qo_tile_indices,
+    int32_t* kv_tile_indices,
+    int32_t* kv_chunk_size_ptr,
+    uint32_t* total_num_rows,
+    int32_t  num_qo_heads,
+    int32_t  num_kv_heads,
+    int32_t  head_dim,
+    int32_t  total_q_len,
+    int32_t  batch_size,
+    int32_t  padded_batch_size,
+    float    sm_scale,
+    void*    stream)
+{
+    uint32_t q_stride_n = num_qo_heads * head_dim;
+    uint32_t q_stride_h = head_dim;
+    uint32_t kv_stride_n = num_kv_heads * head_dim;
+    uint32_t kv_stride_h = head_dim;
+
+    BatchPrefillRaggedParamsT params(
+        reinterpret_cast<DType*>(q),
+        reinterpret_cast<DType*>(k),
+        reinterpret_cast<DType*>(v),
+        /*maybe_custom_mask=*/nullptr,
+        q_indptr,
+        kv_indptr,
+        /*maybe_mask_indptr=*/nullptr,
+        /*maybe_q_rope_offset=*/nullptr,
+        /*maybe_k_rope_offset=*/nullptr,
+        reinterpret_cast<DType*>(output),
+        /*lse=*/nullptr,
+        /*maybe_alibi_slopes=*/nullptr,
+        num_qo_heads,
+        num_kv_heads,
+        q_stride_n,
+        q_stride_h,
+        kv_stride_n,
+        kv_stride_h,
+        /*window_left=*/-1,
+        /*logits_soft_cap=*/0.0f,
+        sm_scale,
+        /*rope_scale=*/1.0f,
+        /*rope_theta=*/1e6f);
+
+    params.request_indices = request_indices;
+    params.qo_tile_indices = qo_tile_indices;
+    params.kv_tile_indices = kv_tile_indices;
+    params.o_indptr = q_indptr;
+    params.kv_chunk_size_ptr = kv_chunk_size_ptr;
+    params.total_num_rows = total_num_rows;
+    params.max_total_num_rows = static_cast<uint32_t>(total_q_len);
+    params.padded_batch_size = static_cast<uint32_t>(padded_batch_size);
+    params.partition_kv = false;
+
+    return static_cast<int>(
+        BatchPrefillWithRaggedKVCacheDispatched<
+            /*CTA_TILE_Q=*/16,
+            /*HEAD_DIM_QK=*/128,
+            /*HEAD_DIM_VO=*/128,
+            PosEncodingMode::kNone,
+            /*USE_FP16_QK_REDUCTION=*/false,
+            MaskMode::kNone,
+            Variant,
+            BatchPrefillRaggedParamsT>(
+            params,
+            /*tmp_v=*/nullptr,
+            /*tmp_s=*/nullptr,
+            /*enable_pdl=*/false,
+            reinterpret_cast<cudaStream_t>(stream)));
+}
+
 // ---------------------------------------------------------------------------
 // Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache.
 //