Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ members = [
"openinfer-deepseek-v2-lite",
"openinfer-kimi-k2",
"openinfer-qwen3-4b",
"openinfer-qwen3-4b-dflash",
"openinfer-qwen35-4b",
"openinfer-sample",
"openinfer-kv-cache",
Expand Down Expand Up @@ -129,6 +130,7 @@ openinfer-engine = { path = "openinfer-engine" }
openinfer-kernels = { path = "openinfer-kernels" }
openinfer-kimi-k2 = { path = "openinfer-kimi-k2" }
openinfer-qwen3-4b = { path = "openinfer-qwen3-4b" }
openinfer-qwen3-4b-dflash = { path = "openinfer-qwen3-4b-dflash" }
openinfer-qwen35-4b = { path = "openinfer-qwen35-4b" }
openinfer-sample = { path = "openinfer-sample" }
openinfer-deepseek-v2-lite = { path = "openinfer-deepseek-v2-lite" }
Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
| `models/qwen3/green-ctx-sm-partition.md` | Green Context SM partition (`OPENINFER_SM_PARTITION=20`) runs prefill/decode on disjoint SMs so decode stops stalling behind co-scheduled prefill: 5090 mid-band ITL p99 ~halved, TPOT down (−22% @QPS12), but TTFT 2–4× worse (prefill deferred + fewer SMs) — a TTFT↔ITL/TPOT trade, not a free win. Two-graph change (decode CUDA graph captured on the green decode stream) adds ~5% ITL p99 / 1–4% TPOT on top. Mechanism, A/B table, Xid-31/gemm_lt pitfalls. |
| `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB, batched greedy sampling (#307), mixed greedy/non-greedy sampling (#284), and pegaflow KV offload (#316) are landed; open set is zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, and YaRN #8 follow-up. |
| `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. |
| `models/qwen3/dflash.md` | `openinfer-qwen3-4b-dflash` supports only `z-lab/Qwen3-4B-DFlash-b16`: standalone model config/weights/forward plus transformers remote-code parity, with no generic DFlash framework or Qwen3 server/controller changes in this task. |
| `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. |
| `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. |
| `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. |
Expand Down
448 changes: 448 additions & 0 deletions docs/models/qwen3/dflash.md

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions openinfer-core/src/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@ pub use attention::{
paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into,
};
pub use openinfer_kernels::ops::{
GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into,
add_batch, add_batch_into, argmax, argmax_batch_bf16_into, bf16_hidden_to_f32_into,
GEMM_LT_MAX_N, LoraDecodeGroupedProjection, RaggedPrefillPlan,
accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, argmax,
argmax_batch_bf16_into, batch_prefill_ragged_nhd_noncausal_into, bf16_hidden_to_f32_into,
embedding_decode_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into,
f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm,
gemm_graphsafe_into_checked, gemm_graphsafe_ref_into_checked, gemm_into_checked, gemm_lt_tune,
gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into,
lora_decode_fused_delta_into, pack_lora_b_rows_into,
gemm_per_token, gemv, k_norm_rope_batch_decode_into, linear,
lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into,
qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into,
scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
single_prefill_nhd_noncausal_into, strided_segment_copy_into, write_vec_into,
};
#[cfg(not(feature = "kernel-call-trace"))]
pub use openinfer_kernels::ops::{
Expand Down
50 changes: 50 additions & 0 deletions openinfer-kernels/csrc/shared/elementwise.cu
Original file line number Diff line number Diff line change
Expand Up @@ -427,4 +427,54 @@ CUresult embedding_batched_vocab_shard_cuda(
return (CUresult)cudaGetLastError();
}

// ============================================================================
// Strided segment copy for DFlash batch K/V concatenation.
//
// Copies one segment (ctx or noise) of every request in a batch from a
// contiguous source layout to a strided destination layout in a single
// kernel launch, replacing 2 * batch_size memcpy_dtod calls per K/V tensor.
//
// src: [batch_size * src_seg_len, dim] row-major, contiguous
// dst: [batch_size * dst_seg_total, dim] row-major, request r occupies
// rows [r * dst_seg_total + dst_row_offset,
// r * dst_seg_total + dst_row_offset + src_seg_len)
//
// Each thread copies one bf16 element. The total work is
// batch_size * src_seg_len * dim.
// ============================================================================

__global__ void strided_segment_copy_kernel(
const __nv_bfloat16 *__restrict__ src,
__nv_bfloat16 *__restrict__ dst,
int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
int batch_size) {
int total = batch_size * src_seg_len * dim;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total;
idx += gridDim.x * blockDim.x) {
int element = idx % dim;
int row_in_seg = (idx / dim) % src_seg_len;
int req = idx / (dim * src_seg_len);
int src_row = req * src_seg_len + row_in_seg;
int dst_row = req * dst_seg_total + dst_row_offset + row_in_seg;
dst[dst_row * dim + element] = src[src_row * dim + element];
}
}

CUresult strided_segment_copy_cuda(
const __nv_bfloat16 *src, __nv_bfloat16 *dst,
int dim, int src_seg_len, int dst_seg_total, int dst_row_offset,
int batch_size, cudaStream_t stream) {
int total = batch_size * src_seg_len * dim;
int block = 256;
// The kernel uses a grid-stride loop, so any grid size >= 1 is correct.
// Size the grid to the work so every element is covered in the first pass
// (no upper cap — a cap would silently drop elements for large copies).
int grid = (total + block - 1) / block;
if (grid < 1) grid = 1;
strided_segment_copy_kernel<<<grid, block, 0, stream>>>(
src, dst, dim, src_seg_len, dst_seg_total, dst_row_offset, batch_size);
return (CUresult)cudaGetLastError();
}

} // extern "C"
152 changes: 152 additions & 0 deletions openinfer-kernels/csrc/shared/paged_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using namespace flashinfer;
using DType = __nv_bfloat16;
using IdType = int32_t;
using ParamsT = BatchDecodeParams<DType, DType, DType, IdType>;
using BatchPrefillRaggedParamsT = BatchPrefillRaggedParams<DType, DType, DType, IdType>;
using Variant = DefaultAttention</*custom_mask=*/false,
/*sliding_window=*/false,
/*logits_soft_cap=*/false,
Expand Down Expand Up @@ -607,6 +608,157 @@ int single_prefill_cuda(
reinterpret_cast<cudaStream_t>(stream)));
}

// ---------------------------------------------------------------------------
// Single-request non-causal prefill over contiguous NHD K/V.
//
// DFlash draft attention materializes K/V as token-major HiddenStates:
// q: [q_len, num_qo_heads, head_dim]
// k/v: [kv_len, num_kv_heads, head_dim]
// This wrapper mirrors vLLM's non-causal FlashAttention/FlashInfer semantics:
// no causal mask, no sliding window, and GQA handled by FlashInfer.
// ---------------------------------------------------------------------------
int single_prefill_nhd_noncausal_cuda(
void* q,
void* output,
void* k,
void* v,
int32_t num_qo_heads,
int32_t num_kv_heads,
int32_t head_dim,
int32_t q_len,
int32_t kv_len,
float sm_scale,
void* stream)
{
uint32_t q_stride_n = num_qo_heads * head_dim;
uint32_t q_stride_h = head_dim;
uint32_t kv_stride_n = num_kv_heads * head_dim;
uint32_t kv_stride_h = head_dim;

PrefillParamsT params(
reinterpret_cast<DType*>(q),
reinterpret_cast<DType*>(k),
reinterpret_cast<DType*>(v),
/*maybe_custom_mask=*/nullptr,
reinterpret_cast<DType*>(output),
/*lse=*/nullptr,
/*maybe_alibi_slopes=*/nullptr,
num_qo_heads,
num_kv_heads,
static_cast<uint32_t>(q_len),
static_cast<uint32_t>(kv_len),
q_stride_n,
q_stride_h,
kv_stride_n,
kv_stride_h,
static_cast<uint32_t>(head_dim),
/*window_left=*/-1,
/*logits_soft_cap=*/0.0f,
sm_scale,
/*rope_scale=*/1.0f,
/*rope_theta=*/1e6f);

return static_cast<int>(
SinglePrefillWithKVCacheDispatched<
/*HEAD_DIM_QK=*/128,
/*HEAD_DIM_VO=*/128,
PosEncodingMode::kNone,
/*USE_FP16_QK_REDUCTION=*/false,
MaskMode::kNone,
Variant,
PrefillParamsT>(
params,
/*tmp=*/nullptr,
reinterpret_cast<cudaStream_t>(stream)));
}

// ---------------------------------------------------------------------------
// Batched non-causal prefill over compact ragged NHD K/V.
//
// DFlash groups exact-shape draft requests into compact token-major tensors:
// q: [sum(q_len), num_qo_heads, head_dim]
// k/v: [sum(kv_len), num_kv_heads, head_dim]
// with q_indptr/kv_indptr separating requests. This maps directly to
// FlashInfer BatchPrefillWithRaggedKVCache with MaskMode::kNone.
// ---------------------------------------------------------------------------
int batch_prefill_ragged_nhd_noncausal_cuda(
void* q,
void* output,
void* k,
void* v,
int32_t* q_indptr,
int32_t* kv_indptr,
int32_t* request_indices,
int32_t* qo_tile_indices,
int32_t* kv_tile_indices,
int32_t* kv_chunk_size_ptr,
uint32_t* total_num_rows,
int32_t num_qo_heads,
int32_t num_kv_heads,
int32_t head_dim,
int32_t total_q_len,
int32_t batch_size,
int32_t padded_batch_size,
float sm_scale,
void* stream)
{
uint32_t q_stride_n = num_qo_heads * head_dim;
uint32_t q_stride_h = head_dim;
uint32_t kv_stride_n = num_kv_heads * head_dim;
uint32_t kv_stride_h = head_dim;

BatchPrefillRaggedParamsT params(
reinterpret_cast<DType*>(q),
reinterpret_cast<DType*>(k),
reinterpret_cast<DType*>(v),
/*maybe_custom_mask=*/nullptr,
q_indptr,
kv_indptr,
/*maybe_mask_indptr=*/nullptr,
/*maybe_q_rope_offset=*/nullptr,
/*maybe_k_rope_offset=*/nullptr,
reinterpret_cast<DType*>(output),
/*lse=*/nullptr,
/*maybe_alibi_slopes=*/nullptr,
num_qo_heads,
num_kv_heads,
q_stride_n,
q_stride_h,
kv_stride_n,
kv_stride_h,
/*window_left=*/-1,
/*logits_soft_cap=*/0.0f,
sm_scale,
/*rope_scale=*/1.0f,
/*rope_theta=*/1e6f);

params.request_indices = request_indices;
params.qo_tile_indices = qo_tile_indices;
params.kv_tile_indices = kv_tile_indices;
params.o_indptr = q_indptr;
params.kv_chunk_size_ptr = kv_chunk_size_ptr;
params.total_num_rows = total_num_rows;
params.max_total_num_rows = static_cast<uint32_t>(total_q_len);
params.padded_batch_size = static_cast<uint32_t>(padded_batch_size);
params.partition_kv = false;

return static_cast<int>(
BatchPrefillWithRaggedKVCacheDispatched<
/*CTA_TILE_Q=*/16,
/*HEAD_DIM_QK=*/128,
/*HEAD_DIM_VO=*/128,
PosEncodingMode::kNone,
/*USE_FP16_QK_REDUCTION=*/false,
MaskMode::kNone,
Variant,
BatchPrefillRaggedParamsT>(
params,
/*tmp_v=*/nullptr,
/*tmp_s=*/nullptr,
/*enable_pdl=*/false,
reinterpret_cast<cudaStream_t>(stream)));
}

// ---------------------------------------------------------------------------
// Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache.
//
Expand Down
Loading