From 3b32d7ebb6879da4357cc7f52c64a640ba241ec6 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 28 May 2026 17:03:44 +0800 Subject: [PATCH] fix: pad ids_dst allocation to prevent MMQ stream-k OOB reads The stream-k kernel loads a full mmq_x-wide tile from ids_dst without bounds-checking the load (only the write-back is bounded). When the number of get_rows is small, the unpadded allocation caused OOB reads. Pad by get_mmq_x_max_host(cc) elements to ensure the tile load is always within allocated memory. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ggml/src/ggml-cuda/mmq.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index caa88c8112d..124a879ff5b 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -175,7 +175,11 @@ void ggml_cuda_mul_mat_q( GGML_ASSERT(ne1 == n_expert_used); ggml_cuda_pool_alloc ids_src1(ctx.pool(), ne_get_rows); - ggml_cuda_pool_alloc ids_dst(ctx.pool(), ne_get_rows); + // Pad ids_dst by mmq_x to prevent OOB reads in the stream-k kernel. + // The kernel loads a full mmq_x-wide tile from ids_dst (line 3709 in mmq.cuh) + // without bounds-checking the load, only the write-back is bounded. + const int64_t mmq_x_pad = (int64_t)get_mmq_x_max_host(cc); + ggml_cuda_pool_alloc ids_dst(ctx.pool(), ne_get_rows + mmq_x_pad); ggml_cuda_pool_alloc expert_bounds(ctx.pool(), ne02 + 1); {