From 3b32d7ebb6879da4357cc7f52c64a640ba241ec6 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Thu, 28 May 2026 17:03:44 +0800
Subject: [PATCH] fix: pad ids_dst allocation to prevent MMQ stream-k OOB reads

The stream-k kernel loads a full mmq_x-wide tile from ids_dst without
bounds-checking the load (only the write-back is bounded). When the
number of get_rows is small, the unpadded allocation caused OOB reads.

Pad by get_mmq_x_max_host(cc) elements to ensure the tile load is
always within allocated memory.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ggml/src/ggml-cuda/mmq.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index caa88c8112d..124a879ff5b 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -175,7 +175,11 @@ void ggml_cuda_mul_mat_q(
     GGML_ASSERT(ne1 == n_expert_used);
 
     ggml_cuda_pool_alloc<int32_t> ids_src1(ctx.pool(), ne_get_rows);
-    ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows);
+    // Pad ids_dst by mmq_x to prevent OOB reads in the stream-k kernel.
+    // The kernel loads a full mmq_x-wide tile from ids_dst (line 3709 in mmq.cuh)
+    // without bounds-checking the load, only the write-back is bounded.
+    const int64_t mmq_x_pad = (int64_t)get_mmq_x_max_host(cc);
+    ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows + mmq_x_pad);
     ggml_cuda_pool_alloc<int32_t> expert_bounds(ctx.pool(), ne02 + 1);
 
     {