diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 02743e89d5f..6adb40d5d3d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1493,6 +1493,16 @@ static void ggml_cuda_op_mul_mat_cublas( // ldc == nrows of the matrix that cuBLAS writes into int64_t ldc = id == ctx.device ? ne0 : row_diff; + // Guard: cuBLAS requires m >= 1, n >= 1, k >= 1 for Sgemm/GemmEx. + // During speculative decoding (DFlash/copyspec), draft verification can + // produce non-consecutive token positions which result in zero-size + // sub-matrices. cuBLAS treats these as invalid parameters and aborts + // with CUBLAS_STATUS_INVALID_VALUE. Zero-size GEMMs are defined as + // no-ops (no output written), matching OpenBLAS and MKL behavior. + if (row_diff == 0 || src1_ncols == 0 || ne10 == 0 || ne00 == 0 || ldc == 0) { + return; + } + const int cc = ggml_cuda_info().devices[id].cc; const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||