From ee7da448afcd6a68c58de5125a41a5f1462b2974 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 16 Oct 2025 17:17:15 -0700
Subject: [PATCH 01/11] feat: initial attempt at circular generation

---
 include/ggml.h                                |  20 +++
 src/ggml-cpu/ops.cpp                          | 158 ++++++++++++++----
 src/ggml-cuda/conv2d-dw.cu                    |  57 +++++--
 src/ggml-cuda/conv2d.cu                       |  41 ++++-
 src/ggml-cuda/pad.cu                          |  89 ++++++++--
 src/ggml-vulkan/ggml-vulkan.cpp               |   8 +
 src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp |  41 ++++-
 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp |  67 ++++++--
 src/ggml-vulkan/vulkan-shaders/pad.comp       |  38 ++++-
 src/ggml.c                                    |  29 +++-
 10 files changed, 465 insertions(+), 83 deletions(-)
diff --git a/include/ggml.h b/include/ggml.h
index 60c6b63d05..0e2b35d7a2 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -2102,6 +2102,13 @@ extern "C" {
             int64_t               ne3,
             uint32_t              mode); // ggml_scale_mode [ | ggml_scale_flag...]
 
+    enum ggml_pad_mode {
+        GGML_PAD_MODE_ZERO     = 0,
+        GGML_PAD_MODE_CIRCULAR = 1,
+    };
+
+    GGML_API void ggml_set_pad_mode(struct ggml_tensor * tensor, enum ggml_pad_mode mode);
+
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,
@@ -2124,6 +2131,19 @@ extern "C" {
             int                  rp3
             );
 
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,
diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp
index 1c43865ff6..7a7c5c671b 100644
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -10,6 +10,14 @@
 #include <float.h>
 #include <algorithm>
 
+static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
+    if (size <= 0) {
+        return 0;
+    }
+    int64_t mod = coord % size;
+    return mod < 0 ? mod + size : mod;
+}
+
 // ggml_compute_forward_dup
 
 static void ggml_compute_forward_dup_same_cont(
@@ -6680,6 +6688,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
     const int32_t pad_y      = dst->op_params[3];
     const int32_t dilation_x = dst->op_params[4];
     const int32_t dilation_y = dst->op_params[5];
+    const bool circular = ggml_get_op_params_i32(dst, 6) != 0;
 
     const int64_t c_in  = src->ne[2];
     const int64_t c_out = kernel->ne[3];
@@ -6736,10 +6745,15 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
 
                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
 
-                        float src_val;
-                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            src_val = 0.0f;
-                        } else {
+                        float src_val = 0.0f;
+                        if (circular) {
+                            if (src_h > 0 && src_w > 0) {
+                                const int64_t sy_wrapped = ggml_wrap_coord(sy, src_h);
+                                const int64_t sx_wrapped = ggml_wrap_coord(sx, src_w);
+                                const float * src_ptr = (const float *)((const char *)src_base + sx_wrapped * src->nb[0] + sy_wrapped * src->nb[1] + ic * src->nb[2]);
+                                src_val               = *src_ptr;
+                            }
+                        } else if (sy >= 0 && sy < src_h && sx >= 0 && sx < src_w) {
                             const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
                             src_val               = *src_ptr;
                         }
@@ -7052,6 +7066,7 @@ struct ggml_conv_2d_dw_params {
     int pad_y;
     int dilation_x;
     int dilation_y;
+    int circular;
 };
 
 static void ggml_compute_forward_conv_2d_dw_cwhn(
@@ -7063,6 +7078,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
 
     const int64_t c = p.channels;
     const float * knl_data = (const float *)kernel->data;
+    const bool circular = p.circular != 0;
 
     const int64_t rows_total = p.dst_h * p.batch;
     const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
@@ -7090,13 +7106,23 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
             for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
                 GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (circular) {
+                        if (p.src_h == 0) {
+                            continue;
+                        }
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (circular) {
+                            if (p.src_w == 0) {
+                                continue;
+                            }
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
@@ -7111,13 +7137,23 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
             for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
                 float sum = 0.0f;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (circular) {
+                        if (p.src_h == 0) {
+                            continue;
+                        }
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (circular) {
+                            if (p.src_w == 0) {
+                                continue;
+                            }
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
@@ -7138,6 +7174,7 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
         const ggml_conv_2d_dw_params & p) {
 
     const int64_t n = p.channels * p.batch;
+    const bool circular = p.circular != 0;
     const int64_t per_thread = (n + params->nth - 1) / params->nth;
     const int64_t start = params->ith * per_thread;
     const int64_t end = MIN(start + per_thread, n);
@@ -7152,13 +7189,23 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
 
                 float sum = 0.0f;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+                    if (circular) {
+                        if (p.src_h == 0) {
+                            continue;
+                        }
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                        if (circular) {
+                            if (p.src_w == 0) {
+                                continue;
+                            }
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         sum += knl_data[knl_y * p.knl_w + knl_x]
@@ -7192,6 +7239,7 @@ void ggml_compute_forward_conv_2d_dw(
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = ggml_get_op_params_i32(dst, 6);
 
     GGML_ASSERT(kernel->ne[3] == p.channels);
     GGML_ASSERT(dst->ne[3] == p.batch);
@@ -7612,24 +7660,76 @@ static void ggml_compute_forward_pad_f32(
     const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
     const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
     const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
+    const int32_t mode = ggml_get_op_params_i32(dst, 8);
+    const bool circular = mode == GGML_PAD_MODE_CIRCULAR;
 
 
     // TODO: optimize
 
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-                    if ((i0 >= lp0 && i0 < ne0 - rp0) \
-                         && (i1 >= lp1 && i1 < ne1 - rp1) \
-                         && (i2 >= lp2 && i2 < ne2 - rp2) \
-                         && (i3 >= lp3 && i3 < ne3 - rp3)) {
-                        const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
-                        const float * src_ptr = (const float *)((char *) src0->data + src_idx);
+    if (!circular) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        if ((i0 >= lp0 && i0 < ne0 - rp0) \
+                             && (i1 >= lp1 && i1 < ne1 - rp1) \
+                             && (i2 >= lp2 && i2 < ne2 - rp2) \
+                             && (i3 >= lp3 && i3 < ne3 - rp3)) {
+                            const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+                            const float * src_ptr = (const float *)((char *) src0->data + src_idx);
+                            dst_ptr[dst_idx] = *src_ptr;
+                        } else {
+                            dst_ptr[dst_idx] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        const int64_t src_ne0 = ne00;
+        const int64_t src_ne1 = ne01;
+        const int64_t src_ne2 = ne02;
+        const int64_t src_ne3 = ne03;
+
+        const bool valid_extents = src_ne0 > 0 && src_ne1 > 0 && src_ne2 > 0 && src_ne3 > 0;
+
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                        if (!valid_extents) {
+                            dst_ptr[dst_idx] = 0;
+                            continue;
+                        }
+
+                        int64_t ci0 = i0 - lp0;
+                        int64_t ci1 = i1 - lp1;
+                        int64_t ci2 = i2 - lp2;
+                        int64_t ci3 = i3 - lp3;
+
+                        ci0 %= src_ne0;
+                        if (ci0 < 0) {
+                            ci0 += src_ne0;
+                        }
+                        ci1 %= src_ne1;
+                        if (ci1 < 0) {
+                            ci1 += src_ne1;
+                        }
+                        ci2 %= src_ne2;
+                        if (ci2 < 0) {
+                            ci2 += src_ne2;
+                        }
+                        ci3 %= src_ne3;
+                        if (ci3 < 0) {
+                            ci3 += src_ne3;
+                        }
+
+                        const size_t src_idx = (size_t)ci3*nb03 + (size_t)ci2*nb02 + (size_t)ci1*nb01 + (size_t)ci0*nb00;
+                        const float * src_ptr = (const float *)((const char *) src0->data + src_idx);
                         dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
                     }
                 }
             }
diff --git a/src/ggml-cuda/conv2d-dw.cu b/src/ggml-cuda/conv2d-dw.cu
index 7583233b1b..352d61ef67 100644
--- a/src/ggml-cuda/conv2d-dw.cu
+++ b/src/ggml-cuda/conv2d-dw.cu
@@ -8,6 +8,7 @@ struct conv_params {
     int padding_x, padding_y;
     int dilation_x, dilation_y;
     int channels, batches;
+    int circular;
 };
 
 struct kernel_bounds {
@@ -15,16 +16,31 @@ struct kernel_bounds {
     int x_min, x_max;
 };
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    if (size == 0) {
+        return 0;
+    }
+    int mod = coord % size;
+    return mod < 0 ? mod + size : mod;
+}
+
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
     kernel_bounds bounds;
-    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.y_max =
-        min(params.kernel_h,
-            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
-    bounds.x_max =
-        min(params.kernel_w,
-            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    if (params.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = params.kernel_h;
+        bounds.x_min = 0;
+        bounds.x_max = params.kernel_w;
+    } else {
+        bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.y_max =
+            min(params.kernel_h,
+                (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+        bounds.x_max =
+            min(params.kernel_w,
+                (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    }
     return bounds;
 }
 
@@ -83,7 +99,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int in_w, const int in_h, const int out_w, const int out_h,
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
-                                 const int channels, const int batches) {
+                                 const int channels, const int batches, const int circular) {
     const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
     const int total_elements = batches * channels * out_h * out_w;
 
@@ -92,7 +108,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     }
 
     conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
-                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular };
 
     int batch_idx, channel_idx, out_y_idx, out_x_idx;
     Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
@@ -102,9 +118,25 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
 
     for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
         int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+        if (params.circular) {
+            if (params.in_h == 0) {
+                continue;
+            }
+            in_y_idx = wrap_coord(in_y_idx, params.in_h);
+        } else if (in_y_idx < 0 || in_y_idx >= params.in_h) {
+            continue;
+        }
 
         for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
             int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+            if (params.circular) {
+                if (params.in_w == 0) {
+                    continue;
+                }
+                in_x_idx = wrap_coord(in_x_idx, params.in_w);
+            } else if (in_x_idx < 0 || in_x_idx >= params.in_w) {
+                continue;
+            }
 
             const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
             const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
@@ -132,6 +164,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int       padding_y  = p[3];
     const int       dilation_x = p[4];
     const int       dilation_y = p[5];
+    const int       circular   = ggml_get_op_params_i32(dst, 6);
 
     const int in_w     = input->ne[0];
     const int in_h     = input->ne[1];
@@ -150,11 +183,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     if (ggml_is_contiguous(input)) {
         conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else if (ggml_is_contiguous_channels(input)) {
         conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else {
         GGML_ABORT("Unsupported memory layout for conv_2d_dw");
     }
diff --git a/src/ggml-cuda/conv2d.cu b/src/ggml-cuda/conv2d.cu
index 142dd66903..9484119fdb 100644
--- a/src/ggml-cuda/conv2d.cu
+++ b/src/ggml-cuda/conv2d.cu
@@ -10,6 +10,7 @@ struct conv_params {
     const int64_t DL_X, DL_Y;
     const int64_t IC, OC;
     const int64_t B;
+    const int64_t circular;
     const int64_t TOTAL;
 };
 
@@ -18,6 +19,14 @@ struct kernel_bounds {
     int64_t x_min, x_max;
 };
 
+__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
+    if (size == 0) {
+        return 0;
+    }
+    int64_t mod = coord % size;
+    return mod < 0 ? mod + size : mod;
+}
+
 __device__ __forceinline__ int64_t max64(int64_t a, int64_t b) {
     return (a > b) ? a : b;
 }
@@ -28,6 +37,13 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) {
 
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) {
     kernel_bounds bounds;
+    if (P.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = P.KH;
+        bounds.x_min = 0;
+        bounds.x_max = P.KW;
+        return bounds;
+    }
     bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
     bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
     bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
@@ -89,10 +105,26 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
         kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
 
         for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
-            const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            if (P.circular) {
+                if (P.IH == 0) {
+                    continue;
+                }
+                in_y = wrap_coord(in_y, P.IH);
+            } else if (in_y < 0 || in_y >= P.IH) {
+                continue;
+            }
 
             for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
-                const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                if (P.circular) {
+                    if (P.IW == 0) {
+                        continue;
+                    }
+                    in_x = wrap_coord(in_x, P.IW);
+                } else if (in_x < 0 || in_x >= P.IW) {
+                    continue;
+                }
 
                 const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
                 const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
@@ -142,8 +174,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
 
-    // No cwhn
-    GGML_ASSERT(p[6] == false);
+    const int circular = ggml_get_op_params_i32(dst, 6);
 
     const int IW = input->ne[0];   // input_w
     const int IH = input->ne[1];   // input_h
@@ -156,7 +187,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int B  = input->ne[3];   // n_batches
 
     const int64_t total  = B * OC * OH * OW;
-    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
+    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, circular, total };
 
     if (kernel->type == GGML_TYPE_F16) {
         conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu
index 29aef33c1a..c432b3ad32 100644
--- a/src/ggml-cuda/pad.cu
+++ b/src/ggml-cuda/pad.cu
@@ -3,7 +3,9 @@
 static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
-                               const int ne0, const int ne1, const int ne2, const int ne3) {
+                               const int ne0, const int ne1, const int ne2, const int ne3,
+                               const int src_ne0, const int src_ne1, const int src_ne2, const int src_ne3,
+                               const int mode) {
     // blockIdx.z: i3*ne2+i2
     // blockIdx.y: i1
     // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@@ -18,33 +20,78 @@ static __global__ void pad_f32(const float * src, float * dst,
 
     // operation
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+    const bool in_src =
+        (i0 >= lp0 && i0 < ne0 - rp0) &&
         (i1 >= lp1 && i1 < ne1 - rp1) &&
         (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3)) {
-        const int64_t i00 = i0 - lp0;
-        const int64_t i01 = i1 - lp1;
-        const int64_t i02 = i2 - lp2;
-        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
-        const int64_t ne00 = ne0 - lp0 - rp0;
+        (i3 >= lp3 && i3 < ne3 - rp3);
 
-        const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+    if (mode == GGML_PAD_MODE_ZERO) {
+        if (in_src) {
+            const int64_t i00 = i0 - lp0;
+            const int64_t i01 = i1 - lp1;
+            const int64_t i02 = i2 - lp2;
+            const int64_t i03 = i3 - lp3;
+            const int64_t ne02 = ne2 - lp2 - rp2;
+            const int64_t ne01 = ne1 - lp1 - rp1;
+            const int64_t ne00 = ne0 - lp0 - rp0;
 
-        dst[dst_idx] = src[src_idx];
-    } else {
+            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+
+            dst[dst_idx] = src[src_idx];
+        } else {
+            dst[dst_idx] = 0.0f;
+        }
+        return;
+    }
+
+    if (src_ne0 <= 0 || src_ne1 <= 0 || src_ne2 <= 0 || src_ne3 <= 0) {
         dst[dst_idx] = 0.0f;
+        return;
     }
+
+    int ci0 = i0 - lp0;
+    int ci1 = i1 - lp1;
+    int ci2 = i2 - lp2;
+    int ci3 = i3 - lp3;
+
+    ci0 %= src_ne0;
+    if (ci0 < 0) {
+        ci0 += src_ne0;
+    }
+    ci1 %= src_ne1;
+    if (ci1 < 0) {
+        ci1 += src_ne1;
+    }
+    ci2 %= src_ne2;
+    if (ci2 < 0) {
+        ci2 += src_ne2;
+    }
+    ci3 %= src_ne3;
+    if (ci3 < 0) {
+        ci3 += src_ne3;
+    }
+
+    const int64_t src_idx = ((int64_t)ci3 * src_ne2 * src_ne1 * src_ne0) +
+                            ((int64_t)ci2 * src_ne1 * src_ne0) +
+                            ((int64_t)ci1 * src_ne0) +
+                            ci0;
+    dst[dst_idx] = src[src_idx];
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
     const int lp0, const int rp0, const int lp1, const int rp1,
     const int lp2, const int rp2, const int lp3, const int rp3,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    const int ne0, const int ne1, const int ne2, const int ne3,
+    const int src_ne0, const int src_ne1, const int src_ne2, const int src_ne3,
+    const int mode, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst,
+        lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
+        ne0, ne1, ne2, ne3,
+        src_ne0, src_ne1, src_ne2, src_ne3,
+        mode);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -63,10 +110,18 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
     const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
-    const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
+   const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
     const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const int32_t mode = ((const int32_t*)(dst->op_params))[8];
+
+    const int src_ne0 = (int) src0->ne[0];
+    const int src_ne1 = (int) src0->ne[1];
+    const int src_ne2 = (int) src0->ne[2];
+    const int src_ne3 = (int) src0->ne[3];
 
     pad_f32_cuda(src0_d, dst_d,
                  lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                 src_ne0, src_ne1, src_ne2, src_ne3,
+                 mode, stream);
 }
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 1674dc66ab..641d2a0506 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -838,6 +838,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
+    uint32_t padding_mode;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -880,6 +881,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
+    p.padding_mode = (uint32_t) ggml_get_op_params_i32(dst, 8);
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
@@ -1124,6 +1126,7 @@ struct vk_op_conv2d_push_constants {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
@@ -1172,6 +1175,7 @@ struct vk_op_conv_transpose_2d_push_constants {
     uint32_t OWOHmp; uint32_t OWOHL;
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
@@ -1200,6 +1204,7 @@ struct vk_op_conv2d_dw_push_constants {
     int32_t pad_y;
     int32_t dilation_x;
     int32_t dilation_y;
+    int32_t circular;
 };
 
 struct vk_op_upscale_push_constants {
@@ -9718,6 +9723,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
+    p.circular = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 6));
 
     GGML_ASSERT(ne03 == ne2);
     GGML_ASSERT(ne02 == ne12);
@@ -9767,6 +9773,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
+    p.circular = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 6));
 
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
@@ -9791,6 +9798,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = ggml_get_op_params_i32(dst, 6);
 
     GGML_ASSERT(src0->ne[3] == p.channels);
     GGML_ASSERT(src1->ne[3] == p.batches);
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
index 70a301488e..bc5ce29580 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
     int pad_y;
     int dilation_x;
     int dilation_y;
+    int circular;
 } p;
 
 layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};
@@ -27,6 +28,15 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+uint wrap_coord(int coord, uint size) {
+    if (size == 0u) {
+        return 0u;
+    }
+    int isize = int(size);
+    int m = coord % isize;
+    return uint(m < 0 ? m + isize : m);
+}
+
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint i0 = idx / p.dst_w;
     uint dst_x = idx - i0 * p.dst_w;
@@ -39,6 +49,21 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
+    if (p.circular != 0 || true) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
+        }
+        return sum;
+    }
+
     for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
         uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
         if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
@@ -70,6 +95,21 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
+    if (p.circular != 0 || true) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
+        }
+        return sum;
+    }
+
     for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
         uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
         if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
@@ -102,4 +142,3 @@ void main() {
 #endif
     dst_data[idx] = D_TYPE(result);
 }
-
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 0367e80bbf..1745a90bca 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -70,6 +70,7 @@ layout(push_constant) uniform parameter {
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
 #endif
+    uint32_t circular;
 }
 
 p;
@@ -276,26 +277,58 @@ void main() {
 #endif
 
 #ifdef TRANSPOSE
-            uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
-            uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
-            uint32_t H_idx = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
-            uint32_t W_idx = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
+            float val = 0.0;
+            if (p.circular == 0u && false) {
+                uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
+                uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
+                uint32_t H_idx      = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
+                uint32_t W_idx      = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
+                uint32_t src_idx    = min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0u), p.Cin * p.N * p.W * p.H - 1u);
+                val                 = src_data[src_idx];
+                if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx >= p.H || W_idx >= p.W ||
+                    (H_idx_x_s1 - H_idx * p.s1 != 0u) || (W_idx_x_s0 - W_idx * p.s0 != 0u)) {
+                    val = 0.0;
+                }
+            } else {
+                int stride_y = int(p.s1);
+                int stride_x = int(p.s0);
+                bool aligned = (stride_y != 0) && (stride_x != 0);
+                int H_raw    = int(OH_idx) * stride_y + int(KH_idx_b) * int(p.d1) - int(p.p1);
+                int W_raw    = int(OW_idx) * stride_x + int(KW_idx_b) * int(p.d0) - int(p.p0);
+                if (aligned) {
+                    aligned = (H_raw % stride_y == 0) && (W_raw % stride_x == 0);
+                }
+                if (aligned && p.H > 0u && p.W > 0u) {
+                    int H_idx = H_raw / stride_y;
+                    int W_idx = W_raw / stride_x;
+                    int sizeH = int(p.H);
+                    int sizeW = int(p.W);
+                    H_idx = (H_idx % sizeH + sizeH) % sizeH;
+                    W_idx = (W_idx % sizeW + sizeW) % sizeW;
+                    uint32_t src_idx = uint32_t(W_idx) + uint32_t(H_idx) * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
+                    val              = src_data[src_idx];
+                }
+            }
+            Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
 #else
-            uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
-            uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
-#endif
-            uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
-            float val = src_data[src_idx];
-            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
-#ifdef TRANSPOSE
-                || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
-#endif
-                ) {
-                val = 0.0;
+            int H_idx = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
+            int W_idx = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
+            if (p.circular != 0u || true) {
+                if (p.H > 0u) {
+                    H_idx = (H_idx % int(p.H) + int(p.H)) % int(p.H);
+                }
+                if (p.W > 0u) {
+                    W_idx = (W_idx % int(p.W) + int(p.W)) % int(p.W);
+                }
+            }
+            float val = 0.0;
+            if (CRS_idx_b < CRS && NPQ_idx < NPQ && Cin_idx_b < p.Cin &&
+                H_idx >= 0 && H_idx < int(p.H) && W_idx >= 0 && W_idx < int(p.W)) {
+                uint32_t src_idx = uint32_t(W_idx) + uint32_t(H_idx) * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
+                val              = src_data[src_idx];
             }
             Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
+#endif
         }
         barrier();
 #ifdef COOPMAT2
diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp
index f3c8176872..45c65a41e8 100644
--- a/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -9,6 +9,8 @@ layout (push_constant) uniform parameter
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint misalign_offsets;
 
+    uint padding_mode;
+
     uint lp0; uint rp0;
     uint lp1; uint rp1;
     uint lp2; uint rp2;
@@ -23,6 +25,18 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+const uint PAD_MODE_ZERO     = 0u;
+const uint PAD_MODE_CIRCULAR = 1u;
+
+uint wrap_coord(int coord, uint size) {
+    if (size == 0u) {
+        return 0u;
+    }
+    int isize = int(size);
+    int m = coord % isize;
+    return uint(m < 0 ? m + isize : m);
+}
+
 void main() {
     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 
@@ -45,5 +59,27 @@ void main() {
                          i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
                          i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
 
-    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    if (p.padding_mode == PAD_MODE_ZERO) {
+        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    }
+    else if(p.padding_mode == PAD_MODE_CIRCULAR) {
+        // Circular padding
+        const uint src_ne0 = p.ne00;
+        const uint src_ne1 = p.ne01;
+        const uint src_ne2 = p.ne02;
+        const uint src_ne3 = p.ne03;
+
+        if (src_ne0 == 0u || src_ne1 == 0u || src_ne2 == 0u || src_ne3 == 0u) {
+            data_d[get_doffset() + dst_idx] = D_TYPE(0.0f);
+            return;
+        }
+
+        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), src_ne0);
+        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), src_ne1);
+        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), src_ne2);
+        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), src_ne3);
+
+        const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
+        data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
+    }
 }
diff --git a/src/ggml.c b/src/ggml.c
index 2bce1375ba..4d57ff30a8 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4558,6 +4558,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
 
     int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
     ggml_set_op_params(result, params, sizeof(params));
+    ggml_set_op_params_i32(result, 6, 0);
 
     result->op     = GGML_OP_CONV_2D_DW;
     result->src[0] = a;
@@ -4595,6 +4596,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
     ggml_set_op_params_i32(result, 3, p1);
     ggml_set_op_params_i32(result, 4, d0);
     ggml_set_op_params_i32(result, 5, d1);
+    ggml_set_op_params_i32(result, 6, 0);
 
     result->op = GGML_OP_CONV_2D;
     result->src[0] = a;
@@ -4822,6 +4824,13 @@ struct ggml_tensor * ggml_interpolate(
     return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
 }
 
+GGML_API void ggml_set_pad_mode(struct ggml_tensor * tensor, enum ggml_pad_mode mode) {
+    GGML_ASSERT(tensor != NULL);
+    GGML_ASSERT(tensor->op == GGML_OP_PAD);
+    GGML_ASSERT(mode == GGML_PAD_MODE_ZERO || mode == GGML_PAD_MODE_CIRCULAR);
+    ggml_set_op_params_i32(tensor, 8, (int32_t) mode);
+}
+
 // ggml_pad
 
 struct ggml_tensor * ggml_pad(
@@ -4861,10 +4870,28 @@ struct ggml_tensor * ggml_pad_ext(
     ggml_set_op_params_i32(result, 6, lp3);
     ggml_set_op_params_i32(result, 7, rp3);
 
-
     result->op     = GGML_OP_PAD;
     result->src[0] = a;
 
+    ggml_set_pad_mode(result, GGML_PAD_MODE_ZERO);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_pad_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                  lp0,
+        int                  rp0,
+        int                  lp1,
+        int                  rp1,
+        int                  lp2,
+        int                  rp2,
+        int                  lp3,
+        int                  rp3
+        ) {
+    struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    ggml_set_pad_mode(result, GGML_PAD_MODE_CIRCULAR);
     return result;
 }
 

From 2d2e9346b6a87d90b2d48bbc83e5ca2b0d02a68f Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 15:01:58 -0700
Subject: [PATCH 02/11] clean up part 1

---
 src/ggml-cpu/ops.cpp | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp
index 7a7c5c671b..d0a2504d51 100644
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -10,14 +10,6 @@
 #include <float.h>
 #include <algorithm>
 
-static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
-    if (size <= 0) {
-        return 0;
-    }
-    int64_t mod = coord % size;
-    return mod < 0 ? mod + size : mod;
-}
-
 // ggml_compute_forward_dup
 
 static void ggml_compute_forward_dup_same_cont(
@@ -6668,6 +6660,12 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
     ggml_compute_forward_mul_mat(params, &dst);
 }
 
+// ggml_wrap_coord 
+
+static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
+    return (coord  + size) % size; // adding size avoids negative number weirdness
+}
+
 // ggml_compute_forward_conv_2d
 
 static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
@@ -6747,12 +6745,10 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
 
                         float src_val = 0.0f;
                         if (circular) {
-                            if (src_h > 0 && src_w > 0) {
-                                const int64_t sy_wrapped = ggml_wrap_coord(sy, src_h);
-                                const int64_t sx_wrapped = ggml_wrap_coord(sx, src_w);
-                                const float * src_ptr = (const float *)((const char *)src_base + sx_wrapped * src->nb[0] + sy_wrapped * src->nb[1] + ic * src->nb[2]);
-                                src_val               = *src_ptr;
-                            }
+                            const int64_t sy_wrapped = ggml_wrap_coord(sy, src_h);
+                            const int64_t sx_wrapped = ggml_wrap_coord(sx, src_w);
+                            const float * src_ptr = (const float *)((const char *)src_base + sx_wrapped * src->nb[0] + sy_wrapped * src->nb[1] + ic * src->nb[2]);
+                            src_val               = *src_ptr;
                         } else if (sy >= 0 && sy < src_h && sx >= 0 && sx < src_w) {
                             const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
                             src_val               = *src_ptr;
@@ -7108,9 +7104,6 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                     int64_t src_y = src_y_base + knl_y * p.dilation_y;
                     if (circular) {
-                        if (p.src_h == 0) {
-                            continue;
-                        }
                         src_y = ggml_wrap_coord(src_y, p.src_h);
                     } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;

From fc3783990e9456697dbcc56b8211b99694ccc8a0 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 15:22:52 -0700
Subject: [PATCH 03/11] clean up part 2

---
 src/ggml-cpu/ops.cpp | 67 ++++++++++----------------------------------
 1 file changed, 15 insertions(+), 52 deletions(-)

diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp
index d0a2504d51..6da40b22ec 100644
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -6753,6 +6753,9 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
                             const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
                             src_val               = *src_ptr;
                         }
+                        else {
+                            src_val = 1.0f;
+                        }
 
                         char * element_ptr = dst_row + dst_idx * traits->type_size;
                         if (kernel_type == GGML_TYPE_F32) {
@@ -7111,9 +7114,6 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                         int64_t src_x = src_x_base + knl_x * p.dilation_x;
                         if (circular) {
-                            if (p.src_w == 0) {
-                                continue;
-                            }
                             src_x = ggml_wrap_coord(src_x, p.src_w);
                         } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
@@ -7132,9 +7132,6 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                     int64_t src_y = src_y_base + knl_y * p.dilation_y;
                     if (circular) {
-                        if (p.src_h == 0) {
-                            continue;
-                        }
                         src_y = ggml_wrap_coord(src_y, p.src_h);
                     } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
@@ -7142,9 +7139,6 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                         int64_t src_x = src_x_base + knl_x * p.dilation_x;
                         if (circular) {
-                            if (p.src_w == 0) {
-                                continue;
-                            }
                             src_x = ggml_wrap_coord(src_x, p.src_w);
                         } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
@@ -7184,9 +7178,6 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                     int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
                     if (circular) {
-                        if (p.src_h == 0) {
-                            continue;
-                        }
                         src_y = ggml_wrap_coord(src_y, p.src_h);
                     } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
@@ -7194,9 +7185,6 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                         int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
                         if (circular) {
-                            if (p.src_w == 0) {
-                                continue;
-                            }
                             src_x = ggml_wrap_coord(src_x, p.src_w);
                         } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
@@ -7680,48 +7668,23 @@ static void ggml_compute_forward_pad_f32(
             }
         }
     } else {
-        const int64_t src_ne0 = ne00;
-        const int64_t src_ne1 = ne01;
-        const int64_t src_ne2 = ne02;
-        const int64_t src_ne3 = ne03;
-
-        const bool valid_extents = src_ne0 > 0 && src_ne1 > 0 && src_ne2 > 0 && src_ne3 > 0;
-
         for (int64_t i2 = 0; i2 < ne2; ++i2) {
             for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
                 for (int64_t i0 = 0; i0 < ne0; ++i0) {
                     for (int64_t i3 = 0; i3 < ne3; ++i3) {
                         const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                        if (!valid_extents) {
-                            dst_ptr[dst_idx] = 0;
-                            continue;
-                        }
-
-                        int64_t ci0 = i0 - lp0;
-                        int64_t ci1 = i1 - lp1;
-                        int64_t ci2 = i2 - lp2;
-                        int64_t ci3 = i3 - lp3;
-
-                        ci0 %= src_ne0;
-                        if (ci0 < 0) {
-                            ci0 += src_ne0;
-                        }
-                        ci1 %= src_ne1;
-                        if (ci1 < 0) {
-                            ci1 += src_ne1;
-                        }
-                        ci2 %= src_ne2;
-                        if (ci2 < 0) {
-                            ci2 += src_ne2;
-                        }
-                        ci3 %= src_ne3;
-                        if (ci3 < 0) {
-                            ci3 += src_ne3;
-                        }
-
-                        const size_t src_idx = (size_t)ci3*nb03 + (size_t)ci2*nb02 + (size_t)ci1*nb01 + (size_t)ci0*nb00;
-                        const float * src_ptr = (const float *)((const char *) src0->data + src_idx);
+                        const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00);
+                        const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01);
+                        const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02);
+                        const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03);
+
+                        const int64_t src_idx =
+                            src_i3*nb03 +
+                            src_i2*nb02 +
+                            src_i1*nb01 +
+                            src_i0*nb00;
+
+                        const float * src_ptr = (const float *)((char *) src0->data + src_idx);
                         dst_ptr[dst_idx] = *src_ptr;
                     }
                 }

From cce4d3f05356f87735e007d9fe619cee717b1c56 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 15:42:20 -0700
Subject: [PATCH 04/11] rework pad.cu

---
 src/ggml-cuda/conv2d-dw.cu | 15 ++----
 src/ggml-cuda/conv2d.cu    | 33 ++++---------
 src/ggml-cuda/pad.cu       | 96 +++++++++++---------------------------
 3 files changed, 38 insertions(+), 106 deletions(-)

diff --git a/src/ggml-cuda/conv2d-dw.cu b/src/ggml-cuda/conv2d-dw.cu
index 352d61ef67..a49202dee7 100644
--- a/src/ggml-cuda/conv2d-dw.cu
+++ b/src/ggml-cuda/conv2d-dw.cu
@@ -17,11 +17,7 @@ struct kernel_bounds {
 };
 
 __device__ __forceinline__ int wrap_coord(int coord, int size) {
-    if (size == 0) {
-        return 0;
-    }
-    int mod = coord % size;
-    return mod < 0 ? mod + size : mod;
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
 }
 
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
@@ -115,13 +111,11 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
 
     T accumulator = 0;
     kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
+    
 
     for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
         int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
         if (params.circular) {
-            if (params.in_h == 0) {
-                continue;
-            }
             in_y_idx = wrap_coord(in_y_idx, params.in_h);
         } else if (in_y_idx < 0 || in_y_idx >= params.in_h) {
             continue;
@@ -130,9 +124,6 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
         for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
             int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
             if (params.circular) {
-                if (params.in_w == 0) {
-                    continue;
-                }
                 in_x_idx = wrap_coord(in_x_idx, params.in_w);
             } else if (in_x_idx < 0 || in_x_idx >= params.in_w) {
                 continue;
@@ -164,7 +155,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int       padding_y  = p[3];
     const int       dilation_x = p[4];
     const int       dilation_y = p[5];
-    const int       circular   = ggml_get_op_params_i32(dst, 6);
+    const int       circular   = p[6];
 
     const int in_w     = input->ne[0];
     const int in_h     = input->ne[1];
diff --git a/src/ggml-cuda/conv2d.cu b/src/ggml-cuda/conv2d.cu
index 9484119fdb..c8e49fa877 100644
--- a/src/ggml-cuda/conv2d.cu
+++ b/src/ggml-cuda/conv2d.cu
@@ -19,12 +19,8 @@ struct kernel_bounds {
     int64_t x_min, x_max;
 };
 
-__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
-    if (size == 0) {
-        return 0;
-    }
-    int64_t mod = coord % size;
-    return mod < 0 ? mod + size : mod;
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
 }
 
 __device__ __forceinline__ int64_t max64(int64_t a, int64_t b) {
@@ -42,12 +38,13 @@ __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x,
         bounds.y_max = P.KH;
         bounds.x_min = 0;
         bounds.x_max = P.KW;
-        return bounds;
     }
-    bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
-    bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    else {
+        bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+        bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    }
     return bounds;
 }
 
@@ -107,25 +104,14 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
         for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
             int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
             if (P.circular) {
-                if (P.IH == 0) {
-                    continue;
-                }
                 in_y = wrap_coord(in_y, P.IH);
-            } else if (in_y < 0 || in_y >= P.IH) {
-                continue;
             }
 
             for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
                 int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
                 if (P.circular) {
-                    if (P.IW == 0) {
-                        continue;
-                    }
                     in_x = wrap_coord(in_x, P.IW);
-                } else if (in_x < 0 || in_x >= P.IW) {
-                    continue;
                 }
-
                 const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
                 const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
                 acc += (input_val * ggml_cuda_cast<float>(kernel_val));
@@ -173,8 +159,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int       PD_Y = p[3];  // padding_y
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
-
-    const int circular = ggml_get_op_params_i32(dst, 6);
+    const int   circular = p[6];  // circular
 
     const int IW = input->ne[0];   // input_w
     const int IH = input->ne[1];   // input_h
diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu
index c432b3ad32..066a536d2b 100644
--- a/src/ggml-cuda/pad.cu
+++ b/src/ggml-cuda/pad.cu
@@ -4,8 +4,7 @@ static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
                                const int ne0, const int ne1, const int ne2, const int ne3,
-                               const int src_ne0, const int src_ne1, const int src_ne2, const int src_ne3,
-                               const int mode) {
+                               const int circular) {
     // blockIdx.z: i3*ne2+i2
     // blockIdx.y: i1
     // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@@ -20,78 +19,39 @@ static __global__ void pad_f32(const float * src, float * dst,
 
     // operation
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    const bool in_src =
-        (i0 >= lp0 && i0 < ne0 - rp0) &&
+    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
         (i1 >= lp1 && i1 < ne1 - rp1) &&
         (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3);
+        (i3 >= lp3 && i3 < ne3 - rp3)) {
+        const int64_t i00 = i0 - lp0;
+        const int64_t i01 = i1 - lp1;
+        const int64_t i02 = i2 - lp2;
+        const int64_t i03 = i3 - lp3;
+        const int64_t ne02 = ne2 - lp2 - rp2;
+        const int64_t ne01 = ne1 - lp1 - rp1;
+        const int64_t ne00 = ne0 - lp0 - rp0;
 
-    if (mode == GGML_PAD_MODE_ZERO) {
-        if (in_src) {
-            const int64_t i00 = i0 - lp0;
-            const int64_t i01 = i1 - lp1;
-            const int64_t i02 = i2 - lp2;
-            const int64_t i03 = i3 - lp3;
-            const int64_t ne02 = ne2 - lp2 - rp2;
-            const int64_t ne01 = ne1 - lp1 - rp1;
-            const int64_t ne00 = ne0 - lp0 - rp0;
+        const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
 
-            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
-
-            dst[dst_idx] = src[src_idx];
-        } else {
-            dst[dst_idx] = 0.0f;
-        }
-        return;
-    }
-
-    if (src_ne0 <= 0 || src_ne1 <= 0 || src_ne2 <= 0 || src_ne3 <= 0) {
+        dst[dst_idx] = src[src_idx];
+    } else {
         dst[dst_idx] = 0.0f;
-        return;
     }
-
-    int ci0 = i0 - lp0;
-    int ci1 = i1 - lp1;
-    int ci2 = i2 - lp2;
-    int ci3 = i3 - lp3;
-
-    ci0 %= src_ne0;
-    if (ci0 < 0) {
-        ci0 += src_ne0;
-    }
-    ci1 %= src_ne1;
-    if (ci1 < 0) {
-        ci1 += src_ne1;
-    }
-    ci2 %= src_ne2;
-    if (ci2 < 0) {
-        ci2 += src_ne2;
-    }
-    ci3 %= src_ne3;
-    if (ci3 < 0) {
-        ci3 += src_ne3;
-    }
-
-    const int64_t src_idx = ((int64_t)ci3 * src_ne2 * src_ne1 * src_ne0) +
-                            ((int64_t)ci2 * src_ne1 * src_ne0) +
-                            ((int64_t)ci1 * src_ne0) +
-                            ci0;
-    dst[dst_idx] = src[src_idx];
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
     const int lp0, const int rp0, const int lp1, const int rp1,
     const int lp2, const int rp2, const int lp3, const int rp3,
     const int ne0, const int ne1, const int ne2, const int ne3,
-    const int src_ne0, const int src_ne1, const int src_ne2, const int src_ne3,
-    const int mode, cudaStream_t stream) {
+    const int circular, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst,
-        lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
-        ne0, ne1, ne2, ne3,
-        src_ne0, src_ne1, src_ne2, src_ne3,
-        mode);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(
+        src, dst,
+        lp0, rp0, lp1, rp1,
+        lp2, rp2, lp3, rp3,
+        ne0, ne1, ne2, ne3
+        circular);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -110,18 +70,14 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
     const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
-   const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
+    const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
     const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
-    const int32_t mode = ((const int32_t*)(dst->op_params))[8];
-
-    const int src_ne0 = (int) src0->ne[0];
-    const int src_ne1 = (int) src0->ne[1];
-    const int src_ne2 = (int) src0->ne[2];
-    const int src_ne3 = (int) src0->ne[3];
+    const int32_t circular = ((const int32_t*)(dst->op_params))[8];
 
     pad_f32_cuda(src0_d, dst_d,
-                 lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
+                 lp0, rp0, lp1, rp1,
+                 lp2, rp2, lp3, rp3,
                  dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-                 src_ne0, src_ne1, src_ne2, src_ne3,
-                 mode, stream);
+                 circular,
+                 stream);
 }

From 2ca02ea39e8b0346072ae6ed8d95743382045e64 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 16:06:38 -0700
Subject: [PATCH 05/11] reword vulkan conv 1

---
 src/ggml-cuda/pad.cu                          | 48 +++++++++-----
 src/ggml-vulkan/ggml-vulkan.cpp               | 12 ++--
 src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp | 66 +++++++++----------
 3 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu
index 066a536d2b..eb31af6ea6 100644
--- a/src/ggml-cuda/pad.cu
+++ b/src/ggml-cuda/pad.cu
@@ -1,5 +1,9 @@
 #include "pad.cuh"
 
+__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
+}
+
 static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
@@ -19,24 +23,38 @@ static __global__ void pad_f32(const float * src, float * dst,
 
     // operation
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
-        (i1 >= lp1 && i1 < ne1 - rp1) &&
-        (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3)) {
-        const int64_t i00 = i0 - lp0;
-        const int64_t i01 = i1 - lp1;
-        const int64_t i02 = i2 - lp2;
-        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
-        const int64_t ne00 = ne0 - lp0 - rp0;
+    const int64_t ne02 = ne2 - lp2 - rp2;
+    const int64_t ne01 = ne1 - lp1 - rp1;
+    const int64_t ne00 = ne0 - lp0 - rp0;
+    if (circular) {
+        const int64_t ne03 = ne3 - lp3 - rp3;
+        const int64_t i00 = wrap_coord(i0 - lp0, ne00);
+        const int64_t i01 = wrap_coord(i1 - lp1, ne01);
+        const int64_t i02 = wrap_coord(i2 - lp2, ne02);
+        const int64_t i03 = wrap_coord(i3 - lp3, ne03);
 
         const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
-
+        
         dst[dst_idx] = src[src_idx];
-    } else {
-        dst[dst_idx] = 0.0f;
     }
+    else {
+        if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+            (i1 >= lp1 && i1 < ne1 - rp1) &&
+            (i2 >= lp2 && i2 < ne2 - rp2) &&
+            (i3 >= lp3 && i3 < ne3 - rp3)) {
+            const int64_t i00 = i0 - lp0;
+            const int64_t i01 = i1 - lp1;
+            const int64_t i02 = i2 - lp2;
+            const int64_t i03 = i3 - lp3;
+
+            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+
+            dst[dst_idx] = src[src_idx];
+        } else {
+            dst[dst_idx] = 0.0f;
+        }
+    }
+
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
@@ -50,7 +68,7 @@ static void pad_f32_cuda(const float * src, float * dst,
         src, dst,
         lp0, rp0, lp1, rp1,
         lp2, rp2, lp3, rp3,
-        ne0, ne1, ne2, ne3
+        ne0, ne1, ne2, ne3,
         circular);
 }
 
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 641d2a0506..fd55acf6a8 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -838,7 +838,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
-    uint32_t padding_mode;
+    uint32_t circular;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -881,7 +881,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
-    p.padding_mode = (uint32_t) ggml_get_op_params_i32(dst, 8);
+    p.padding_mode = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
@@ -1126,6 +1126,7 @@ struct vk_op_conv2d_push_constants {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+
     uint32_t circular;
 };
 
@@ -1175,6 +1176,7 @@ struct vk_op_conv_transpose_2d_push_constants {
     uint32_t OWOHmp; uint32_t OWOHL;
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
+
     uint32_t circular;
 };
 
@@ -9723,7 +9725,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
-    p.circular = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 6));
+    p.circular = static_cast<uint32_t>(dst->op_params[6]);
 
     GGML_ASSERT(ne03 == ne2);
     GGML_ASSERT(ne02 == ne12);
@@ -9773,7 +9775,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
-    p.circular = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 6));
+    p.circular = static_cast<uint32_t>(dst->op_params[6]);
 
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
@@ -9798,7 +9800,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
-    p.circular = ggml_get_op_params_i32(dst, 6);
+    p.circular = dst->op_params[6];
 
     GGML_ASSERT(src0->ne[3] == p.channels);
     GGML_ASSERT(src1->ne[3] == p.batches);
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
index bc5ce29580..f06c5699fa 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -29,12 +29,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
 uint wrap_coord(int coord, uint size) {
-    if (size == 0u) {
-        return 0u;
-    }
-    int isize = int(size);
-    int m = coord % isize;
-    return uint(m < 0 ? m + isize : m);
+    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
 }
 
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
@@ -49,7 +44,7 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
-    if (p.circular != 0 || true) {
+    if (p.circular) {
         for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
             int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
             uint src_y  = wrap_coord(raw_y, p.src_h);
@@ -61,22 +56,22 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
                 sum = fma(v, k, sum);
             }
         }
-        return sum;
     }
-
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
-        }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
@@ -95,7 +90,7 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
-    if (p.circular != 0 || true) {
+    if (p.circular) {
         for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
             int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
             uint src_y  = wrap_coord(raw_y, p.src_h);
@@ -107,24 +102,25 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
                 sum = fma(v, k, sum);
             }
         }
-        return sum;
     }
-
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
-        }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
-            sum = fma(v, k, sum);
-        }
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
     }
+    }
+
     return sum;
 }
 

From aa7dcc5e57d5d1d088dbc89725e15e1553f6ddc6 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 16:13:17 -0700
Subject: [PATCH 06/11] reset conv2d back to most recent version

---
 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 71 +++----------------
 1 file changed, 9 insertions(+), 62 deletions(-)

diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 1745a90bca..86bafba4a4 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -11,12 +11,12 @@
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
 
-#include "types.glsl"
+#include "types.comp"
 
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
     A_TYPE knl_data[];
-};  // src0 - kernel:   [KW, KH, Cin, Cout] for conv_2d, [KW, KH, Cout, Cin] for conv_transposed_2d
+};  // src0 - kernel:   [KW, KH, Cin, Cout]
 
 layout(binding = 1) readonly buffer B {
     B_TYPE src_data[];
@@ -66,11 +66,6 @@ layout(push_constant) uniform parameter {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
-#ifdef TRANSPOSE
-    uint32_t s0mp; uint32_t s0L;
-    uint32_t s1mp; uint32_t s1L;
-#endif
-    uint32_t circular;
 }
 
 p;
@@ -230,11 +225,7 @@ void main() {
             uint32_t B_ly    = r_offset + Ar;
             uint32_t B_lx    = Ac;
             uint32_t K_idx   = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
-#ifdef TRANSPOSE
-            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
-#else
             uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
-#endif
             float    val     = knl_data[knl_idx];
             if (K_idx >= K || CRS_idx_a >= CRS) {
                 val = 0.0;
@@ -276,59 +267,15 @@ void main() {
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
 
-#ifdef TRANSPOSE
-            float val = 0.0;
-            if (p.circular == 0u && false) {
-                uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
-                uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
-                uint32_t H_idx      = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
-                uint32_t W_idx      = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
-                uint32_t src_idx    = min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0u), p.Cin * p.N * p.W * p.H - 1u);
-                val                 = src_data[src_idx];
-                if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx >= p.H || W_idx >= p.W ||
-                    (H_idx_x_s1 - H_idx * p.s1 != 0u) || (W_idx_x_s0 - W_idx * p.s0 != 0u)) {
-                    val = 0.0;
-                }
-            } else {
-                int stride_y = int(p.s1);
-                int stride_x = int(p.s0);
-                bool aligned = (stride_y != 0) && (stride_x != 0);
-                int H_raw    = int(OH_idx) * stride_y + int(KH_idx_b) * int(p.d1) - int(p.p1);
-                int W_raw    = int(OW_idx) * stride_x + int(KW_idx_b) * int(p.d0) - int(p.p0);
-                if (aligned) {
-                    aligned = (H_raw % stride_y == 0) && (W_raw % stride_x == 0);
-                }
-                if (aligned && p.H > 0u && p.W > 0u) {
-                    int H_idx = H_raw / stride_y;
-                    int W_idx = W_raw / stride_x;
-                    int sizeH = int(p.H);
-                    int sizeW = int(p.W);
-                    H_idx = (H_idx % sizeH + sizeH) % sizeH;
-                    W_idx = (W_idx % sizeW + sizeW) % sizeW;
-                    uint32_t src_idx = uint32_t(W_idx) + uint32_t(H_idx) * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
-                    val              = src_data[src_idx];
-                }
-            }
-            Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
-#else
-            int H_idx = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
-            int W_idx = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
-            if (p.circular != 0u || true) {
-                if (p.H > 0u) {
-                    H_idx = (H_idx % int(p.H) + int(p.H)) % int(p.H);
-                }
-                if (p.W > 0u) {
-                    W_idx = (W_idx % int(p.W) + int(p.W)) % int(p.W);
-                }
-            }
-            float val = 0.0;
-            if (CRS_idx_b < CRS && NPQ_idx < NPQ && Cin_idx_b < p.Cin &&
-                H_idx >= 0 && H_idx < int(p.H) && W_idx >= 0 && W_idx < int(p.W)) {
-                uint32_t src_idx = uint32_t(W_idx) + uint32_t(H_idx) * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
-                val              = src_data[src_idx];
+            uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
+            uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
+            uint32_t src_idx =
+                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+            float val = src_data[src_idx];
+            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
+                val = 0.0;
             }
             Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
-#endif
         }
         barrier();
 #ifdef COOPMAT2

From d29330b1d6c210b776a56b1cc53c4e62e5f8c28f Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 16:36:15 -0700
Subject: [PATCH 07/11] clean up pad.comp

---
 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 30 ++++++--
 src/ggml-vulkan/vulkan-shaders/pad.comp       | 71 ++++---------------
 2 files changed, 37 insertions(+), 64 deletions(-)

diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 86bafba4a4..2bd1afad35 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -66,6 +66,8 @@ layout(push_constant) uniform parameter {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+
+    uint32_t circular;
 }
 
 p;
@@ -152,6 +154,10 @@ uint fastdiv(uint n, uint mp, uint L) {
     return (msbs + n) >> L;
 }
 
+uint wrap_coord(int coord, uint size) {
+    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
+}
+
 #ifdef COOPMAT2
 #define ACC_TYPE float16_t
 
@@ -267,13 +273,23 @@ void main() {
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
 
-            uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
-            uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
-            uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
-            float val = src_data[src_idx];
-            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
-                val = 0.0;
+            float val;
+            if (p.circular == 1u) {
+                int H_raw = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
+                int W_raw = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
+                uint32_t H_idx = wrap_coord(H_raw, p.H);
+                uint32_t W_idx = wrap_coord(W_raw, p.W);
+                uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
+                val = src_data[src_idx];
+            } else {
+                uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
+                uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
+                uint32_t src_idx =
+                    min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+                float val = src_data[src_idx];
+                if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
+                    val = 0.0;
+                }
             }
             Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
         }
diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp
index 45c65a41e8..65e14935d9 100644
--- a/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -1,40 +1,12 @@
 #version 450
 
-#include "types.glsl"
-
-layout (push_constant) uniform parameter
-{
-    uint ne;
-    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
-    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
-    uint misalign_offsets;
-
-    uint padding_mode;
-
-    uint lp0; uint rp0;
-    uint lp1; uint rp1;
-    uint lp2; uint rp2;
-    uint lp3; uint rp3;
-} p;
-
-uint get_aoffset() { return p.misalign_offsets >> 16; }
-uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+#include "types.comp"
+#include "generic_unary_head.comp"
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
-const uint PAD_MODE_ZERO     = 0u;
-const uint PAD_MODE_CIRCULAR = 1u;
-
 uint wrap_coord(int coord, uint size) {
-    if (size == 0u) {
-        return 0u;
-    }
-    int isize = int(size);
-    int m = coord % isize;
-    return uint(m < 0 ? m + isize : m);
+    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
 }
 
 void main() {
@@ -51,35 +23,20 @@ void main() {
     const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
 
-    const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
+    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
 
-    const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
-                         i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
-                         i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
-                         i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
-
-    if (p.padding_mode == PAD_MODE_ZERO) {
-        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
-    }
-    else if(p.padding_mode == PAD_MODE_CIRCULAR) {
-        // Circular padding
-        const uint src_ne0 = p.ne00;
-        const uint src_ne1 = p.ne01;
-        const uint src_ne2 = p.ne02;
-        const uint src_ne3 = p.ne03;
-
-        if (src_ne0 == 0u || src_ne1 == 0u || src_ne2 == 0u || src_ne3 == 0u) {
-            data_d[get_doffset() + dst_idx] = D_TYPE(0.0f);
-            return;
-        }
-
-        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), src_ne0);
-        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), src_ne1);
-        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), src_ne2);
-        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), src_ne3);
-
+    if (p.circular) {
+        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00);
+        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01);
+        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02);
+        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03);
         const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
         data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
     }
+    else {
+        const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    }
+
 }

From 25d358c627186901b6506ee70faed598613eff05 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 17:08:27 -0700
Subject: [PATCH 08/11] clean up a bit more

---
 include/ggml.h                  | 26 ++++++++++++++++++--
 src/ggml-vulkan/ggml-vulkan.cpp |  2 +-
 src/ggml.c                      | 43 +++++++++++++++++++++++++++------
 3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/include/ggml.h b/include/ggml.h
index 0e2b35d7a2..9cda308e4b 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -1987,11 +1987,22 @@ extern "C" {
             int                   dilation0,
             int                   dilation1);
 
-    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            int                   stride);
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride);
 
     GGML_API struct ggml_tensor * ggml_conv_2d_direct(
             struct ggml_context * ctx,
@@ -2004,6 +2015,17 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
     GGML_API struct ggml_tensor * ggml_conv_3d_direct(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index fd55acf6a8..93b4405937 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -881,7 +881,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
-    p.padding_mode = dst->op_params[8];
+    p.circular = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
diff --git a/src/ggml.c b/src/ggml.c
index 4d57ff30a8..745ed10cde 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4558,7 +4558,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
 
     int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
     ggml_set_op_params(result, params, sizeof(params));
-    ggml_set_op_params_i32(result, 6, 0);
+    ggml_set_op_params_i32(result, 6, 0); // default not circular
 
     result->op     = GGML_OP_CONV_2D_DW;
     result->src[0] = a;
@@ -4566,6 +4566,22 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride0,
+        int                   stride1,
+        int                   pad0,
+        int                   pad1,
+        int                   dilation0,
+        int                   dilation1) {
+    struct ggml_tensor * result =
+        ggml_conv_2d_dw_direct(ctx, a, b, stride0, stride1, pad0, pad1, dilation0, dilation1);
+    ggml_set_op_params_i32(result, 6, 1);
+    return result;
+}
+
 // ggml_conv_2d_direct
 
 struct ggml_tensor * ggml_conv_2d_direct(
@@ -4577,8 +4593,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
         int                   p0,  // padding dimension 0
         int                   p1,  // padding dimension 1
         int                   d0,  // dilation dimension 0
-        int                   d1) {// dilation dimension 1
-
+        int                   d1) {  // dilation dimension 1
     GGML_ASSERT(a->ne[2] == b->ne[2]);
     //GGML_ASSERT(a->type == b->type);
 
@@ -4596,7 +4611,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
     ggml_set_op_params_i32(result, 3, p1);
     ggml_set_op_params_i32(result, 4, d0);
     ggml_set_op_params_i32(result, 5, d1);
-    ggml_set_op_params_i32(result, 6, 0);
+    ggml_set_op_params_i32(result, 6, 0); // default not circular
 
     result->op = GGML_OP_CONV_2D;
     result->src[0] = a;
@@ -4605,6 +4620,21 @@ struct ggml_tensor * ggml_conv_2d_direct(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * result = ggml_conv_2d_direct(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    ggml_set_op_params_i32(result, 6, 1);
+    return result;
+}
+
 // ggml_conv_3d_direct
 
 struct ggml_tensor * ggml_conv_3d_direct(
@@ -4869,12 +4899,11 @@ struct ggml_tensor * ggml_pad_ext(
     ggml_set_op_params_i32(result, 5, rp2);
     ggml_set_op_params_i32(result, 6, lp3);
     ggml_set_op_params_i32(result, 7, rp3);
+    ggml_set_op_params_i32(result, 8, 0); // not circular by default
 
     result->op     = GGML_OP_PAD;
     result->src[0] = a;
 
-    ggml_set_pad_mode(result, GGML_PAD_MODE_ZERO);
-
     return result;
 }
 
@@ -4891,7 +4920,7 @@ struct ggml_tensor * ggml_pad_circular(
         int                  rp3
         ) {
     struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
-    ggml_set_pad_mode(result, GGML_PAD_MODE_CIRCULAR);
+    ggml_set_op_params_i32(result, 8, 1);
     return result;
 }
 

From 6eb26b3c74ed06f600e61f48d62dc39f9c1166c0 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 13:26:36 -0700
Subject: [PATCH 09/11] cleanup shaders

---
 include/ggml.h                                | 11 +++++++
 src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp |  6 ++--
 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp |  4 +--
 src/ggml-vulkan/vulkan-shaders/pad.comp       | 32 +++++++++++++++----
 src/ggml.c                                    | 18 +++++++++++
 5 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/include/ggml.h b/include/ggml.h
index 9cda308e4b..fa3d657002 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -1987,6 +1987,17 @@ extern "C" {
             int                   dilation0,
             int                   dilation1);
 
+    GGML_API struct ggml_tensor * ggml_conv_2d_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
     GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
index f06c5699fa..77af6873a0 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -29,7 +29,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
 uint wrap_coord(int coord, uint size) {
-    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
 }
 
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
@@ -44,7 +44,7 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
-    if (p.circular) {
+    if (p.circular != 0) {
         for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
             int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
             uint src_y  = wrap_coord(raw_y, p.src_h);
@@ -90,7 +90,7 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
-    if (p.circular) {
+    if (p.circular != 0) {
         for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
             int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
             uint src_y  = wrap_coord(raw_y, p.src_h);
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 2bd1afad35..0f744b529b 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -11,7 +11,7 @@
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
 
-#include "types.comp"
+#include "types.glsl"
 
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
@@ -155,7 +155,7 @@ uint fastdiv(uint n, uint mp, uint L) {
 }
 
 uint wrap_coord(int coord, uint size) {
-    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
 }
 
 #ifdef COOPMAT2
diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp
index 65e14935d9..45c61f8708 100644
--- a/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -1,12 +1,32 @@
 #version 450
 
-#include "types.comp"
-#include "generic_unary_head.comp"
+#include "types.glsl"
+
+layout (push_constant) uniform parameter
+{
+    uint ne;
+    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
+    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
+    uint misalign_offsets;
+
+    uint lp0; uint rp0;
+    uint lp1; uint rp1;
+    uint lp2; uint rp2;
+    uint lp3; uint rp3;
+
+    uint circular;
+} p;
+
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
 uint wrap_coord(int coord, uint size) {
-    return ((uint)(coord + (int)size)) % size; // add size to avoid issues with negative
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
 }
 
 void main() {
@@ -23,10 +43,10 @@ void main() {
     const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
 
-    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+    const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
 
-    if (p.circular) {
+    if (p.circular != 0u) {
         const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00);
         const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01);
         const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02);
@@ -38,5 +58,5 @@ void main() {
         const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
         data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
     }
-
 }
+
diff --git a/src/ggml.c b/src/ggml.c
index 745ed10cde..0f04ba896c 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4397,6 +4397,24 @@ struct ggml_tensor * ggml_conv_2d(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    if (p0 == 0 && p1 == 0) {
+        return ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    }
+
+    struct ggml_tensor * b_padded = ggml_pad_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0);
+    return ggml_conv_2d(ctx, a, b_padded, s0, s1, 0, 0, d0, d1);
+}
+
 // a: [OC*IC, KD, KH, KW]
 // b: [N*IC, ID, IH, IW]
 // result: [N*OD, OH, OW, IC * KD * KH * KW]

From c78d545209ed283f3a6cbafe9f58a26b7ca2f129 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 15:34:29 -0700
Subject: [PATCH 10/11] Finish circular patch

---
 src/ggml-vulkan/ggml-vulkan.cpp               |  2 +-
 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 60 ++++++++++++-------
 src/ggml-vulkan/vulkan-shaders/pad.comp       |  6 +-
 3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 93b4405937..8d1e69a508 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -9775,7 +9775,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
-    p.circular = static_cast<uint32_t>(dst->op_params[6]);
+    p.circular = 0;
 
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 0f744b529b..098c0178d2 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -16,7 +16,7 @@
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
     A_TYPE knl_data[];
-};  // src0 - kernel:   [KW, KH, Cin, Cout]
+};  // src0 - kernel:   [KW, KH, Cin, Cout] for conv_2d, [KW, KH, Cout, Cin] for conv_transposed_2d
 
 layout(binding = 1) readonly buffer B {
     B_TYPE src_data[];
@@ -66,6 +66,10 @@ layout(push_constant) uniform parameter {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+#ifdef TRANSPOSE
+    uint32_t s0mp; uint32_t s0L;
+    uint32_t s1mp; uint32_t s1L;
+#endif
 
     uint32_t circular;
 }
@@ -95,6 +99,10 @@ uint32_t NPQ = p.N * p.OH * p.OW;
 
 uint32_t n_elems_out = K * NPQ;
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);
 
@@ -154,10 +162,6 @@ uint fastdiv(uint n, uint mp, uint L) {
     return (msbs + n) >> L;
 }
 
-uint wrap_coord(int coord, uint size) {
-    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
-}
-
 #ifdef COOPMAT2
 #define ACC_TYPE float16_t
 
@@ -231,7 +235,11 @@ void main() {
             uint32_t B_ly    = r_offset + Ar;
             uint32_t B_lx    = Ac;
             uint32_t K_idx   = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
+#ifdef TRANSPOSE
+            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
+#else
             uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
+#endif
             float    val     = knl_data[knl_idx];
             if (K_idx >= K || CRS_idx_a >= CRS) {
                 val = 0.0;
@@ -273,23 +281,31 @@ void main() {
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
 
-            float val;
-            if (p.circular == 1u) {
-                int H_raw = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
-                int W_raw = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
-                uint32_t H_idx = wrap_coord(H_raw, p.H);
-                uint32_t W_idx = wrap_coord(W_raw, p.W);
-                uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
-                val = src_data[src_idx];
-            } else {
-                uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
-                uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
-                uint32_t src_idx =
-                    min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
-                float val = src_data[src_idx];
-                if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
-                    val = 0.0;
-                }
+            uint32_t H_pos;
+            uint32_t W_pos;
+#ifdef TRANSPOSE
+            uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
+            uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
+            uint32_t H_idx = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
+            uint32_t W_idx = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
+            H_pos            = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx;
+            W_pos            = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx;
+#else
+            int H_raw = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
+            int W_raw = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
+            H_pos     = (p.circular != 0) ? wrap_coord(H_raw, p.H) : uint32_t(H_raw);
+            W_pos     = (p.circular != 0) ? wrap_coord(W_raw, p.W) : uint32_t(W_raw);
+#endif
+            uint32_t src_idx =
+                min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+            float val = src_data[src_idx];
+            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
+                || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+#ifdef TRANSPOSE
+                || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
+#endif
+                ) {
+                val = 0.0;
             }
             Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
         }
diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp
index 45c61f8708..b6a0cf933e 100644
--- a/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -8,13 +8,12 @@ layout (push_constant) uniform parameter
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint misalign_offsets;
+    uint circular;
 
     uint lp0; uint rp0;
     uint lp1; uint rp1;
     uint lp2; uint rp2;
     uint lp3; uint rp3;
-
-    uint circular;
 } p;
 
 uint get_aoffset() { return p.misalign_offsets >> 16; }
@@ -58,5 +57,4 @@ void main() {
         const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
         data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
     }
-}
-
+}
\ No newline at end of file

From 8369d950d4049b57c655ef5ceb868a9abfd54b2f Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 16:52:36 -0700
Subject: [PATCH 11/11] remove unneded pad mode stuff

---
 include/ggml.h | 7 -------
 src/ggml.c     | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/include/ggml.h b/include/ggml.h
index 4f140d2b13..6e4c867f84 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -2179,13 +2179,6 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             int64_t               ne3,
             uint32_t              mode); // ggml_scale_mode [ | ggml_scale_flag...]
 
-    enum ggml_pad_mode {
-        GGML_PAD_MODE_ZERO     = 0,
-        GGML_PAD_MODE_CIRCULAR = 1,
-    };
-
-    GGML_API void ggml_set_pad_mode(struct ggml_tensor * tensor, enum ggml_pad_mode mode);
-
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,
diff --git a/src/ggml.c b/src/ggml.c
index d56ad065ca..e7397405bc 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4932,13 +4932,6 @@ struct ggml_tensor * ggml_interpolate(
     return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
 }
 
-GGML_API void ggml_set_pad_mode(struct ggml_tensor * tensor, enum ggml_pad_mode mode) {
-    GGML_ASSERT(tensor != NULL);
-    GGML_ASSERT(tensor->op == GGML_OP_PAD);
-    GGML_ASSERT(mode == GGML_PAD_MODE_ZERO || mode == GGML_PAD_MODE_CIRCULAR);
-    ggml_set_op_params_i32(tensor, 8, (int32_t) mode);
-}
-
 // ggml_pad
 
 struct ggml_tensor * ggml_pad(