diff --git a/include/ggml.h b/include/ggml.h
index d948b00cc..6e4c867f8 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -2031,11 +2031,33 @@ extern "C" {
             int                   dilation0,
             int                   dilation1);
 
-    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+    GGML_API struct ggml_tensor * ggml_conv_2d_circular(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            int                   stride);
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride);
 
     GGML_API struct ggml_tensor * ggml_conv_2d_direct(
             struct ggml_context * ctx,
@@ -2048,6 +2070,17 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
     GGML_API struct ggml_tensor * ggml_conv_3d_direct(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
@@ -2168,6 +2201,19 @@ extern "C" {
             int                  rp3
             );
 
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,
diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp
index b52f0f847..3db28021d 100644
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -6660,6 +6660,12 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
     ggml_compute_forward_mul_mat(params, &dst);
 }
 
+// ggml_wrap_coord 
+
+static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
+    return (coord  + size) % size; // adding size avoids negative number weirdness
+}
+
 // ggml_compute_forward_conv_2d
 
 static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
@@ -6680,6 +6686,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
     const int32_t pad_y      = dst->op_params[3];
     const int32_t dilation_x = dst->op_params[4];
     const int32_t dilation_y = dst->op_params[5];
+    const bool circular = ggml_get_op_params_i32(dst, 6) != 0;
 
     const int64_t c_in  = src->ne[2];
     const int64_t c_out = kernel->ne[3];
@@ -6736,13 +6743,19 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
 
                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
 
-                        float src_val;
-                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            src_val = 0.0f;
-                        } else {
+                        float src_val = 0.0f;
+                        if (circular) {
+                            const int64_t sy_wrapped = ggml_wrap_coord(sy, src_h);
+                            const int64_t sx_wrapped = ggml_wrap_coord(sx, src_w);
+                            const float * src_ptr = (const float *)((const char *)src_base + sx_wrapped * src->nb[0] + sy_wrapped * src->nb[1] + ic * src->nb[2]);
+                            src_val               = *src_ptr;
+                        } else if (sy >= 0 && sy < src_h && sx >= 0 && sx < src_w) {
                             const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
                             src_val               = *src_ptr;
                         }
+                        else {
+                            src_val = 1.0f;
+                        }
 
                         char * element_ptr = dst_row + dst_idx * traits->type_size;
                         if (kernel_type == GGML_TYPE_F32) {
@@ -7052,6 +7065,7 @@ struct ggml_conv_2d_dw_params {
     int pad_y;
     int dilation_x;
     int dilation_y;
+    int circular;
 };
 
 static void ggml_compute_forward_conv_2d_dw_cwhn(
@@ -7063,6 +7077,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
 
     const int64_t c = p.channels;
     const float * knl_data = (const float *)kernel->data;
+    const bool circular = p.circular != 0;
 
     const int64_t rows_total = p.dst_h * p.batch;
     const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
@@ -7090,13 +7105,17 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
             for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
                 GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (circular) {
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (circular) {
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
@@ -7111,13 +7130,17 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
             for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
                 float sum = 0.0f;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (circular) {
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (circular) {
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
@@ -7138,6 +7161,7 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
         const ggml_conv_2d_dw_params & p) {
 
     const int64_t n = p.channels * p.batch;
+    const bool circular = p.circular != 0;
     const int64_t per_thread = (n + params->nth - 1) / params->nth;
     const int64_t start = params->ith * per_thread;
     const int64_t end = MIN(start + per_thread, n);
@@ -7152,13 +7176,17 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
 
                 float sum = 0.0f;
                 for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
+                    int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+                    if (circular) {
+                        src_y = ggml_wrap_coord(src_y, p.src_h);
+                    } else if (src_y < 0 || src_y >= p.src_h) {
                         continue;
                     }
                     for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                        int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                        if (circular) {
+                            src_x = ggml_wrap_coord(src_x, p.src_w);
+                        } else if (src_x < 0 || src_x >= p.src_w) {
                             continue;
                         }
                         sum += knl_data[knl_y * p.knl_w + knl_x]
@@ -7192,6 +7220,7 @@ void ggml_compute_forward_conv_2d_dw(
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = ggml_get_op_params_i32(dst, 6);
 
     GGML_ASSERT(kernel->ne[3] == p.channels);
     GGML_ASSERT(dst->ne[3] == p.batch);
@@ -7612,24 +7641,51 @@ static void ggml_compute_forward_pad_f32(
     const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
     const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
     const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
+    const int32_t mode = ggml_get_op_params_i32(dst, 8);
+    const bool circular = mode == GGML_PAD_MODE_CIRCULAR;
 
 
     // TODO: optimize
 
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-                    if ((i0 >= lp0 && i0 < ne0 - rp0) \
-                         && (i1 >= lp1 && i1 < ne1 - rp1) \
-                         && (i2 >= lp2 && i2 < ne2 - rp2) \
-                         && (i3 >= lp3 && i3 < ne3 - rp3)) {
-                        const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+    if (!circular) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        if ((i0 >= lp0 && i0 < ne0 - rp0) \
+                             && (i1 >= lp1 && i1 < ne1 - rp1) \
+                             && (i2 >= lp2 && i2 < ne2 - rp2) \
+                             && (i3 >= lp3 && i3 < ne3 - rp3)) {
+                            const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+                            const float * src_ptr = (const float *)((char *) src0->data + src_idx);
+                            dst_ptr[dst_idx] = *src_ptr;
+                        } else {
+                            dst_ptr[dst_idx] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00);
+                        const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01);
+                        const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02);
+                        const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03);
+
+                        const int64_t src_idx =
+                            src_i3*nb03 +
+                            src_i2*nb02 +
+                            src_i1*nb01 +
+                            src_i0*nb00;
+
                         const float * src_ptr = (const float *)((char *) src0->data + src_idx);
                         dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
                     }
                 }
             }
diff --git a/src/ggml-cuda/conv2d-dw.cu b/src/ggml-cuda/conv2d-dw.cu
index 7583233b1..a49202dee 100644
--- a/src/ggml-cuda/conv2d-dw.cu
+++ b/src/ggml-cuda/conv2d-dw.cu
@@ -8,6 +8,7 @@ struct conv_params {
     int padding_x, padding_y;
     int dilation_x, dilation_y;
     int channels, batches;
+    int circular;
 };
 
 struct kernel_bounds {
@@ -15,16 +16,27 @@ struct kernel_bounds {
     int x_min, x_max;
 };
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
+}
+
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
     kernel_bounds bounds;
-    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.y_max =
-        min(params.kernel_h,
-            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
-    bounds.x_max =
-        min(params.kernel_w,
-            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    if (params.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = params.kernel_h;
+        bounds.x_min = 0;
+        bounds.x_max = params.kernel_w;
+    } else {
+        bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.y_max =
+            min(params.kernel_h,
+                (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+        bounds.x_max =
+            min(params.kernel_w,
+                (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    }
     return bounds;
 }
 
@@ -83,7 +95,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int in_w, const int in_h, const int out_w, const int out_h,
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
-                                 const int channels, const int batches) {
+                                 const int channels, const int batches, const int circular) {
     const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
     const int total_elements = batches * channels * out_h * out_w;
 
@@ -92,19 +104,30 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     }
 
     conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
-                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular };
 
     int batch_idx, channel_idx, out_y_idx, out_x_idx;
     Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
 
     T accumulator = 0;
     kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
+    
 
     for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
         int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+        if (params.circular) {
+            in_y_idx = wrap_coord(in_y_idx, params.in_h);
+        } else if (in_y_idx < 0 || in_y_idx >= params.in_h) {
+            continue;
+        }
 
         for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
             int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+            if (params.circular) {
+                in_x_idx = wrap_coord(in_x_idx, params.in_w);
+            } else if (in_x_idx < 0 || in_x_idx >= params.in_w) {
+                continue;
+            }
 
             const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
             const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
@@ -132,6 +155,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int       padding_y  = p[3];
     const int       dilation_x = p[4];
     const int       dilation_y = p[5];
+    const int       circular   = p[6];
 
     const int in_w     = input->ne[0];
     const int in_h     = input->ne[1];
@@ -150,11 +174,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     if (ggml_is_contiguous(input)) {
         conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else if (ggml_is_contiguous_channels(input)) {
         conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else {
         GGML_ABORT("Unsupported memory layout for conv_2d_dw");
     }
diff --git a/src/ggml-cuda/conv2d.cu b/src/ggml-cuda/conv2d.cu
index 142dd6690..c8e49fa87 100644
--- a/src/ggml-cuda/conv2d.cu
+++ b/src/ggml-cuda/conv2d.cu
@@ -10,6 +10,7 @@ struct conv_params {
     const int64_t DL_X, DL_Y;
     const int64_t IC, OC;
     const int64_t B;
+    const int64_t circular;
     const int64_t TOTAL;
 };
 
@@ -18,6 +19,10 @@ struct kernel_bounds {
     int64_t x_min, x_max;
 };
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
+}
+
 __device__ __forceinline__ int64_t max64(int64_t a, int64_t b) {
     return (a > b) ? a : b;
 }
@@ -28,10 +33,18 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) {
 
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) {
     kernel_bounds bounds;
-    bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
-    bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    if (P.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = P.KH;
+        bounds.x_min = 0;
+        bounds.x_max = P.KW;
+    }
+    else {
+        bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+        bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    }
     return bounds;
 }
 
@@ -89,11 +102,16 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
         kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
 
         for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
-            const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            if (P.circular) {
+                in_y = wrap_coord(in_y, P.IH);
+            }
 
             for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
-                const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
-
+                int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                if (P.circular) {
+                    in_x = wrap_coord(in_x, P.IW);
+                }
                 const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
                 const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
                 acc += (input_val * ggml_cuda_cast<float>(kernel_val));
@@ -141,9 +159,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int       PD_Y = p[3];  // padding_y
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
-
-    // No cwhn
-    GGML_ASSERT(p[6] == false);
+    const int   circular = p[6];  // circular
 
     const int IW = input->ne[0];   // input_w
     const int IH = input->ne[1];   // input_h
@@ -156,7 +172,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int B  = input->ne[3];   // n_batches
 
     const int64_t total  = B * OC * OH * OW;
-    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
+    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, circular, total };
 
     if (kernel->type == GGML_TYPE_F16) {
         conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu
index 29aef33c1..eb31af6ea 100644
--- a/src/ggml-cuda/pad.cu
+++ b/src/ggml-cuda/pad.cu
@@ -1,9 +1,14 @@
 #include "pad.cuh"
 
+__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
+    return (coord+size) % size; // +size to fix negative numbers giving incorrect mod
+}
+
 static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
-                               const int ne0, const int ne1, const int ne2, const int ne3) {
+                               const int ne0, const int ne1, const int ne2, const int ne3,
+                               const int circular) {
     // blockIdx.z: i3*ne2+i2
     // blockIdx.y: i1
     // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@@ -18,33 +23,53 @@ static __global__ void pad_f32(const float * src, float * dst,
 
     // operation
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
-        (i1 >= lp1 && i1 < ne1 - rp1) &&
-        (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3)) {
-        const int64_t i00 = i0 - lp0;
-        const int64_t i01 = i1 - lp1;
-        const int64_t i02 = i2 - lp2;
-        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
-        const int64_t ne00 = ne0 - lp0 - rp0;
+    const int64_t ne02 = ne2 - lp2 - rp2;
+    const int64_t ne01 = ne1 - lp1 - rp1;
+    const int64_t ne00 = ne0 - lp0 - rp0;
+    if (circular) {
+        const int64_t ne03 = ne3 - lp3 - rp3;
+        const int64_t i00 = wrap_coord(i0 - lp0, ne00);
+        const int64_t i01 = wrap_coord(i1 - lp1, ne01);
+        const int64_t i02 = wrap_coord(i2 - lp2, ne02);
+        const int64_t i03 = wrap_coord(i3 - lp3, ne03);
 
         const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
-
+        
         dst[dst_idx] = src[src_idx];
-    } else {
-        dst[dst_idx] = 0.0f;
     }
+    else {
+        if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+            (i1 >= lp1 && i1 < ne1 - rp1) &&
+            (i2 >= lp2 && i2 < ne2 - rp2) &&
+            (i3 >= lp3 && i3 < ne3 - rp3)) {
+            const int64_t i00 = i0 - lp0;
+            const int64_t i01 = i1 - lp1;
+            const int64_t i02 = i2 - lp2;
+            const int64_t i03 = i3 - lp3;
+
+            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+
+            dst[dst_idx] = src[src_idx];
+        } else {
+            dst[dst_idx] = 0.0f;
+        }
+    }
+
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
     const int lp0, const int rp0, const int lp1, const int rp1,
     const int lp2, const int rp2, const int lp3, const int rp3,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    const int ne0, const int ne1, const int ne2, const int ne3,
+    const int circular, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(
+        src, dst,
+        lp0, rp0, lp1, rp1,
+        lp2, rp2, lp3, rp3,
+        ne0, ne1, ne2, ne3,
+        circular);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -65,8 +90,12 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
     const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
     const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const int32_t circular = ((const int32_t*)(dst->op_params))[8];
 
     pad_f32_cuda(src0_d, dst_d,
-                 lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+                 lp0, rp0, lp1, rp1,
+                 lp2, rp2, lp3, rp3,
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                 circular,
+                 stream);
 }
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 21bd05225..353d8b2a2 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -852,6 +852,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
+    uint32_t circular;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -894,6 +895,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
+    p.circular = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
@@ -1156,6 +1158,8 @@ struct vk_op_conv2d_push_constants {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
@@ -1204,6 +1208,8 @@ struct vk_op_conv_transpose_2d_push_constants {
     uint32_t OWOHmp; uint32_t OWOHL;
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
@@ -1232,6 +1238,7 @@ struct vk_op_conv2d_dw_push_constants {
     int32_t pad_y;
     int32_t dilation_x;
     int32_t dilation_y;
+    int32_t circular;
 };
 
 struct vk_op_upscale_push_constants {
@@ -9982,6 +9989,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
+    p.circular = static_cast<uint32_t>(dst->op_params[6]);
 
     GGML_ASSERT(ne03 == ne2);
     GGML_ASSERT(ne02 == ne12);
@@ -10031,6 +10039,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
+    p.circular = 0;
 
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
@@ -10055,6 +10064,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = dst->op_params[6];
 
     GGML_ASSERT(src0->ne[3] == p.channels);
     GGML_ASSERT(src1->ne[3] == p.batches);
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
index 70a301488..77af6873a 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
     int pad_y;
     int dilation_x;
     int dilation_y;
+    int circular;
 } p;
 
 layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};
@@ -27,6 +28,10 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+uint wrap_coord(int coord, uint size) {
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
+}
+
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint i0 = idx / p.dst_w;
     uint dst_x = idx - i0 * p.dst_w;
@@ -39,19 +44,34 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+    if (p.circular != 0) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
@@ -70,21 +90,37 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+    if (p.circular != 0) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
-            sum = fma(v, k, sum);
-        }
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
     }
+    }
+
     return sum;
 }
 
@@ -102,4 +138,3 @@ void main() {
 #endif
     dst_data[idx] = D_TYPE(result);
 }
-
diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 0367e80bb..098c0178d 100644
--- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -70,6 +70,8 @@ layout(push_constant) uniform parameter {
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
 #endif
+
+    uint32_t circular;
 }
 
 p;
@@ -97,6 +99,10 @@ uint32_t NPQ = p.N * p.OH * p.OW;
 
 uint32_t n_elems_out = K * NPQ;
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);
 
@@ -275,20 +281,26 @@ void main() {
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
 
+            uint32_t H_pos;
+            uint32_t W_pos;
 #ifdef TRANSPOSE
             uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
             uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
             uint32_t H_idx = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
             uint32_t W_idx = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
+            H_pos            = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx;
+            W_pos            = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx;
 #else
-            uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
-            uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
+            int H_raw = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1);
+            int W_raw = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0);
+            H_pos     = (p.circular != 0) ? wrap_coord(H_raw, p.H) : uint32_t(H_raw);
+            W_pos     = (p.circular != 0) ? wrap_coord(W_raw, p.W) : uint32_t(W_raw);
 #endif
             uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+                min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
             float val = src_data[src_idx];
             if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+                || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
 #ifdef TRANSPOSE
                 || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
 #endif
diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp
index f3c817687..b6a0cf933 100644
--- a/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -8,6 +8,7 @@ layout (push_constant) uniform parameter
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint misalign_offsets;
+    uint circular;
 
     uint lp0; uint rp0;
     uint lp1; uint rp1;
@@ -23,6 +24,10 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+uint wrap_coord(int coord, uint size) {
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
+}
+
 void main() {
     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 
@@ -40,10 +45,16 @@ void main() {
     const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
 
-    const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
-                         i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
-                         i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
-                         i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
-
-    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
-}
+    if (p.circular != 0u) {
+        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00);
+        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01);
+        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02);
+        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03);
+        const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
+        data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
+    }
+    else {
+        const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    }
+}
\ No newline at end of file
diff --git a/src/ggml.c b/src/ggml.c
index 9be35c1be..e7397405b 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4457,6 +4457,24 @@ struct ggml_tensor * ggml_conv_2d(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    if (p0 == 0 && p1 == 0) {
+        return ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    }
+
+    struct ggml_tensor * b_padded = ggml_pad_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0);
+    return ggml_conv_2d(ctx, a, b_padded, s0, s1, 0, 0, d0, d1);
+}
+
 // a: [OC*IC, KD, KH, KW]
 // b: [N*IC, ID, IH, IW]
 // result: [N*OD, OH, OW, IC * KD * KH * KW]
@@ -4618,6 +4636,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
 
     int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
     ggml_set_op_params(result, params, sizeof(params));
+    ggml_set_op_params_i32(result, 6, 0); // default not circular
 
     result->op     = GGML_OP_CONV_2D_DW;
     result->src[0] = a;
@@ -4625,6 +4644,22 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride0,
+        int                   stride1,
+        int                   pad0,
+        int                   pad1,
+        int                   dilation0,
+        int                   dilation1) {
+    struct ggml_tensor * result =
+        ggml_conv_2d_dw_direct(ctx, a, b, stride0, stride1, pad0, pad1, dilation0, dilation1);
+    ggml_set_op_params_i32(result, 6, 1);
+    return result;
+}
+
 // ggml_conv_2d_direct
 
 struct ggml_tensor * ggml_conv_2d_direct(
@@ -4636,8 +4671,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
         int                   p0,  // padding dimension 0
         int                   p1,  // padding dimension 1
         int                   d0,  // dilation dimension 0
-        int                   d1) {// dilation dimension 1
-
+        int                   d1) {  // dilation dimension 1
     GGML_ASSERT(a->ne[2] == b->ne[2]);
     //GGML_ASSERT(a->type == b->type);
 
@@ -4655,6 +4689,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
     ggml_set_op_params_i32(result, 3, p1);
     ggml_set_op_params_i32(result, 4, d0);
     ggml_set_op_params_i32(result, 5, d1);
+    ggml_set_op_params_i32(result, 6, 0); // default not circular
 
     result->op = GGML_OP_CONV_2D;
     result->src[0] = a;
@@ -4663,6 +4698,21 @@ struct ggml_tensor * ggml_conv_2d_direct(
     return result;
 }
 
+struct ggml_tensor * ggml_conv_2d_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * result = ggml_conv_2d_direct(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    ggml_set_op_params_i32(result, 6, 1);
+    return result;
+}
+
 // ggml_conv_3d_direct
 
 struct ggml_tensor * ggml_conv_3d_direct(
@@ -4920,7 +4970,7 @@ struct ggml_tensor * ggml_pad_ext(
     ggml_set_op_params_i32(result, 5, rp2);
     ggml_set_op_params_i32(result, 6, lp3);
     ggml_set_op_params_i32(result, 7, rp3);
-
+    ggml_set_op_params_i32(result, 8, 0); // not circular by default
 
     result->op     = GGML_OP_PAD;
     result->src[0] = a;
@@ -4928,6 +4978,23 @@ struct ggml_tensor * ggml_pad_ext(
     return result;
 }
 
+struct ggml_tensor * ggml_pad_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                  lp0,
+        int                  rp0,
+        int                  lp1,
+        int                  rp1,
+        int                  lp2,
+        int                  rp2,
+        int                  lp3,
+        int                  rp3
+        ) {
+    struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    ggml_set_op_params_i32(result, 8, 1);
+    return result;
+}
+
 // ggml_pad_reflect_1d
 
 struct ggml_tensor * ggml_pad_reflect_1d(