diff --git a/include/ggml.h b/include/ggml.h index d948b00cc..6e4c867f8 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -2031,11 +2031,33 @@ extern "C" { int dilation0, int dilation1); - GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + GGML_API struct ggml_tensor * ggml_conv_2d_circular( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - int stride); + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + +GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); GGML_API struct ggml_tensor * ggml_conv_2d_direct( struct ggml_context * ctx, @@ -2048,6 +2070,17 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + GGML_API struct ggml_tensor * ggml_conv_3d_direct( struct ggml_context * ctx, struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC] @@ -2168,6 +2201,19 @@ extern "C" { int rp3 ); + GGML_API struct ggml_tensor * ggml_pad_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] GGML_API struct ggml_tensor * ggml_pad_reflect_1d( struct ggml_context * ctx, diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp index b52f0f847..3db28021d 100644 --- a/src/ggml-cpu/ops.cpp +++ b/src/ggml-cpu/ops.cpp @@ -6660,6 +6660,12 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params ggml_compute_forward_mul_mat(params, &dst); } +// ggml_wrap_coord + +static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) { + return (coord + size) % size; // adding size avoids negative number weirdness +} + // ggml_compute_forward_conv_2d static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, @@ -6680,6 +6686,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int32_t pad_y = dst->op_params[3]; const int32_t dilation_x = dst->op_params[4]; const int32_t dilation_y = dst->op_params[5]; + const bool circular = ggml_get_op_params_i32(dst, 6) != 0; const int64_t c_in = src->ne[2]; const int64_t c_out = kernel->ne[3]; @@ -6736,13 +6743,19 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; - float src_val; - if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { - src_val = 0.0f; - } else { + float src_val = 0.0f; + if (circular) { + const int64_t sy_wrapped = ggml_wrap_coord(sy, src_h); + const int64_t sx_wrapped = ggml_wrap_coord(sx, src_w); + const float * src_ptr = (const float *)((const char *)src_base + sx_wrapped * src->nb[0] + sy_wrapped * src->nb[1] + ic * src->nb[2]); + src_val = *src_ptr; + } else if (sy >= 0 && sy < src_h && sx >= 0 && sx < src_w) { const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); src_val = *src_ptr; } + else { + src_val = 1.0f; + } char * element_ptr = dst_row + dst_idx * traits->type_size; if (kernel_type == GGML_TYPE_F32) { @@ -7052,6 +7065,7 @@ struct ggml_conv_2d_dw_params { int pad_y; int dilation_x; int dilation_y; + int circular; }; static void ggml_compute_forward_conv_2d_dw_cwhn( @@ -7063,6 +7077,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( const int64_t c = p.channels; const float * knl_data = (const float *)kernel->data; + const bool circular = p.circular != 0; const int64_t rows_total = p.dst_h * p.batch; const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; @@ -7090,13 +7105,17 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { GGML_F32_VEC sum = GGML_F32_VEC_ZERO; for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = src_y_base + knl_y * p.dilation_y; - if (src_y < 0 || src_y >= p.src_h) { + int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (circular) { + src_y = ggml_wrap_coord(src_y, p.src_h); + } else if (src_y < 0 || src_y >= p.src_h) { continue; } for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = src_x_base + knl_x * p.dilation_x; - if (src_x < 0 || src_x >= p.src_w) { + int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (circular) { + src_x = ggml_wrap_coord(src_x, p.src_w); + } else if (src_x < 0 || src_x >= p.src_w) { continue; } GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); @@ -7111,13 +7130,17 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { float sum = 0.0f; for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = src_y_base + knl_y * p.dilation_y; - if (src_y < 0 || src_y >= p.src_h) { + int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (circular) { + src_y = ggml_wrap_coord(src_y, p.src_h); + } else if (src_y < 0 || src_y >= p.src_h) { continue; } for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = src_x_base + knl_x * p.dilation_x; - if (src_x < 0 || src_x >= p.src_w) { + int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (circular) { + src_x = ggml_wrap_coord(src_x, p.src_w); + } else if (src_x < 0 || src_x >= p.src_w) { continue; } sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] @@ -7138,6 +7161,7 @@ static void ggml_compute_forward_conv_2d_dw_whcn( const ggml_conv_2d_dw_params & p) { const int64_t n = p.channels * p.batch; + const bool circular = p.circular != 0; const int64_t per_thread = (n + params->nth - 1) / params->nth; const int64_t start = params->ith * per_thread; const int64_t end = MIN(start + per_thread, n); @@ -7152,13 +7176,17 @@ static void ggml_compute_forward_conv_2d_dw_whcn( float sum = 0.0f; for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y < 0 || src_y >= p.src_h) { + int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (circular) { + src_y = ggml_wrap_coord(src_y, p.src_h); + } else if (src_y < 0 || src_y >= p.src_h) { continue; } for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x < 0 || src_x >= p.src_w) { + int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (circular) { + src_x = ggml_wrap_coord(src_x, p.src_w); + } else if (src_x < 0 || src_x >= p.src_w) { continue; } sum += knl_data[knl_y * p.knl_w + knl_x] @@ -7192,6 +7220,7 @@ void ggml_compute_forward_conv_2d_dw( p.pad_y = dst->op_params[3]; p.dilation_x = dst->op_params[4]; p.dilation_y = dst->op_params[5]; + p.circular = ggml_get_op_params_i32(dst, 6); GGML_ASSERT(kernel->ne[3] == p.channels); GGML_ASSERT(dst->ne[3] == p.batch); @@ -7612,24 +7641,51 @@ static void ggml_compute_forward_pad_f32( const int32_t rp2 = ggml_get_op_params_i32(dst, 5); const int32_t lp3 = ggml_get_op_params_i32(dst, 6); const int32_t rp3 = ggml_get_op_params_i32(dst, 7); + const int32_t mode = ggml_get_op_params_i32(dst, 8); + const bool circular = mode == GGML_PAD_MODE_CIRCULAR; // TODO: optimize - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - if ((i0 >= lp0 && i0 < ne0 - rp0) \ - && (i1 >= lp1 && i1 < ne1 - rp1) \ - && (i2 >= lp2 && i2 < ne2 - rp2) \ - && (i3 >= lp3 && i3 < ne3 - rp3)) { - const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00; + if (!circular) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + if ((i0 >= lp0 && i0 < ne0 - rp0) \ + && (i1 >= lp1 && i1 < ne1 - rp1) \ + && (i2 >= lp2 && i2 < ne2 - rp2) \ + && (i3 >= lp3 && i3 < ne3 - rp3)) { + const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00; + const float * src_ptr = (const float *)((char *) src0->data + src_idx); + dst_ptr[dst_idx] = *src_ptr; + } else { + dst_ptr[dst_idx] = 0; + } + } + } + } + } + } else { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00); + const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01); + const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02); + const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03); + + const int64_t src_idx = + src_i3*nb03 + + src_i2*nb02 + + src_i1*nb01 + + src_i0*nb00; + const float * src_ptr = (const float *)((char *) src0->data + src_idx); dst_ptr[dst_idx] = *src_ptr; - } else { - dst_ptr[dst_idx] = 0; } } } diff --git a/src/ggml-cuda/conv2d-dw.cu b/src/ggml-cuda/conv2d-dw.cu index 7583233b1..a49202dee 100644 --- a/src/ggml-cuda/conv2d-dw.cu +++ b/src/ggml-cuda/conv2d-dw.cu @@ -8,6 +8,7 @@ struct conv_params { int padding_x, padding_y; int dilation_x, dilation_y; int channels, batches; + int circular; }; struct kernel_bounds { @@ -15,16 +16,27 @@ struct kernel_bounds { int x_min, x_max; }; +__device__ __forceinline__ int wrap_coord(int coord, int size) { + return (coord+size) % size; // +size to fix negative numbers giving incorrect mod +} + __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) { kernel_bounds bounds; - bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); - bounds.y_max = - min(params.kernel_h, - (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); - bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); - bounds.x_max = - min(params.kernel_w, - (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + if (params.circular) { + bounds.y_min = 0; + bounds.y_max = params.kernel_h; + bounds.x_min = 0; + bounds.x_max = params.kernel_w; + } else { + bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.y_max = + min(params.kernel_h, + (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + bounds.x_max = + min(params.kernel_w, + (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + } return bounds; } @@ -83,7 +95,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr const int in_w, const int in_h, const int out_w, const int out_h, const int kernel_w, const int kernel_h, const int stride_x, const int stride_y, const int padding_x, const int padding_y, const int dilation_x, const int dilation_y, - const int channels, const int batches) { + const int channels, const int batches, const int circular) { const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = batches * channels * out_h * out_w; @@ -92,19 +104,30 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr } conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, - stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches }; + stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular }; int batch_idx, channel_idx, out_y_idx, out_x_idx; Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx); T accumulator = 0; kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params); + for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) { int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y); + if (params.circular) { + in_y_idx = wrap_coord(in_y_idx, params.in_h); + } else if (in_y_idx < 0 || in_y_idx >= params.in_h) { + continue; + } for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) { int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x); + if (params.circular) { + in_x_idx = wrap_coord(in_x_idx, params.in_w); + } else if (in_x_idx < 0 || in_x_idx >= params.in_w) { + continue; + } const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)]; const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)]; @@ -132,6 +155,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int padding_y = p[3]; const int dilation_x = p[4]; const int dilation_y = p[5]; + const int circular = p[6]; const int in_w = input->ne[0]; const int in_h = input->ne[1]; @@ -150,11 +174,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) if (ggml_is_contiguous(input)) { conv2d_dw_kernel<<>>( x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, - dilation_x, dilation_y, channels, batches); + dilation_x, dilation_y, channels, batches, circular); } else if (ggml_is_contiguous_channels(input)) { conv2d_dw_kernel<<>>( x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, - dilation_x, dilation_y, channels, batches); + dilation_x, dilation_y, channels, batches, circular); } else { GGML_ABORT("Unsupported memory layout for conv_2d_dw"); } diff --git a/src/ggml-cuda/conv2d.cu b/src/ggml-cuda/conv2d.cu index 142dd6690..c8e49fa87 100644 --- a/src/ggml-cuda/conv2d.cu +++ b/src/ggml-cuda/conv2d.cu @@ -10,6 +10,7 @@ struct conv_params { const int64_t DL_X, DL_Y; const int64_t IC, OC; const int64_t B; + const int64_t circular; const int64_t TOTAL; }; @@ -18,6 +19,10 @@ struct kernel_bounds { int64_t x_min, x_max; }; +__device__ __forceinline__ int wrap_coord(int coord, int size) { + return (coord+size) % size; // +size to fix negative numbers giving incorrect mod +} + __device__ __forceinline__ int64_t max64(int64_t a, int64_t b) { return (a > b) ? a : b; } @@ -28,10 +33,18 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) { __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) { kernel_bounds bounds; - bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); - bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); - bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); - bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + if (P.circular) { + bounds.y_min = 0; + bounds.y_max = P.KH; + bounds.x_min = 0; + bounds.x_max = P.KW; + } + else { + bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + } return bounds; } @@ -89,11 +102,16 @@ static __global__ void conv2d_kernel(const float * __restrict__ input, kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P); for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) { - const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y); + int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y); + if (P.circular) { + in_y = wrap_coord(in_y, P.IH); + } for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) { - const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X); - + int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X); + if (P.circular) { + in_x = wrap_coord(in_x, P.IW); + } const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)]; const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)]; acc += (input_val * ggml_cuda_cast(kernel_val)); @@ -141,9 +159,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int PD_Y = p[3]; // padding_y const int DL_X = p[4]; // dilation_x const int DL_Y = p[5]; // dilation_y - - // No cwhn - GGML_ASSERT(p[6] == false); + const int circular = p[6]; // circular const int IW = input->ne[0]; // input_w const int IH = input->ne[1]; // input_h @@ -156,7 +172,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int B = input->ne[3]; // n_batches const int64_t total = B * OC * OH * OW; - conv_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total }; + conv_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, circular, total }; if (kernel->type == GGML_TYPE_F16) { conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st); diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu index 29aef33c1..eb31af6ea 100644 --- a/src/ggml-cuda/pad.cu +++ b/src/ggml-cuda/pad.cu @@ -1,9 +1,14 @@ #include "pad.cuh" +__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) { + return (coord+size) % size; // +size to fix negative numbers giving incorrect mod +} + static __global__ void pad_f32(const float * src, float * dst, const int lp0, const int rp0, const int lp1, const int rp1, const int lp2, const int rp2, const int lp3, const int rp3, - const int ne0, const int ne1, const int ne2, const int ne3) { + const int ne0, const int ne1, const int ne2, const int ne3, + const int circular) { // blockIdx.z: i3*ne2+i2 // blockIdx.y: i1 // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE @@ -18,33 +23,53 @@ static __global__ void pad_f32(const float * src, float * dst, // operation const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - if ((i0 >= lp0 && i0 < ne0 - rp0) && - (i1 >= lp1 && i1 < ne1 - rp1) && - (i2 >= lp2 && i2 < ne2 - rp2) && - (i3 >= lp3 && i3 < ne3 - rp3)) { - const int64_t i00 = i0 - lp0; - const int64_t i01 = i1 - lp1; - const int64_t i02 = i2 - lp2; - const int64_t i03 = i3 - lp3; - const int64_t ne02 = ne2 - lp2 - rp2; - const int64_t ne01 = ne1 - lp1 - rp1; - const int64_t ne00 = ne0 - lp0 - rp0; + const int64_t ne02 = ne2 - lp2 - rp2; + const int64_t ne01 = ne1 - lp1 - rp1; + const int64_t ne00 = ne0 - lp0 - rp0; + if (circular) { + const int64_t ne03 = ne3 - lp3 - rp3; + const int64_t i00 = wrap_coord(i0 - lp0, ne00); + const int64_t i01 = wrap_coord(i1 - lp1, ne01); + const int64_t i02 = wrap_coord(i2 - lp2, ne02); + const int64_t i03 = wrap_coord(i3 - lp3, ne03); const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; - + dst[dst_idx] = src[src_idx]; - } else { - dst[dst_idx] = 0.0f; } + else { + if ((i0 >= lp0 && i0 < ne0 - rp0) && + (i1 >= lp1 && i1 < ne1 - rp1) && + (i2 >= lp2 && i2 < ne2 - rp2) && + (i3 >= lp3 && i3 < ne3 - rp3)) { + const int64_t i00 = i0 - lp0; + const int64_t i01 = i1 - lp1; + const int64_t i02 = i2 - lp2; + const int64_t i03 = i3 - lp3; + + const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; + + dst[dst_idx] = src[src_idx]; + } else { + dst[dst_idx] = 0.0f; + } + } + } static void pad_f32_cuda(const float * src, float * dst, const int lp0, const int rp0, const int lp1, const int rp1, const int lp2, const int rp2, const int lp3, const int rp3, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + const int ne0, const int ne1, const int ne2, const int ne3, + const int circular, cudaStream_t stream) { int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; dim3 gridDim(num_blocks, ne1, ne2*ne3); - pad_f32<<>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3); + pad_f32<<>>( + src, dst, + lp0, rp0, lp1, rp1, + lp2, rp2, lp3, rp3, + ne0, ne1, ne2, ne3, + circular); } void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -65,8 +90,12 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int32_t rp2 = ((const int32_t*)(dst->op_params))[5]; const int32_t lp3 = ((const int32_t*)(dst->op_params))[6]; const int32_t rp3 = ((const int32_t*)(dst->op_params))[7]; + const int32_t circular = ((const int32_t*)(dst->op_params))[8]; pad_f32_cuda(src0_d, dst_d, - lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + lp0, rp0, lp1, rp1, + lp2, rp2, lp3, rp3, + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + circular, + stream); } diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp index 21bd05225..353d8b2a2 100644 --- a/src/ggml-vulkan/ggml-vulkan.cpp +++ b/src/ggml-vulkan/ggml-vulkan.cpp @@ -852,6 +852,7 @@ struct vk_op_pad_push_constants { uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; + uint32_t circular; uint32_t lp0; uint32_t rp0; uint32_t lp1; uint32_t rp1; @@ -894,6 +895,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor p.rp2 = dst->op_params[5]; p.lp3 = dst->op_params[6]; p.rp3 = dst->op_params[7]; + p.circular = dst->op_params[8]; return p; // fastdiv values and offsets are initialized later in ggml_vk_op } @@ -1156,6 +1158,8 @@ struct vk_op_conv2d_push_constants { uint32_t KWKHmp; uint32_t KWKHL; uint32_t OWmp; uint32_t OWL; uint32_t OWOHmp; uint32_t OWOHL; + + uint32_t circular; }; template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { @@ -1204,6 +1208,8 @@ struct vk_op_conv_transpose_2d_push_constants { uint32_t OWOHmp; uint32_t OWOHL; uint32_t s0mp; uint32_t s0L; uint32_t s1mp; uint32_t s1L; + + uint32_t circular; }; template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) { @@ -1232,6 +1238,7 @@ struct vk_op_conv2d_dw_push_constants { int32_t pad_y; int32_t dilation_x; int32_t dilation_y; + int32_t circular; }; struct vk_op_upscale_push_constants { @@ -9982,6 +9989,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, p.nb1 = static_cast(nb1 / nb0); p.nb2 = static_cast(nb2 / nb0); p.nb3 = static_cast(nb3 / nb0); + p.circular = static_cast(dst->op_params[6]); GGML_ASSERT(ne03 == ne2); GGML_ASSERT(ne02 == ne12); @@ -10031,6 +10039,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context p.nb1 = static_cast(nb1 / nb0); p.nb2 = static_cast(nb2 / nb0); p.nb3 = static_cast(nb3 / nb0); + p.circular = 0; GGML_ASSERT(ne02 == ne2); GGML_ASSERT(ne03 == ne12); @@ -10055,6 +10064,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx p.pad_y = dst->op_params[3]; p.dilation_x = dst->op_params[4]; p.dilation_y = dst->op_params[5]; + p.circular = dst->op_params[6]; GGML_ASSERT(src0->ne[3] == p.channels); GGML_ASSERT(src1->ne[3] == p.batches); diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp index 70a301488..77af6873a 100644 --- a/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +++ b/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp @@ -19,6 +19,7 @@ layout (push_constant) uniform parameter int pad_y; int dilation_x; int dilation_y; + int circular; } p; layout (binding = 0) readonly buffer A {A_TYPE knl_data[];}; @@ -27,6 +28,10 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];}; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; +uint wrap_coord(int coord, uint size) { + return (uint(coord + int(size))) % size; // add size to avoid issues with negative +} + FLOAT_TYPE conv_2d_dw_whcn(uint idx) { uint i0 = idx / p.dst_w; uint dst_x = idx - i0 * p.dst_w; @@ -39,19 +44,34 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) { uint knl_i = c * p.knl_h * p.knl_w; FLOAT_TYPE sum = 0.0; - for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { - uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int - continue; + if (p.circular != 0) { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + int raw_y = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y; + uint src_y = wrap_coord(raw_y, p.src_h); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + int raw_x = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x; + uint src_x = wrap_coord(raw_x, p.src_w); + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); + sum = fma(v, k, sum); + } } - for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { - uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + } + else { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int continue; } - FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); - FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); - sum = fma(v, k, sum); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + continue; + } + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); + sum = fma(v, k, sum); + } } } return sum; @@ -70,21 +90,37 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) { uint knl_row = p.knl_w * p.channels; FLOAT_TYPE sum = 0.0; - for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { - uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int - continue; + if (p.circular != 0) { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + int raw_y = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y; + uint src_y = wrap_coord(raw_y, p.src_h); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + int raw_x = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x; + uint src_x = wrap_coord(raw_x, p.src_w); + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); + sum = fma(v, k, sum); + } } - for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { - uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + } + else { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int continue; } - FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); - FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); - sum = fma(v, k, sum); - } + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + continue; + } + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); + sum = fma(v, k, sum); + } } + } + return sum; } @@ -102,4 +138,3 @@ void main() { #endif dst_data[idx] = D_TYPE(result); } - diff --git a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp index 0367e80bb..098c0178d 100644 --- a/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +++ b/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -70,6 +70,8 @@ layout(push_constant) uniform parameter { uint32_t s0mp; uint32_t s0L; uint32_t s1mp; uint32_t s1L; #endif + + uint32_t circular; } p; @@ -97,6 +99,10 @@ uint32_t NPQ = p.N * p.OH * p.OW; uint32_t n_elems_out = K * NPQ; +uint32_t wrap_coord(int coord, uint32_t size) { + return uint32_t((uint(coord + int(size))) % size); +} + // Number of blocktiles per input uint32_t NB_CRS = splitWork(CRS, BS_CRS); @@ -275,20 +281,26 @@ void main() { KW_idx_b = CRS_remainder - KH_idx_b * p.KW; #endif + uint32_t H_pos; + uint32_t W_pos; #ifdef TRANSPOSE uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1; uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0; uint32_t H_idx = fastdiv(H_idx_x_s1, p.s1mp, p.s1L); uint32_t W_idx = fastdiv(W_idx_x_s0, p.s0mp, p.s0L); + H_pos = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx; + W_pos = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx; #else - uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1; - uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0; + int H_raw = int(OH_idx) * int(p.s1) + int(KH_idx_b) * int(p.d1) - int(p.p1); + int W_raw = int(OW_idx) * int(p.s0) + int(KW_idx_b) * int(p.d0) - int(p.p0); + H_pos = (p.circular != 0) ? wrap_coord(H_raw, p.H) : uint32_t(H_raw); + W_pos = (p.circular != 0) ? wrap_coord(W_raw, p.W) : uint32_t(W_raw); #endif uint32_t src_idx = - min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); + min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); float val = src_data[src_idx]; if (CRS_idx_b >= CRS || NPQ_idx >= NPQ - || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case) + || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case) #ifdef TRANSPOSE || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0) #endif diff --git a/src/ggml-vulkan/vulkan-shaders/pad.comp b/src/ggml-vulkan/vulkan-shaders/pad.comp index f3c817687..b6a0cf933 100644 --- a/src/ggml-vulkan/vulkan-shaders/pad.comp +++ b/src/ggml-vulkan/vulkan-shaders/pad.comp @@ -8,6 +8,7 @@ layout (push_constant) uniform parameter uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint misalign_offsets; + uint circular; uint lp0; uint rp0; uint lp1; uint rp1; @@ -23,6 +24,10 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; +uint wrap_coord(int coord, uint size) { + return (uint(coord + int(size))) % size; // add size to avoid issues with negative +} + void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; @@ -40,10 +45,16 @@ void main() { const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00; const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; - const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 && - i1 >= p.lp1 && i1 < p.ne11 - p.rp1 && - i2 >= p.lp2 && i2 < p.ne12 - p.rp2 && - i3 >= p.lp3 && i3 < p.ne13 - p.rp3; - - data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f); -} + if (p.circular != 0u) { + const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00); + const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01); + const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02); + const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03); + const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00; + data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]); + } + else { + const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; + data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f); + } +} \ No newline at end of file diff --git a/src/ggml.c b/src/ggml.c index 9be35c1be..e7397405b 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4457,6 +4457,24 @@ struct ggml_tensor * ggml_conv_2d( return result; } +struct ggml_tensor * ggml_conv_2d_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + if (p0 == 0 && p1 == 0) { + return ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1); + } + + struct ggml_tensor * b_padded = ggml_pad_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0); + return ggml_conv_2d(ctx, a, b_padded, s0, s1, 0, 0, d0, d1); +} + // a: [OC*IC, KD, KH, KW] // b: [N*IC, ID, IH, IW] // result: [N*OD, OH, OW, IC * KD * KH * KW] @@ -4618,6 +4636,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; ggml_set_op_params(result, params, sizeof(params)); + ggml_set_op_params_i32(result, 6, 0); // default not circular result->op = GGML_OP_CONV_2D_DW; result->src[0] = a; @@ -4625,6 +4644,22 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( return result; } +struct ggml_tensor * ggml_conv_2d_dw_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + struct ggml_tensor * result = + ggml_conv_2d_dw_direct(ctx, a, b, stride0, stride1, pad0, pad1, dilation0, dilation1); + ggml_set_op_params_i32(result, 6, 1); + return result; +} + // ggml_conv_2d_direct struct ggml_tensor * ggml_conv_2d_direct( @@ -4636,8 +4671,7 @@ struct ggml_tensor * ggml_conv_2d_direct( int p0, // padding dimension 0 int p1, // padding dimension 1 int d0, // dilation dimension 0 - int d1) {// dilation dimension 1 - + int d1) { // dilation dimension 1 GGML_ASSERT(a->ne[2] == b->ne[2]); //GGML_ASSERT(a->type == b->type); @@ -4655,6 +4689,7 @@ struct ggml_tensor * ggml_conv_2d_direct( ggml_set_op_params_i32(result, 3, p1); ggml_set_op_params_i32(result, 4, d0); ggml_set_op_params_i32(result, 5, d1); + ggml_set_op_params_i32(result, 6, 0); // default not circular result->op = GGML_OP_CONV_2D; result->src[0] = a; @@ -4663,6 +4698,21 @@ struct ggml_tensor * ggml_conv_2d_direct( return result; } +struct ggml_tensor * ggml_conv_2d_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * result = ggml_conv_2d_direct(ctx, a, b, s0, s1, p0, p1, d0, d1); + ggml_set_op_params_i32(result, 6, 1); + return result; +} + // ggml_conv_3d_direct struct ggml_tensor * ggml_conv_3d_direct( @@ -4920,7 +4970,7 @@ struct ggml_tensor * ggml_pad_ext( ggml_set_op_params_i32(result, 5, rp2); ggml_set_op_params_i32(result, 6, lp3); ggml_set_op_params_i32(result, 7, rp3); - + ggml_set_op_params_i32(result, 8, 0); // not circular by default result->op = GGML_OP_PAD; result->src[0] = a; @@ -4928,6 +4978,23 @@ struct ggml_tensor * ggml_pad_ext( return result; } +struct ggml_tensor * ggml_pad_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ) { + struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + ggml_set_op_params_i32(result, 8, 1); + return result; +} + // ggml_pad_reflect_1d struct ggml_tensor * ggml_pad_reflect_1d(