-
Notifications
You must be signed in to change notification settings - Fork 371
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
-- bc650ef by Mihai.Olinovici <[email protected]>: Add qs8/qu8 gavgpool kernels, configs and tests. -- b0cfeb2 by Mihai.Olinovici <[email protected]>: Adhere to standard RVV naming. -- 8c941c0 by Mihai.Olinovici <[email protected]>: Remove old unipass reminants. FUTURE_COPYBARA_INTEGRATE_REVIEW=#7031 from imaginationtech:img_patch25_qs8_gavgpool 8c941c0 PiperOrigin-RevId: 672712922
- Loading branch information
1 parent
33f3487
commit 40ff7e2
Showing
20 changed files
with
3,116 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
175 changes: 175 additions & 0 deletions
175
src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-rvv-u1v.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
// Auto-generated file. Do not edit! | ||
// Template: src/qs8-gavgpool/multipass-rvv.c.in | ||
// Generator: tools/xngen | ||
// | ||
// Copyright 2024 Imagination Technologies, inc. | ||
// | ||
// This source code is licensed under the BSD-style license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include <assert.h> | ||
|
||
#include <riscv_vector.h> | ||
|
||
#include "xnnpack/gavgpool.h" | ||
#include "xnnpack/math.h" | ||
|
||
|
||
void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__rvv_u1v( | ||
size_t rows, | ||
size_t channels, | ||
const int8_t* input, | ||
size_t input_stride, | ||
const int8_t* zero, | ||
int32_t* buffer, | ||
int8_t* output, | ||
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) | ||
{ | ||
assert(rows > 7); | ||
assert(channels != 0); | ||
|
||
const int8_t* i0 = input; | ||
const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); | ||
if XNN_UNPREDICTABLE(rows < 2) { | ||
i1 = zero; | ||
} | ||
const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); | ||
if XNN_UNPREDICTABLE(rows <= 2) { | ||
i2 = zero; | ||
} | ||
const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); | ||
if XNN_UNPREDICTABLE(rows < 4) { | ||
i3 = zero; | ||
} | ||
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); | ||
if XNN_UNPREDICTABLE(rows <= 4) { | ||
i4 = zero; | ||
} | ||
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); | ||
if XNN_UNPREDICTABLE(rows < 6) { | ||
i5 = zero; | ||
} | ||
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); | ||
if XNN_UNPREDICTABLE(rows <= 6) { | ||
i6 = zero; | ||
} | ||
const size_t input_increment = 7 * input_stride - channels; | ||
|
||
const int32_t init_bias = params->fp32_scalar_fmagic.init_bias; | ||
int32_t* b = buffer; | ||
int32_t c = (int32_t) channels; | ||
do { | ||
int32_t n = __riscv_vsetvl_e8m1(c); c -= n; | ||
vint8m1_t i0_i8v = __riscv_vle8_v_i8m1(i0, n); i0 += n; | ||
vint8m1_t i1_i8v = __riscv_vle8_v_i8m1(i1, n); i1 += n; | ||
vint16m2_t acc_i16v = __riscv_vwadd_vv_i16m2(i0_i8v, i1_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i2, n); i2 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i3, n); i3 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i4, n); i4 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i5, n); i5 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i6, n); i6 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
|
||
vint32m4_t acc_i32v = __riscv_vwadd_vx_i32m4(acc_i16v, init_bias, n); | ||
__riscv_vse32_v_i32m4(b, acc_i32v, n); b += n; | ||
} while (c > 0); | ||
|
||
for (rows -= 7; rows > 7; rows -= 7) { | ||
i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); | ||
i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); | ||
i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); | ||
i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); | ||
i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); | ||
i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); | ||
i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); | ||
|
||
int32_t* b = buffer; | ||
int32_t c = (int32_t) channels; | ||
do { | ||
int32_t n = __riscv_vsetvl_e8m1(c); c -= n; | ||
vint8m1_t i0_i8v = __riscv_vle8_v_i8m1(i0, n); i0 += n; | ||
vint8m1_t i1_i8v = __riscv_vle8_v_i8m1(i1, n); i1 += n; | ||
vint16m2_t acc_i16v = __riscv_vwadd_vv_i16m2(i0_i8v, i1_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i2, n); i2 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i3, n); i3 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i4, n); i4 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i5, n); i5 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i6, n); i6 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
|
||
vint32m4_t acc_i32v = __riscv_vle32_v_i32m4(b, n); | ||
acc_i32v = __riscv_vwadd_wv_i32m4(acc_i32v, acc_i16v, n); | ||
__riscv_vse32_v_i32m4(b, acc_i32v, n); b += n; | ||
} while (c > 0); | ||
} | ||
|
||
i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); | ||
i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); | ||
if XNN_UNPREDICTABLE(rows < 2) { | ||
i1 = zero; | ||
} | ||
i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); | ||
if XNN_UNPREDICTABLE(rows <= 2) { | ||
i2 = zero; | ||
} | ||
i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); | ||
if XNN_UNPREDICTABLE(rows < 4) { | ||
i3 = zero; | ||
} | ||
i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); | ||
if XNN_UNPREDICTABLE(rows <= 4) { | ||
i4 = zero; | ||
} | ||
i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); | ||
if XNN_UNPREDICTABLE(rows < 6) { | ||
i5 = zero; | ||
} | ||
i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); | ||
if XNN_UNPREDICTABLE(rows <= 6) { | ||
i6 = zero; | ||
} | ||
|
||
const float scale = params->fp32_scalar_fmagic.scale; | ||
const float output_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; | ||
const float output_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; | ||
const float magic_bias = params->fp32_scalar_fmagic.magic_bias; | ||
const int32_t magic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; | ||
|
||
do { | ||
int32_t n = __riscv_vsetvl_e8m1(channels); channels -= n; | ||
vint8m1_t i0_i8v = __riscv_vle8_v_i8m1(i0, n); i0 += n; | ||
vint8m1_t i1_i8v = __riscv_vle8_v_i8m1(i1, n); i1 += n; | ||
vint16m2_t acc_i16v = __riscv_vwadd_vv_i16m2(i0_i8v, i1_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i2, n); i2 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i3, n); i3 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i4, n); i4 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i5, n); i5 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
i0_i8v = __riscv_vle8_v_i8m1(i6, n); i6 += n; | ||
acc_i16v = __riscv_vwadd_wv_i16m2(acc_i16v, i0_i8v, n); | ||
|
||
vint32m4_t acc_i32v = __riscv_vle32_v_i32m4(buffer, n); buffer += n; | ||
acc_i32v = __riscv_vwadd_wv_i32m4(acc_i32v, acc_i16v, n); | ||
vfloat32m4_t acc_f32v = __riscv_vfcvt_f_x_v_f32m4(acc_i32v, n); | ||
acc_f32v = __riscv_vfmul_vf_f32m4(acc_f32v, scale, n); | ||
acc_f32v = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc_f32v, output_min_less_zero_point, n), output_max_less_zero_point, n); | ||
acc_f32v = __riscv_vfadd_vf_f32m4(acc_f32v, magic_bias, n); | ||
|
||
vint32m4_t out_i32v = __riscv_vfcvt_x_f_v_i32m4(acc_f32v, n); | ||
vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); | ||
out_i16v = __riscv_vsub_vx_i16m2(out_i16v, magic_bias_less_output_zero_point, n); | ||
vint8m1_t out_i8v = __riscv_vncvt_x_x_w_i8m1(out_i16v, n); | ||
__riscv_vse8_v_i8m1(output, out_i8v, n); output += n; | ||
} while (channels != 0); | ||
} |
Oops, something went wrong.