From 4c608060db5136e81846d868f0640ab95443f4df Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 23 May 2025 22:26:39 +0800 Subject: [PATCH 1/2] Add Neon mld_polyvecl_pointwise_acc_montgomery_l{4,5,7}_native These are basically written from scratch inspired by the same functions in mlkem-native. Resolves https://github.com/pq-code-package/mldsa-native/issues/257 Signed-off-by: Matthias J. Kannwischer --- mldsa/native/aarch64/meta.h | 22 +++ .../native/aarch64/src/arith_native_aarch64.h | 15 ++ ...mld_polyvecl_pointwise_acc_montgomery_l4.S | 126 +++++++++++++++ ...mld_polyvecl_pointwise_acc_montgomery_l5.S | 132 ++++++++++++++++ ...mld_polyvecl_pointwise_acc_montgomery_l7.S | 144 ++++++++++++++++++ mldsa/native/api.h | 67 ++++++++ mldsa/polyvec.c | 19 ++- 7 files changed, 523 insertions(+), 2 deletions(-) create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h index 6b044a93..e7992376 100644 --- a/mldsa/native/aarch64/meta.h +++ b/mldsa/native/aarch64/meta.h @@ -10,6 +10,7 @@ /* Set of primitives that this backend replaces */ #define MLD_USE_NATIVE_NTT #define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ @@ -31,6 +32,27 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N]) mld_aarch64_intt_zetas_layer123456); } +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[4 * MLDSA_N], + const int32_t v[4 * MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, u, v); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[5 * MLDSA_N], + const int32_t v[5 * MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, u, v); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[7 * MLDSA_N], + const int32_t v[7 * MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, u, v); +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_NATIVE_AARCH64_META_H */ diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h index d3528e6f..b745c5d2 100644 --- a/mldsa/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/native/aarch64/src/arith_native_aarch64.h @@ -32,4 +32,19 @@ void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *); #define mld_intt_asm MLD_NAMESPACE(intt_asm) void mld_intt_asm(int32_t *, const int32_t *, const int32_t *); +#define mld_polyvecl_pointwise_acc_montgomery_l4_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) +void mld_polyvecl_pointwise_acc_montgomery_l4_asm(int32_t *, const int32_t *, + const int32_t *); + +#define mld_polyvecl_pointwise_acc_montgomery_l5_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) +void mld_polyvecl_pointwise_acc_montgomery_l5_asm(int32_t *, const int32_t *, + const int32_t *); + +#define mld_polyvecl_pointwise_acc_montgomery_l7_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) +void mld_polyvecl_pointwise_acc_montgomery_l7_asm(int32_t *, const int32_t *, + const int32_t *); + #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S new file mode 100644 index 00000000..32a8a9a5 --- /dev/null +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S @@ -0,0 +1,126 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro montgomery_reduce_long res, inl, inh + uzp1 t0.4s, \inl\().4s, \inh\().4s + mul t0.4s, t0.4s, modulus_twisted.4s + smlal \inl\().2d, t0.2s, modulus.2s + smlal2 \inh\().2d, t0.4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys a, b, a_ptr, b_ptr + ldr q_\()\a, [\a_ptr], #16 + ldr q_\()\b, [\b_ptr], #16 +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +out_ptr .req x0 +a0_ptr .req x1 +b0_ptr .req x2 +a1_ptr .req x3 +b1_ptr .req x4 +a2_ptr .req x5 +b2_ptr .req x6 +a3_ptr .req x7 +b3_ptr .req x8 +count .req x9 +wtmp .req w9 + +modulus .req v0 +modulus_twisted .req v1 + +aa .req v2 +bb .req v3 +res .req v4 +resl .req v5 +resh .req v6 +t0 .req v7 + +q_aa .req q2 +q_bb .req q3 +q_res .req q4 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l4_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + // Computed bases of vector entries + add a1_ptr, a0_ptr, #(1 * 1024) + add a2_ptr, a0_ptr, #(2 * 1024) + add a3_ptr, a0_ptr, #(3 * 1024) + + add b1_ptr, b0_ptr, #(1 * 1024) + add b2_ptr, b0_ptr, #(2 * 1024) + add b3_ptr, b0_ptr, #(3 * 1024) + + mov count, #(MLDSA_N / 4) +l4_loop_start: + load_polys aa, bb, a0_ptr, b0_ptr + pmull resl, resh, aa, bb + load_polys aa, bb, a1_ptr, b1_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a2_ptr, b2_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a3_ptr, b3_ptr + pmlal resl, resh, aa, bb + + montgomery_reduce_long res, resl, resh + + str q_res, [out_ptr], #16 + + subs count, count, #1 + cbnz count, l4_loop_start + + pop_stack + ret +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S new file mode 100644 index 00000000..eea407e0 --- /dev/null +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S @@ -0,0 +1,132 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro montgomery_reduce_long res, inl, inh + uzp1 t0.4s, \inl\().4s, \inh\().4s + mul t0.4s, t0.4s, modulus_twisted.4s + smlal \inl\().2d, t0.2s, modulus.2s + smlal2 \inh\().2d, t0.4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys a, b, a_ptr, b_ptr + ldr q_\()\a, [\a_ptr], #16 + ldr q_\()\b, [\b_ptr], #16 +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +out_ptr .req x0 +a0_ptr .req x1 +b0_ptr .req x2 +a1_ptr .req x3 +b1_ptr .req x4 +a2_ptr .req x5 +b2_ptr .req x6 +a3_ptr .req x7 +b3_ptr .req x8 +a4_ptr .req x9 +b4_ptr .req x10 +count .req x11 +wtmp .req w11 + +modulus .req v0 +modulus_twisted .req v1 + +aa .req v2 +bb .req v3 +res .req v4 +resl .req v5 +resh .req v6 +t0 .req v7 + +q_aa .req q2 +q_bb .req q3 +q_res .req q4 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l5_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + // Computed bases of vector entries + add a1_ptr, a0_ptr, #(1 * 1024) + add a2_ptr, a0_ptr, #(2 * 1024) + add a3_ptr, a0_ptr, #(3 * 1024) + add a4_ptr, a0_ptr, #(4 * 1024) + + add b1_ptr, b0_ptr, #(1 * 1024) + add b2_ptr, b0_ptr, #(2 * 1024) + add b3_ptr, b0_ptr, #(3 * 1024) + add b4_ptr, b0_ptr, #(4 * 1024) + + mov count, #(MLDSA_N / 4) +l5_loop_start: + load_polys aa, bb, a0_ptr, b0_ptr + pmull resl, resh, aa, bb + load_polys aa, bb, a1_ptr, b1_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a2_ptr, b2_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a3_ptr, b3_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a4_ptr, b4_ptr + pmlal resl, resh, aa, bb + + montgomery_reduce_long res, resl, resh + + str q_res, [out_ptr], #16 + + subs count, count, #1 + cbnz count, l5_loop_start + + pop_stack + ret +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S new file mode 100644 index 00000000..e3984082 --- /dev/null +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S @@ -0,0 +1,144 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro montgomery_reduce_long res, inl, inh + uzp1 t0.4s, \inl\().4s, \inh\().4s + mul t0.4s, t0.4s, modulus_twisted.4s + smlal \inl\().2d, t0.2s, modulus.2s + smlal2 \inh\().2d, t0.4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys a, b, a_ptr, b_ptr + ldr q_\()\a, [\a_ptr], #16 + ldr q_\()\b, [\b_ptr], #16 +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +out_ptr .req x0 +a0_ptr .req x1 +b0_ptr .req x2 +a1_ptr .req x3 +b1_ptr .req x4 +a2_ptr .req x5 +b2_ptr .req x6 +a3_ptr .req x7 +b3_ptr .req x8 +a4_ptr .req x9 +b4_ptr .req x10 +a5_ptr .req x11 +b5_ptr .req x12 +a6_ptr .req x13 +b6_ptr .req x14 +count .req x15 +wtmp .req w15 + +modulus .req v0 +modulus_twisted .req v1 + +aa .req v2 +bb .req v3 +res .req v4 +resl .req v5 +resh .req v6 +t0 .req v7 + +q_aa .req q2 +q_bb .req q3 +q_res .req q4 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l7_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + // Computed bases of vector entries + add a1_ptr, a0_ptr, #(1 * 1024) + add a2_ptr, a0_ptr, #(2 * 1024) + add a3_ptr, a0_ptr, #(3 * 1024) + add a4_ptr, a0_ptr, #(4 * 1024) + add a5_ptr, a4_ptr, #(1 * 1024) + add a6_ptr, a5_ptr, #(1 * 1024) + + add b1_ptr, b0_ptr, #(1 * 1024) + add b2_ptr, b0_ptr, #(2 * 1024) + add b3_ptr, b0_ptr, #(3 * 1024) + add b4_ptr, b0_ptr, #(4 * 1024) + add b5_ptr, b4_ptr, #(1 * 1024) + add b6_ptr, b5_ptr, #(1 * 1024) + + mov count, #(MLDSA_N / 4) +l7_loop_start: + load_polys aa, bb, a0_ptr, b0_ptr + pmull resl, resh, aa, bb + load_polys aa, bb, a1_ptr, b1_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a2_ptr, b2_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a3_ptr, b3_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a4_ptr, b4_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a5_ptr, b5_ptr + pmlal resl, resh, aa, bb + load_polys aa, bb, a6_ptr, b6_ptr + pmlal resl, resh, aa, bb + + montgomery_reduce_long res, resl, resh + + str q_res, [out_ptr], #16 + + subs count, count, #1 + cbnz count, l7_loop_start + + pop_stack + ret +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/api.h b/mldsa/native/api.h index 3795291b..ccaa655c 100644 --- a/mldsa/native/api.h +++ b/mldsa/native/api.h @@ -71,4 +71,71 @@ static MLD_INLINE void mld_ntt_native(int32_t p[MLDSA_N]); static MLD_INLINE void mld_intt_native(int16_t p[MLKEM_N]) #endif /* MLD_USE_NATIVE_INTT */ +#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY) + +#if MLDSA_L == 4 + /************************************************* + * Name: mld_polyvecl_pointwise_acc_montgomery_l4_native + * + * Description: Pointwise multiply vectors of polynomials of length MLDSA_L, + * multiply resulting vector by 2^{-32} and add (accumulate) + * polynomials in it. + * Input/output vectors are in NTT domain representation. + * The second input is assumed to be output of an NTT, and + * hence must have coefficients bounded by (-9q, +9q). + * + * + * Arguments: - int32_t *w: output polynomial + * - const int32_t *u: pointer to first input vector + * - const int32_t *v: pointer to second input vector + **************************************************/ + static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[4 * MLDSA_N], + const int32_t v[4 * MLDSA_N]); +#endif /* MLDSA_L == 4 */ + +#if MLDSA_L == 5 +/************************************************* + * Name: mld_polyvecl_pointwise_acc_montgomery_l5_native + * + * Description: Pointwise multiply vectors of polynomials of length MLDSA_L, + * multiply resulting vector by 2^{-32} and add (accumulate) + * polynomials in it. + * Input/output vectors are in NTT domain representation. + * The second input is assumed to be output of an NTT, and + * hence must have coefficients bounded by (-9q, +9q). + * + * + * Arguments: - int32_t *w: output polynomial + * - const int32_t *u: pointer to first input vector + * - const int32_t *v: pointer to second input vector + **************************************************/ +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[5 * MLDSA_N], + const int32_t v[5 * MLDSA_N]); +#endif /* MLDSA_L == 5 */ + +#if MLDSA_L == 7 +/************************************************* + * Name: mld_polyvecl_pointwise_acc_montgomery_l7_native + * + * Description: Pointwise multiply vectors of polynomials of length MLDSA_L, + * multiply resulting vector by 2^{-32} and add (accumulate) + * polynomials in it. + * Input/output vectors are in NTT domain representation. + * The second input is assumed to be output of an NTT, and + * hence must have coefficients bounded by (-9q, +9q). + * + * + * Arguments: - int32_t *w: output polynomial + * - const int32_t *u: pointer to first input vector + * - const int32_t *v: pointer to second input vector + **************************************************/ +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[7 * MLDSA_N], + const int32_t v[7 * MLDSA_N]); +#endif /* MLDSA_L == 7 */ + +#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */ + #endif /* !MLD_NATIVE_API_H */ diff --git a/mldsa/polyvec.c b/mldsa/polyvec.c index bdcb2ae9..20286e73 100644 --- a/mldsa/polyvec.c +++ b/mldsa/polyvec.c @@ -231,7 +231,7 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); } } - +#if !defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY) void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { @@ -263,7 +263,22 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, w->coeffs[i] = montgomery_reduce(t); } } - +#else /* !MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */ +void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, + const polyvecl *v) +{ +#if MLDSA_L == 4 + mld_polyvecl_pointwise_acc_montgomery_l4_native(w->coeffs, (const int32_t *)u, + (const int32_t *)v); +#elif MLDSA_L == 5 + mld_polyvecl_pointwise_acc_montgomery_l5_native(w->coeffs, (const int32_t *)u, + (const int32_t *)v); +#elif MLDSA_L == 7 + mld_polyvecl_pointwise_acc_montgomery_l7_native(w->coeffs, (const int32_t *)u, + (const int32_t *)v); +#endif +} +#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */ int polyvecl_chknorm(const polyvecl *v, int32_t bound) { From e9c3850bff2296b579162627cc47510acbe02791 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 23 May 2025 23:39:54 +0800 Subject: [PATCH 2/2] Add polyvecl_pointwise_acc_montgomery to component benchmarks Signed-off-by: Matthias J. Kannwischer --- test/bench_components_mldsa.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/bench_components_mldsa.c b/test/bench_components_mldsa.c index ebc46bcf..1171b52f 100644 --- a/test/bench_components_mldsa.c +++ b/test/bench_components_mldsa.c @@ -10,6 +10,7 @@ #include #include "../mldsa/ntt.h" #include "../mldsa/poly.h" +#include "../mldsa/polyvec.h" #include "../mldsa/randombytes.h" #include "hal.h" @@ -26,6 +27,8 @@ static int cmp_uint64_t(const void *a, const void *b) for (i = 0; i < NTESTS; i++) \ { \ randombytes((uint8_t *)data0, sizeof(data0)); \ + randombytes((uint8_t *)data1, sizeof(data1)); \ + randombytes((uint8_t *)data2, sizeof(data2)); \ for (j = 0; j < NWARMUP; j++) \ { \ code; \ @@ -44,15 +47,20 @@ static int cmp_uint64_t(const void *a, const void *b) static int bench(void) { - int32_t data0[256]; + MLD_ALIGN int32_t data0[256]; + MLD_ALIGN int32_t data1[MLDSA_K * 256]; + MLD_ALIGN int32_t data2[MLDSA_K * 256]; uint64_t cyc[NTESTS]; unsigned i, j; uint64_t t0, t1; - /* ntt */ BENCH("poly_ntt", poly_ntt((poly *)data0)) + BENCH("poly_invntt_tomont", poly_invntt_tomont((poly *)data0)) + BENCH("polyvecl_pointwise_acc_montgomery", + polyvecl_pointwise_acc_montgomery( + (poly *)data0, (const polyvecl *)data1, (const polyvecl *)data2)) return 0; }