From 4c608060db5136e81846d868f0640ab95443f4df Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Fri, 23 May 2025 22:26:39 +0800
Subject: [PATCH 1/2] Add Neon
 mld_polyvecl_pointwise_acc_montgomery_l{4,5,7}_native

These are  basically written from scratch inspired by the same functions
in mlkem-native.
Resolves https://github.com/pq-code-package/mldsa-native/issues/257

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/native/aarch64/meta.h                   |  22 +++
 .../native/aarch64/src/arith_native_aarch64.h |  15 ++
 ...mld_polyvecl_pointwise_acc_montgomery_l4.S | 126 +++++++++++++++
 ...mld_polyvecl_pointwise_acc_montgomery_l5.S | 132 ++++++++++++++++
 ...mld_polyvecl_pointwise_acc_montgomery_l7.S | 144 ++++++++++++++++++
 mldsa/native/api.h                            |  67 ++++++++
 mldsa/polyvec.c                               |  19 ++-
 7 files changed, 523 insertions(+), 2 deletions(-)
 create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
 create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S
 create mode 100644 mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S

diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h
index 6b044a93..e7992376 100644
--- a/mldsa/native/aarch64/meta.h
+++ b/mldsa/native/aarch64/meta.h
@@ -10,6 +10,7 @@
 /* Set of primitives that this backend replaces */
 #define MLD_USE_NATIVE_NTT
 #define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY
 
 /* Identifier for this backend so that source and assembly files
  * in the build can be appropriately guarded. */
@@ -31,6 +32,27 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
                mld_aarch64_intt_zetas_layer123456);
 }
 
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+    int32_t w[MLDSA_N], const int32_t u[4 * MLDSA_N],
+    const int32_t v[4 * MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, u, v);
+}
+
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+    int32_t w[MLDSA_N], const int32_t u[5 * MLDSA_N],
+    const int32_t v[5 * MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, u, v);
+}
+
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+    int32_t w[MLDSA_N], const int32_t u[7 * MLDSA_N],
+    const int32_t v[7 * MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, u, v);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_AARCH64_META_H */
diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h
index d3528e6f..b745c5d2 100644
--- a/mldsa/native/aarch64/src/arith_native_aarch64.h
+++ b/mldsa/native/aarch64/src/arith_native_aarch64.h
@@ -32,4 +32,19 @@ void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *);
 #define mld_intt_asm MLD_NAMESPACE(intt_asm)
 void mld_intt_asm(int32_t *, const int32_t *, const int32_t *);
 
+#define mld_polyvecl_pointwise_acc_montgomery_l4_asm \
+  MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm)
+void mld_polyvecl_pointwise_acc_montgomery_l4_asm(int32_t *, const int32_t *,
+                                                  const int32_t *);
+
+#define mld_polyvecl_pointwise_acc_montgomery_l5_asm \
+  MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm)
+void mld_polyvecl_pointwise_acc_montgomery_l5_asm(int32_t *, const int32_t *,
+                                                  const int32_t *);
+
+#define mld_polyvecl_pointwise_acc_montgomery_l7_asm \
+  MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm)
+void mld_polyvecl_pointwise_acc_montgomery_l7_asm(int32_t *, const int32_t *,
+                                                  const int32_t *);
+
 #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
new file mode 100644
index 00000000..32a8a9a5
--- /dev/null
+++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
@@ -0,0 +1,126 @@
+/* Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */ 
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64)
+
+.macro montgomery_reduce_long res, inl, inh
+    uzp1   t0.4s, \inl\().4s, \inh\().4s
+    mul    t0.4s, t0.4s, modulus_twisted.4s
+    smlal  \inl\().2d, t0.2s, modulus.2s
+    smlal2 \inh\().2d, t0.4s, modulus.4s
+    uzp2   \res\().4s, \inl\().4s, \inh\().4s
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr
+    ldr q_\()\a, [\a_ptr], #16
+    ldr q_\()\b, [\b_ptr], #16
+.endm
+
+.macro pmull dl, dh, a, b
+    smull  \dl\().2d, \a\().2s, \b\().2s
+    smull2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro pmlal dl, dh, a, b
+    smlal  \dl\().2d, \a\().2s, \b\().2s
+    smlal2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro save_vregs
+        sub sp, sp, #(16*4)
+        stp  d8,  d9, [sp, #16*0]
+        stp d10, d11, [sp, #16*1]
+        stp d12, d13, [sp, #16*2]
+        stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #16*0]
+        ldp d10, d11, [sp, #16*1]
+        ldp d12, d13, [sp, #16*2]
+        ldp d14, d15, [sp, #16*3]
+        add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+        save_vregs
+.endm
+
+.macro pop_stack
+        restore_vregs
+.endm
+
+out_ptr .req x0
+a0_ptr  .req x1
+b0_ptr  .req x2
+a1_ptr  .req x3
+b1_ptr  .req x4
+a2_ptr  .req x5
+b2_ptr  .req x6
+a3_ptr  .req x7
+b3_ptr  .req x8
+count   .req x9
+wtmp    .req w9
+
+modulus         .req v0
+modulus_twisted .req v1
+
+aa   .req v2 
+bb   .req v3
+res  .req v4
+resl .req v5
+resh .req v6
+t0   .req v7
+
+q_aa  .req q2
+q_bb  .req q3
+q_res .req q4
+
+.text
+.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l4_asm)
+    push_stack
+
+    // load q = 8380417
+    movz wtmp, #57345
+    movk wtmp, #127, lsl #16
+    dup modulus.4s, wtmp
+
+    // load -q^-1 = 4236238847
+    movz wtmp, #57343
+    movk wtmp, #64639, lsl #16
+    dup modulus_twisted.4s, wtmp
+
+    // Computed bases of vector entries
+    add a1_ptr, a0_ptr, #(1 * 1024)
+    add a2_ptr, a0_ptr, #(2 * 1024)
+    add a3_ptr, a0_ptr, #(3 * 1024)
+
+    add b1_ptr, b0_ptr, #(1 * 1024)
+    add b2_ptr, b0_ptr, #(2 * 1024)
+    add b3_ptr, b0_ptr, #(3 * 1024)
+
+    mov count, #(MLDSA_N / 4)
+l4_loop_start:
+    load_polys aa, bb, a0_ptr, b0_ptr
+    pmull resl, resh, aa, bb
+    load_polys aa, bb, a1_ptr, b1_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a2_ptr, b2_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a3_ptr, b3_ptr
+    pmlal resl, resh, aa, bb
+
+    montgomery_reduce_long res, resl, resh
+
+    str q_res, [out_ptr], #16
+
+    subs count, count, #1
+    cbnz count, l4_loop_start
+
+    pop_stack
+    ret
+#endif /* MLD_ARITH_BACKEND_AARCH64 */
diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S
new file mode 100644
index 00000000..eea407e0
--- /dev/null
+++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S
@@ -0,0 +1,132 @@
+/* Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */ 
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64)
+
+.macro montgomery_reduce_long res, inl, inh
+    uzp1   t0.4s, \inl\().4s, \inh\().4s
+    mul    t0.4s, t0.4s, modulus_twisted.4s
+    smlal  \inl\().2d, t0.2s, modulus.2s
+    smlal2 \inh\().2d, t0.4s, modulus.4s
+    uzp2   \res\().4s, \inl\().4s, \inh\().4s
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr
+    ldr q_\()\a, [\a_ptr], #16
+    ldr q_\()\b, [\b_ptr], #16
+.endm
+
+.macro pmull dl, dh, a, b
+    smull  \dl\().2d, \a\().2s, \b\().2s
+    smull2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro pmlal dl, dh, a, b
+    smlal  \dl\().2d, \a\().2s, \b\().2s
+    smlal2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro save_vregs
+        sub sp, sp, #(16*4)
+        stp  d8,  d9, [sp, #16*0]
+        stp d10, d11, [sp, #16*1]
+        stp d12, d13, [sp, #16*2]
+        stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #16*0]
+        ldp d10, d11, [sp, #16*1]
+        ldp d12, d13, [sp, #16*2]
+        ldp d14, d15, [sp, #16*3]
+        add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+        save_vregs
+.endm
+
+.macro pop_stack
+        restore_vregs
+.endm
+
+out_ptr .req x0
+a0_ptr  .req x1
+b0_ptr  .req x2
+a1_ptr  .req x3
+b1_ptr  .req x4
+a2_ptr  .req x5
+b2_ptr  .req x6
+a3_ptr  .req x7
+b3_ptr  .req x8
+a4_ptr  .req x9
+b4_ptr  .req x10
+count   .req x11
+wtmp    .req w11
+
+modulus         .req v0
+modulus_twisted .req v1
+
+aa   .req v2 
+bb   .req v3
+res  .req v4
+resl .req v5
+resh .req v6
+t0   .req v7
+
+q_aa  .req q2
+q_bb  .req q3
+q_res .req q4
+
+.text
+.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l5_asm)
+    push_stack
+
+    // load q = 8380417
+    movz wtmp, #57345
+    movk wtmp, #127, lsl #16
+    dup modulus.4s, wtmp
+
+    // load -q^-1 = 4236238847
+    movz wtmp, #57343
+    movk wtmp, #64639, lsl #16
+    dup modulus_twisted.4s, wtmp
+
+    // Computed bases of vector entries
+    add a1_ptr, a0_ptr, #(1 * 1024)
+    add a2_ptr, a0_ptr, #(2 * 1024)
+    add a3_ptr, a0_ptr, #(3 * 1024)
+    add a4_ptr, a0_ptr, #(4 * 1024)
+
+    add b1_ptr, b0_ptr, #(1 * 1024)
+    add b2_ptr, b0_ptr, #(2 * 1024)
+    add b3_ptr, b0_ptr, #(3 * 1024)
+    add b4_ptr, b0_ptr, #(4 * 1024)
+
+    mov count, #(MLDSA_N / 4)
+l5_loop_start:
+    load_polys aa, bb, a0_ptr, b0_ptr
+    pmull resl, resh, aa, bb
+    load_polys aa, bb, a1_ptr, b1_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a2_ptr, b2_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a3_ptr, b3_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a4_ptr, b4_ptr
+    pmlal resl, resh, aa, bb
+
+    montgomery_reduce_long res, resl, resh
+
+    str q_res, [out_ptr], #16
+
+    subs count, count, #1
+    cbnz count, l5_loop_start
+
+    pop_stack
+    ret
+#endif /* MLD_ARITH_BACKEND_AARCH64 */
diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S
new file mode 100644
index 00000000..e3984082
--- /dev/null
+++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S
@@ -0,0 +1,144 @@
+/* Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */ 
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64)
+
+.macro montgomery_reduce_long res, inl, inh
+    uzp1   t0.4s, \inl\().4s, \inh\().4s
+    mul    t0.4s, t0.4s, modulus_twisted.4s
+    smlal  \inl\().2d, t0.2s, modulus.2s
+    smlal2 \inh\().2d, t0.4s, modulus.4s
+    uzp2   \res\().4s, \inl\().4s, \inh\().4s
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr
+    ldr q_\()\a, [\a_ptr], #16
+    ldr q_\()\b, [\b_ptr], #16
+.endm
+
+.macro pmull dl, dh, a, b
+    smull  \dl\().2d, \a\().2s, \b\().2s
+    smull2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro pmlal dl, dh, a, b
+    smlal  \dl\().2d, \a\().2s, \b\().2s
+    smlal2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro save_vregs
+        sub sp, sp, #(16*4)
+        stp  d8,  d9, [sp, #16*0]
+        stp d10, d11, [sp, #16*1]
+        stp d12, d13, [sp, #16*2]
+        stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #16*0]
+        ldp d10, d11, [sp, #16*1]
+        ldp d12, d13, [sp, #16*2]
+        ldp d14, d15, [sp, #16*3]
+        add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+        save_vregs
+.endm
+
+.macro pop_stack
+        restore_vregs
+.endm
+
+out_ptr .req x0
+a0_ptr  .req x1
+b0_ptr  .req x2
+a1_ptr  .req x3
+b1_ptr  .req x4
+a2_ptr  .req x5
+b2_ptr  .req x6
+a3_ptr  .req x7
+b3_ptr  .req x8
+a4_ptr  .req x9
+b4_ptr  .req x10
+a5_ptr  .req x11
+b5_ptr  .req x12
+a6_ptr  .req x13
+b6_ptr  .req x14
+count   .req x15
+wtmp    .req w15
+
+modulus         .req v0
+modulus_twisted .req v1
+
+aa   .req v2 
+bb   .req v3
+res  .req v4
+resl .req v5
+resh .req v6
+t0   .req v7
+
+q_aa  .req q2
+q_bb  .req q3
+q_res .req q4
+
+.text
+.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l7_asm)
+    push_stack
+
+    // load q = 8380417
+    movz wtmp, #57345
+    movk wtmp, #127, lsl #16
+    dup modulus.4s, wtmp
+
+    // load -q^-1 = 4236238847
+    movz wtmp, #57343
+    movk wtmp, #64639, lsl #16
+    dup modulus_twisted.4s, wtmp
+
+    // Computed bases of vector entries
+    add a1_ptr, a0_ptr, #(1 * 1024)
+    add a2_ptr, a0_ptr, #(2 * 1024)
+    add a3_ptr, a0_ptr, #(3 * 1024)
+    add a4_ptr, a0_ptr, #(4 * 1024)
+    add a5_ptr, a4_ptr, #(1 * 1024)
+    add a6_ptr, a5_ptr, #(1 * 1024)
+
+    add b1_ptr, b0_ptr, #(1 * 1024)
+    add b2_ptr, b0_ptr, #(2 * 1024)
+    add b3_ptr, b0_ptr, #(3 * 1024)
+    add b4_ptr, b0_ptr, #(4 * 1024)
+    add b5_ptr, b4_ptr, #(1 * 1024)
+    add b6_ptr, b5_ptr, #(1 * 1024)
+
+    mov count, #(MLDSA_N / 4)
+l7_loop_start:
+    load_polys aa, bb, a0_ptr, b0_ptr
+    pmull resl, resh, aa, bb
+    load_polys aa, bb, a1_ptr, b1_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a2_ptr, b2_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a3_ptr, b3_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a4_ptr, b4_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a5_ptr, b5_ptr
+    pmlal resl, resh, aa, bb
+    load_polys aa, bb, a6_ptr, b6_ptr
+    pmlal resl, resh, aa, bb
+
+    montgomery_reduce_long res, resl, resh
+
+    str q_res, [out_ptr], #16
+
+    subs count, count, #1
+    cbnz count, l7_loop_start
+
+    pop_stack
+    ret
+#endif /* MLD_ARITH_BACKEND_AARCH64 */
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
index 3795291b..ccaa655c 100644
--- a/mldsa/native/api.h
+++ b/mldsa/native/api.h
@@ -71,4 +71,71 @@ static MLD_INLINE void mld_ntt_native(int32_t p[MLDSA_N]);
 static MLD_INLINE void mld_intt_native(int16_t p[MLKEM_N])
 #endif /* MLD_USE_NATIVE_INTT */
 
+#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY)
+
+#if MLDSA_L == 4
+    /*************************************************
+     * Name:        mld_polyvecl_pointwise_acc_montgomery_l4_native
+     *
+     * Description: Pointwise multiply vectors of polynomials of length MLDSA_L,
+     *              multiply resulting vector by 2^{-32} and add (accumulate)
+     *              polynomials in it.
+     *              Input/output vectors are in NTT domain representation.
+     *              The second input is assumed to be output of an NTT, and
+     *              hence must have coefficients bounded by (-9q, +9q).
+     *
+     *
+     * Arguments:   - int32_t *w: output polynomial
+     *              - const int32_t *u: pointer to first input vector
+     *              - const int32_t *v: pointer to second input vector
+     **************************************************/
+    static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+        int32_t w[MLDSA_N], const int32_t u[4 * MLDSA_N],
+        const int32_t v[4 * MLDSA_N]);
+#endif /* MLDSA_L == 4 */
+
+#if MLDSA_L == 5
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_acc_montgomery_l5_native
+ *
+ * Description: Pointwise multiply vectors of polynomials of length MLDSA_L,
+ *              multiply resulting vector by 2^{-32} and add (accumulate)
+ *              polynomials in it.
+ *              Input/output vectors are in NTT domain representation.
+ *              The second input is assumed to be output of an NTT, and
+ *              hence must have coefficients bounded by (-9q, +9q).
+ *
+ *
+ * Arguments:   - int32_t *w: output polynomial
+ *              - const int32_t *u: pointer to first input vector
+ *              - const int32_t *v: pointer to second input vector
+ **************************************************/
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+    int32_t w[MLDSA_N], const int32_t u[5 * MLDSA_N],
+    const int32_t v[5 * MLDSA_N]);
+#endif /* MLDSA_L == 5 */
+
+#if MLDSA_L == 7
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_acc_montgomery_l7_native
+ *
+ * Description: Pointwise multiply vectors of polynomials of length MLDSA_L,
+ *              multiply resulting vector by 2^{-32} and add (accumulate)
+ *              polynomials in it.
+ *              Input/output vectors are in NTT domain representation.
+ *              The second input is assumed to be output of an NTT, and
+ *              hence must have coefficients bounded by (-9q, +9q).
+ *
+ *
+ * Arguments:   - int32_t *w: output polynomial
+ *              - const int32_t *u: pointer to first input vector
+ *              - const int32_t *v: pointer to second input vector
+ **************************************************/
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+    int32_t w[MLDSA_N], const int32_t u[7 * MLDSA_N],
+    const int32_t v[7 * MLDSA_N]);
+#endif /* MLDSA_L == 7 */
+
+#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/polyvec.c b/mldsa/polyvec.c
index bdcb2ae9..20286e73 100644
--- a/mldsa/polyvec.c
+++ b/mldsa/polyvec.c
@@ -231,7 +231,7 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a,
     poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
   }
 }
-
+#if !defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY)
 void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u,
                                        const polyvecl *v)
 {
@@ -263,7 +263,22 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u,
     w->coeffs[i] = montgomery_reduce(t);
   }
 }
-
+#else /* !MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */
+void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u,
+                                       const polyvecl *v)
+{
+#if MLDSA_L == 4
+  mld_polyvecl_pointwise_acc_montgomery_l4_native(w->coeffs, (const int32_t *)u,
+                                                  (const int32_t *)v);
+#elif MLDSA_L == 5
+  mld_polyvecl_pointwise_acc_montgomery_l5_native(w->coeffs, (const int32_t *)u,
+                                                  (const int32_t *)v);
+#elif MLDSA_L == 7
+  mld_polyvecl_pointwise_acc_montgomery_l7_native(w->coeffs, (const int32_t *)u,
+                                                  (const int32_t *)v);
+#endif
+}
+#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY */
 
 int polyvecl_chknorm(const polyvecl *v, int32_t bound)
 {

From e9c3850bff2296b579162627cc47510acbe02791 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Fri, 23 May 2025 23:39:54 +0800
Subject: [PATCH 2/2] Add polyvecl_pointwise_acc_montgomery to component
 benchmarks

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 test/bench_components_mldsa.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/bench_components_mldsa.c b/test/bench_components_mldsa.c
index ebc46bcf..1171b52f 100644
--- a/test/bench_components_mldsa.c
+++ b/test/bench_components_mldsa.c
@@ -10,6 +10,7 @@
 #include <string.h>
 #include "../mldsa/ntt.h"
 #include "../mldsa/poly.h"
+#include "../mldsa/polyvec.h"
 #include "../mldsa/randombytes.h"
 #include "hal.h"
 
@@ -26,6 +27,8 @@ static int cmp_uint64_t(const void *a, const void *b)
   for (i = 0; i < NTESTS; i++)                          \
   {                                                     \
     randombytes((uint8_t *)data0, sizeof(data0));       \
+    randombytes((uint8_t *)data1, sizeof(data1));       \
+    randombytes((uint8_t *)data2, sizeof(data2));       \
     for (j = 0; j < NWARMUP; j++)                       \
     {                                                   \
       code;                                             \
@@ -44,15 +47,20 @@ static int cmp_uint64_t(const void *a, const void *b)
 
 static int bench(void)
 {
-  int32_t data0[256];
+  MLD_ALIGN int32_t data0[256];
+  MLD_ALIGN int32_t data1[MLDSA_K * 256];
+  MLD_ALIGN int32_t data2[MLDSA_K * 256];
   uint64_t cyc[NTESTS];
   unsigned i, j;
   uint64_t t0, t1;
 
-  /* ntt */
   BENCH("poly_ntt", poly_ntt((poly *)data0))
+
   BENCH("poly_invntt_tomont", poly_invntt_tomont((poly *)data0))
 
+  BENCH("polyvecl_pointwise_acc_montgomery",
+        polyvecl_pointwise_acc_montgomery(
+            (poly *)data0, (const polyvecl *)data1, (const polyvecl *)data2))
   return 0;
 }