diff --git a/dev/aarch64_clean/src/poly_decompose_32_asm.S b/dev/aarch64_clean/src/poly_decompose_32_asm.S new file mode 100644 index 00000000..bddfef2c --- /dev/null +++ b/dev/aarch64_clean/src/poly_decompose_32_asm.S @@ -0,0 +1,104 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +// a aliased with a0 +.macro decompose32 a1, a, temp + // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / + // 2^49), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1074791425 / 2^49 ≲ 1 / 523776. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #18 + + // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 31*GAMMA2 = 8118528 + gamma2_2x .req v22 // 2*GAMMA2 = 523776 + barrett_const .req v23 // Barrett constant = 1074791425 + + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0xe100 + movk w5, #0x7b, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xfe00 + movk w7, #7, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0401 + movk w11, #0x4010, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) + +poly_decompose_32_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose32 v5, v1, v24 + decompose32 v6, v2, v24 + decompose32 v7, v3, v24 + decompose32 v4, v0, v24 + + str q5, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q7, [a1_ptr, #3*16] + str q4, [a1_ptr], #4*16 + str q1, [a0_ptr, #1*16] + str q2, [a0_ptr, #2*16] + str q3, [a0_ptr, #3*16] + str q0, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_32_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile new file mode 100644 index 00000000..e0323c8a --- /dev/null +++ b/mldsa/native/aarch64/src/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +###### +# To run, see the README.md file +###### +.PHONY: all clean + +# ISA to optimize for +TARGET_ISA=Arm_AArch64 + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Cortex_A55 + +SLOTHY_EXTRA_FLAGS ?= + +SLOTHY_FLAGS=-c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c sw_pipelining.minimize_overlapping=False \ + -c sw_pipelining.allow_post \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 \ + $(SLOTHY_EXTRA_FLAGS) + +# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. +# Allow SLOTHY to use all V-registers, but only caller-saved GPRs. +RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]" + +# Used for kernels which don't stash callee-saved registers. +# Restrict SLOTHY to caller-saved registers. +RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]" + +all: poly_decompose_32_asm.S + +# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use +# those registers. +poly_decompose_32_asm.S: ../../../../dev/aarch64_clean/src/poly_decompose_32_asm.S + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_32_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) + + +clean: + -$(RM) -rf poly_decompose_32_asm.S diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S index bddfef2c..433d5878 100644 --- a/mldsa/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S @@ -1,7 +1,7 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) @@ -67,28 +67,280 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov count, #(64/4) + // Instructions: 38 + // Expected cycles: 44 + // Expected IPC: 0.86 + // + // Cycle bound: 44.0 + // IPC bound: 0.86 + // + // Wall time: 0.16s + // User time: 0.16s + // + // ------------ cycle (expected) -------------> + // 0 25 + // |------------------------|------------------ + ldr q18, [x2, #48] // *........................................... + ldr q24, [x2, #16] // ..*......................................... + sqdmulh v25.4S, v18.4S, v23.4S // ....*....................................... + cmgt v3.4S, v18.4S, v21.4S // .....*...................................... + sqdmulh v7.4S, v24.4S, v23.4S // ......*..................................... + cmgt v1.4S, v24.4S, v21.4S // .......*.................................... + srshr v25.4S, v25.4S, #18 // ........*................................... + ldr q17, [x2], #4*16 // .........*.................................. + mls v18.4S, v25.4S, v22.4S // ...........*................................ + srshr v7.4S, v7.4S, #18 // ............*............................... + sqdmulh v31.4S, v17.4S, v23.4S // .............*.............................. + bic v25.16B, v25.16B, v3.16B // ..............*............................. + mls v24.4S, v7.4S, v22.4S // ...............*............................ + add v20.4S, v18.4S, v3.4S // ................*........................... + srshr v18.4S, v31.4S, #18 // .................*.......................... + bic v3.16B, v7.16B, v1.16B // ..................*......................... + add v5.4S, v24.4S, v1.4S // ...................*........................ + cmgt v24.4S, v17.4S, v21.4S // ....................*....................... + mls v17.4S, v18.4S, v22.4S // .....................*...................... + bic v28.16B, v18.16B, v24.16B // ......................*..................... + str q3, [x0, #16] // .......................*.................... + ldr q18, [x2, #-32] // ........................*................... + add v24.4S, v17.4S, v24.4S // ..........................*................. + str q25, [x0, #48] // ...........................*................ + sqdmulh v25.4S, v18.4S, v23.4S // ............................*............... + cmgt v3.4S, v18.4S, v21.4S // .............................*.............. + ldr q31, [x2, #48] // ..............................*............. + srshr v25.4S, v25.4S, #18 // ................................*........... + str q24, [x1], #4*16 // .................................*.......... + sqdmulh v6.4S, v31.4S, v23.4S // ..................................*......... + mls v18.4S, v25.4S, v22.4S // ...................................*........ + bic v25.16B, v25.16B, v3.16B // ....................................*....... + ldr q24, [x2, #16] // .....................................*...... + add v18.4S, v18.4S, v3.4S // .......................................*.... + str q25, [x0, #32] // ........................................*... + sqdmulh v0.4S, v24.4S, v23.4S // .........................................*.. + str q18, [x1, #-32] // ..........................................*. + ldr q18, [x2], #4*16 // ...........................................* + + // ------------ cycle (expected) -------------> + // 0 25 + // |------------------------|------------------ + // ldr q31, [x2, #48] // *........................................... + // sqdmulh v6.4S, v31.4S, v23.4S // ....*....................................... + // ldr q24, [x2, #16] // ..*......................................... + // ldr q18, [x2], #4*16 // .........*.................................. + // sqdmulh v0.4S, v24.4S, v23.4S // ......*..................................... + // cmgt v16.4S, v24.4S, v21.4S // .......*.................................... + // srshr v4.4S, v6.4S, #18 // ........*................................... + // cmgt v6.4S, v31.4S, v21.4S // .....*...................................... + // sqdmulh v1.4S, v18.4S, v23.4S // .............*.............................. + // srshr v7.4S, v0.4S, #18 // ............*............................... + // srshr v19.4S, v1.4S, #18 // .................*.......................... + // cmgt v25.4S, v18.4S, v21.4S // ....................*....................... + // bic v28.16B, v19.16B, v25.16B // ......................*..................... + // ldr q2, [x2, #-32] // ........................*................... + // mls v31.4S, v4.4S, v22.4S // ...........*................................ + // mls v18.4S, v19.4S, v22.4S // .....................*...................... + // sqdmulh v1.4S, v2.4S, v23.4S // ............................*............... + // cmgt v29.4S, v2.4S, v21.4S // .............................*.............. + // add v20.4S, v31.4S, v6.4S // ................*........................... + // ldr q31, [x2, #48] // ..............................*............. + // srshr v17.4S, v1.4S, #18 // ................................*........... + // add v3.4S, v18.4S, v25.4S // ..........................*................. + // bic v27.16B, v4.16B, v6.16B // ..............*............................. + // bic v26.16B, v17.16B, v29.16B // ....................................*....... + // str q26, [x0, #32] // ........................................*... + // mls v2.4S, v17.4S, v22.4S // ...................................*........ + // str q3, [x1], #4*16 // .................................*.......... + // bic v0.16B, v7.16B, v16.16B // ..................*......................... + // mls v24.4S, v7.4S, v22.4S // ...............*............................ + // str q0, [x0, #16] // .......................*.................... + // add v18.4S, v2.4S, v29.4S // .......................................*.... + // str q27, [x0, #48] // ...........................*................ + // sqdmulh v6.4S, v31.4S, v23.4S // ..................................*......... + // str q18, [x1, #-32] // ..........................................*. + // add v5.4S, v24.4S, v16.4S // ...................*........................ + // ldr q24, [x2, #16] // .....................................*...... + // ldr q18, [x2], #4*16 // ...........................................* + // sqdmulh v0.4S, v24.4S, v23.4S // .........................................*.. + + sub count, count, #2 poly_decompose_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - decompose32 v5, v1, v24 - decompose32 v6, v2, v24 - decompose32 v7, v3, v24 - decompose32 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 + // Instructions: 36 + // Expected cycles: 40 + // Expected IPC: 0.90 + // + // Cycle bound: 40.0 + // IPC bound: 0.90 + // + // Wall time: 5.18s + // User time: 5.18s + // + // ---------- cycle (expected) -----------> + // 0 25 + // |------------------------|-------------- + cmgt v16.4S, v24.4S, v21.4S // *....................................... + srshr v4.4S, v6.4S, #18 // .*...................................... + cmgt v6.4S, v31.4S, v21.4S // ..*..................................... + sqdmulh v1.4S, v18.4S, v23.4S // ...*.................................... + str q5, [x1, #-48] // ....l................................... + srshr v7.4S, v0.4S, #18 // .....*.................................. + str q20, [x1, #-16] // ......l................................. + srshr v19.4S, v1.4S, #18 // .......*................................ + cmgt v25.4S, v18.4S, v21.4S // ........*............................... + str q28, [x0], #4*16 // .........l.............................. + bic v28.16B, v19.16B, v25.16B // ..........*............................. + ldr q2, [x2, #-32] // ...........*............................ + mls v31.4S, v4.4S, v22.4S // .............*.......................... + mls v18.4S, v19.4S, v22.4S // ..............*......................... + sqdmulh v1.4S, v2.4S, v23.4S // ...............*........................ + cmgt v29.4S, v2.4S, v21.4S // ................*....................... + add v20.4S, v31.4S, v6.4S // .................*...................... + ldr q31, [x2, #48] // ..................e..................... + srshr v17.4S, v1.4S, #18 // ....................*................... + add v3.4S, v18.4S, v25.4S // .....................*.................. + bic v27.16B, v4.16B, v6.16B // ......................*................. + bic v26.16B, v17.16B, v29.16B // .......................*................ + str q26, [x0, #32] // ........................*............... + mls v2.4S, v17.4S, v22.4S // .........................*.............. + str q3, [x1], #4*16 // ..........................*............. + bic v0.16B, v7.16B, v16.16B // ...........................*............ + mls v24.4S, v7.4S, v22.4S // ............................*........... + str q0, [x0, #16] // .............................*.......... + add v18.4S, v2.4S, v29.4S // ..............................*......... + str q27, [x0, #48] // ...............................*........ + sqdmulh v6.4S, v31.4S, v23.4S // ................................e....... + str q18, [x1, #-32] // .................................*...... + add v5.4S, v24.4S, v16.4S // ..................................*..... + ldr q24, [x2, #16] // ...................................e.... + ldr q18, [x2], #4*16 // .....................................e.. + sqdmulh v0.4S, v24.4S, v23.4S // .......................................e + + // -------------------------- cycle (expected) ---------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + // ldr q1, [x2, #1*16] // .................e....'..................................~....'......... + // ldr q2, [x2, #2*16] // ......................'..........*............................'......... + // ldr q3, [x2, #3*16] // e.....................'.................~.....................'......... + // ldr q0, [x2], #4*16 // ...................e..'....................................~..'......... + // sqdmulh v5.4s, v1.4s, v23.4s // .....................e'......................................~'......... + // srshr v5.4s, v5.4s, #18 // ......................'....*..................................'....~.... + // cmgt v24.4s, v1.4s, v21.4s // ......................*.......................................~......... + // mls v1.4s, v5.4s, v22.4s // ..........~...........'...........................*...........'......... + // bic v5.16b, v5.16b, v24.16b // .........~............'..........................*............'......... + // add v1.4s, v1.4s, v24.4s // ................~.....'.................................*.....'......... + // sqdmulh v6.4s, v2.4s, v23.4s // ......................'..............*........................'......... + // srshr v6.4s, v6.4s, #18 // ..~...................'...................*...................'......... + // cmgt v24.4s, v2.4s, v21.4s // ......................'...............*.......................'......... + // mls v2.4s, v6.4s, v22.4s // .......~..............'........................*..............'......... + // bic v6.16b, v6.16b, v24.16b // .....~................'......................*................'......... + // add v2.4s, v2.4s, v24.4s // ............~.........'.............................*.........'......... + // sqdmulh v7.4s, v3.4s, v23.4s // ..............e.......'...............................~.......'......... + // srshr v7.4s, v7.4s, #18 // ......................'*......................................'~........ + // cmgt v24.4s, v3.4s, v21.4s // ......................'.*.....................................'.~....... + // mls v3.4s, v7.4s, v22.4s // ......................'............*..........................'......... + // bic v7.16b, v7.16b, v24.16b // ....~.................'.....................*.................'......... + // add v3.4s, v3.4s, v24.4s // ......................'................*......................'......... + // sqdmulh v4.4s, v0.4s, v23.4s // ......................'..*....................................'..~...... + // srshr v4.4s, v4.4s, #18 // ......................'......*................................'......~.. + // cmgt v24.4s, v0.4s, v21.4s // ......................'.......*...............................'.......~. + // mls v0.4s, v4.4s, v22.4s // ......................'.............*.........................'......... + // bic v4.16b, v4.16b, v24.16b // ......................'.........*.............................'......... + // add v0.4s, v0.4s, v24.4s // ...~..................'....................*..................'......... + // str q5, [x0, #1*16] // ...........~..........'............................*..........'......... + // str q6, [x0, #2*16] // ......~...............'.......................*...............'......... + // str q7, [x0, #3*16] // .............~........'..............................*........'......... + // str q4, [x0], #4*16 // ......................'........~..............................'........l + // str q1, [x1, #1*16] // ......................'...~...................................'...l..... + // str q2, [x1, #2*16] // ...............~......'................................*......'......... + // str q3, [x1, #3*16] // ......................'.....~.................................'.....l... + // str q0, [x1], #4*16 // ........~.............'.........................*.............'......... subs count, count, #1 bne poly_decompose_32_loop + // Instructions: 34 + // Expected cycles: 36 + // Expected IPC: 0.94 + // + // Cycle bound: 36.0 + // IPC bound: 0.94 + // + // Wall time: 6.58s + // User time: 6.58s + // + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + srshr v0.4S, v0.4S, #18 // *................................... + sqdmulh v25.4S, v18.4S, v23.4S // .*.................................. + str q5, [x1, #-48] // ..*................................. + cmgt v17.4S, v18.4S, v21.4S // ...*................................ + str q20, [x1, #-16] // ....*............................... + srshr v16.4S, v25.4S, #18 // .....*.............................. + str q28, [x0], #4*16 // ......*............................. + srshr v28.4S, v6.4S, #18 // .......*............................ + mls v18.4S, v16.4S, v22.4S // ........*........................... + cmgt v3.4S, v31.4S, v21.4S // .........*.......................... + mls v31.4S, v28.4S, v22.4S // ..........*......................... + cmgt v30.4S, v24.4S, v21.4S // ...........*........................ + add v1.4S, v18.4S, v17.4S // ............*....................... + ldr q18, [x2, #-32] // .............*...................... + mls v24.4S, v0.4S, v22.4S // ...............*.................... + add v29.4S, v31.4S, v3.4S // ................*................... + sqdmulh v25.4S, v18.4S, v23.4S // .................*.................. + cmgt v6.4S, v18.4S, v21.4S // ..................*................. + bic v7.16B, v28.16B, v3.16B // ...................*................ + str q29, [x1, #48] // ....................*............... + srshr v31.4S, v25.4S, #18 // .....................*.............. + bic v25.16B, v16.16B, v17.16B // ......................*............. + str q25, [x0], #4*16 // .......................*............ + bic v25.16B, v31.16B, v6.16B // ........................*........... + str q25, [x0, #-32] // .........................*.......... + mls v18.4S, v31.4S, v22.4S // ..........................*......... + str q1, [x1], #4*16 // ...........................*........ + bic v25.16B, v0.16B, v30.16B // ............................*....... + str q25, [x0, #-48] // .............................*...... + add v25.4S, v18.4S, v6.4S // ..............................*..... + str q7, [x0, #-16] // ...............................*.... + add v18.4S, v24.4S, v30.4S // ................................*... + str q25, [x1, #-32] // .................................*.. + str q18, [x1, #-48] // ...................................* + + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + // cmgt v16.4S, v24.4S, v21.4S // ...........*........................ + // srshr v4.4S, v6.4S, #18 // .......*............................ + // cmgt v6.4S, v31.4S, v21.4S // .........*.......................... + // sqdmulh v1.4S, v18.4S, v23.4S // .*.................................. + // str q5, [x1, #-48] // ..*................................. + // srshr v7.4S, v0.4S, #18 // *................................... + // str q20, [x1, #-16] // ....*............................... + // srshr v19.4S, v1.4S, #18 // .....*.............................. + // cmgt v25.4S, v18.4S, v21.4S // ...*................................ + // str q28, [x0], #4*16 // ......*............................. + // bic v28.16B, v19.16B, v25.16B // ......................*............. + // ldr q2, [x2, #-32] // .............*...................... + // mls v31.4S, v4.4S, v22.4S // ..........*......................... + // mls v18.4S, v19.4S, v22.4S // ........*........................... + // sqdmulh v1.4S, v2.4S, v23.4S // .................*.................. + // cmgt v29.4S, v2.4S, v21.4S // ..................*................. + // add v20.4S, v31.4S, v6.4S // ................*................... + // srshr v17.4S, v1.4S, #18 // .....................*.............. + // add v3.4S, v18.4S, v25.4S // ............*....................... + // bic v27.16B, v4.16B, v6.16B // ...................*................ + // bic v26.16B, v17.16B, v29.16B // ........................*........... + // str q26, [x0, #32] // .........................*.......... + // mls v2.4S, v17.4S, v22.4S // ..........................*......... + // str q3, [x1], #4*16 // ...........................*........ + // bic v0.16B, v7.16B, v16.16B // ............................*....... + // mls v24.4S, v7.4S, v22.4S // ...............*.................... + // str q0, [x0, #16] // .............................*...... + // add v18.4S, v2.4S, v29.4S // ..............................*..... + // str q27, [x0, #48] // ...............................*.... + // str q18, [x1, #-32] // .................................*.. + // add v5.4S, v24.4S, v16.4S // ................................*... + // str q5, [x1, #-48] // ...................................* + // str q20, [x1, #-16] // ....................*............... + // str q28, [x0], #4*16 // .......................*............ + ret