From 46913f60f1638b55f8a7c42159f171b8fe4d9a81 Mon Sep 17 00:00:00 2001 From: willieyz Date: Fri, 3 Oct 2025 18:05:45 +0800 Subject: [PATCH 1/2] SLOTHY-OPT: Run poly_decompose_32_asm.S through SLOTHY with neoverse_n1 - This commit run poly_decompose_32_asm.S throught SLOTHY with `TARGET_MICROARCH=Arm_Neoverse_N1_experimental` Signed-off-by: willieyz --- dev/aarch64_clean/src/poly_decompose_32_asm.S | 104 ++++++ mldsa/native/aarch64/src/Makefile | 42 +++ .../aarch64/src/poly_decompose_32_asm.S | 296 ++++++++++++++++-- 3 files changed, 420 insertions(+), 22 deletions(-) create mode 100644 dev/aarch64_clean/src/poly_decompose_32_asm.S create mode 100644 mldsa/native/aarch64/src/Makefile diff --git a/dev/aarch64_clean/src/poly_decompose_32_asm.S b/dev/aarch64_clean/src/poly_decompose_32_asm.S new file mode 100644 index 000000000..bddfef2c2 --- /dev/null +++ b/dev/aarch64_clean/src/poly_decompose_32_asm.S @@ -0,0 +1,104 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +// a aliased with a0 +.macro decompose32 a1, a, temp + // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / + // 2^49), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1074791425 / 2^49 ≲ 1 / 523776. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #18 + + // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 31*GAMMA2 = 8118528 + gamma2_2x .req v22 // 2*GAMMA2 = 523776 + barrett_const .req v23 // Barrett constant = 1074791425 + + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0xe100 + movk w5, #0x7b, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xfe00 + movk w7, #7, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0401 + movk w11, #0x4010, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) + +poly_decompose_32_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose32 v5, v1, v24 + decompose32 v6, v2, v24 + decompose32 v7, v3, v24 + decompose32 v4, v0, v24 + + str q5, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q7, [a1_ptr, #3*16] + str q4, [a1_ptr], #4*16 + str q1, [a0_ptr, #1*16] + str q2, [a0_ptr, #2*16] + str q3, [a0_ptr, #3*16] + str q0, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_32_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile new file mode 100644 index 000000000..51fc96bd4 --- /dev/null +++ b/mldsa/native/aarch64/src/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +###### +# To run, see the README.md file +###### +.PHONY: all clean + +# ISA to optimize for +TARGET_ISA=Arm_AArch64 + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Neoverse_N1_experimental + +SLOTHY_EXTRA_FLAGS ?= + +SLOTHY_FLAGS=-c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c sw_pipelining.minimize_overlapping=False \ + -c sw_pipelining.allow_post \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 \ + $(SLOTHY_EXTRA_FLAGS) + +# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. +# Allow SLOTHY to use all V-registers, but only caller-saved GPRs. +RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]" + +# Used for kernels which don't stash callee-saved registers. +# Restrict SLOTHY to caller-saved registers. +RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]" + +all: poly_decompose_32_asm.S + +# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use +# those registers. +poly_decompose_32_asm.S: ../../../../dev/aarch64_clean/src/poly_decompose_32_asm.S + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_32_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) + + +clean: + -$(RM) -rf poly_decompose_32_asm.S diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S index bddfef2c2..041a5aabc 100644 --- a/mldsa/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S @@ -1,7 +1,7 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) @@ -67,28 +67,280 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov count, #(64/4) + // Instructions: 26 + // Expected cycles: 21 + // Expected IPC: 1.24 + // + // Cycle bound: 21.0 + // IPC bound: 1.24 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q17, [x2], #4*16 // *............................. + ldr q7, [x2], #4*16 // *............................. + ldr q25, [x2, #-80] // .*............................ + ldr q29, [x2, #-112] // .*............................ + sqdmulh v6.4S, v17.4S, v23.4S // ....*......................... + cmgt v31.4S, v17.4S, v21.4S // ....*......................... + cmgt v0.4S, v29.4S, v21.4S // .....*........................ + sqdmulh v20.4S, v7.4S, v23.4S // ......*....................... + cmgt v27.4S, v25.4S, v21.4S // ......*....................... + cmgt v30.4S, v7.4S, v21.4S // .......*...................... + sqdmulh v24.4S, v25.4S, v23.4S // ........*..................... + srshr v6.4S, v6.4S, #18 // .........*.................... + sqdmulh v4.4S, v29.4S, v23.4S // ..........*................... + srshr v20.4S, v20.4S, #18 // ...........*.................. + mls v17.4S, v6.4S, v22.4S // .............*................ + srshr v26.4S, v24.4S, #18 // .............*................ + bic v6.16B, v6.16B, v31.16B // ..............*............... + mls v7.4S, v20.4S, v22.4S // ...............*.............. + srshr v3.4S, v4.4S, #18 // ...............*.............. + bic v1.16B, v20.16B, v30.16B // ................*............. + str q6, [x0], #4*16 // ................*............. + mls v25.4S, v26.4S, v22.4S // .................*............ + add v6.4S, v17.4S, v31.4S // ..................*........... + ldr q17, [x2, #-96] // ..................*........... + mls v29.4S, v3.4S, v22.4S // ...................*.......... + str q6, [x1], #4*16 // ....................*......... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x2], #4*16 // *.............................. + // sqdmulh v28.4S, v7.4S, v23.4S // ....*.......................... + // srshr v4.4S, v28.4S, #18 // .........*..................... + // cmgt v30.4S, v7.4S, v21.4S // ....*.......................... + // mls v7.4S, v4.4S, v22.4S // .............*................. + // bic v1.16B, v4.16B, v30.16B // ..............*................ + // add v24.4S, v7.4S, v30.4S // ..................*............ + // str q1, [x0], #4*16 // ................*.............. + // ldr q7, [x2], #4*16 // *.............................. + // ldr q25, [x2, #-80] // .*............................. + // ldr q29, [x2, #-112] // .*............................. + // sqdmulh v28.4S, v7.4S, v23.4S // ......*........................ + // sqdmulh v20.4S, v25.4S, v23.4S // ........*...................... + // cmgt v0.4S, v29.4S, v21.4S // .....*......................... + // sqdmulh v6.4S, v29.4S, v23.4S // ..........*.................... + // cmgt v27.4S, v25.4S, v21.4S // ......*........................ + // srshr v4.4S, v28.4S, #18 // ...........*................... + // srshr v26.4S, v20.4S, #18 // .............*................. + // str q24, [x1], #4*16 // ....................*.......... + // cmgt v30.4S, v7.4S, v21.4S // .......*....................... + // srshr v3.4S, v6.4S, #18 // ...............*............... + // mls v7.4S, v4.4S, v22.4S // ...............*............... + // mls v25.4S, v26.4S, v22.4S // .................*............. + // ldr q17, [x2, #-96] // ..................*............ + // bic v1.16B, v4.16B, v30.16B // ................*.............. + // mls v29.4S, v3.4S, v22.4S // ...................*........... + + sub count, count, #2 poly_decompose_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - decompose32 v5, v1, v24 - decompose32 v6, v2, v24 - decompose32 v7, v3, v24 - decompose32 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 + // Instructions: 36 + // Expected cycles: 19 + // Expected IPC: 1.89 + // + // Cycle bound: 19.0 + // IPC bound: 1.89 + // + // Wall time: 210.08s + // User time: 210.08s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add v24.4S, v7.4S, v30.4S // *............................. + str q1, [x0], #4*16 // *............................. + bic v18.16B, v26.16B, v27.16B // .l............................ + ldr q7, [x2], #4*16 // .e............................ + sqdmulh v31.4S, v17.4S, v23.4S // ..l........................... + add v19.4S, v25.4S, v27.4S // ..l........................... + ldr q25, [x2, #-80] // ...*.......................... + bic v5.16B, v3.16B, v0.16B // ...l.......................... + add v26.4S, v29.4S, v0.4S // ....l......................... + ldr q29, [x2, #-112] // ....*......................... + sqdmulh v28.4S, v7.4S, v23.4S // .....e........................ + str q18, [x0, #-80] // .....l........................ + cmgt v2.4S, v17.4S, v21.4S // ......l....................... + str q19, [x1, #-16] // ......l....................... + srshr v18.4S, v31.4S, #18 // .......l...................... + sqdmulh v20.4S, v25.4S, v23.4S // .......*...................... + cmgt v0.4S, v29.4S, v21.4S // ........*..................... + str q5, [x0, #-112] // ........l..................... + sqdmulh v6.4S, v29.4S, v23.4S // .........*.................... + cmgt v27.4S, v25.4S, v21.4S // .........*.................... + srshr v4.4S, v28.4S, #18 // ..........e................... + str q26, [x1, #-48] // ..........l................... + mls v17.4S, v18.4S, v22.4S // ...........l.................. + bic v16.16B, v18.16B, v2.16B // ...........l.................. + srshr v26.4S, v20.4S, #18 // ............*................. + str q24, [x1], #4*16 // ............*................. + cmgt v30.4S, v7.4S, v21.4S // .............e................ + str q16, [x0, #-96] // .............l................ + srshr v3.4S, v6.4S, #18 // ..............*............... + mls v7.4S, v4.4S, v22.4S // ..............e............... + add v5.4S, v17.4S, v2.4S // ................l............. + mls v25.4S, v26.4S, v22.4S // ................*............. + ldr q17, [x2, #-96] // .................*............ + bic v1.16B, v4.16B, v30.16B // .................e............ + mls v29.4S, v3.4S, v22.4S // ..................*........... + str q5, [x1, #-96] // ..................l........... + + // ------------------ cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|----- + // ldr q1, [x2, #1*16] // ...~..............'...*..............'...~.............. + // ldr q2, [x2, #2*16] // ................~.'................*.'................~. + // ldr q3, [x2, #3*16] // ..~...............'..*...............'..~............... + // ldr q0, [x2], #4*16 // e.................'~.................'~................. + // sqdmulh v5.4s, v1.4s, v23.4s // ........~.........'........*.........'........~......... + // srshr v5.4s, v5.4s, #18 // .............~....'.............*....'.............~.... + // cmgt v24.4s, v1.4s, v21.4s // .......~..........'.......*..........'.......~.......... + // mls v1.4s, v5.4s, v22.4s // .................~'.................*'.................. + // bic v5.16b, v5.16b, v24.16b // ..~...............'..~...............'..l............... + // add v1.4s, v1.4s, v24.4s // ...~..............'...~..............'...l.............. + // sqdmulh v6.4s, v2.4s, v23.4s // .~................'.~................'.l................ + // srshr v6.4s, v6.4s, #18 // ......~...........'......~...........'......l........... + // cmgt v24.4s, v2.4s, v21.4s // .....~............'.....~............'.....l............ + // mls v2.4s, v6.4s, v22.4s // ..........~.......'..........~.......'..........l....... + // bic v6.16b, v6.16b, v24.16b // ..........~.......'..........~.......'..........l....... + // add v2.4s, v2.4s, v24.4s // ...............~..'...............~..'...............l.. + // sqdmulh v7.4s, v3.4s, v23.4s // ......~...........'......*...........'......~........... + // srshr v7.4s, v7.4s, #18 // ...........~......'...........*......'...........~...... + // cmgt v24.4s, v3.4s, v21.4s // ........~.........'........*.........'........~......... + // mls v3.4s, v7.4s, v22.4s // ...............~..'...............*..'...............~.. + // bic v7.16b, v7.16b, v24.16b // ~.................'~.................'l................. + // add v3.4s, v3.4s, v24.4s // .~................'.~................'.l................ + // sqdmulh v4.4s, v0.4s, v23.4s // ....e.............'....~.............'....~............. + // srshr v4.4s, v4.4s, #18 // .........e........'.........~........'.........~........ + // cmgt v24.4s, v0.4s, v21.4s // ............e.....'............~.....'............~..... + // mls v0.4s, v4.4s, v22.4s // .............e....'.............~....'.............~.... + // bic v4.16b, v4.16b, v24.16b // ................e.'................~.'................~. + // add v0.4s, v0.4s, v24.4s // ..................*..................~.................. + // str q5, [x0, #1*16] // .......~..........'.......~..........'.......l.......... + // str q6, [x0, #2*16] // ............~.....'............~.....'............l..... + // str q7, [x0, #3*16] // ....~.............'....~.............'....l............. + // str q4, [x0], #4*16 // ..................*..................~.................. + // str q1, [x1, #1*16] // .........~........'.........~........'.........l........ + // str q2, [x1, #2*16] // .................~'.................~'.................l + // str q3, [x1, #3*16] // .....~............'.....~............'.....l............ + // str q0, [x1], #4*16 // ...........~......'...........*......'...........~...... subs count, count, #1 bne poly_decompose_32_loop + // Instructions: 46 + // Expected cycles: 25 + // Expected IPC: 1.84 + // + // Cycle bound: 25.0 + // IPC bound: 1.84 + // + // Wall time: 0.51s + // User time: 0.51s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add v18.4S, v7.4S, v30.4S // *............................. + ldr q20, [x2, #-16] // *............................. + ldr q19, [x2, #-48] // .*............................ + sqdmulh v6.4S, v17.4S, v23.4S // .*............................ + str q18, [x1], #4*16 // ..*........................... + add v18.4S, v29.4S, v0.4S // ..*........................... + cmgt v4.4S, v17.4S, v21.4S // ...*.......................... + ldr q16, [x2, #-32] // ...*.......................... + sqdmulh v7.4S, v20.4S, v23.4S // ....*......................... + str q18, [x1, #-112] // ....*......................... + str q1, [x0], #4*16 // .....*........................ + cmgt v2.4S, v19.4S, v21.4S // .....*........................ + srshr v6.4S, v6.4S, #18 // ......*....................... + sqdmulh v28.4S, v19.4S, v23.4S // ......*....................... + bic v29.16B, v3.16B, v0.16B // .......*...................... + add v24.4S, v25.4S, v27.4S // ........*..................... + sqdmulh v30.4S, v16.4S, v23.4S // ........*..................... + srshr v1.4S, v7.4S, #18 // .........*.................... + str q29, [x0, #-112] // .........*.................... + mls v17.4S, v6.4S, v22.4S // ..........*................... + cmgt v7.4S, v20.4S, v21.4S // ..........*................... + srshr v5.4S, v28.4S, #18 // ...........*.................. + str q24, [x1, #-80] // ...........*.................. + cmgt v24.4S, v16.4S, v21.4S // ............*................. + mls v20.4S, v1.4S, v22.4S // .............*................ + srshr v31.4S, v30.4S, #18 // .............*................ + bic v1.16B, v1.16B, v7.16B // ..............*............... + mls v19.4S, v5.4S, v22.4S // ...............*.............. + add v0.4S, v17.4S, v4.4S // ...............*.............. + bic v17.16B, v26.16B, v27.16B // ................*............. + str q1, [x0, #-16] // ................*............. + mls v16.4S, v31.4S, v22.4S // .................*............ + bic v5.16B, v5.16B, v2.16B // .................*............ + add v25.4S, v20.4S, v7.4S // ..................*........... + str q17, [x0, #-80] // ..................*........... + bic v18.16B, v31.16B, v24.16B // ...................*.......... + str q0, [x1, #-96] // ...................*.......... + add v31.4S, v19.4S, v2.4S // ....................*......... + bic v2.16B, v6.16B, v4.16B // ....................*......... + str q5, [x0, #-48] // .....................*........ + str q25, [x1, #-16] // .....................*........ + add v20.4S, v16.4S, v24.4S // ......................*....... + str q2, [x0, #-96] // ......................*....... + str q18, [x0, #-32] // .......................*...... + str q31, [x1, #-48] // .......................*...... + str q20, [x1, #-32] // ........................*..... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // add v24.4S, v7.4S, v30.4S // *.............................. + // str q1, [x0], #4*16 // .....*......................... + // bic v18.16B, v26.16B, v27.16B // ................*.............. + // sqdmulh v31.4S, v17.4S, v23.4S // .*............................. + // add v19.4S, v25.4S, v27.4S // ........*...................... + // ldr q25, [x2, #-16] // *.............................. + // bic v5.16B, v3.16B, v0.16B // .......*....................... + // add v26.4S, v29.4S, v0.4S // ..*............................ + // ldr q29, [x2, #-48] // .*............................. + // str q18, [x0, #-80] // ..................*............ + // cmgt v2.4S, v17.4S, v21.4S // ...*........................... + // str q19, [x1, #-16] // ...........*................... + // srshr v18.4S, v31.4S, #18 // ......*........................ + // sqdmulh v20.4S, v25.4S, v23.4S // ....*.......................... + // cmgt v0.4S, v29.4S, v21.4S // .....*......................... + // str q5, [x0, #-112] // .........*..................... + // sqdmulh v6.4S, v29.4S, v23.4S // ......*........................ + // cmgt v27.4S, v25.4S, v21.4S // ..........*.................... + // str q26, [x1, #-48] // ....*.......................... + // mls v17.4S, v18.4S, v22.4S // ..........*.................... + // bic v16.16B, v18.16B, v2.16B // ....................*.......... + // srshr v26.4S, v20.4S, #18 // .........*..................... + // str q24, [x1], #4*16 // ..*............................ + // str q16, [x0, #-96] // ......................*........ + // srshr v3.4S, v6.4S, #18 // ...........*................... + // add v5.4S, v17.4S, v2.4S // ...............*............... + // mls v25.4S, v26.4S, v22.4S // .............*................. + // ldr q17, [x2, #-32] // ...*........................... + // mls v29.4S, v3.4S, v22.4S // ...............*............... + // str q5, [x1, #-96] // ...................*........... + // bic v18.16B, v26.16B, v27.16B // ..............*................ + // sqdmulh v31.4S, v17.4S, v23.4S // ........*...................... + // add v19.4S, v25.4S, v27.4S // ..................*............ + // bic v5.16B, v3.16B, v0.16B // .................*............. + // add v26.4S, v29.4S, v0.4S // ....................*.......... + // str q18, [x0, #-16] // ................*.............. + // cmgt v2.4S, v17.4S, v21.4S // ............*.................. + // str q19, [x1, #-16] // .....................*......... + // srshr v18.4S, v31.4S, #18 // .............*................. + // str q5, [x0, #-48] // .....................*......... + // str q26, [x1, #-48] // .......................*....... + // mls v17.4S, v18.4S, v22.4S // .................*............. + // bic v16.16B, v18.16B, v2.16B // ...................*........... + // str q16, [x0, #-32] // .......................*....... + // add v5.4S, v17.4S, v2.4S // ......................*........ + // str q5, [x1, #-32] // ........................*...... + ret From db477520ee9b947dcc1430ad90333e8eaf49abb4 Mon Sep 17 00:00:00 2001 From: willieyz Date: Thu, 9 Oct 2025 11:26:17 +0800 Subject: [PATCH 2/2] SLOTHY-OPT: Run poly_decompose_32_asm.S through SLOTHY with cortex-a55 - This commit run poly_decompose_32_asm.S throught SLOTHY with `TARGET_MICROARCH=Arm_Cortex_A55` Signed-off-by: willieyz --- mldsa/native/aarch64/src/Makefile | 2 +- .../aarch64/src/poly_decompose_32_asm.S | 510 +++++++++--------- 2 files changed, 256 insertions(+), 256 deletions(-) diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile index 51fc96bd4..e0323c8ad 100644 --- a/mldsa/native/aarch64/src/Makefile +++ b/mldsa/native/aarch64/src/Makefile @@ -10,7 +10,7 @@ TARGET_ISA=Arm_AArch64 # MicroArch target to optimize for -TARGET_MICROARCH=Arm_Neoverse_N1_experimental +TARGET_MICROARCH=Arm_Cortex_A55 SLOTHY_EXTRA_FLAGS ?= diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S index 041a5aabc..433d5878c 100644 --- a/mldsa/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S @@ -67,279 +67,279 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov count, #(64/4) - // Instructions: 26 - // Expected cycles: 21 - // Expected IPC: 1.24 + // Instructions: 38 + // Expected cycles: 44 + // Expected IPC: 0.86 // - // Cycle bound: 21.0 - // IPC bound: 1.24 + // Cycle bound: 44.0 + // IPC bound: 0.86 // - // Wall time: 0.08s - // User time: 0.08s + // Wall time: 0.16s + // User time: 0.16s // - // ----- cycle (expected) ------> + // ------------ cycle (expected) -------------> // 0 25 - // |------------------------|---- - ldr q17, [x2], #4*16 // *............................. - ldr q7, [x2], #4*16 // *............................. - ldr q25, [x2, #-80] // .*............................ - ldr q29, [x2, #-112] // .*............................ - sqdmulh v6.4S, v17.4S, v23.4S // ....*......................... - cmgt v31.4S, v17.4S, v21.4S // ....*......................... - cmgt v0.4S, v29.4S, v21.4S // .....*........................ - sqdmulh v20.4S, v7.4S, v23.4S // ......*....................... - cmgt v27.4S, v25.4S, v21.4S // ......*....................... - cmgt v30.4S, v7.4S, v21.4S // .......*...................... - sqdmulh v24.4S, v25.4S, v23.4S // ........*..................... - srshr v6.4S, v6.4S, #18 // .........*.................... - sqdmulh v4.4S, v29.4S, v23.4S // ..........*................... - srshr v20.4S, v20.4S, #18 // ...........*.................. - mls v17.4S, v6.4S, v22.4S // .............*................ - srshr v26.4S, v24.4S, #18 // .............*................ - bic v6.16B, v6.16B, v31.16B // ..............*............... - mls v7.4S, v20.4S, v22.4S // ...............*.............. - srshr v3.4S, v4.4S, #18 // ...............*.............. - bic v1.16B, v20.16B, v30.16B // ................*............. - str q6, [x0], #4*16 // ................*............. - mls v25.4S, v26.4S, v22.4S // .................*............ - add v6.4S, v17.4S, v31.4S // ..................*........... - ldr q17, [x2, #-96] // ..................*........... - mls v29.4S, v3.4S, v22.4S // ...................*.......... - str q6, [x1], #4*16 // ....................*......... + // |------------------------|------------------ + ldr q18, [x2, #48] // *........................................... + ldr q24, [x2, #16] // ..*......................................... + sqdmulh v25.4S, v18.4S, v23.4S // ....*....................................... + cmgt v3.4S, v18.4S, v21.4S // .....*...................................... + sqdmulh v7.4S, v24.4S, v23.4S // ......*..................................... + cmgt v1.4S, v24.4S, v21.4S // .......*.................................... + srshr v25.4S, v25.4S, #18 // ........*................................... + ldr q17, [x2], #4*16 // .........*.................................. + mls v18.4S, v25.4S, v22.4S // ...........*................................ + srshr v7.4S, v7.4S, #18 // ............*............................... + sqdmulh v31.4S, v17.4S, v23.4S // .............*.............................. + bic v25.16B, v25.16B, v3.16B // ..............*............................. + mls v24.4S, v7.4S, v22.4S // ...............*............................ + add v20.4S, v18.4S, v3.4S // ................*........................... + srshr v18.4S, v31.4S, #18 // .................*.......................... + bic v3.16B, v7.16B, v1.16B // ..................*......................... + add v5.4S, v24.4S, v1.4S // ...................*........................ + cmgt v24.4S, v17.4S, v21.4S // ....................*....................... + mls v17.4S, v18.4S, v22.4S // .....................*...................... + bic v28.16B, v18.16B, v24.16B // ......................*..................... + str q3, [x0, #16] // .......................*.................... + ldr q18, [x2, #-32] // ........................*................... + add v24.4S, v17.4S, v24.4S // ..........................*................. + str q25, [x0, #48] // ...........................*................ + sqdmulh v25.4S, v18.4S, v23.4S // ............................*............... + cmgt v3.4S, v18.4S, v21.4S // .............................*.............. + ldr q31, [x2, #48] // ..............................*............. + srshr v25.4S, v25.4S, #18 // ................................*........... + str q24, [x1], #4*16 // .................................*.......... + sqdmulh v6.4S, v31.4S, v23.4S // ..................................*......... + mls v18.4S, v25.4S, v22.4S // ...................................*........ + bic v25.16B, v25.16B, v3.16B // ....................................*....... + ldr q24, [x2, #16] // .....................................*...... + add v18.4S, v18.4S, v3.4S // .......................................*.... + str q25, [x0, #32] // ........................................*... + sqdmulh v0.4S, v24.4S, v23.4S // .........................................*.. + str q18, [x1, #-32] // ..........................................*. + ldr q18, [x2], #4*16 // ...........................................* - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q7, [x2], #4*16 // *.............................. - // sqdmulh v28.4S, v7.4S, v23.4S // ....*.......................... - // srshr v4.4S, v28.4S, #18 // .........*..................... - // cmgt v30.4S, v7.4S, v21.4S // ....*.......................... - // mls v7.4S, v4.4S, v22.4S // .............*................. - // bic v1.16B, v4.16B, v30.16B // ..............*................ - // add v24.4S, v7.4S, v30.4S // ..................*............ - // str q1, [x0], #4*16 // ................*.............. - // ldr q7, [x2], #4*16 // *.............................. - // ldr q25, [x2, #-80] // .*............................. - // ldr q29, [x2, #-112] // .*............................. - // sqdmulh v28.4S, v7.4S, v23.4S // ......*........................ - // sqdmulh v20.4S, v25.4S, v23.4S // ........*...................... - // cmgt v0.4S, v29.4S, v21.4S // .....*......................... - // sqdmulh v6.4S, v29.4S, v23.4S // ..........*.................... - // cmgt v27.4S, v25.4S, v21.4S // ......*........................ - // srshr v4.4S, v28.4S, #18 // ...........*................... - // srshr v26.4S, v20.4S, #18 // .............*................. - // str q24, [x1], #4*16 // ....................*.......... - // cmgt v30.4S, v7.4S, v21.4S // .......*....................... - // srshr v3.4S, v6.4S, #18 // ...............*............... - // mls v7.4S, v4.4S, v22.4S // ...............*............... - // mls v25.4S, v26.4S, v22.4S // .................*............. - // ldr q17, [x2, #-96] // ..................*............ - // bic v1.16B, v4.16B, v30.16B // ................*.............. - // mls v29.4S, v3.4S, v22.4S // ...................*........... + // ------------ cycle (expected) -------------> + // 0 25 + // |------------------------|------------------ + // ldr q31, [x2, #48] // *........................................... + // sqdmulh v6.4S, v31.4S, v23.4S // ....*....................................... + // ldr q24, [x2, #16] // ..*......................................... + // ldr q18, [x2], #4*16 // .........*.................................. + // sqdmulh v0.4S, v24.4S, v23.4S // ......*..................................... + // cmgt v16.4S, v24.4S, v21.4S // .......*.................................... + // srshr v4.4S, v6.4S, #18 // ........*................................... + // cmgt v6.4S, v31.4S, v21.4S // .....*...................................... + // sqdmulh v1.4S, v18.4S, v23.4S // .............*.............................. + // srshr v7.4S, v0.4S, #18 // ............*............................... + // srshr v19.4S, v1.4S, #18 // .................*.......................... + // cmgt v25.4S, v18.4S, v21.4S // ....................*....................... + // bic v28.16B, v19.16B, v25.16B // ......................*..................... + // ldr q2, [x2, #-32] // ........................*................... + // mls v31.4S, v4.4S, v22.4S // ...........*................................ + // mls v18.4S, v19.4S, v22.4S // .....................*...................... + // sqdmulh v1.4S, v2.4S, v23.4S // ............................*............... + // cmgt v29.4S, v2.4S, v21.4S // .............................*.............. + // add v20.4S, v31.4S, v6.4S // ................*........................... + // ldr q31, [x2, #48] // ..............................*............. + // srshr v17.4S, v1.4S, #18 // ................................*........... + // add v3.4S, v18.4S, v25.4S // ..........................*................. + // bic v27.16B, v4.16B, v6.16B // ..............*............................. + // bic v26.16B, v17.16B, v29.16B // ....................................*....... + // str q26, [x0, #32] // ........................................*... + // mls v2.4S, v17.4S, v22.4S // ...................................*........ + // str q3, [x1], #4*16 // .................................*.......... + // bic v0.16B, v7.16B, v16.16B // ..................*......................... + // mls v24.4S, v7.4S, v22.4S // ...............*............................ + // str q0, [x0, #16] // .......................*.................... + // add v18.4S, v2.4S, v29.4S // .......................................*.... + // str q27, [x0, #48] // ...........................*................ + // sqdmulh v6.4S, v31.4S, v23.4S // ..................................*......... + // str q18, [x1, #-32] // ..........................................*. + // add v5.4S, v24.4S, v16.4S // ...................*........................ + // ldr q24, [x2, #16] // .....................................*...... + // ldr q18, [x2], #4*16 // ...........................................* + // sqdmulh v0.4S, v24.4S, v23.4S // .........................................*.. sub count, count, #2 poly_decompose_32_loop: - // Instructions: 36 - // Expected cycles: 19 - // Expected IPC: 1.89 - // - // Cycle bound: 19.0 - // IPC bound: 1.89 - // - // Wall time: 210.08s - // User time: 210.08s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - add v24.4S, v7.4S, v30.4S // *............................. - str q1, [x0], #4*16 // *............................. - bic v18.16B, v26.16B, v27.16B // .l............................ - ldr q7, [x2], #4*16 // .e............................ - sqdmulh v31.4S, v17.4S, v23.4S // ..l........................... - add v19.4S, v25.4S, v27.4S // ..l........................... - ldr q25, [x2, #-80] // ...*.......................... - bic v5.16B, v3.16B, v0.16B // ...l.......................... - add v26.4S, v29.4S, v0.4S // ....l......................... - ldr q29, [x2, #-112] // ....*......................... - sqdmulh v28.4S, v7.4S, v23.4S // .....e........................ - str q18, [x0, #-80] // .....l........................ - cmgt v2.4S, v17.4S, v21.4S // ......l....................... - str q19, [x1, #-16] // ......l....................... - srshr v18.4S, v31.4S, #18 // .......l...................... - sqdmulh v20.4S, v25.4S, v23.4S // .......*...................... - cmgt v0.4S, v29.4S, v21.4S // ........*..................... - str q5, [x0, #-112] // ........l..................... - sqdmulh v6.4S, v29.4S, v23.4S // .........*.................... - cmgt v27.4S, v25.4S, v21.4S // .........*.................... - srshr v4.4S, v28.4S, #18 // ..........e................... - str q26, [x1, #-48] // ..........l................... - mls v17.4S, v18.4S, v22.4S // ...........l.................. - bic v16.16B, v18.16B, v2.16B // ...........l.................. - srshr v26.4S, v20.4S, #18 // ............*................. - str q24, [x1], #4*16 // ............*................. - cmgt v30.4S, v7.4S, v21.4S // .............e................ - str q16, [x0, #-96] // .............l................ - srshr v3.4S, v6.4S, #18 // ..............*............... - mls v7.4S, v4.4S, v22.4S // ..............e............... - add v5.4S, v17.4S, v2.4S // ................l............. - mls v25.4S, v26.4S, v22.4S // ................*............. - ldr q17, [x2, #-96] // .................*............ - bic v1.16B, v4.16B, v30.16B // .................e............ - mls v29.4S, v3.4S, v22.4S // ..................*........... - str q5, [x1, #-96] // ..................l........... + // Instructions: 36 + // Expected cycles: 40 + // Expected IPC: 0.90 + // + // Cycle bound: 40.0 + // IPC bound: 0.90 + // + // Wall time: 5.18s + // User time: 5.18s + // + // ---------- cycle (expected) -----------> + // 0 25 + // |------------------------|-------------- + cmgt v16.4S, v24.4S, v21.4S // *....................................... + srshr v4.4S, v6.4S, #18 // .*...................................... + cmgt v6.4S, v31.4S, v21.4S // ..*..................................... + sqdmulh v1.4S, v18.4S, v23.4S // ...*.................................... + str q5, [x1, #-48] // ....l................................... + srshr v7.4S, v0.4S, #18 // .....*.................................. + str q20, [x1, #-16] // ......l................................. + srshr v19.4S, v1.4S, #18 // .......*................................ + cmgt v25.4S, v18.4S, v21.4S // ........*............................... + str q28, [x0], #4*16 // .........l.............................. + bic v28.16B, v19.16B, v25.16B // ..........*............................. + ldr q2, [x2, #-32] // ...........*............................ + mls v31.4S, v4.4S, v22.4S // .............*.......................... + mls v18.4S, v19.4S, v22.4S // ..............*......................... + sqdmulh v1.4S, v2.4S, v23.4S // ...............*........................ + cmgt v29.4S, v2.4S, v21.4S // ................*....................... + add v20.4S, v31.4S, v6.4S // .................*...................... + ldr q31, [x2, #48] // ..................e..................... + srshr v17.4S, v1.4S, #18 // ....................*................... + add v3.4S, v18.4S, v25.4S // .....................*.................. + bic v27.16B, v4.16B, v6.16B // ......................*................. + bic v26.16B, v17.16B, v29.16B // .......................*................ + str q26, [x0, #32] // ........................*............... + mls v2.4S, v17.4S, v22.4S // .........................*.............. + str q3, [x1], #4*16 // ..........................*............. + bic v0.16B, v7.16B, v16.16B // ...........................*............ + mls v24.4S, v7.4S, v22.4S // ............................*........... + str q0, [x0, #16] // .............................*.......... + add v18.4S, v2.4S, v29.4S // ..............................*......... + str q27, [x0, #48] // ...............................*........ + sqdmulh v6.4S, v31.4S, v23.4S // ................................e....... + str q18, [x1, #-32] // .................................*...... + add v5.4S, v24.4S, v16.4S // ..................................*..... + ldr q24, [x2, #16] // ...................................e.... + ldr q18, [x2], #4*16 // .....................................e.. + sqdmulh v0.4S, v24.4S, v23.4S // .......................................e - // ------------------ cycle (expected) -------------------> + // -------------------------- cycle (expected) ---------------------------> // 0 25 50 - // |------------------------|------------------------|----- - // ldr q1, [x2, #1*16] // ...~..............'...*..............'...~.............. - // ldr q2, [x2, #2*16] // ................~.'................*.'................~. - // ldr q3, [x2, #3*16] // ..~...............'..*...............'..~............... - // ldr q0, [x2], #4*16 // e.................'~.................'~................. - // sqdmulh v5.4s, v1.4s, v23.4s // ........~.........'........*.........'........~......... - // srshr v5.4s, v5.4s, #18 // .............~....'.............*....'.............~.... - // cmgt v24.4s, v1.4s, v21.4s // .......~..........'.......*..........'.......~.......... - // mls v1.4s, v5.4s, v22.4s // .................~'.................*'.................. - // bic v5.16b, v5.16b, v24.16b // ..~...............'..~...............'..l............... - // add v1.4s, v1.4s, v24.4s // ...~..............'...~..............'...l.............. - // sqdmulh v6.4s, v2.4s, v23.4s // .~................'.~................'.l................ - // srshr v6.4s, v6.4s, #18 // ......~...........'......~...........'......l........... - // cmgt v24.4s, v2.4s, v21.4s // .....~............'.....~............'.....l............ - // mls v2.4s, v6.4s, v22.4s // ..........~.......'..........~.......'..........l....... - // bic v6.16b, v6.16b, v24.16b // ..........~.......'..........~.......'..........l....... - // add v2.4s, v2.4s, v24.4s // ...............~..'...............~..'...............l.. - // sqdmulh v7.4s, v3.4s, v23.4s // ......~...........'......*...........'......~........... - // srshr v7.4s, v7.4s, #18 // ...........~......'...........*......'...........~...... - // cmgt v24.4s, v3.4s, v21.4s // ........~.........'........*.........'........~......... - // mls v3.4s, v7.4s, v22.4s // ...............~..'...............*..'...............~.. - // bic v7.16b, v7.16b, v24.16b // ~.................'~.................'l................. - // add v3.4s, v3.4s, v24.4s // .~................'.~................'.l................ - // sqdmulh v4.4s, v0.4s, v23.4s // ....e.............'....~.............'....~............. - // srshr v4.4s, v4.4s, #18 // .........e........'.........~........'.........~........ - // cmgt v24.4s, v0.4s, v21.4s // ............e.....'............~.....'............~..... - // mls v0.4s, v4.4s, v22.4s // .............e....'.............~....'.............~.... - // bic v4.16b, v4.16b, v24.16b // ................e.'................~.'................~. - // add v0.4s, v0.4s, v24.4s // ..................*..................~.................. - // str q5, [x0, #1*16] // .......~..........'.......~..........'.......l.......... - // str q6, [x0, #2*16] // ............~.....'............~.....'............l..... - // str q7, [x0, #3*16] // ....~.............'....~.............'....l............. - // str q4, [x0], #4*16 // ..................*..................~.................. - // str q1, [x1, #1*16] // .........~........'.........~........'.........l........ - // str q2, [x1, #2*16] // .................~'.................~'.................l - // str q3, [x1, #3*16] // .....~............'.....~............'.....l............ - // str q0, [x1], #4*16 // ...........~......'...........*......'...........~...... + // |------------------------|------------------------|--------------------- + // ldr q1, [x2, #1*16] // .................e....'..................................~....'......... + // ldr q2, [x2, #2*16] // ......................'..........*............................'......... + // ldr q3, [x2, #3*16] // e.....................'.................~.....................'......... + // ldr q0, [x2], #4*16 // ...................e..'....................................~..'......... + // sqdmulh v5.4s, v1.4s, v23.4s // .....................e'......................................~'......... + // srshr v5.4s, v5.4s, #18 // ......................'....*..................................'....~.... + // cmgt v24.4s, v1.4s, v21.4s // ......................*.......................................~......... + // mls v1.4s, v5.4s, v22.4s // ..........~...........'...........................*...........'......... + // bic v5.16b, v5.16b, v24.16b // .........~............'..........................*............'......... + // add v1.4s, v1.4s, v24.4s // ................~.....'.................................*.....'......... + // sqdmulh v6.4s, v2.4s, v23.4s // ......................'..............*........................'......... + // srshr v6.4s, v6.4s, #18 // ..~...................'...................*...................'......... + // cmgt v24.4s, v2.4s, v21.4s // ......................'...............*.......................'......... + // mls v2.4s, v6.4s, v22.4s // .......~..............'........................*..............'......... + // bic v6.16b, v6.16b, v24.16b // .....~................'......................*................'......... + // add v2.4s, v2.4s, v24.4s // ............~.........'.............................*.........'......... + // sqdmulh v7.4s, v3.4s, v23.4s // ..............e.......'...............................~.......'......... + // srshr v7.4s, v7.4s, #18 // ......................'*......................................'~........ + // cmgt v24.4s, v3.4s, v21.4s // ......................'.*.....................................'.~....... + // mls v3.4s, v7.4s, v22.4s // ......................'............*..........................'......... + // bic v7.16b, v7.16b, v24.16b // ....~.................'.....................*.................'......... + // add v3.4s, v3.4s, v24.4s // ......................'................*......................'......... + // sqdmulh v4.4s, v0.4s, v23.4s // ......................'..*....................................'..~...... + // srshr v4.4s, v4.4s, #18 // ......................'......*................................'......~.. + // cmgt v24.4s, v0.4s, v21.4s // ......................'.......*...............................'.......~. + // mls v0.4s, v4.4s, v22.4s // ......................'.............*.........................'......... + // bic v4.16b, v4.16b, v24.16b // ......................'.........*.............................'......... + // add v0.4s, v0.4s, v24.4s // ...~..................'....................*..................'......... + // str q5, [x0, #1*16] // ...........~..........'............................*..........'......... + // str q6, [x0, #2*16] // ......~...............'.......................*...............'......... + // str q7, [x0, #3*16] // .............~........'..............................*........'......... + // str q4, [x0], #4*16 // ......................'........~..............................'........l + // str q1, [x1, #1*16] // ......................'...~...................................'...l..... + // str q2, [x1, #2*16] // ...............~......'................................*......'......... + // str q3, [x1, #3*16] // ......................'.....~.................................'.....l... + // str q0, [x1], #4*16 // ........~.............'.........................*.............'......... subs count, count, #1 bne poly_decompose_32_loop - // Instructions: 46 - // Expected cycles: 25 - // Expected IPC: 1.84 + // Instructions: 34 + // Expected cycles: 36 + // Expected IPC: 0.94 // - // Cycle bound: 25.0 - // IPC bound: 1.84 + // Cycle bound: 36.0 + // IPC bound: 0.94 // - // Wall time: 0.51s - // User time: 0.51s + // Wall time: 6.58s + // User time: 6.58s // - // ----- cycle (expected) ------> + // -------- cycle (expected) ---------> // 0 25 - // |------------------------|---- - add v18.4S, v7.4S, v30.4S // *............................. - ldr q20, [x2, #-16] // *............................. - ldr q19, [x2, #-48] // .*............................ - sqdmulh v6.4S, v17.4S, v23.4S // .*............................ - str q18, [x1], #4*16 // ..*........................... - add v18.4S, v29.4S, v0.4S // ..*........................... - cmgt v4.4S, v17.4S, v21.4S // ...*.......................... - ldr q16, [x2, #-32] // ...*.......................... - sqdmulh v7.4S, v20.4S, v23.4S // ....*......................... - str q18, [x1, #-112] // ....*......................... - str q1, [x0], #4*16 // .....*........................ - cmgt v2.4S, v19.4S, v21.4S // .....*........................ - srshr v6.4S, v6.4S, #18 // ......*....................... - sqdmulh v28.4S, v19.4S, v23.4S // ......*....................... - bic v29.16B, v3.16B, v0.16B // .......*...................... - add v24.4S, v25.4S, v27.4S // ........*..................... - sqdmulh v30.4S, v16.4S, v23.4S // ........*..................... - srshr v1.4S, v7.4S, #18 // .........*.................... - str q29, [x0, #-112] // .........*.................... - mls v17.4S, v6.4S, v22.4S // ..........*................... - cmgt v7.4S, v20.4S, v21.4S // ..........*................... - srshr v5.4S, v28.4S, #18 // ...........*.................. - str q24, [x1, #-80] // ...........*.................. - cmgt v24.4S, v16.4S, v21.4S // ............*................. - mls v20.4S, v1.4S, v22.4S // .............*................ - srshr v31.4S, v30.4S, #18 // .............*................ - bic v1.16B, v1.16B, v7.16B // ..............*............... - mls v19.4S, v5.4S, v22.4S // ...............*.............. - add v0.4S, v17.4S, v4.4S // ...............*.............. - bic v17.16B, v26.16B, v27.16B // ................*............. - str q1, [x0, #-16] // ................*............. - mls v16.4S, v31.4S, v22.4S // .................*............ - bic v5.16B, v5.16B, v2.16B // .................*............ - add v25.4S, v20.4S, v7.4S // ..................*........... - str q17, [x0, #-80] // ..................*........... - bic v18.16B, v31.16B, v24.16B // ...................*.......... - str q0, [x1, #-96] // ...................*.......... - add v31.4S, v19.4S, v2.4S // ....................*......... - bic v2.16B, v6.16B, v4.16B // ....................*......... - str q5, [x0, #-48] // .....................*........ - str q25, [x1, #-16] // .....................*........ - add v20.4S, v16.4S, v24.4S // ......................*....... - str q2, [x0, #-96] // ......................*....... - str q18, [x0, #-32] // .......................*...... - str q31, [x1, #-48] // .......................*...... - str q20, [x1, #-32] // ........................*..... + // |------------------------|---------- + srshr v0.4S, v0.4S, #18 // *................................... + sqdmulh v25.4S, v18.4S, v23.4S // .*.................................. + str q5, [x1, #-48] // ..*................................. + cmgt v17.4S, v18.4S, v21.4S // ...*................................ + str q20, [x1, #-16] // ....*............................... + srshr v16.4S, v25.4S, #18 // .....*.............................. + str q28, [x0], #4*16 // ......*............................. + srshr v28.4S, v6.4S, #18 // .......*............................ + mls v18.4S, v16.4S, v22.4S // ........*........................... + cmgt v3.4S, v31.4S, v21.4S // .........*.......................... + mls v31.4S, v28.4S, v22.4S // ..........*......................... + cmgt v30.4S, v24.4S, v21.4S // ...........*........................ + add v1.4S, v18.4S, v17.4S // ............*....................... + ldr q18, [x2, #-32] // .............*...................... + mls v24.4S, v0.4S, v22.4S // ...............*.................... + add v29.4S, v31.4S, v3.4S // ................*................... + sqdmulh v25.4S, v18.4S, v23.4S // .................*.................. + cmgt v6.4S, v18.4S, v21.4S // ..................*................. + bic v7.16B, v28.16B, v3.16B // ...................*................ + str q29, [x1, #48] // ....................*............... + srshr v31.4S, v25.4S, #18 // .....................*.............. + bic v25.16B, v16.16B, v17.16B // ......................*............. + str q25, [x0], #4*16 // .......................*............ + bic v25.16B, v31.16B, v6.16B // ........................*........... + str q25, [x0, #-32] // .........................*.......... + mls v18.4S, v31.4S, v22.4S // ..........................*......... + str q1, [x1], #4*16 // ...........................*........ + bic v25.16B, v0.16B, v30.16B // ............................*....... + str q25, [x0, #-48] // .............................*...... + add v25.4S, v18.4S, v6.4S // ..............................*..... + str q7, [x0, #-16] // ...............................*.... + add v18.4S, v24.4S, v30.4S // ................................*... + str q25, [x1, #-32] // .................................*.. + str q18, [x1, #-48] // ...................................* - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // add v24.4S, v7.4S, v30.4S // *.............................. - // str q1, [x0], #4*16 // .....*......................... - // bic v18.16B, v26.16B, v27.16B // ................*.............. - // sqdmulh v31.4S, v17.4S, v23.4S // .*............................. - // add v19.4S, v25.4S, v27.4S // ........*...................... - // ldr q25, [x2, #-16] // *.............................. - // bic v5.16B, v3.16B, v0.16B // .......*....................... - // add v26.4S, v29.4S, v0.4S // ..*............................ - // ldr q29, [x2, #-48] // .*............................. - // str q18, [x0, #-80] // ..................*............ - // cmgt v2.4S, v17.4S, v21.4S // ...*........................... - // str q19, [x1, #-16] // ...........*................... - // srshr v18.4S, v31.4S, #18 // ......*........................ - // sqdmulh v20.4S, v25.4S, v23.4S // ....*.......................... - // cmgt v0.4S, v29.4S, v21.4S // .....*......................... - // str q5, [x0, #-112] // .........*..................... - // sqdmulh v6.4S, v29.4S, v23.4S // ......*........................ - // cmgt v27.4S, v25.4S, v21.4S // ..........*.................... - // str q26, [x1, #-48] // ....*.......................... - // mls v17.4S, v18.4S, v22.4S // ..........*.................... - // bic v16.16B, v18.16B, v2.16B // ....................*.......... - // srshr v26.4S, v20.4S, #18 // .........*..................... - // str q24, [x1], #4*16 // ..*............................ - // str q16, [x0, #-96] // ......................*........ - // srshr v3.4S, v6.4S, #18 // ...........*................... - // add v5.4S, v17.4S, v2.4S // ...............*............... - // mls v25.4S, v26.4S, v22.4S // .............*................. - // ldr q17, [x2, #-32] // ...*........................... - // mls v29.4S, v3.4S, v22.4S // ...............*............... - // str q5, [x1, #-96] // ...................*........... - // bic v18.16B, v26.16B, v27.16B // ..............*................ - // sqdmulh v31.4S, v17.4S, v23.4S // ........*...................... - // add v19.4S, v25.4S, v27.4S // ..................*............ - // bic v5.16B, v3.16B, v0.16B // .................*............. - // add v26.4S, v29.4S, v0.4S // ....................*.......... - // str q18, [x0, #-16] // ................*.............. - // cmgt v2.4S, v17.4S, v21.4S // ............*.................. - // str q19, [x1, #-16] // .....................*......... - // srshr v18.4S, v31.4S, #18 // .............*................. - // str q5, [x0, #-48] // .....................*......... - // str q26, [x1, #-48] // .......................*....... - // mls v17.4S, v18.4S, v22.4S // .................*............. - // bic v16.16B, v18.16B, v2.16B // ...................*........... - // str q16, [x0, #-32] // .......................*....... - // add v5.4S, v17.4S, v2.4S // ......................*........ - // str q5, [x1, #-32] // ........................*...... + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + // cmgt v16.4S, v24.4S, v21.4S // ...........*........................ + // srshr v4.4S, v6.4S, #18 // .......*............................ + // cmgt v6.4S, v31.4S, v21.4S // .........*.......................... + // sqdmulh v1.4S, v18.4S, v23.4S // .*.................................. + // str q5, [x1, #-48] // ..*................................. + // srshr v7.4S, v0.4S, #18 // *................................... + // str q20, [x1, #-16] // ....*............................... + // srshr v19.4S, v1.4S, #18 // .....*.............................. + // cmgt v25.4S, v18.4S, v21.4S // ...*................................ + // str q28, [x0], #4*16 // ......*............................. + // bic v28.16B, v19.16B, v25.16B // ......................*............. + // ldr q2, [x2, #-32] // .............*...................... + // mls v31.4S, v4.4S, v22.4S // ..........*......................... + // mls v18.4S, v19.4S, v22.4S // ........*........................... + // sqdmulh v1.4S, v2.4S, v23.4S // .................*.................. + // cmgt v29.4S, v2.4S, v21.4S // ..................*................. + // add v20.4S, v31.4S, v6.4S // ................*................... + // srshr v17.4S, v1.4S, #18 // .....................*.............. + // add v3.4S, v18.4S, v25.4S // ............*....................... + // bic v27.16B, v4.16B, v6.16B // ...................*................ + // bic v26.16B, v17.16B, v29.16B // ........................*........... + // str q26, [x0, #32] // .........................*.......... + // mls v2.4S, v17.4S, v22.4S // ..........................*......... + // str q3, [x1], #4*16 // ...........................*........ + // bic v0.16B, v7.16B, v16.16B // ............................*....... + // mls v24.4S, v7.4S, v22.4S // ...............*.................... + // str q0, [x0, #16] // .............................*...... + // add v18.4S, v2.4S, v29.4S // ..............................*..... + // str q27, [x0, #48] // ...............................*.... + // str q18, [x1, #-32] // .................................*.. + // add v5.4S, v24.4S, v16.4S // ................................*... + // str q5, [x1, #-48] // ...................................* + // str q20, [x1, #-16] // ....................*............... + // str q28, [x0], #4*16 // .......................*............ ret