Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions dev/aarch64_clean/src/poly_decompose_32_asm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)

// a aliased with a0
.macro decompose32 a1, a, temp
// Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 /
// 2^49), where round-() denotes "round half down". This is
// exact for 0 <= a < Q. Note that half is rounded down since
// 1074791425 / 2^491 / 523776.
sqdmulh \a1\().4s, \a\().4s, barrett_const.4s
srshr \a1\().4s, \a1\().4s, #18

// If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was
// given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we
// still round it to 0 like other "wrapped around" cases.)

// Check for wrap-around
cmgt \temp\().4s, \a\().4s, q_bound.4s

// Compute remainder a0
mls \a\().4s, \a1\().4s, gamma2_2x.4s

// If wrap-around is required, set a1 = 0 and a0 -= 1
bic \a1\().16b, \a1\().16b, \temp\().16b
add \a\().4s, \a\().4s, \temp\().4s
.endm

/* Parameters */
a1_ptr .req x0 // Output polynomial with coefficients c1
a0_ptr .req x1 // Output polynomial with coefficients c0
a_ptr .req x2 // Input polynomial

count .req x3

/* Constant register assignments */
q .req v20 // Q = 8380417
q_bound .req v21 // 31*GAMMA2 = 8118528
gamma2_2x .req v22 // 2*GAMMA2 = 523776
barrett_const .req v23 // Barrett constant = 1074791425


.text
.global MLD_ASM_NAMESPACE(poly_decompose_32_asm)
.balign 4
MLD_ASM_FN_SYMBOL(poly_decompose_32_asm)
// Load constants into SIMD registers
movz w4, #57345
movk w4, #127, lsl #16
dup q.4s, w4

movz w5, #0xe100
movk w5, #0x7b, lsl #16
dup q_bound.4s, w5

movz w7, #0xfe00
movk w7, #7, lsl #16
dup gamma2_2x.4s, w7

movz w11, #0x0401
movk w11, #0x4010, lsl #16
dup barrett_const.4s, w11

mov count, #(64/4)

poly_decompose_32_loop:
ldr q1, [a_ptr, #1*16]
ldr q2, [a_ptr, #2*16]
ldr q3, [a_ptr, #3*16]
ldr q0, [a_ptr], #4*16

decompose32 v5, v1, v24
decompose32 v6, v2, v24
decompose32 v7, v3, v24
decompose32 v4, v0, v24

str q5, [a1_ptr, #1*16]
str q6, [a1_ptr, #2*16]
str q7, [a1_ptr, #3*16]
str q4, [a1_ptr], #4*16
str q1, [a0_ptr, #1*16]
str q2, [a0_ptr, #2*16]
str q3, [a0_ptr, #3*16]
str q0, [a0_ptr], #4*16

subs count, count, #1
bne poly_decompose_32_loop

ret

.unreq a1_ptr
.unreq a0_ptr
.unreq a_ptr
.unreq count
.unreq q
.unreq q_bound
.unreq gamma2_2x
.unreq barrett_const

#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
42 changes: 42 additions & 0 deletions mldsa/native/aarch64/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) The mldsa-native project authors
# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT

######
# To run, see the README.md file
######
.PHONY: all clean

# ISA to optimize for
TARGET_ISA=Arm_AArch64

# MicroArch target to optimize for
TARGET_MICROARCH=Arm_Cortex_A55

SLOTHY_EXTRA_FLAGS ?=

SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
-c inputs_are_outputs \
-c sw_pipelining.minimize_overlapping=False \
-c sw_pipelining.allow_post \
-c variable_size \
-c constraints.stalls_first_attempt=64 \
$(SLOTHY_EXTRA_FLAGS)

# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30.
# Allow SLOTHY to use all V-registers, but only caller-saved GPRs.
RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]"

# Used for kernels which don't stash callee-saved registers.
# Restrict SLOTHY to caller-saved registers.
RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]"

all: poly_decompose_32_asm.S

# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use
# those registers.
poly_decompose_32_asm.S: ../../../../dev/aarch64_clean/src/poly_decompose_32_asm.S
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_32_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG)


clean:
-$(RM) -rf poly_decompose_32_asm.S
Loading