diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 3ca4a7e62..3e50cc892 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -218,6 +218,34 @@ jobs: - name: make lib run: | make lib + simpasm: + strategy: + fail-fast: false + matrix: + backend: + - arg: '--aarch64-clean' + name: Clean + # TODO: add backend option after we have optimized/clean seperation + # - arg: '' + # name: Optimized + simplify: + - arg: '' + name: Simplified + - arg: '--no-simplify' + name: Unmodified + runs-on: pqcp-arm64 + name: AArch64 dev backend (${{ matrix.simplify.name }}) + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Reinstate and test backend + uses: ./.github/actions/setup-shell + with: + nix-shell: 'ci' + gh_token: ${{ secrets.GITHUB_TOKEN }} + script: | + ./scripts/autogen ${{ matrix.simplify.arg }} + make clean + OPT=1 make quickcheck scan-build: strategy: fail-fast: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 48c2d5f7d..030ad3821 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -461,7 +461,8 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - uses: ./.github/actions/setup-shell with: - nix-shell: 'ci' + nix-shell: 'ci-cross' # Need cross-compiler for ASM simplification + nix-cache: 'true' gh_token: ${{ secrets.GITHUB_TOKEN }} script: | - python3 ./scripts/autogen --dry-run + python3 ./scripts/autogen --dry-run --force-cross diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index e3eed9e52..0bfa7f1c1 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -72,6 +72,9 @@ source code and documentation. - Matthias J. Kannwischer * URL: https://eprint.iacr.org/2022/1243 * Referenced from: + - [dev/fips202/aarch64/auto.h](dev/fips202/aarch64/auto.h) + - [dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S](dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S) + - [dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S](dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S) - [mldsa/fips202/native/aarch64/auto.h](mldsa/fips202/native/aarch64/auto.h) - [mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S](mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S) - [mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S](mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S) @@ -145,6 +148,27 @@ source code and documentation. - Damien Stehlé * URL: https://github.com/pq-crystals/dilithium/tree/master/avx2 * Referenced from: + - [dev/x86_64/src/align.h](dev/x86_64/src/align.h) + - [dev/x86_64/src/consts.c](dev/x86_64/src/consts.c) + - [dev/x86_64/src/consts.h](dev/x86_64/src/consts.h) + - [dev/x86_64/src/intt.S](dev/x86_64/src/intt.S) + - [dev/x86_64/src/ntt.S](dev/x86_64/src/ntt.S) + - [dev/x86_64/src/nttunpack.S](dev/x86_64/src/nttunpack.S) + - [dev/x86_64/src/pointwise.S](dev/x86_64/src/pointwise.S) + - [dev/x86_64/src/pointwise_acc_l4.S](dev/x86_64/src/pointwise_acc_l4.S) + - [dev/x86_64/src/pointwise_acc_l5.S](dev/x86_64/src/pointwise_acc_l5.S) + - [dev/x86_64/src/pointwise_acc_l7.S](dev/x86_64/src/pointwise_acc_l7.S) + - [dev/x86_64/src/poly_caddq_avx2.c](dev/x86_64/src/poly_caddq_avx2.c) + - [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c) + - [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c) + - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) + - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c) + - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c) + - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c) + - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c) + - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c) + - [dev/x86_64/src/rej_uniform_eta2_avx2.c](dev/x86_64/src/rej_uniform_eta2_avx2.c) + - [dev/x86_64/src/rej_uniform_eta4_avx2.c](dev/x86_64/src/rej_uniform_eta4_avx2.c) - [mldsa/native/x86_64/src/align.h](mldsa/native/x86_64/src/align.h) - [mldsa/native/x86_64/src/consts.c](mldsa/native/x86_64/src/consts.c) - [mldsa/native/x86_64/src/consts.h](mldsa/native/x86_64/src/consts.h) diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h new file mode 100644 index 000000000..d2375644f --- /dev/null +++ b/dev/aarch64_clean/meta.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_AARCH64_META_H +#define MLD_NATIVE_AARCH64_META_H + +/* Set of primitives that this backend replaces */ +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_REJ_UNIFORM +#define MLD_USE_NATIVE_REJ_UNIFORM_ETA2 +#define MLD_USE_NATIVE_REJ_UNIFORM_ETA4 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_32 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_88 +#define MLD_USE_NATIVE_POLY_CADDQ +#define MLD_USE_NATIVE_POLY_USE_HINT_32 +#define MLD_USE_NATIVE_POLY_USE_HINT_88 +#define MLD_USE_NATIVE_POLY_CHKNORM +#define MLD_USE_NATIVE_POLYZ_UNPACK_17 +#define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_AARCH64 + + +#if !defined(__ASSEMBLER__) +#include "src/arith_native_aarch64.h" + +static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_asm(data, mld_aarch64_ntt_zetas_layer123456, + mld_aarch64_ntt_zetas_layer78); +} + +static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_intt_asm(data, mld_aarch64_intt_zetas_layer78, + mld_aarch64_intt_zetas_layer123456); +} + +static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + if (len != MLDSA_N || buflen % 24 != 0) + { + return -1; + } + + /* Safety: outlen is at most MLDSA_N, hence, this cast is safe. */ + return (int)mld_rej_uniform_asm(r, buf, buflen, mld_rej_uniform_table); +} + +static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + unsigned int outlen; + /* AArch64 implementation assumes specific buffer lengths */ + if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN) + { + return -1; + } + /* Constant time: Inputs and outputs to this function are secret. + * It is safe to leak which coefficients are accepted/rejected. + * The assembly implementation must not leak any other information about the + * accepted coefficients. Constant-time testing cannot cover this, and we + * hence have to manually verify the assembly. + * We declassify prior the input data and mark the outputs as secret. + */ + MLD_CT_TESTING_DECLASSIFY(buf, buflen); + outlen = mld_rej_uniform_eta2_asm(r, buf, buflen, mld_rej_uniform_eta_table); + MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen); + /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */ + return (int)outlen; +} + +static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + unsigned int outlen; + /* AArch64 implementation assumes specific buffer lengths */ + if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN) + { + return -1; + } + /* Constant time: Inputs and outputs to this function are secret. + * It is safe to leak which coefficients are accepted/rejected. + * The assembly implementation must not leak any other information about the + * accepted coefficients. Constant-time testing cannot cover this, and we + * hence have to manually verify the assembly. + * We declassify prior the input data and mark the outputs as secret. + */ + MLD_CT_TESTING_DECLASSIFY(buf, buflen); + outlen = mld_rej_uniform_eta4_asm(r, buf, buflen, mld_rej_uniform_eta_table); + MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen); + /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */ + return (int)outlen; +} + +static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_32_asm(a1, a0, a); +} + +static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_88_asm(a1, a0, a); +} + +static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N]) +{ + mld_poly_caddq_asm(a); +} + +static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a, + const int32_t *h) +{ + mld_poly_use_hint_32_asm(b, a, h); +} + +static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a, + const int32_t *h) +{ + mld_poly_use_hint_88_asm(b, a, h); +} + +static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B) +{ + return mld_poly_chknorm_asm(a, B); +} + +static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, + const uint8_t *buf) +{ + mld_polyz_unpack_17_asm(r, buf, mld_polyz_unpack_17_indices); +} + +static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, + const uint8_t *buf) +{ + mld_polyz_unpack_19_asm(r, buf, mld_polyz_unpack_19_indices); +} + +static MLD_INLINE void mld_poly_pointwise_montgomery_native( + int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], + const int32_t in1[MLDSA_N]) +{ + mld_poly_pointwise_montgomery_asm(out, in0, in1); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N], + const int32_t v[4][MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, (const int32_t *)u, + (const int32_t *)v); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N], + const int32_t v[5][MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, (const int32_t *)u, + (const int32_t *)v); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N], + const int32_t v[7][MLDSA_N]) +{ + mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, (const int32_t *)u, + (const int32_t *)v); +} + +#endif /* !__ASSEMBLER__ */ +#endif /* !MLD_NATIVE_AARCH64_META_H */ diff --git a/dev/aarch64_clean/src/aarch64_zetas.c b/dev/aarch64_clean/src/aarch64_zetas.c new file mode 100644 index 000000000..ed51b2b3b --- /dev/null +++ b/dev/aarch64_clean/src/aarch64_zetas.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) + +#include +#include "arith_native_aarch64.h" + +/* + * Table of zeta values used in the AArch64 forward NTT + * See autogen for details. + */ +MLD_ALIGN const int32_t mld_aarch64_ntt_zetas_layer123456[] = { + -3572223, -915382907, 3765607, 964937599, 3761513, 963888510, + -3201494, -820383522, -2883726, -738955404, -3145678, -806080660, + -3201430, -820367122, 0, 0, -601683, -154181397, + -3370349, -863652652, -4063053, -1041158200, 3602218, 923069133, + 3182878, 815613168, 2740543, 702264730, -3586446, -919027554, + 0, 0, 3542485, 907762539, 2663378, 682491182, + -1674615, -429120452, -3110818, -797147778, 2101410, 538486762, + 3704823, 949361686, 1159875, 297218217, 0, 0, + 2682288, 687336873, -3524442, -903139016, -434125, -111244624, + 394148, 101000509, 928749, 237992130, 1095468, 280713909, + -3506380, -898510625, 0, 0, 2129892, 545785280, + 676590, 173376332, -1335936, -342333886, 2071829, 530906624, + -4018989, -1029866791, 3241972, 830756018, 2156050, 552488273, + 0, 0, 3764867, 964747974, -3227876, -827143915, + 1714295, 439288460, 3415069, 875112161, 1759347, 450833045, + -817536, -209493775, -3574466, -915957677, 0, 0, + -1005239, -257592709, 2453983, 628833668, 1460718, 374309300, + 3756790, 962678241, -1935799, -496048908, -1716988, -439978542, + -3950053, -1012201926, 0, 0, 557458, 142848732, + -642628, -164673562, -3585098, -918682129, -2897314, -742437332, + 3192354, 818041395, 556856, 142694469, 3870317, 991769559, + 0, 0, -1221177, -312926867, 2815639, 721508096, + 2283733, 585207070, 2917338, 747568486, 1853806, 475038184, + 3345963, 857403734, 1858416, 476219497, 0, 0, +}; + +MLD_ALIGN const int32_t mld_aarch64_ntt_zetas_layer78[] = { + 3073009, 1277625, -2635473, 3852015, 787459213, + 327391679, -675340520, 987079667, 1753, -2659525, + 2660408, -59148, 449207, -681503850, 681730119, + -15156688, -1935420, -1455890, -1780227, 2772600, + -495951789, -373072124, -456183549, 710479343, 4183372, + -3222807, -3121440, -274060, 1071989969, -825844983, + -799869667, -70227934, 1182243, 636927, -3956745, + -3284915, 302950022, 163212680, -1013916752, -841760171, + 87208, -3965306, -2296397, -3716946, 22347069, + -1016110510, -588452222, -952468207, 2508980, 2028118, + 1937570, -3815725, 642926661, 519705671, 496502727, + -977780347, -27812, 1009365, -1979497, -3956944, + -7126831, 258649997, -507246529, -1013967746, 822541, + -2454145, 1596822, -3759465, 210776307, -628875181, + 409185979, -963363710, 2811291, -2983781, -1109516, + 4158088, 720393920, -764594519, -284313712, 1065510939, + -1685153, 2678278, -3551006, -250446, -431820817, + 686309310, -909946047, -64176841, -3410568, -3768948, + 635956, -2455377, -873958779, -965793731, 162963861, + -629190881, 1528066, 482649, 1148858, -2962264, + 391567239, 123678909, 294395108, -759080783, -4146264, + 2192938, 2387513, -268456, -1062481036, 561940831, + 611800717, -68791907, -1772588, -1727088, -3611750, + -3180456, -454226054, -442566669, -925511710, -814992530, + -565603, 169688, 2462444, -3334383, -144935890, + 43482586, 631001801, -854436357, 3747250, 1239911, + 3195676, 1254190, 960233614, 317727459, 818892658, + 321386456, 2296099, -3838479, 2642980, -12417, + 588375860, -983611064, 677264190, -3181859, -4166425, + -3488383, 1987814, -3197248, -1067647297, -893898890, + 509377762, -819295484, 2998219, -89301, -1354892, + -1310261, 768294260, -22883400, -347191365, -335754661, + 141835, 2513018, 613238, -2218467, 36345249, + 643961400, 157142369, -568482643, 1736313, 235407, + -3250154, 3258457, 444930577, 60323094, -832852657, + 834980303, -458740, 4040196, 2039144, -818761, + -117552223, 1035301089, 522531086, -209807681, -1921994, + -3472069, -1879878, -2178965, -492511373, -889718424, + -481719139, -558360247, -2579253, 1787943, -2391089, + -2254727, -660934133, 458160776, -612717067, -577774276, + -1623354, -2374402, 586241, 527981, -415984810, + -608441020, 150224382, 135295244, 2105286, -2033807, + -1179613, -2743411, 539479988, -521163479, -302276083, + -702999655, 3482206, -4182915, -1300016, -2362063, + 892316032, -1071872863, -333129378, -605279149, -1476985, + 2491325, 507927, -724804, -378477722, 638402564, + 130156402, -185731180, 1994046, -1393159, -1187885, + -1834526, 510974714, -356997292, -304395785, -470097680, + -1317678, 2461387, 3035980, 621164, -337655269, + 630730945, 777970524, 159173408, -3033742, 2647994, + -2612853, 749577, -777397036, 678549029, -669544140, + 192079267, -338420, 3009748, 4148469, -4022750, + -86720197, 771248568, 1063046068, -1030830548, 3901472, + -1226661, 2925816, 3374250, 999753034, -314332144, + 749740976, 864652284, 3980599, -1615530, 1665318, + 1163598, 1020029345, -413979908, 426738094, 298172236, + 2569011, 1723229, 2028038, -3369273, 658309618, + 441577800, 519685171, -863376927, 1356448, -2775755, + 2683270, -2778788, 347590090, -711287812, 687588511, + -712065019, 3994671, -1370517, 3363542, 545376, + 1023635298, -351195274, 861908357, 139752717, -11879, + 3020393, 214880, -770441, -3043996, 773976352, + 55063046, -197425671, -3467665, 2312838, -653275, + -459163, -888589898, 592665232, -167401858, -117660617, + 3105558, 508145, 860144, 140244, 795799901, + 130212265, 220412084, 35937555, -1103344, -553718, + 3430436, -1514152, -282732136, -141890356, 879049958, + -388001774, 348812, -327848, 1011223, -2354215, + 89383150, -84011120, 259126110, -603268097, -2185084, + 2358373, -3014420, 2926054, -559928242, 604333585, + -772445769, 749801963, 3123762, -2193087, -1716814, + -392707, 800464680, -561979013, -439933955, -100631253, + -3818627, -1922253, -2236726, 1744507, -978523985, + -492577742, -573161516, 447030292, -303005, -3974485, + 1900052, 1054478, -77645096, -1018462631, 486888731, + 270210213, 3531229, -3773731, -781875, -731434, + 904878186, -967019376, -200355636, -187430119, +}; + +MLD_ALIGN const int32_t mld_aarch64_intt_zetas_layer78[] = { + -1744507, 2236726, 1922253, 3818627, -447030292, 573161516, + 492577742, 978523985, 731434, 781875, 3773731, -3531229, + 187430119, 200355636, 967019376, -904878186, -1054478, -1900052, + 3974485, 303005, -270210213, -486888731, 1018462631, 77645096, + 2354215, -1011223, 327848, -348812, 603268097, -259126110, + 84011120, -89383150, 392707, 1716814, 2193087, -3123762, + 100631253, 439933955, 561979013, -800464680, -2926054, 3014420, + -2358373, 2185084, -749801963, 772445769, -604333585, 559928242, + 459163, 653275, -2312838, 3467665, 117660617, 167401858, + -592665232, 888589898, 1514152, -3430436, 553718, 1103344, + 388001774, -879049958, 141890356, 282732136, -140244, -860144, + -508145, -3105558, -35937555, -220412084, -130212265, -795799901, + 2778788, -2683270, 2775755, -1356448, 712065019, -687588511, + 711287812, -347590090, 770441, -214880, -3020393, 11879, + 197425671, -55063046, -773976352, 3043996, -545376, -3363542, + 1370517, -3994671, -139752717, -861908357, 351195274, -1023635298, + -3374250, -2925816, 1226661, -3901472, -864652284, -749740976, + 314332144, -999753034, 3369273, -2028038, -1723229, -2569011, + 863376927, -519685171, -441577800, -658309618, -1163598, -1665318, + 1615530, -3980599, -298172236, -426738094, 413979908, -1020029345, + -621164, -3035980, -2461387, 1317678, -159173408, -777970524, + -630730945, 337655269, 4022750, -4148469, -3009748, 338420, + 1030830548, -1063046068, -771248568, 86720197, -749577, 2612853, + -2647994, 3033742, -192079267, 669544140, -678549029, 777397036, + 2362063, 1300016, 4182915, -3482206, 605279149, 333129378, + 1071872863, -892316032, 1834526, 1187885, 1393159, -1994046, + 470097680, 304395785, 356997292, -510974714, 724804, -507927, + -2491325, 1476985, 185731180, -130156402, -638402564, 378477722, + 2254727, 2391089, -1787943, 2579253, 577774276, 612717067, + -458160776, 660934133, 2743411, 1179613, 2033807, -2105286, + 702999655, 302276083, 521163479, -539479988, -527981, -586241, + 2374402, 1623354, -135295244, -150224382, 608441020, 415984810, + -3258457, 3250154, -235407, -1736313, -834980303, 832852657, + -60323094, -444930577, 2178965, 1879878, 3472069, 1921994, + 558360247, 481719139, 889718424, 492511373, 818761, -2039144, + -4040196, 458740, 209807681, -522531086, -1035301089, 117552223, + 3197248, -1987814, 3488383, 4166425, 819295484, -509377762, + 893898890, 1067647297, 2218467, -613238, -2513018, -141835, + 568482643, -157142369, -643961400, -36345249, 1310261, 1354892, + 89301, -2998219, 335754661, 347191365, 22883400, -768294260, + 3334383, -2462444, -169688, 565603, 854436357, -631001801, + -43482586, 144935890, 12417, -2642980, 3838479, -2296099, + 3181859, -677264190, 983611064, -588375860, -1254190, -3195676, + -1239911, -3747250, -321386456, -818892658, -317727459, -960233614, + 2962264, -1148858, -482649, -1528066, 759080783, -294395108, + -123678909, -391567239, 3180456, 3611750, 1727088, 1772588, + 814992530, 925511710, 442566669, 454226054, 268456, -2387513, + -2192938, 4146264, 68791907, -611800717, -561940831, 1062481036, + -4158088, 1109516, 2983781, -2811291, -1065510939, 284313712, + 764594519, -720393920, 2455377, -635956, 3768948, 3410568, + 629190881, -162963861, 965793731, 873958779, 250446, 3551006, + -2678278, 1685153, 64176841, 909946047, -686309310, 431820817, + 3815725, -1937570, -2028118, -2508980, 977780347, -496502727, + -519705671, -642926661, 3759465, -1596822, 2454145, -822541, + 963363710, -409185979, 628875181, -210776307, 3956944, 1979497, + -1009365, 27812, 1013967746, 507246529, -258649997, 7126831, + 274060, 3121440, 3222807, -4183372, 70227934, 799869667, + 825844983, -1071989969, 3716946, 2296397, 3965306, -87208, + 952468207, 588452222, 1016110510, -22347069, 3284915, 3956745, + -636927, -1182243, 841760171, 1013916752, -163212680, -302950022, + -3852015, 2635473, -1277625, -3073009, -987079667, 675340520, + -327391679, -787459213, -2772600, 1780227, 1455890, 1935420, + -710479343, 456183549, 373072124, 495951789, 59148, -2660408, + 2659525, -1753, 15156688, -681730119, 681503850, -449207, +}; + +MLD_ALIGN const int32_t mld_aarch64_intt_zetas_layer123456[] = { + -2283733, -585207070, -1858416, -476219497, -3345963, -857403734, + -2815639, -721508096, -1853806, -475038184, -2917338, -747568486, + 3585098, 918682129, -3870317, -991769559, -556856, -142694469, + 642628, 164673562, -3192354, -818041395, 2897314, 742437332, + -1460718, -374309300, 3950053, 1012201926, 1716988, 439978542, + -2453983, -628833668, 1935799, 496048908, -3756790, -962678241, + -1714295, -439288460, 3574466, 915957677, 817536, 209493775, + 3227876, 827143915, -1759347, -450833045, -3415069, -875112161, + 1335936, 342333886, -2156050, -552488273, -3241972, -830756018, + -676590, -173376332, 4018989, 1029866791, -2071829, -530906624, + 434125, 111244624, 3506380, 898510625, -1095468, -280713909, + 3524442, 903139016, -928749, -237992130, -394148, -101000509, + 1674615, 429120452, -1159875, -297218217, -3704823, -949361686, + -2663378, -682491182, -2101410, -538486762, 3110818, 797147778, + 4063053, 1041158200, 3586446, 919027554, -2740543, -702264730, + 3370349, 863652652, -3182878, -815613168, -3602218, -923069133, + -294725, -75523344, -3761513, -963888510, -3765607, -964937599, + 3201430, 820367122, 3145678, 806080660, 2883726, 738955404, + 3201494, 820383522, 1221177, 312926867, -557458, -142848732, + 1005239, 257592709, -3764867, -964747974, -2129892, -545785280, + -2682288, -687336873, -3542485, -907762539, 601683, 154181397, + 0, 0, +}; + +#else /* MLD_ARITH_BACKEND_AARCH64 */ + +MLD_EMPTY_CU(aarch64_zetas) + +#endif /* !MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/arith_native_aarch64.h b/dev/aarch64_clean/src/arith_native_aarch64.h new file mode 100644 index 000000000..c55a205c9 --- /dev/null +++ b/dev/aarch64_clean/src/arith_native_aarch64.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H +#define MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H + +#include +#include "../../../common.h" + +#define mld_aarch64_ntt_zetas_layer123456 \ + MLD_NAMESPACE(aarch64_ntt_zetas_layer123456) +#define mld_aarch64_ntt_zetas_layer78 MLD_NAMESPACE(aarch64_ntt_zetas_layer78) + +#define mld_aarch64_intt_zetas_layer78 MLD_NAMESPACE(aarch64_intt_zetas_layer78) +#define mld_aarch64_intt_zetas_layer123456 \ + MLD_NAMESPACE(aarch64_intt_zetas_layer123456) + +extern const int32_t mld_aarch64_ntt_zetas_layer123456[]; +extern const int32_t mld_aarch64_ntt_zetas_layer78[]; + +extern const int32_t mld_aarch64_intt_zetas_layer78[]; +extern const int32_t mld_aarch64_intt_zetas_layer123456[]; + +#define mld_rej_uniform_table MLD_NAMESPACE(rej_uniform_table) +extern const uint8_t mld_rej_uniform_table[]; +#define mld_rej_uniform_eta_table MLD_NAMESPACE(rej_uniform_eta_table) +extern const uint8_t mld_rej_uniform_eta_table[]; + +#define mld_polyz_unpack_17_indices MLD_NAMESPACE(polyz_unpack_17_indices) +extern const uint8_t mld_polyz_unpack_17_indices[]; +#define mld_polyz_unpack_19_indices MLD_NAMESPACE(polyz_unpack_19_indices) +extern const uint8_t mld_polyz_unpack_19_indices[]; + + +/* + * Sampling 256 coefficients mod 15 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/15))/2 = 136.5 bytes. + * We sample 1 block (=136 bytes) of SHAKE256_RATE output initially. + * Sampling 2 blocks initially results in slightly worse performance. + */ +#define MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN (1 * 136) +/* + * Sampling 256 coefficients mod 9 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/9))/2 = 227.5 bytes. + * We sample 2 blocks (=272 bytes) of SHAKE256_RATE output initially. + */ +#define MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN (2 * 136) + +#define mld_ntt_asm MLD_NAMESPACE(ntt_asm) +void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *); + +#define mld_intt_asm MLD_NAMESPACE(intt_asm) +void mld_intt_asm(int32_t *, const int32_t *, const int32_t *); + +#define mld_rej_uniform_asm MLD_NAMESPACE(rej_uniform_asm) +uint64_t mld_rej_uniform_asm(int32_t *r, const uint8_t *buf, unsigned buflen, + const uint8_t *table); + +#define mld_rej_uniform_eta2_asm MLD_NAMESPACE(rej_uniform_eta2_asm) +unsigned mld_rej_uniform_eta2_asm(int32_t *r, const uint8_t *buf, + unsigned buflen, const uint8_t *table); + +#define mld_rej_uniform_eta4_asm MLD_NAMESPACE(rej_uniform_eta4_asm) +unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf, + unsigned buflen, const uint8_t *table); + +#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm) +void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a); + +#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm) +void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a); + +#define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm) +void mld_poly_caddq_asm(int32_t *a); + +#define mld_poly_use_hint_32_asm MLD_NAMESPACE(poly_use_hint_32_asm) +void mld_poly_use_hint_32_asm(int32_t *b, const int32_t *a, const int32_t *h); + +#define mld_poly_use_hint_88_asm MLD_NAMESPACE(poly_use_hint_88_asm) +void mld_poly_use_hint_88_asm(int32_t *b, const int32_t *a, const int32_t *h); + +#define mld_poly_chknorm_asm MLD_NAMESPACE(poly_chknorm_asm) +uint32_t mld_poly_chknorm_asm(const int32_t *a, int32_t B); + +#define mld_polyz_unpack_17_asm MLD_NAMESPACE(polyz_unpack_17_asm) +void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf, + const uint8_t *indices); + +#define mld_polyz_unpack_19_asm MLD_NAMESPACE(polyz_unpack_19_asm) +void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, + const uint8_t *indices); + +#define mld_poly_pointwise_montgomery_asm \ + MLD_NAMESPACE(poly_pointwise_montgomery_asm) +void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *, + const int32_t *); + +#define mld_polyvecl_pointwise_acc_montgomery_l4_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) +void mld_polyvecl_pointwise_acc_montgomery_l4_asm(int32_t *, const int32_t *, + const int32_t *); + +#define mld_polyvecl_pointwise_acc_montgomery_l5_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) +void mld_polyvecl_pointwise_acc_montgomery_l5_asm(int32_t *, const int32_t *, + const int32_t *); + +#define mld_polyvecl_pointwise_acc_montgomery_l7_asm \ + MLD_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) +void mld_polyvecl_pointwise_acc_montgomery_l7_asm(int32_t *, const int32_t *, + const int32_t *); + +#endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */ diff --git a/dev/aarch64_clean/src/intt.S b/dev/aarch64_clean/src/intt.S new file mode 100644 index 000000000..85898dd5f --- /dev/null +++ b/dev/aarch64_clean/src/intt.S @@ -0,0 +1,372 @@ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] + mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] + mls \dst\().4s, t2.4s, modulus.s[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s + mul \dst\().4s, \src\().4s, \const\().4s + mls \dst\().4s, t2.4s, modulus.s[0] +.endm + + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr q_root0, [\r_ptr], #(8*16) + ldr q_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr q_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr q_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr q_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr q_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr q_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr q_root7, [\r_ptr, #(-8*16 + 7*16)] +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr q_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr q_\root0, [\r_ptr0], #8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr q_\root0, [\r_ptr1], #(6*16) + ldr q_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr q_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr q_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr q_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr q_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +.text +.global MLD_ASM_NAMESPACE(intt_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(intt_asm) + push_stack + + in .req x0 + r5678_ptr .req x1 + r1234_ptr .req x2 + inp .req x3 + count .req x4 + xtmp .req x5 + + wtmp .req w5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + q_data0 .req q8 + q_data1 .req q9 + q_data2 .req q10 + q_data3 .req q11 + q_data4 .req q12 + q_data5 .req q13 + q_data6 .req q14 + q_data7 .req q15 + q_data8 .req q16 + q_data9 .req q17 + q_data10 .req q18 + q_data11 .req q19 + q_data12 .req q20 + q_data13 .req q21 + q_data14 .req q22 + q_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root3 .req q3 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + q_root3_tw .req q7 + + + tmp .req v24 + q_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + mov inp, in + + mov count, #16 + + .p2align 2 +layer5678_start: + + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in] + + load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r5678_ptr + + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data0, data1, data2, data3 + + load_next_roots_6 root1, r1234_ptr + load_next_roots_56 root0, r1234_ptr + + gs_butterfly data0, data1, root0, 0, 1 + gs_butterfly data2, data3, root0, 2, 3 + gs_butterfly data0, data2, root1, 0, 1 + gs_butterfly data1, data3, root1, 0, 1 + + str q_data0, [in], #(16*4) + str q_data1, [in, #(-16*4 + 1*16)] + str q_data2, [in, #(-16*4 + 2*16)] + str q_data3, [in, #(-16*4 + 3*16)] + + subs count, count, #1 + cbnz count, layer5678_start + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq q_root0_tw + .unreq q_root1_tw + .unreq q_root2_tw + .unreq q_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + q_root4 .req q4 + q_root5 .req q5 + q_root6 .req q6 + q_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + + mov in, inp + mov count, #4 + + // load ninv + mov wtmp, #16382 // 2^(32 - 8) mod Q + dup ninv.4s, wtmp + + // load ninv_tw = 4197891 + movz wtmp, #3587 + movk wtmp, #64, lsl #16 + dup ninv_tw.4s, wtmp + + load_roots_1234 r1234_ptr + + .p2align 2 +layer1234_start: + ldr q_data0, [in, #(0*(512/8))] + ldr q_data1, [in, #(1*(512/8))] + ldr q_data2, [in, #(2*(512/8))] + ldr q_data3, [in, #(3*(512/8))] + ldr q_data4, [in, #(4*(512/8))] + ldr q_data5, [in, #(5*(512/8))] + ldr q_data6, [in, #(6*(512/8))] + ldr q_data7, [in, #(7*(512/8))] + ldr q_data8, [in, #(8*(512/8))] + ldr q_data9, [in, #(9*(512/8))] + ldr q_data10, [in, #(10*(512/8))] + ldr q_data11, [in, #(11*(512/8))] + ldr q_data12, [in, #(12*(512/8))] + ldr q_data13, [in, #(13*(512/8))] + ldr q_data14, [in, #(14*(512/8))] + ldr q_data15, [in, #(15*(512/8))] + + // layer4 + gs_butterfly data0, data1, root3, 2, 3 + gs_butterfly data2, data3, root4, 0, 1 + gs_butterfly data4, data5, root4, 2, 3 + gs_butterfly data6, data7, root5, 0, 1 + gs_butterfly data8, data9, root5, 2, 3 + gs_butterfly data10, data11, root6, 0, 1 + gs_butterfly data12, data13, root6, 2, 3 + gs_butterfly data14, data15, root7, 0, 1 + + // layer3 + gs_butterfly data0, data2, root1, 2, 3 + gs_butterfly data1, data3, root1, 2, 3 + gs_butterfly data4, data6, root2, 0, 1 + gs_butterfly data5, data7, root2, 0, 1 + gs_butterfly data8, data10, root2, 2, 3 + gs_butterfly data9, data11, root2, 2, 3 + gs_butterfly data12, data14, root3, 0, 1 + gs_butterfly data13, data15, root3, 0, 1 + + // layer2 + gs_butterfly data0, data4, root0, 2, 3 + gs_butterfly data1, data5, root0, 2, 3 + gs_butterfly data2, data6, root0, 2, 3 + gs_butterfly data3, data7, root0, 2, 3 + gs_butterfly data8, data12, root1, 0, 1 + gs_butterfly data9, data13, root1, 0, 1 + gs_butterfly data10, data14, root1, 0, 1 + gs_butterfly data11, data15, root1, 0, 1 + + // layer 1 + gs_butterfly data0, data8, root0, 0, 1 + gs_butterfly data1, data9, root0, 0, 1 + gs_butterfly data2, data10, root0, 0, 1 + gs_butterfly data3, data11, root0, 0, 1 + gs_butterfly data4, data12, root0, 0, 1 + gs_butterfly data5, data13, root0, 0, 1 + gs_butterfly data6, data14, root0, 0, 1 + gs_butterfly data7, data15, root0, 0, 1 + + str q_data8, [in, #(8*(512/8))] + str q_data9, [in, #(9*(512/8))] + str q_data10, [in, #(10*(512/8))] + str q_data11, [in, #(11*(512/8))] + str q_data12, [in, #(12*(512/8))] + str q_data13, [in, #(13*(512/8))] + str q_data14, [in, #(14*(512/8))] + str q_data15, [in, #(15*(512/8))] + + // Scale half the coeffs 2^-8 and the Montgomery factor 2^32. + // For the other half, the scaling has been merged into the + // multiplication with the twiddle factor on the last layer. + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + str q_data0, [in], #(16) + str q_data1, [in, #(-16 + 1*(512/8))] + str q_data2, [in, #(-16 + 2*(512/8))] + str q_data3, [in, #(-16 + 3*(512/8))] + str q_data4, [in, #(-16 + 4*(512/8))] + str q_data5, [in, #(-16 + 5*(512/8))] + str q_data6, [in, #(-16 + 6*(512/8))] + str q_data7, [in, #(-16 + 7*(512/8))] + + subs count, count, #1 + cbnz count, layer1234_start + + pop_stack + ret + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S new file mode 100644 index 000000000..ed949b5fb --- /dev/null +++ b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S @@ -0,0 +1,115 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro montgomery_reduce_long res, inl, inh + uzp1 \res\().4s, \inl\().4s, \inh\().4s + mul \res\().4s, \res\().4s, modulus_twisted.4s + smlal \inl\().2d, \res\().2s, modulus.2s + smlal2 \inh\().2d, \res\().4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys p0, p1, p2, p3, ptr, idx +.if \idx == 0 + ldr \p1, [\ptr, #1*16] + ldr \p2, [\ptr, #2*16] + ldr \p3, [\ptr, #3*16] + ldr \p0, [\ptr], #4*16 +.else + ldr \p0, [\ptr, #(1024*\idx-4*16)] + ldr \p1, [\ptr, #(1024*\idx-3*16)] + ldr \p2, [\ptr, #(1024*\idx-2*16)] + ldr \p3, [\ptr, #(1024*\idx-1*16)] +.endif +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +out_ptr .req x0 +a_ptr .req x1 +b_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l4_asm) + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + mov count, #(MLDSA_N / 4) + +l4_loop_start: + load_polys q16, q17, q18, q19, a_ptr, 0 + load_polys q20, q21, q22, q23, b_ptr, 0 + + pmull v24, v25, v16, v20 + pmull v26, v27, v17, v21 + pmull v28, v29, v18, v22 + pmull v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 1 + load_polys q20, q21, q22, q23, b_ptr, 1 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 2 + load_polys q20, q21, q22, q23, b_ptr, 2 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 3 + load_polys q20, q21, q22, q23, b_ptr, 3 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + montgomery_reduce_long v16, v24, v25 + montgomery_reduce_long v17, v26, v27 + montgomery_reduce_long v18, v28, v29 + montgomery_reduce_long v19, v30, v31 + + str q17, [out_ptr, #1*16] + str q18, [out_ptr, #2*16] + str q19, [out_ptr, #3*16] + str q16, [out_ptr], #4*16 + + subs count, count, #4 + cbnz count, l4_loop_start + + ret + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l5.S b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l5.S new file mode 100644 index 000000000..11a472fe7 --- /dev/null +++ b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l5.S @@ -0,0 +1,122 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro montgomery_reduce_long res, inl, inh + uzp1 \res\().4s, \inl\().4s, \inh\().4s + mul \res\().4s, \res\().4s, modulus_twisted.4s + smlal \inl\().2d, \res\().2s, modulus.2s + smlal2 \inh\().2d, \res\().4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys p0, p1, p2, p3, ptr, idx +.if \idx == 0 + ldr \p1, [\ptr, #1*16] + ldr \p2, [\ptr, #2*16] + ldr \p3, [\ptr, #3*16] + ldr \p0, [\ptr], #4*16 +.else + ldr \p0, [\ptr, #(1024*\idx-4*16)] + ldr \p1, [\ptr, #(1024*\idx-3*16)] + ldr \p2, [\ptr, #(1024*\idx-2*16)] + ldr \p3, [\ptr, #(1024*\idx-1*16)] +.endif +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +out_ptr .req x0 +a_ptr .req x1 +b_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l5_asm) + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + mov count, #(MLDSA_N / 4) + +l5_loop_start: + load_polys q16, q17, q18, q19, a_ptr, 0 + load_polys q20, q21, q22, q23, b_ptr, 0 + + pmull v24, v25, v16, v20 + pmull v26, v27, v17, v21 + pmull v28, v29, v18, v22 + pmull v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 1 + load_polys q20, q21, q22, q23, b_ptr, 1 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 2 + load_polys q20, q21, q22, q23, b_ptr, 2 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 3 + load_polys q20, q21, q22, q23, b_ptr, 3 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 4 + load_polys q20, q21, q22, q23, b_ptr, 4 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + montgomery_reduce_long v16, v24, v25 + montgomery_reduce_long v17, v26, v27 + montgomery_reduce_long v18, v28, v29 + montgomery_reduce_long v19, v30, v31 + + str q17, [out_ptr, #1*16] + str q18, [out_ptr, #2*16] + str q19, [out_ptr, #3*16] + str q16, [out_ptr], #4*16 + + subs count, count, #4 + cbnz count, l5_loop_start + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l7.S b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l7.S new file mode 100644 index 000000000..ce558ced8 --- /dev/null +++ b/dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l7.S @@ -0,0 +1,138 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro montgomery_reduce_long res, inl, inh + uzp1 \res\().4s, \inl\().4s, \inh\().4s + mul \res\().4s, \res\().4s, modulus_twisted.4s + smlal \inl\().2d, \res\().2s, modulus.2s + smlal2 \inh\().2d, \res\().4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro load_polys p0, p1, p2, p3, ptr, idx +.if \idx == 0 + ldr \p1, [\ptr, #1*16] + ldr \p2, [\ptr, #2*16] + ldr \p3, [\ptr, #3*16] + ldr \p0, [\ptr], #4*16 +.else + ldr \p0, [\ptr, #(1024*\idx-4*16)] + ldr \p1, [\ptr, #(1024*\idx-3*16)] + ldr \p2, [\ptr, #(1024*\idx-2*16)] + ldr \p3, [\ptr, #(1024*\idx-1*16)] +.endif +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +out_ptr .req x0 +a_ptr .req x1 +b_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +.text +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l7_asm) + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + mov count, #(MLDSA_N / 4) + +l7_loop_start: + load_polys q16, q17, q18, q19, a_ptr, 0 + load_polys q20, q21, q22, q23, b_ptr, 0 + + pmull v24, v25, v16, v20 + pmull v26, v27, v17, v21 + pmull v28, v29, v18, v22 + pmull v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 1 + load_polys q20, q21, q22, q23, b_ptr, 1 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 2 + load_polys q20, q21, q22, q23, b_ptr, 2 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 3 + load_polys q20, q21, q22, q23, b_ptr, 3 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 4 + load_polys q20, q21, q22, q23, b_ptr, 4 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 5 + load_polys q20, q21, q22, q23, b_ptr, 5 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + load_polys q16, q17, q18, q19, a_ptr, 6 + load_polys q20, q21, q22, q23, b_ptr, 6 + + pmlal v24, v25, v16, v20 + pmlal v26, v27, v17, v21 + pmlal v28, v29, v18, v22 + pmlal v30, v31, v19, v23 + + montgomery_reduce_long v16, v24, v25 + montgomery_reduce_long v17, v26, v27 + montgomery_reduce_long v18, v28, v29 + montgomery_reduce_long v19, v30, v31 + + str q17, [out_ptr, #1*16] + str q18, [out_ptr, #2*16] + str q19, [out_ptr, #3*16] + str q16, [out_ptr], #4*16 + + subs count, count, #4 + cbnz count, l7_loop_start + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/ntt.S b/dev/aarch64_clean/src/ntt.S new file mode 100644 index 000000000..def498c66 --- /dev/null +++ b/dev/aarch64_clean/src/ntt.S @@ -0,0 +1,307 @@ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] + mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] + mls \dst\().4s, t2.4s, consts.s[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s + mul \dst\().4s, \src\().4s, \const\().4s + mls \dst\().4s, t2.4s, consts.s[0] +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro load_roots_123 + ldr q_root0, [r012345_ptr], #64 + ldr q_root1, [r012345_ptr, #(-64 + 16)] + ldr q_root2, [r012345_ptr, #(-64 + 32)] + ldr q_root3, [r012345_ptr, #(-64 + 48)] +.endm + +.macro load_roots_456 + ldr q_root0, [r012345_ptr], #64 + ldr q_root1, [r012345_ptr, #(-64 + 16)] + ldr q_root2, [r012345_ptr, #(-64 + 32)] + ldr q_root3, [r012345_ptr, #(-64 + 48)] +.endm + +.macro load_roots_78_part1 + ldr q_root0, [r67_ptr], #(12*16) + ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)] + ldr q_root1, [r67_ptr, #(-12*16 + 2*16)] + ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)] + ldr q_root2, [r67_ptr, #(-12*16 + 4*16)] + ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)] +.endm + +.macro load_roots_78_part2 + ldr q_root0, [r67_ptr, (-12*16 + 6*16)] + ldr q_root0_tw, [r67_ptr, (-12*16 + 7*16)] + ldr q_root1, [r67_ptr, (-12*16 + 8*16)] + ldr q_root1_tw, [r67_ptr, (-12*16 + 9*16)] + ldr q_root2, [r67_ptr, (-12*16 + 10*16)] + ldr q_root2_tw, [r67_ptr, (-12*16 + 11*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + // Inputs + in .req x0 // Input/output buffer + r012345_ptr .req x1 // twiddles for layer 0,1,2,3,4,5 + r67_ptr .req x2 // twiddles for layer 6,7 + + count .req x3 + inp .req x4 + inpp .req x5 + xtmp .req x6 + wtmp .req w6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + q_data0 .req q9 + q_data1 .req q10 + q_data2 .req q11 + q_data3 .req q12 + q_data4 .req q13 + q_data5 .req q14 + q_data6 .req q15 + q_data7 .req q16 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root3 .req q3 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + q_root3_tw .req q7 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + consts .req v8 + q_consts .req q8 + +.text +.global MLD_ASM_NAMESPACE(ntt_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(ntt_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup consts.4s, wtmp + + mov inp, in + mov count, #8 + + load_roots_123 + + .p2align 2 +layer123_start: + ldr q_data0, [in, #(0*(1024/8))] + ldr q_data1, [in, #(1*(1024/8))] + ldr q_data2, [in, #(2*(1024/8))] + ldr q_data3, [in, #(3*(1024/8))] + ldr q_data4, [in, #(4*(1024/8))] + ldr q_data5, [in, #(5*(1024/8))] + ldr q_data6, [in, #(6*(1024/8))] + ldr q_data7, [in, #(7*(1024/8))] + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root1, 0, 1 + ct_butterfly data5, data7, root1, 0, 1 + + ct_butterfly data0, data1, root1, 2, 3 + ct_butterfly data2, data3, root2, 0, 1 + ct_butterfly data4, data5, root2, 2, 3 + ct_butterfly data6, data7, root3, 0, 1 + + str q_data0, [in], #16 + str q_data1, [in, #(-16 + 1*(1024/8))] + str q_data2, [in, #(-16 + 2*(1024/8))] + str q_data3, [in, #(-16 + 3*(1024/8))] + str q_data4, [in, #(-16 + 4*(1024/8))] + str q_data5, [in, #(-16 + 5*(1024/8))] + str q_data6, [in, #(-16 + 6*(1024/8))] + str q_data7, [in, #(-16 + 7*(1024/8))] + + subs count, count, #1 + cbnz count, layer123_start + + mov in, inp + add inpp, in, #64 + mov count, #8 + + // Use two data pointers and carefully arrange + // increments to facilitate reordering of loads + // and stores by SLOTHY. + // + // TODO: Think of alternatives here -- the start with `in` + // pointing to 64 byte below the actual data, which in theory + // could underflow. It's unclear how the CPU would behave in this case. + sub in, in, #64 + sub inpp, inpp, #64 + + .p2align 2 +layer45678_start: + ldr q_data0, [in, #(64 + 16*0)] + ldr q_data1, [in, #(64 + 16*1)] + ldr q_data2, [in, #(64 + 16*2)] + ldr q_data3, [in, #(64 + 16*3)] + ldr q_data4, [inpp, #(64 + 16*0)] + ldr q_data5, [inpp, #(64 + 16*1)] + ldr q_data6, [inpp, #(64 + 16*2)] + ldr q_data7, [inpp, #(64 + 16*3)] + + add in, in, #64 + add inpp, inpp, #64 + + load_roots_456 + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root1, 0, 1 + ct_butterfly data5, data7, root1, 0, 1 + + ct_butterfly data0, data1, root1, 2, 3 + ct_butterfly data2, data3, root2, 0, 1 + ct_butterfly data4, data5, root2, 2, 3 + ct_butterfly data6, data7, root3, 0, 1 + + // Transpose using trn + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_78_part1 + + ct_butterfly_v data0, data2, root0, root0_tw + ct_butterfly_v data1, data3, root0, root0_tw + ct_butterfly_v data0, data1, root1, root1_tw + ct_butterfly_v data2, data3, root2, root2_tw + + load_roots_78_part2 + + ct_butterfly_v data4, data6, root0, root0_tw + ct_butterfly_v data5, data7, root0, root0_tw + ct_butterfly_v data4, data5, root1, root1_tw + ct_butterfly_v data6, data7, root2, root2_tw + + // Transpose as part of st4 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 + st4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp], #64 + + subs count, count, #1 + cbnz count, layer45678_start + + pop_stack + ret + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/pointwise_montgomery.S b/dev/aarch64_clean/src/pointwise_montgomery.S new file mode 100644 index 000000000..641d3644a --- /dev/null +++ b/dev/aarch64_clean/src/pointwise_montgomery.S @@ -0,0 +1,78 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) +/* simpasm: header-end */ + +.macro montgomery_reduce_long res, inl, inh + uzp1 \res\().4s, \inl\().4s, \inh\().4s + mul \res\().4s, \res\().4s, modulus_twisted.4s + smlal \inl\().2d, \res\().2s, modulus.2s + smlal2 \inh\().2d, \res\().4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +out_ptr .req x0 +a_ptr .req x1 +b_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +.text +.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + + mov count, #(MLDSA_N / 4) + +loop_start: + ldr q17, [a_ptr, #1*16] + ldr q18, [a_ptr, #2*16] + ldr q19, [a_ptr, #3*16] + ldr q16, [a_ptr], #4*16 + + ldr q21, [b_ptr, #1*16] + ldr q22, [b_ptr, #2*16] + ldr q23, [b_ptr, #3*16] + ldr q20, [b_ptr], #4*16 + + pmull v24, v25, v16, v20 + pmull v26, v27, v17, v21 + pmull v28, v29, v18, v22 + pmull v30, v31, v19, v23 + + montgomery_reduce_long v16, v24, v25 + montgomery_reduce_long v17, v26, v27 + montgomery_reduce_long v18, v28, v29 + montgomery_reduce_long v19, v30, v31 + + str q17, [out_ptr, #1*16] + str q18, [out_ptr, #2*16] + str q19, [out_ptr, #3*16] + str q16, [out_ptr], #4*16 + + subs count, count, #4 + cbnz count, loop_start + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/dev/aarch64_clean/src/poly_caddq_asm.S b/dev/aarch64_clean/src/poly_caddq_asm.S new file mode 100644 index 000000000..a65d62583 --- /dev/null +++ b/dev/aarch64_clean/src/poly_caddq_asm.S @@ -0,0 +1,60 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.macro caddq inout + ushr tmp.4s, \inout\().4s, #31 + mla \inout\().4s, tmp.4s, q_reg.4s +.endm + +.global MLD_ASM_NAMESPACE(poly_caddq_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_caddq_asm) + // Function signature: void mld_poly_caddq_asm(int32_t *a) + // x0: pointer to polynomial coefficients + + // Register assignments + a_ptr .req x0 + count .req x1 + q_reg .req v4 + tmp .req v5 + + // Load constants + // MLDSA_Q = 8380417 = 0x7FE001 + movz w9, #0xE001 + movk w9, #0x7F, lsl #16 + dup q_reg.4s, w9 // Load Q values + + mov count, #64/4 +poly_caddq_loop: + ldr q0, [a_ptr, #0*16] + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + + caddq v0 + caddq v1 + caddq v2 + caddq v3 + + str q1, [a_ptr, #1*16] + str q2, [a_ptr, #2*16] + str q3, [a_ptr, #3*16] + str q0, [a_ptr], #4*16 + + subs count, count, #1 + bne poly_caddq_loop + + ret + + .unreq a_ptr + .unreq count + .unreq q_reg + .unreq tmp +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/poly_chknorm_asm.S b/dev/aarch64_clean/src/poly_chknorm_asm.S new file mode 100644 index 000000000..5cbe117ba --- /dev/null +++ b/dev/aarch64_clean/src/poly_chknorm_asm.S @@ -0,0 +1,63 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.macro chknorm a + abs \a\().4s, \a\().4s + cmge \a\().4s, \a\().4s, bound.4s + orr flags.16b, flags.16b, \a\().16b +.endm + + /* Parameters */ + a_ptr .req x0 // Input polynomial + B .req w1 // Input norm bound + + count .req x2 + + /* Constant register assignments */ + bound .req v20 + flags .req v21 + +.text +.global MLD_ASM_NAMESPACE(poly_chknorm_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_chknorm_asm) + // Load constants + dup bound.4s, B + + movi flags.4s, 0 + + mov count, #(64/4) + +poly_chknorm_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + chknorm v1 + chknorm v2 + chknorm v3 + chknorm v0 + + subs count, count, #1 + bne poly_chknorm_loop + + // Return 0xffffffff if any of the 4 lanes is 0xffffffff + umaxv s21, flags.4s + fmov w0, s21 + + ret + + .unreq a_ptr + .unreq B + .unreq count + .unreq bound + .unreq flags +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/poly_decompose_32_asm.S b/dev/aarch64_clean/src/poly_decompose_32_asm.S new file mode 100644 index 000000000..fd29df11b --- /dev/null +++ b/dev/aarch64_clean/src/poly_decompose_32_asm.S @@ -0,0 +1,105 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// a aliased with a0 +.macro decompose32 a1, a, temp + // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / + // 2^49), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1074791425 / 2^49 ≲ 1 / 523776. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #18 + + // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 31*GAMMA2 = 8118528 + gamma2_2x .req v22 // 2*GAMMA2 = 523776 + barrett_const .req v23 // Barrett constant = 1074791425 + + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0xe100 + movk w5, #0x7b, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xfe00 + movk w7, #7, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0401 + movk w11, #0x4010, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) + +poly_decompose_32_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose32 v5, v1, v24 + decompose32 v6, v2, v24 + decompose32 v7, v3, v24 + decompose32 v4, v0, v24 + + str q5, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q7, [a1_ptr, #3*16] + str q4, [a1_ptr], #4*16 + str q1, [a0_ptr, #1*16] + str q2, [a0_ptr, #2*16] + str q3, [a0_ptr, #3*16] + str q0, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_32_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/poly_decompose_88_asm.S b/dev/aarch64_clean/src/poly_decompose_88_asm.S new file mode 100644 index 000000000..a16390777 --- /dev/null +++ b/dev/aarch64_clean/src/poly_decompose_88_asm.S @@ -0,0 +1,103 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// a aliased with a0 +.macro decompose88 a1, a, temp + // Compute a1 = round-(a / 190464) ≈ round(a * 1477838209 / + // 2^48), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1477838209 / 2^48 ≲ 1 / 190464. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #17 + + // If a1 = 44, i.e. a > 87*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 87*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 87*GAMMA2 = 8285184 + gamma2_2x .req v22 // 2*GAMMA2 = 190464 + barrett_const .req v23 // Barrett constant = 1477838209 + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_88_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0x6c00 + movk w5, #0x7e, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xe800 + movk w7, #0x2, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0581 + movk w11, #0x5816, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) +poly_decompose_88_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose88 v5, v1, v24 + decompose88 v6, v2, v24 + decompose88 v7, v3, v24 + decompose88 v4, v0, v24 + + str q5, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q7, [a1_ptr, #3*16] + str q4, [a1_ptr], #4*16 + str q1, [a0_ptr, #1*16] + str q2, [a0_ptr, #2*16] + str q3, [a0_ptr, #3*16] + str q0, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_88_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/poly_use_hint_32_asm.S b/dev/aarch64_clean/src/poly_use_hint_32_asm.S new file mode 100644 index 000000000..259338546 --- /dev/null +++ b/dev/aarch64_clean/src/poly_use_hint_32_asm.S @@ -0,0 +1,122 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// a aliased with a0 +.macro decompose32 a1, a, temp + // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / + // 2^49), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1074791425 / 2^49 ≲ 1 / 523776. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #18 + + // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + +// a aliased with delta +.macro use_hint32 b, a, h, temp + decompose32 \b, \a, \temp + + // delta = (a0 <= 0) ? -1 : 1 + cmle \a\().4s, \a\().4s, #0 + orr \a\().4s, #1 + + // b = (b + delta * h) % 16 + mla \b\().4s, \a\().4s, \h\().4s + and \b\().16b, \b\().16b, mask_15.16b +.endm + + /* Parameters */ + b_ptr .req x0 // Output polynomial + a_ptr .req x1 // Input polynomial + h_ptr .req x2 // Input hints + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 31*GAMMA2 = 8118528 + gamma2_2x .req v22 // 2*GAMMA2 = 523776 + barrett_const .req v23 // Barrett constant = 1074791425 + mask_15 .req v24 // mask = 15 + +.text +.global MLD_ASM_NAMESPACE(poly_use_hint_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_use_hint_32_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0xe100 + movk w5, #0x7b, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xfe00 + movk w7, #7, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0401 + movk w11, #0x4010, lsl #16 + dup barrett_const.4s, w11 + + movi mask_15.4s, #15 + + mov count, #(64/4) + +poly_use_hint_32_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + ldr q5, [h_ptr, #1*16] + ldr q6, [h_ptr, #2*16] + ldr q7, [h_ptr, #3*16] + ldr q4, [h_ptr], #4*16 + + use_hint32 v17, v1, v5, v25 + use_hint32 v18, v2, v6, v25 + use_hint32 v19, v3, v7, v25 + use_hint32 v16, v0, v4, v25 + + str q17, [b_ptr, #1*16] + str q18, [b_ptr, #2*16] + str q19, [b_ptr, #3*16] + str q16, [b_ptr], #4*16 + + subs count, count, #1 + bne poly_use_hint_32_loop + + ret + + .unreq b_ptr + .unreq a_ptr + .unreq h_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const + .unreq mask_15 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/poly_use_hint_88_asm.S b/dev/aarch64_clean/src/poly_use_hint_88_asm.S new file mode 100644 index 000000000..bbe39ee0d --- /dev/null +++ b/dev/aarch64_clean/src/poly_use_hint_88_asm.S @@ -0,0 +1,124 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// a aliased with a0 +.macro decompose88 a1, a, temp + // Compute a1 = round-(a / 190464) ≈ round(a * 1477838209 / + // 2^48), where round-() denotes "round half down". This is + // exact for 0 <= a < Q. Note that half is rounded down since + // 1477838209 / 2^48 ≲ 1 / 190464. + sqdmulh \a1\().4s, \a\().4s, barrett_const.4s + srshr \a1\().4s, \a1\().4s, #17 + + // If a1 = 44, i.e. a > 87*GAMMA2, proceed as if a' = a - Q was + // given instead. (For a = 87*GAMMA2 + 1 thus a' = -GAMMA2, we + // still round it to 0 like other "wrapped around" cases.) + + // Check for wrap-around + cmgt \temp\().4s, \a\().4s, q_bound.4s + + // Compute remainder a0 + mls \a\().4s, \a1\().4s, gamma2_2x.4s + + // If wrap-around is required, set a1 = 0 and a0 -= 1 + bic \a1\().16b, \a1\().16b, \temp\().16b + add \a\().4s, \a\().4s, \temp\().4s +.endm + +// a aliased with delta +.macro use_hint88 b, a, h, temp + decompose88 \b, \a, \temp + + // delta = (a0 <= 0) ? -1 : 1 + cmle \a\().4s, \a\().4s, #0 + orr \a\().4s, #1 + + // b = (b + delta * h) % 44 + mla \b\().4s, \a\().4s, \h\().4s + cmgt \temp\().4s, \b\().4s, const_43.4s + bic \b\().16b, \b\().16b, \temp\().16b + umin \b\().4s, \b\().4s, const_43.4s +.endm + + /* Parameters */ + b_ptr .req x0 // Output polynomial + a_ptr .req x1 // Input polynomial + h_ptr .req x2 // Input hints + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_bound .req v21 // 87*GAMMA2 = 8285184 + gamma2_2x .req v22 // 2*GAMMA2 = 190464 + barrett_const .req v23 // Barrett constant = 1477838209 + const_43 .req v24 // mask = 43 + +.text +.global MLD_ASM_NAMESPACE(poly_use_hint_88_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_use_hint_88_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + movz w5, #0x6c00 + movk w5, #0x7e, lsl #16 + dup q_bound.4s, w5 + + movz w7, #0xe800 + movk w7, #0x2, lsl #16 + dup gamma2_2x.4s, w7 + + movz w11, #0x0581 + movk w11, #0x5816, lsl #16 + dup barrett_const.4s, w11 + + movi const_43.4s, #43 + + mov count, #(64/4) + +poly_use_hint_88_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + ldr q5, [h_ptr, #1*16] + ldr q6, [h_ptr, #2*16] + ldr q7, [h_ptr, #3*16] + ldr q4, [h_ptr], #4*16 + + use_hint88 v17, v1, v5, v25 + use_hint88 v18, v2, v6, v25 + use_hint88 v19, v3, v7, v25 + use_hint88 v16, v0, v4, v25 + + str q17, [b_ptr, #1*16] + str q18, [b_ptr, #2*16] + str q19, [b_ptr, #3*16] + str q16, [b_ptr], #4*16 + + subs count, count, #1 + bne poly_use_hint_88_loop + + ret + + .unreq b_ptr + .unreq a_ptr + .unreq h_ptr + .unreq count + .unreq q + .unreq q_bound + .unreq gamma2_2x + .unreq barrett_const + .unreq const_43 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/polyz_unpack_17_asm.S b/dev/aarch64_clean/src/polyz_unpack_17_asm.S new file mode 100644 index 000000000..1c28d965c --- /dev/null +++ b/dev/aarch64_clean/src/polyz_unpack_17_asm.S @@ -0,0 +1,104 @@ +/* + * Copyright (c) The mldsa-native project authors + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + #include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.macro trim_map_17 a + // Keep only 18 out of 24 bits in each 32-bit lane + // Lane 0 1 2 3 + // Bits 0..23 16..39 32..55 48..71 + ushl \a\().4s, \a\().4s, shifts.4s + // Bits 0..23 18..39 36..55 54..71 + and \a\().16b, \a\().16b, mask.16b + // Bits 0..17 18..35 36..53 54..71 + + // Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] + sub \a\().4s, gamma1.4s, \a\().4s +.endm + + /* Parameters */ + output .req x0 + buf .req x1 + indices .req x2 + + xtmp .req x3 + count .req x9 + + /* Constant register assignments */ + idx0 .req v24 + idx1 .req v25 + idx2 .req v26 + idx3 .req v27 + shifts .req v28 + mask .req v29 // 2^18 - 1 + gamma1 .req v30 // 2^17 + +.text +.global MLD_ASM_NAMESPACE(polyz_unpack_17_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyz_unpack_17_asm) + // Load indices + ldr q24, [indices] + ldr q25, [indices, #1*16] + ldr q26, [indices, #2*16] + ldr q27, [indices, #3*16] + + // Load per-lane shifts 0, -2, -4, -6. (Negative means right shift.) + // The shifts for the 4 32-bit lanes are sign-extended from the lowest + // 8 bits, so it suffices to set up only byte 0, 4, 8, 12. + movz xtmp, 0xfe, lsl 32 + mov shifts.d[0], xtmp + movz xtmp, 0xfc + movk xtmp, 0xfa, lsl 32 + mov shifts.d[1], xtmp + + movi mask.4s, 0x3, msl 16 + + movi gamma1.4s, 0x2, lsl 16 + + mov count, #(64/4) + +polyz_unpack_17_loop: + ldr q1, [buf, #16] + ldr q2, [buf, #32] + ldr q0, [buf], #36 + + tbl v4.16b, {v0.16b}, idx0.16b + tbl v5.16b, {v0.16b - v1.16b}, idx1.16b + tbl v6.16b, {v1.16b}, idx2.16b + tbl v7.16b, {v1.16b - v2.16b}, idx3.16b + + trim_map_17 v4 + trim_map_17 v5 + trim_map_17 v6 + trim_map_17 v7 + + str q5, [output, #1*16] + str q6, [output, #2*16] + str q7, [output, #3*16] + str q4, [output], #4*16 + + subs count, count, #1 + bne polyz_unpack_17_loop + + ret + + .unreq output + .unreq buf + .unreq indices + .unreq xtmp + .unreq count + .unreq idx0 + .unreq idx1 + .unreq idx2 + .unreq idx3 + .unreq shifts + .unreq mask + .unreq gamma1 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/polyz_unpack_19_asm.S b/dev/aarch64_clean/src/polyz_unpack_19_asm.S new file mode 100644 index 000000000..a6eac6882 --- /dev/null +++ b/dev/aarch64_clean/src/polyz_unpack_19_asm.S @@ -0,0 +1,101 @@ +/* + * Copyright (c) The mldsa-native project authors + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + #include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.macro trim_map_19 a + // Keep only 20 out of 24 bits in each 32-bit lane + // Lane 0 1 2 3 + // Bits 0..23 16..39 40..63 56..79 + ushl \a\().4s, \a\().4s, shifts.4s + // Bits 0..23 20..39 40..63 60..79 + and \a\().16b, \a\().16b, mask.16b + // Bits 0..19 20..39 40..59 60..79 + + // Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] + sub \a\().4s, gamma1.4s, \a\().4s +.endm + + /* Parameters */ + output .req x0 + buf .req x1 + indices .req x2 + + xtmp .req x3 + count .req x9 + + /* Constant register assignments */ + idx0 .req v24 + idx1 .req v25 + idx2 .req v26 + idx3 .req v27 + shifts .req v28 + mask .req v29 // 2^20 - 1 + gamma1 .req v30 // 2^19 + +.text +.global MLD_ASM_NAMESPACE(polyz_unpack_19_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyz_unpack_19_asm) + // Load indices + ldr q24, [indices] + ldr q25, [indices, #1*16] + ldr q26, [indices, #2*16] + ldr q27, [indices, #3*16] + + // Load per-lane shifts 0, -4, 0, -4. (Negative means right shift.) + // The shifts for the 4 32-bit lanes are sign-extended from the lowest + // 8 bits, so it suffices to set up only byte 0, 4, 8, 12. + movz xtmp, 0xfc, lsl 32 + dup shifts.2d, xtmp + + movi mask.4s, 0xf, msl 16 + + movi gamma1.4s, 0x8, lsl 16 + + mov count, #(64/4) + +polyz_unpack_19_loop: + ldr q1, [buf, #16] + ldr q2, [buf, #32] + ldr q0, [buf], #40 + + tbl v4.16b, {v0.16b}, idx0.16b + tbl v5.16b, {v0.16b - v1.16b}, idx1.16b + tbl v6.16b, {v1.16b}, idx2.16b + tbl v7.16b, {v1.16b - v2.16b}, idx3.16b + + trim_map_19 v4 + trim_map_19 v5 + trim_map_19 v6 + trim_map_19 v7 + + str q5, [output, #1*16] + str q6, [output, #2*16] + str q7, [output, #3*16] + str q4, [output], #4*16 + + subs count, count, #1 + bne polyz_unpack_19_loop + + ret + + .unreq output + .unreq buf + .unreq indices + .unreq xtmp + .unreq count + .unreq idx0 + .unreq idx1 + .unreq idx2 + .unreq idx3 + .unreq shifts + .unreq mask + .unreq gamma1 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/polyz_unpack_table.c b/dev/aarch64_clean/src/polyz_unpack_table.c new file mode 100644 index 000000000..7ef915b58 --- /dev/null +++ b/dev/aarch64_clean/src/polyz_unpack_table.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "arith_native_aarch64.h" + +/* Table of indices used for tbl instructions in polyz_unpack_{17,19}. */ + +MLD_ALIGN const uint8_t mld_polyz_unpack_17_indices[] = { + 0, 1, 2, 255, 2, 3, 4, 255, 4, 5, 6, 255, 6, 7, 8, 255, + 9, 10, 11, 255, 11, 12, 13, 255, 13, 14, 15, 255, 15, 16, 17, 255, + 2, 3, 4, 255, 4, 5, 6, 255, 6, 7, 8, 255, 8, 9, 10, 255, + 11, 12, 13, 255, 13, 14, 15, 255, 15, 16, 17, 255, 17, 18, 19, 255, +}; + +MLD_ALIGN const uint8_t mld_polyz_unpack_19_indices[] = { + 0, 1, 2, 255, 2, 3, 4, 255, 5, 6, 7, 255, 7, 8, 9, 255, + 10, 11, 12, 255, 12, 13, 14, 255, 15, 16, 17, 255, 17, 18, 19, 255, + 4, 5, 6, 255, 6, 7, 8, 255, 9, 10, 11, 255, 11, 12, 13, 255, + 14, 15, 16, 255, 16, 17, 18, 255, 19, 20, 21, 255, 21, 22, 23, 255, +}; + +#else /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(aarch64_polyz_unpack_table) + +#endif /* !(MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/aarch64_clean/src/rej_uniform_asm.S b/dev/aarch64_clean/src/rej_uniform_asm.S new file mode 100644 index 000000000..dadde25f3 --- /dev/null +++ b/dev/aarch64_clean/src/rej_uniform_asm.S @@ -0,0 +1,410 @@ +/* + * Copyright (c) The mldsa-native project authors + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + #include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// We save the output on the stack first, and copy to the actual +// output buffer only in the end. This is because the main loop can overwrite +// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). +#define STACK_SIZE (4*MLDSA_N + 64) + +.macro push_stack + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack + add sp, sp, #STACK_SIZE +.endm + + /* Parameters */ + output .req x0 + buf .req x1 + buflen .req x2 + table_idx .req x3 + + len .req x4 + + /* Temporary output on the stack */ + xtmp .req x7 + wtmp .req w7 + output_tmp .req x7 + output_tmp_base .req x8 + + /* Number of coefficients sampled so far */ + count .req x9 + + /* Temporary registers */ + initial_zero_count .req x11 + final_copy_count .req x11 + + rec_idx_0 .req x12 + rec_idx_1 .req x13 + rec_idx_2 .req x14 + rec_idx_3 .req x15 + + ctr0 .req x12 + ctr1 .req x13 + ctr2 .req x14 + ctr3 .req x15 + + ctr01 .req ctr0 + ctr23 .req ctr2 + + /* Vector registers */ + + buf0 .req v0 + buf1 .req v1 + buf2 .req v2 + + tmp0 .req v4 + tmp1 .req v5 + tmp2 .req v6 + tmp3 .req v7 + + sign0 .req v4 + sign1 .req v5 + sign2 .req v6 + sign3 .req v7 + + val0 .req v16 + val0q .req q16 + val1 .req v17 + val1q .req q17 + val2 .req v18 + val2q .req q18 + val3 .req v19 + val3q .req q19 + + t0 .req d20 + t1 .req d21 + t2 .req d22 + t3 .req d23 + + table0 .req v24 + table0q .req q24 + table1 .req v25 + table1q .req q25 + table2 .req v26 + table2q .req q26 + table3 .req v27 + table3q .req q27 + + mldsa_q .req v30 + bits .req v31 + + .text + .global MLD_ASM_NAMESPACE(rej_uniform_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(rej_uniform_asm) + push_stack + + // Load 0x1, 0x2, 0x4, 0x8 + movz xtmp, 0x1 + movk xtmp, 0x2, lsl 32 + mov bits.d[0], xtmp + + movz xtmp, 0x4 + movk xtmp, 0x8, lsl 32 + mov bits.d[1], xtmp + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup mldsa_q.4s, wtmp + + mov output_tmp_base, sp + mov output_tmp, output_tmp_base + + // The entire temporary stack buffer is copied to the output buffer + // at the end of this routine. To avoid leaking original stack contents + // in case not enough bytes have been sampled, zero the temporary buffer. + mov initial_zero_count, #0 + eor val0.16b, val0.16b, val0.16b +rej_uniform_initial_zero: + str val0q, [output_tmp], #64 + str val0q, [output_tmp, #-48] + str val0q, [output_tmp, #-32] + str val0q, [output_tmp, #-16] + add initial_zero_count, initial_zero_count, #16 + cmp initial_zero_count, #MLDSA_N + b.lt rej_uniform_initial_zero + + mov output_tmp, output_tmp_base + + mov count, #0 + mov len, #MLDSA_N + + cmp buflen, #48 + b.lo rej_uniform_loop48_end + +rej_uniform_loop48: + // Finish once we've generated sufficiently many coefficients + cmp count, len + b.hs rej_uniform_memory_copy + + // First, we unpack the byte stream into a stream of signed + // coefficients, interpreting each consecutive 3 bytes as one + // signed 24-bit coefficients, presented as 32-bit integers. + // The topmost bit is masked out making it a 23-bit coefficient. + // + // We handle 16 coefficients a time, and use ld3 for the required + // de-interleaving of the byte stream. + sub buflen, buflen, #48 + ld3 {buf0.16b, buf1.16b, buf2.16b}, [buf], #48 + + // Mask out top-most bit + movi tmp0.16b, #0x80 + bic buf2.16b, buf2.16b, tmp0.16b + + // Unpack 16 triples of bytes into 16 32-bit integers, + // represented as 4 vectors val0-val3. + zip1 tmp0.16b, buf0.16b, buf1.16b + zip2 tmp1.16b, buf0.16b, buf1.16b + uxtl tmp2.8h, buf2.8b + uxtl2 tmp3.8h, buf2.16b + + zip1 val0.8h, tmp0.8h, tmp2.8h + zip2 val1.8h, tmp0.8h, tmp2.8h + zip1 val2.8h, tmp1.8h, tmp3.8h + zip2 val3.8h, tmp1.8h, tmp3.8h + + // At this point, val0-val3 are the signed integers to do rejection + // sampling on. For each of them, do the following: + // - Check which coefficients are within range, and represent the set + // of lane-indices of those coefficients as an 4-bit bitmap. + // - Move the respective lanes to the front of the vector. This is the + // most complex part, and is done by interpreting the 4-bit bitmap as + // an index into a lookup table giving the lane-table to be use for + // the `tbl` instruction. + // - Write the vector to the output buffer, but merely increase the output + // buffer pointer by the number of valid coefficients. + + + // Set valid lanes to -1 (0b1...1) + cmhi sign0.4s, mldsa_q.4s, val0.4s + cmhi sign1.4s, mldsa_q.4s, val1.4s + cmhi sign2.4s, mldsa_q.4s, val2.4s + cmhi sign3.4s, mldsa_q.4s, val3.4s + + // If lane i is valid and has value -1, retain only i-th bit + and sign0.16b, sign0.16b, bits.16b + and sign1.16b, sign1.16b, bits.16b + and sign2.16b, sign2.16b, bits.16b + and sign3.16b, sign3.16b, bits.16b + + // Get 4-bit bitmap of valid lane indices by adding lanes + uaddlv t0, sign0.4s + uaddlv t1, sign1.4s + uaddlv t2, sign2.4s + uaddlv t3, sign3.4s + + fmov rec_idx_0, t0 + fmov rec_idx_1, t1 + fmov rec_idx_2, t2 + fmov rec_idx_3, t3 + + ldr table0q, [table_idx, rec_idx_0, lsl #4] + ldr table1q, [table_idx, rec_idx_1, lsl #4] + ldr table2q, [table_idx, rec_idx_2, lsl #4] + ldr table3q, [table_idx, rec_idx_3, lsl #4] + + // Compute number of valid coefficients. Recall that at this + // point, lane i has value 2^i (hence popcount 1) if its coefficient + // is valid, and 0 otherwise. + cnt sign0.16b, sign0.16b + cnt sign1.16b, sign1.16b + cnt sign2.16b, sign2.16b + cnt sign3.16b, sign3.16b + + // Extract number of valid coefficients + uaddlv t0, sign0.4s + uaddlv t1, sign1.4s + uaddlv t2, sign2.4s + uaddlv t3, sign3.4s + + fmov ctr0, t0 + fmov ctr1, t1 + fmov ctr2, t2 + fmov ctr3, t3 + + // Move valid coefficients to the front + tbl val0.16b, {val0.16b}, table0.16b + tbl val1.16b, {val1.16b}, table1.16b + tbl val2.16b, {val2.16b}, table2.16b + tbl val3.16b, {val3.16b}, table3.16b + + str val0q, [output_tmp] + add output_tmp, output_tmp, ctr0, lsl #2 + + str val1q, [output_tmp] + add output_tmp, output_tmp, ctr1, lsl #2 + + str val2q, [output_tmp] + add output_tmp, output_tmp, ctr2, lsl #2 + + str val3q, [output_tmp] + add output_tmp, output_tmp, ctr3, lsl #2 + + add ctr01, ctr0, ctr1 + add ctr23, ctr2, ctr3 + add count, count, ctr01 + add count, count, ctr23 + + cmp buflen, #48 + b.hs rej_uniform_loop48 +rej_uniform_loop48_end: + + // Finish once we've generated sufficiently many coefficients + cmp count, len + b.hs rej_uniform_memory_copy + + cmp buflen, #24 + b.lo rej_uniform_memory_copy + + sub buflen, buflen, #24 + ld3 {buf0.8b, buf1.8b, buf2.8b}, [buf], #24 + + // mask out top-most bit + movi tmp0.16b, #0x80 + bic buf2.16b, buf2.16b, tmp0.16b + + zip1 tmp0.16b, buf0.16b, buf1.16b + uxtl tmp2.8h, buf2.8b + + zip1 val0.8h, tmp0.8h, tmp2.8h + zip2 val1.8h, tmp0.8h, tmp2.8h + + cmhi sign0.4s, mldsa_q.4s, val0.4s + cmhi sign1.4s, mldsa_q.4s, val1.4s + + and sign0.16b, sign0.16b, bits.16b + and sign1.16b, sign1.16b, bits.16b + + uaddlv t0, sign0.4s + uaddlv t1, sign1.4s + + fmov rec_idx_0, t0 + fmov rec_idx_1, t1 + + ldr table0q, [table_idx, rec_idx_0, lsl #4] + ldr table1q, [table_idx, rec_idx_1, lsl #4] + + cnt sign0.16b, sign0.16b + cnt sign1.16b, sign1.16b + + uaddlv t0, sign0.4s + uaddlv t1, sign1.4s + + fmov ctr0, t0 + fmov ctr1, t1 + + tbl val0.16b, {val0.16b}, table0.16b + tbl val1.16b, {val1.16b}, table1.16b + + str val0q, [output_tmp] + add output_tmp, output_tmp, ctr0, lsl #2 + + str val1q, [output_tmp] + add output_tmp, output_tmp, ctr1, lsl #2 + + add count, count, ctr0 + add count, count, ctr1 + +rej_uniform_memory_copy: + // min = min(count,len) + cmp count, len + csel count, count, len, lo + + // Always copy MLDSA_N coefficients from the stack to the destination, + // even if not all of them may be valid. This simplifies the loop and + // allows us to stick to vectorized code. + mov final_copy_count, #0 + mov output_tmp, output_tmp_base +rej_uniform_final_copy: + ldr val0q, [output_tmp], #64 + ldr val1q, [output_tmp, #-48] + ldr val2q, [output_tmp, #-32] + ldr val3q, [output_tmp, #-16] + str val0q, [output], #64 + str val1q, [output, #-48] + str val2q, [output, #-32] + str val3q, [output, #-16] + add final_copy_count, final_copy_count, #16 + cmp final_copy_count, #MLDSA_N + b.lt rej_uniform_final_copy + + mov x0, count + b rej_uniform_return + + +rej_uniform_return: + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq output + .unreq buf + .unreq buflen + .unreq table_idx + .unreq len + .unreq output_tmp + .unreq output_tmp_base + .unreq count + .unreq xtmp + .unreq wtmp + .unreq final_copy_count + .unreq initial_zero_count + .unreq rec_idx_0 + .unreq rec_idx_1 + .unreq rec_idx_2 + .unreq rec_idx_3 + .unreq ctr0 + .unreq ctr1 + .unreq ctr2 + .unreq ctr3 + .unreq ctr01 + .unreq ctr23 + .unreq buf0 + .unreq buf1 + .unreq buf2 + .unreq tmp0 + .unreq tmp1 + .unreq tmp2 + .unreq tmp3 + .unreq sign0 + .unreq sign1 + .unreq sign2 + .unreq sign3 + .unreq val0 + .unreq val0q + .unreq val1 + .unreq val1q + .unreq val2 + .unreq val2q + .unreq val3 + .unreq val3q + .unreq t0 + .unreq t1 + .unreq t2 + .unreq t3 + .unreq table0 + .unreq table0q + .unreq table1 + .unreq table1q + .unreq table2 + .unreq table2q + .unreq table3 + .unreq table3q + .unreq mldsa_q + .unreq bits + +/* TODO: autogenerate this in autogen */ +#undef STACK_SIZE +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/rej_uniform_eta2_asm.S b/dev/aarch64_clean/src/rej_uniform_eta2_asm.S new file mode 100644 index 000000000..594d95810 --- /dev/null +++ b/dev/aarch64_clean/src/rej_uniform_eta2_asm.S @@ -0,0 +1,333 @@ +/* + * Copyright (c) The mldsa-native project authors + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// We save the output on the stack first, and copy to the actual +// output buffer only in the end. This is because the main loop can overwrite +// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). +#define STACK_SIZE (2*MLDSA_N + 64) + +.macro push_stack + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack + add sp, sp, #STACK_SIZE +.endm + + /* Parameters */ + output .req x0 + buf .req x1 + buflen .req x2 + table_idx .req x3 + + len .req x4 + + /* Temporary output on the stack */ + xtmp .req x7 + wtmp .req w7 + output_tmp .req x7 + output_tmp_base .req x8 + + /* Number of coefficients sampled so far */ + count .req x9 + buf_consumed .req x10 + + /* Temporary registers */ + tmp .req w11 + initial_zero_count .req x11 + final_copy_count .req x11 + + rec_idx_0 .req x12 + rec_idx_1 .req x13 + + rec_idx_0_w .req w12 + rec_idx_1_w .req w13 + + ctr0 .req x12 + ctr1 .req x13 + + ctr0_w .req w12 + ctr1_w .req w13 + + ctr01 .req ctr0 + + /* Vector registers */ + buf0 .req v0 + + tmp0 .req v26 + tmp1 .req v27 + tmp2 .req v28 + tmp3 .req v29 + + sign0 .req v4 + sign1 .req v5 + const2 .req v7 + + // Barrett reduction constants + barrett_const .req v26 + modulus5 .req v27 + barrett_tmp .req v28 + + val0 .req v16 + val0q .req q16 + val1 .req v17 + val1q .req q17 + val2 .req v18 + val2q .req q18 + val3 .req v19 + val3q .req q19 + + t0 .req s20 + t1 .req s21 + + table0 .req v24 + table0q .req q24 + table1 .req v25 + table1q .req q25 + + eta_bound .req v30 + bits .req v31 + + .text + .global MLD_ASM_NAMESPACE(rej_uniform_eta2_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(rej_uniform_eta2_asm) + push_stack + + // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + movz xtmp, 0x1 + movk xtmp, 0x2, lsl 16 + movk xtmp, 0x4, lsl 32 + movk xtmp, 0x8, lsl 48 + mov bits.d[0], xtmp + + movz xtmp, 0x10 + movk xtmp, 0x20, lsl 16 + movk xtmp, 0x40, lsl 32 + movk xtmp, 0x80, lsl 48 + mov bits.d[1], xtmp + + // Load eta2 bound = 15 + movi eta_bound.8h, #15 + + mov output_tmp_base, sp + mov output_tmp, output_tmp_base + + // The entire temporary stack buffer is copied to the output buffer + // at the end of this routine. To avoid leaking original stack contents + // in case not enough bytes have been sampled, zero the temporary buffer. + // The temporary buffer holds 16-bit values that are expanded to 32-bit + // on copy out + mov initial_zero_count, #0 + eor val0.16b, val0.16b, val0.16b +rej_uniform_eta2_initial_zero: + str val0q, [output_tmp], #64 + str val0q, [output_tmp, #-48] + str val0q, [output_tmp, #-32] + str val0q, [output_tmp, #-16] + add initial_zero_count, initial_zero_count, #32 + cmp initial_zero_count, #MLDSA_N + b.lt rej_uniform_eta2_initial_zero + + mov output_tmp, output_tmp_base + + mov count, #0 + mov len, #MLDSA_N + +rej_uniform_eta2_loop8: + // Finish once we've generated sufficiently many coefficients + cmp count, len + b.hs rej_uniform_eta2_memory_copy + + // Load 8 bytes and extract nibbles to get 16 4-bit values + sub buflen, buflen, #8 + ld1 {buf0.8b}, [buf], #8 + + // Extract nibbles + movi tmp0.8b, #0x0F + and tmp1.8b, buf0.8b, tmp0.8b // Low nibbles [L0, L1, L2, L3, L4, L5, L6, L7] + ushr tmp2.8b, buf0.8b, #4 // High nibbles [H0, H1, H2, H3, H4, H5, H6, H7] + + // Interleave low and high nibbles: L0,H0,L1,H1,L2,H2,L3,H3,... + zip1 tmp0.8b, tmp1.8b, tmp2.8b // First 8 nibbles interleaved [L0,H0,L1,H1,L2,H2,L3,H3] + zip2 tmp3.8b, tmp1.8b, tmp2.8b // Next 8 nibbles interleaved [L4,H4,L5,H5,L6,H6,L7,H7] + + // Convert to 16-bit values + uxtl val0.8h, tmp0.8b + uxtl val1.8h, tmp3.8b + + // At this point, val0-val1 are the signed integers to do rejection + // sampling on. For each of them, do the following: + // - Check which coefficients are within range, and represent the set + // of lane-indices of those coefficients as an 8-bit bitmap. + // - Move the respective lanes to the front of the vector. This is the + // most complex part, and is done by interpreting the 8-bit bitmap as + // an index into a lookup table giving the lane-table to be use for + // the `tbl` instruction. + // - Write the vector to the output buffer, but merely increase the output + // buffer pointer by the number of valid coefficients. + + // Check which coefficients are within range (< 15) + cmhi sign0.8h, eta_bound.8h, val0.8h + cmhi sign1.8h, eta_bound.8h, val1.8h + + // If lane i is valid and has value -1, retain only i-th bit + and sign0.16b, sign0.16b, bits.16b + and sign1.16b, sign1.16b, bits.16b + + // Get 8-bit bitmap of valid lane indices by adding lanes + uaddlv t0, sign0.8h + uaddlv t1, sign1.8h + + fmov rec_idx_0_w, t0 + fmov rec_idx_1_w, t1 + + ldr table0q, [table_idx, rec_idx_0, lsl #4] + ldr table1q, [table_idx, rec_idx_1, lsl #4] + + // Compute number of valid coefficients. Recall that at this + // point, lane i has value 2^i (hence popcount 1) if its coefficient + // is valid, and 0 otherwise. + cnt sign0.16b, sign0.16b + cnt sign1.16b, sign1.16b + + // Extract number of valid coefficients + uaddlv t0, sign0.8h + uaddlv t1, sign1.8h + + fmov ctr0_w, t0 + fmov ctr1_w, t1 + + // Move valid coefficients to the front + tbl val0.16b, {val0.16b}, table0.16b + tbl val1.16b, {val1.16b}, table1.16b + + + // We store 16-bit coefficients here. They will be expanded to 32-bit + // on copy out + str val0q, [output_tmp] + add output_tmp, output_tmp, ctr0, lsl #1 + + str val1q, [output_tmp] + add output_tmp, output_tmp, ctr1, lsl #1 + + add ctr01, ctr0, ctr1 + add count, count, ctr01 + + cmp buflen, #8 + b.hs rej_uniform_eta2_loop8 + +rej_uniform_eta2_memory_copy: + // min = min(count,len) + cmp count, len + csel count, count, len, lo + + // Initialize constant vectors for Barrett reduction + movz wtmp, #6554 // round((2**15)/5) + dup barrett_const.8h, wtmp + movi modulus5.8h, #5 + movi const2.8h, #2 + + // Always copy MLDSA_N coefficients from the stack to the destination + mov final_copy_count, #0 + mov output_tmp, output_tmp_base +rej_uniform_eta2_final_copy: + ldr val0q, [output_tmp], #32 + ldr val2q, [output_tmp, #-16] + + // Reference: + // Barrett reduction: t0 = t0 - (205 * t0 >> 10) * 5; + + // To make efficient use of sqdmulh, we use the equivalent + // t0 = t0 - (13108 * t0 >> 16) * 5; + + sqdmulh barrett_tmp.8h, val0.8h, barrett_const.8h + mls val0.8h, barrett_tmp.8h, modulus5.8h + + sqdmulh barrett_tmp.8h, val2.8h, barrett_const.8h + mls val2.8h, barrett_tmp.8h, modulus5.8h + + sub val0.8h, const2.8h, val0.8h + sub val2.8h, const2.8h, val2.8h + + // Expand from 16-bit to 32-bit + sxtl2 val1.4s, val0.8h + sxtl val0.4s, val0.4h + + sxtl2 val3.4s, val2.8h + sxtl val2.4s, val2.4h + + str val0q, [output], #64 + str val1q, [output, #-48] + str val2q, [output, #-32] + str val3q, [output, #-16] + add final_copy_count, final_copy_count, #16 + cmp final_copy_count, #MLDSA_N + b.lt rej_uniform_eta2_final_copy + + mov x0, count + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq output + .unreq buf + .unreq buflen + .unreq table_idx + .unreq len + .unreq output_tmp + .unreq output_tmp_base + .unreq count + .unreq buf_consumed + .unreq tmp + .unreq xtmp + .unreq final_copy_count + .unreq initial_zero_count + .unreq rec_idx_0 + .unreq rec_idx_1 + .unreq rec_idx_0_w + .unreq rec_idx_1_w + .unreq ctr0 + .unreq ctr1 + .unreq ctr0_w + .unreq ctr1_w + .unreq ctr01 + .unreq buf0 + .unreq tmp0 + .unreq tmp1 + .unreq tmp2 + .unreq tmp3 + .unreq sign0 + .unreq sign1 + .unreq val0 + .unreq val0q + .unreq val1 + .unreq val1q + .unreq val2 + .unreq val2q + .unreq val3 + .unreq val3q + .unreq t0 + .unreq t1 + .unreq table0 + .unreq table0q + .unreq table1 + .unreq table1q + .unreq eta_bound + .unreq bits + .unreq const2 + .unreq barrett_const + .unreq modulus5 + .unreq barrett_tmp + +#undef STACK_SIZE +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/rej_uniform_eta4_asm.S b/dev/aarch64_clean/src/rej_uniform_eta4_asm.S new file mode 100644 index 000000000..42e503d01 --- /dev/null +++ b/dev/aarch64_clean/src/rej_uniform_eta4_asm.S @@ -0,0 +1,309 @@ +/* + * Copyright (c) The mldsa-native project authors + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +// We save the output on the stack first, and copy to the actual +// output buffer only in the end. This is because the main loop can overwrite +// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). +#define STACK_SIZE (2*MLDSA_N + 64) + +.macro push_stack + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack + add sp, sp, #STACK_SIZE +.endm + + /* Parameters */ + output .req x0 + buf .req x1 + buflen .req x2 + table_idx .req x3 + + len .req x4 + + /* Temporary output on the stack */ + xtmp .req x7 + wtmp .req w7 + output_tmp .req x7 + output_tmp_base .req x8 + + /* Number of coefficients sampled so far */ + count .req x9 + buf_consumed .req x10 + + /* Temporary registers */ + tmp .req w11 + initial_zero_count .req x11 + final_copy_count .req x11 + + rec_idx_0 .req x12 + rec_idx_1 .req x13 + + rec_idx_0_w .req w12 + rec_idx_1_w .req w13 + + ctr0 .req x12 + ctr1 .req x13 + + ctr0_w .req w12 + ctr1_w .req w13 + + ctr01 .req ctr0 + + /* Vector registers */ + buf0 .req v0 + + tmp0 .req v26 + tmp1 .req v27 + tmp2 .req v28 + tmp3 .req v29 + + sign0 .req v4 + sign1 .req v5 + const4 .req v7 + + + val0 .req v16 + val0q .req q16 + val1 .req v17 + val1q .req q17 + val2 .req v18 + val2q .req q18 + val3 .req v19 + val3q .req q19 + + t0 .req s20 + t1 .req s21 + + table0 .req v24 + table0q .req q24 + table1 .req v25 + table1q .req q25 + + eta_bound .req v30 + bits .req v31 + + .text + .global MLD_ASM_NAMESPACE(rej_uniform_eta4_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(rej_uniform_eta4_asm) + push_stack + + // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + movz xtmp, 0x1 + movk xtmp, 0x2, lsl 16 + movk xtmp, 0x4, lsl 32 + movk xtmp, 0x8, lsl 48 + mov bits.d[0], xtmp + + movz xtmp, 0x10 + movk xtmp, 0x20, lsl 16 + movk xtmp, 0x40, lsl 32 + movk xtmp, 0x80, lsl 48 + mov bits.d[1], xtmp + + // Load eta4 bound = 9 + movi eta_bound.8h, #9 + movi const4.8h, #4 + + mov output_tmp_base, sp + mov output_tmp, output_tmp_base + + // The entire temporary stack buffer is copied to the output buffer + // at the end of this routine. To avoid leaking original stack contents + // in case not enough bytes have been sampled, zero the temporary buffer. + // The temporary buffer holds 16-bit values that are expanded to 32-bit + // on copy out + mov initial_zero_count, #0 + eor val0.16b, val0.16b, val0.16b +rej_uniform_eta4_initial_zero: + str val0q, [output_tmp], #64 + str val0q, [output_tmp, #-48] + str val0q, [output_tmp, #-32] + str val0q, [output_tmp, #-16] + add initial_zero_count, initial_zero_count, #32 + cmp initial_zero_count, #MLDSA_N + b.lt rej_uniform_eta4_initial_zero + + mov output_tmp, output_tmp_base + + mov count, #0 + mov len, #MLDSA_N + +rej_uniform_eta4_loop8: + // Finish once we've generated sufficiently many coefficients + cmp count, len + b.hs rej_uniform_eta4_memory_copy + + // Load 8 bytes and extract nibbles to get 16 4-bit values + sub buflen, buflen, #8 + ld1 {buf0.8b}, [buf], #8 + + // Extract nibbles + movi tmp0.8b, #0x0F + and tmp1.8b, buf0.8b, tmp0.8b // Low nibbles [L0, L1, L2, L3, L4, L5, L6, L7] + ushr tmp2.8b, buf0.8b, #4 // High nibbles [H0, H1, H2, H3, H4, H5, H6, H7] + + // Interleave low and high nibbles: L0,H0,L1,H1,L2,H2,L3,H3,... + zip1 tmp0.8b, tmp1.8b, tmp2.8b // First 8 nibbles interleaved [L0,H0,L1,H1,L2,H2,L3,H3] + zip2 tmp3.8b, tmp1.8b, tmp2.8b // Next 8 nibbles interleaved [L4,H4,L5,H5,L6,H6,L7,H7] + + // Convert to 16-bit values + uxtl val0.8h, tmp0.8b + uxtl val1.8h, tmp3.8b + + // At this point, val0-val1 are the signed integers to do rejection + // sampling on. For each of them, do the following: + // - Check which coefficients are within range, and represent the set + // of lane-indices of those coefficients as an 8-bit bitmap. + // - Move the respective lanes to the front of the vector. This is the + // most complex part, and is done by interpreting the 8-bit bitmap as + // an index into a lookup table giving the lane-table to be use for + // the `tbl` instruction. + // - Write the vector to the output buffer, but merely increase the output + // buffer pointer by the number of valid coefficients. + + // Check which coefficients are within range (< 9) + cmhi sign0.8h, eta_bound.8h, val0.8h + cmhi sign1.8h, eta_bound.8h, val1.8h + + // If lane i is valid and has value -1, retain only i-th bit + and sign0.16b, sign0.16b, bits.16b + and sign1.16b, sign1.16b, bits.16b + + // Get 8-bit bitmap of valid lane indices by adding lanes + uaddlv t0, sign0.8h + uaddlv t1, sign1.8h + + fmov rec_idx_0_w, t0 + fmov rec_idx_1_w, t1 + + ldr table0q, [table_idx, rec_idx_0, lsl #4] + ldr table1q, [table_idx, rec_idx_1, lsl #4] + + // Compute number of valid coefficients. Recall that at this + // point, lane i has value 2^i (hence popcount 1) if its coefficient + // is valid, and 0 otherwise. + cnt sign0.16b, sign0.16b + cnt sign1.16b, sign1.16b + + // Extract number of valid coefficients + uaddlv t0, sign0.8h + uaddlv t1, sign1.8h + + fmov ctr0_w, t0 + fmov ctr1_w, t1 + + // Move valid coefficients to the front + tbl val0.16b, {val0.16b}, table0.16b + tbl val1.16b, {val1.16b}, table1.16b + + + // We store 16-bit coefficients here. They will be expanded to 32-bit + // on copy out + str val0q, [output_tmp] + add output_tmp, output_tmp, ctr0, lsl #1 + + str val1q, [output_tmp] + add output_tmp, output_tmp, ctr1, lsl #1 + + add ctr01, ctr0, ctr1 + add count, count, ctr01 + + cmp buflen, #8 + b.hs rej_uniform_eta4_loop8 + +rej_uniform_eta4_memory_copy: + // min = min(count,len) + cmp count, len + csel count, count, len, lo + + // Always copy MLDSA_N coefficients from the stack to the destination + mov final_copy_count, #0 + mov output_tmp, output_tmp_base +rej_uniform_eta4_final_copy: + ldr val0q, [output_tmp], #32 + ldr val2q, [output_tmp, #-16] + + // Apply eta4 transformation: 4 - nibble + sub val0.8h, const4.8h, val0.8h + sub val2.8h, const4.8h, val2.8h + + // Expand from 16-bit to 32-bit + sxtl2 val1.4s, val0.8h + sxtl val0.4s, val0.4h + + sxtl2 val3.4s, val2.8h + sxtl val2.4s, val2.4h + + str val0q, [output], #64 + str val1q, [output, #-48] + str val2q, [output, #-32] + str val3q, [output, #-16] + add final_copy_count, final_copy_count, #16 + cmp final_copy_count, #MLDSA_N + b.lt rej_uniform_eta4_final_copy + + mov x0, count + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq output + .unreq buf + .unreq buflen + .unreq table_idx + .unreq len + .unreq output_tmp + .unreq output_tmp_base + .unreq count + .unreq buf_consumed + .unreq tmp + .unreq xtmp + .unreq final_copy_count + .unreq initial_zero_count + .unreq rec_idx_0 + .unreq rec_idx_1 + .unreq rec_idx_0_w + .unreq rec_idx_1_w + .unreq ctr0 + .unreq ctr1 + .unreq ctr0_w + .unreq ctr1_w + .unreq ctr01 + .unreq buf0 + .unreq tmp0 + .unreq tmp1 + .unreq tmp2 + .unreq tmp3 + .unreq sign0 + .unreq sign1 + .unreq val0 + .unreq val0q + .unreq val1 + .unreq val1q + .unreq val2 + .unreq val2q + .unreq val3 + .unreq val3q + .unreq t0 + .unreq t1 + .unreq table0 + .unreq table0q + .unreq table1 + .unreq table1q + .unreq eta_bound + .unreq bits + +#undef STACK_SIZE +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/aarch64_clean/src/rej_uniform_eta_table.c b/dev/aarch64_clean/src/rej_uniform_eta_table.c new file mode 100644 index 000000000..cf21509cb --- /dev/null +++ b/dev/aarch64_clean/src/rej_uniform_eta_table.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "arith_native_aarch64.h" + +/* + * Lookup table used by 16-bit rejection sampling (rej_eta). + * Adapted from ML-KEM for ML-DSA eta rejection sampling. + * See autogen for details. + */ +MLD_ALIGN const uint8_t mld_rej_uniform_eta_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */, + 0, 1, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */, + 2, 3, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */, + 0, 1, 2, 3, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */, + 4, 5, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */, + 0, 1, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */, + 2, 3, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */, + 0, 1, 2, 3, 4, 5, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 7 */, + 6, 7, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */, + 0, 1, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */, + 2, 3, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */, + 0, 1, 2, 3, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 11 */, + 4, 5, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */, + 0, 1, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 13 */, + 2, 3, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 14 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 255, 255, 255, 255, 255, 255, 255, 255 /* 15 */, + 8, 9, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 16 */, + 0, 1, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 17 */, + 2, 3, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 18 */, + 0, 1, 2, 3, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 19 */, + 4, 5, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 20 */, + 0, 1, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 21 */, + 2, 3, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 22 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 23 */, + 6, 7, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 24 */, + 0, 1, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 25 */, + 2, 3, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 26 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 27 */, + 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 28 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 29 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 30 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 255, 255, 255, 255, 255, 255 /* 31 */, + 10, 11, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 32 */, + 0, 1, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 33 */, + 2, 3, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 34 */, + 0, 1, 2, 3, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 35 */, + 4, 5, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 36 */, + 0, 1, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 37 */, + 2, 3, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 38 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 39 */, + 6, 7, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 40 */, + 0, 1, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 41 */, + 2, 3, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 42 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 43 */, + 4, 5, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 44 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 45 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 46 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 255, 255, 255, 255, 255, 255 /* 47 */, + 8, 9, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 48 */, + 0, 1, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 49 */, + 2, 3, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 50 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 51 */, + 4, 5, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 52 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 53 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 54 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 55 */, + 6, 7, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 56 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 57 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 58 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 59 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 60 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 61 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 62 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 255, 255, 255, 255 /* 63 */, + 12, 13, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 64 */, + 0, 1, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 65 */, + 2, 3, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 66 */, + 0, 1, 2, 3, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 67 */, + 4, 5, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 68 */, + 0, 1, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 69 */, + 2, 3, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 70 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 71 */, + 6, 7, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 72 */, + 0, 1, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 73 */, + 2, 3, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 74 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 75 */, + 4, 5, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 76 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 77 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 78 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 255, 255, 255, 255, 255, 255 /* 79 */, + 8, 9, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 80 */, + 0, 1, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 81 */, + 2, 3, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 82 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 83 */, + 4, 5, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 84 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 85 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 86 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 87 */, + 6, 7, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 88 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 89 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 90 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 91 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 92 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 93 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 94 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 255, 255, 255, 255 /* 95 */, + 10, 11, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 96 */, + 0, 1, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 97 */, + 2, 3, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 98 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 99 */, + 4, 5, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 100 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 101 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 102 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 103 */, + 6, 7, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 104 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 105 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 106 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 107 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 108 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 109 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 110 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 255, 255, 255, 255 /* 111 */, + 8, 9, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 112 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 113 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 114 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 115 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 116 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 117 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 118 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 119 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 120 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 121 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 122 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 123 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 124 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 125 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 126 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 255, 255 /* 127 */, + 14, 15, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 128 */, + 0, 1, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 129 */, + 2, 3, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 130 */, + 0, 1, 2, 3, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 131 */, + 4, 5, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 132 */, + 0, 1, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 133 */, + 2, 3, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 134 */, + 0, 1, 2, 3, 4, 5, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 135 */, + 6, 7, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 136 */, + 0, 1, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 137 */, + 2, 3, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 138 */, + 0, 1, 2, 3, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 139 */, + 4, 5, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 140 */, + 0, 1, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 141 */, + 2, 3, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 142 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 14, 15, 255, 255, 255, 255, 255, 255 /* 143 */, + 8, 9, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 144 */, + 0, 1, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 145 */, + 2, 3, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 146 */, + 0, 1, 2, 3, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 147 */, + 4, 5, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 148 */, + 0, 1, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 149 */, + 2, 3, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 150 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 151 */, + 6, 7, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 152 */, + 0, 1, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 153 */, + 2, 3, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 154 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 155 */, + 4, 5, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 156 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 157 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 158 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 14, 15, 255, 255, 255, 255 /* 159 */, + 10, 11, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 160 */, + 0, 1, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 161 */, + 2, 3, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 162 */, + 0, 1, 2, 3, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 163 */, + 4, 5, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 164 */, + 0, 1, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 165 */, + 2, 3, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 166 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 167 */, + 6, 7, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 168 */, + 0, 1, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 169 */, + 2, 3, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 170 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 171 */, + 4, 5, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 172 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 173 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 174 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 14, 15, 255, 255, 255, 255 /* 175 */, + 8, 9, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 176 */, + 0, 1, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 177 */, + 2, 3, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 178 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 179 */, + 4, 5, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 180 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 181 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 182 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 183 */, + 6, 7, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 184 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 185 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 186 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 187 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 188 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 189 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 190 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 255, 255 /* 191 */, + 12, 13, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 192 */, + 0, 1, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 193 */, + 2, 3, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 194 */, + 0, 1, 2, 3, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 195 */, + 4, 5, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 196 */, + 0, 1, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 197 */, + 2, 3, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 198 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 199 */, + 6, 7, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 200 */, + 0, 1, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 201 */, + 2, 3, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 202 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 203 */, + 4, 5, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 204 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 205 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 206 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 255, 255, 255, 255 /* 207 */, + 8, 9, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 208 */, + 0, 1, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 209 */, + 2, 3, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 210 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 211 */, + 4, 5, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 212 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 213 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 214 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 215 */, + 6, 7, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 216 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 217 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 218 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 219 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 220 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 221 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 222 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 14, 15, 255, 255 /* 223 */, + 10, 11, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 224 */, + 0, 1, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 225 */, + 2, 3, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 226 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 227 */, + 4, 5, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 228 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 229 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 230 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 231 */, + 6, 7, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 232 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 233 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 234 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 235 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 236 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 237 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 238 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 14, 15, 255, 255 /* 239 */, + 8, 9, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 240 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 241 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 242 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 243 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 244 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 245 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 246 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 247 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 248 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 249 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 250 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 251 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 252 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 253 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 254 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 /* 255 */, +}; + +#else /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(aarch64_rej_uniform_eta_table) + +#endif /* !(MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/aarch64_clean/src/rej_uniform_table.c b/dev/aarch64_clean/src/rej_uniform_table.c new file mode 100644 index 000000000..65bc3b78d --- /dev/null +++ b/dev/aarch64_clean/src/rej_uniform_table.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "arith_native_aarch64.h" + +/* + * Lookup table used by rejection sampling of the public matrix. + * See autogen for details. + */ +MLD_ALIGN const uint8_t mld_rej_uniform_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */, + 0, 1, 2, 3, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */, + 4, 5, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */, + 8, 9, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 255, 255, 255, 255 /* 7 */, + 12, 13, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */, + 0, 1, 2, 3, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */, + 4, 5, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 255, 255, 255, 255 /* 11 */, + 8, 9, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 13 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 14 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 /* 15 */, +}; + +#else /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(aarch64_rej_uniform_table) + +#endif /* !(MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/fips202/aarch64/auto.h b/dev/fips202/aarch64/auto.h new file mode 100644 index 000000000..4375cc197 --- /dev/null +++ b/dev/fips202/aarch64/auto.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [HYBRID] + * Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64 + * Becker, Kannwischer + * https://eprint.iacr.org/2022/1243 + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_AUTO_H +#define MLD_FIPS202_NATIVE_AARCH64_AUTO_H +/* Default FIPS202 assembly profile for AArch64 systems */ + +/* + * Default logic to decide which implementation to use. + * + */ + +/* + * Keccak-f1600 + * + * - On Arm-based Apple CPUs, we pick a pure Neon implementation. + * - Otherwise, unless MLD_SYS_AARCH64_SLOW_BARREL_SHIFTER is set, + * we use lazy-rotation scalar assembly from @[HYBRID]. + * - Otherwise, if MLD_SYS_AARCH64_SLOW_BARREL_SHIFTER is set, we + * fall back to the standard C implementation. + */ +#if defined(__ARM_FEATURE_SHA3) && defined(__APPLE__) +#include "x1_v84a.h" +#elif !defined(MLD_SYS_AARCH64_SLOW_BARREL_SHIFTER) +#include "x1_scalar.h" +#endif + +/* + * Keccak-f1600x2/x4 + * + * The optimal implementation is highly CPU-specific; see @[HYBRID]. + * + * For now, if v8.4-A is not implemented, we fall back to Keccak-f1600. + * If v8.4-A is implemented and we are on an Apple CPU, we use a plain + * Neon-based implementation. + * If v8.4-A is implemented and we are not on an Apple CPU, we use a + * scalar/Neon/Neon hybrid. + * The reason for this distinction is that Apple CPUs appear to implement + * the SHA3 instructions on all SIMD units, while Arm CPUs prior to Cortex-X4 + * don't, and ordinary Neon instructions are still needed. + */ +#if defined(__ARM_FEATURE_SHA3) +/* + * For Apple-M cores, we use a plain implementation leveraging SHA3 + * instructions only. + */ +#if defined(__APPLE__) +#include "x2_v84a.h" +#else +#include "x4_v8a_v84a_scalar.h" +#endif + +#else /* __ARM_FEATURE_SHA3 */ + +#include "x4_v8a_scalar.h" + +#endif /* !__ARM_FEATURE_SHA3 */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_AUTO_H */ diff --git a/dev/fips202/aarch64/src/fips202_native_aarch64.h b/dev/fips202/aarch64/src/fips202_native_aarch64.h new file mode 100644 index 000000000..20f616803 --- /dev/null +++ b/dev/fips202/aarch64/src/fips202_native_aarch64.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_FIPS202_NATIVE_AARCH64_SRC_FIPS202_NATIVE_AARCH64_H +#define MLD_FIPS202_NATIVE_AARCH64_SRC_FIPS202_NATIVE_AARCH64_H + +#include +#include "../../../../cbmc.h" +#include "../../../../common.h" + + +#define mld_keccakf1600_round_constants \ + MLD_NAMESPACE(keccakf1600_round_constants) +extern const uint64_t mld_keccakf1600_round_constants[]; + +#define mld_keccak_f1600_x1_scalar_asm MLD_NAMESPACE(keccak_f1600_x1_scalar_asm) +void mld_keccak_f1600_x1_scalar_asm(uint64_t *state, uint64_t const *rc) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 1)) + requires(rc == mld_keccakf1600_round_constants) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 1)) +); + +#define mld_keccak_f1600_x1_v84a_asm MLD_NAMESPACE(keccak_f1600_x1_v84a_asm) +void mld_keccak_f1600_x1_v84a_asm(uint64_t *state, uint64_t const *rc) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 1)) + requires(rc == mld_keccakf1600_round_constants) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 1)) +); + +#define mld_keccak_f1600_x2_v84a_asm MLD_NAMESPACE(keccak_f1600_x2_v84a_asm) +void mld_keccak_f1600_x2_v84a_asm(uint64_t *state, uint64_t const *rc) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 2)) + requires(rc == mld_keccakf1600_round_constants) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 2)) +); + +#define mld_keccak_f1600_x4_scalar_v8a_hybrid_asm \ + MLD_NAMESPACE(keccak_f1600_x4_scalar_v8a_hybrid_asm) +void mld_keccak_f1600_x4_scalar_v8a_hybrid_asm(uint64_t *state, + uint64_t const *rc) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4)) + requires(rc == mld_keccakf1600_round_constants) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 4)) +); + +#define mld_keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm \ + MLD_NAMESPACE(keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm) +void mld_keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm(uint64_t *state, + uint64_t const *rc) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4)) + requires(rc == mld_keccakf1600_round_constants) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 4)) +); + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_SRC_FIPS202_NATIVE_AARCH64_H */ diff --git a/dev/fips202/aarch64/src/keccak_f1600_x1_scalar_asm.S b/dev/fips202/aarch64/src/keccak_f1600_x1_scalar_asm.S new file mode 100644 index 000000000..6d02406b8 --- /dev/null +++ b/dev/fips202/aarch64/src/keccak_f1600_x1_scalar_asm.S @@ -0,0 +1,322 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +#include "../../../../common.h" +#if defined(MLD_FIPS202_AARCH64_NEED_X1_SCALAR) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x1_scalar_asm.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x1_scalar_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x1_scalar_asm) + + sub sp, sp, #0x80 + stp x19, x20, [sp, #0x20] + stp x21, x22, [sp, #0x30] + stp x23, x24, [sp, #0x40] + stp x25, x26, [sp, #0x50] + stp x27, x28, [sp, #0x60] + stp x29, x30, [sp, #0x70] + +keccak_f1600_x1_scalar_initial: + mov x26, x1 + str x1, [sp, #0x8] + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + str x0, [sp] + eor x30, x24, x25 + eor x27, x9, x10 + eor x0, x30, x21 + eor x26, x27, x6 + eor x27, x26, x7 + eor x29, x0, x22 + eor x26, x29, x23 + eor x29, x4, x5 + eor x30, x29, x1 + eor x0, x27, x8 + eor x29, x30, x2 + eor x30, x19, x20 + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor x4, x4, x27 + eor x30, x30, x17 + eor x30, x30, x28 + eor x29, x29, x3 + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor x23, x23, x30 + str x23, [sp, #0x18] + eor x23, x14, x15 + eor x14, x14, x0 + eor x23, x23, x11 + eor x15, x15, x0 + eor x1, x1, x27 + eor x23, x23, x12 + eor x23, x23, x13 + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + eor x26, x13, x0 + eor x13, x28, x23 + eor x28, x24, x30 + eor x24, x16, x23 + eor x16, x21, x30 + eor x21, x25, x30 + eor x30, x19, x23 + eor x19, x20, x23 + eor x20, x17, x23 + eor x17, x12, x0 + eor x0, x2, x27 + eor x2, x6, x29 + eor x6, x8, x29 + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + ldr x27, [sp, #0x18] + bic x25, x17, x2, ror #5 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + bic x7, x5, x28, ror #10 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor x5, x20, x11, ror #41 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x10] + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + ldr x11, [x11] + bic x4, x21, x30, ror #57 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 + +keccak_f1600_x1_scalar_loop: + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + str x28, [sp, #0x18] + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x10] + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0x18] + add x25, x25, #0x1 + str x25, [sp, #0x10] + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + b.le keccak_f1600_x1_scalar_loop + ror x6, x6, #0x2b + ror x11, x11, #0x32 + ror x21, x21, #0x14 + ror x2, x2, #0x3d + ror x7, x7, #0x13 + ror x12, x12, #0x3 + ror x17, x17, #0x24 + ror x22, x22, #0x2c + ror x3, x3, #0x27 + ror x8, x8, #0x38 + ror x13, x13, #0x2e + ror x28, x28, #0x3f + ror x23, x23, #0x3a + ror x4, x4, #0x36 + ror x9, x9, #0x31 + ror x14, x14, #0x8 + ror x19, x19, #0x25 + ror x24, x24, #0x1c + ror x5, x5, #0x19 + ror x10, x10, #0x17 + ror x15, x15, #0x3e + ror x20, x20, #0x2 + ror x25, x25, #0x9 + ldr x0, [sp] + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + ldp x19, x20, [sp, #0x20] + ldp x21, x22, [sp, #0x30] + ldp x23, x24, [sp, #0x40] + ldp x25, x26, [sp, #0x50] + ldp x27, x28, [sp, #0x60] + ldp x29, x30, [sp, #0x70] + add sp, sp, #0x80 + ret +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_AARCH64_NEED_X1_SCALAR && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S b/dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S new file mode 100644 index 000000000..5cc4ee325 --- /dev/null +++ b/dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S @@ -0,0 +1,158 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [HYBRID] + * Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64 + * Becker, Kannwischer + * https://eprint.iacr.org/2022/1243 + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// +// This implementation is essentially from the paper @[HYBRID]. +// The only difference is interleaving/deinterleaving of Keccak state +// during load and store, so that the caller need not do this. +// + +#include "../../../../common.h" +#if defined(MLD_FIPS202_AARCH64_NEED_X1_V84A) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) +/* simpasm: header-end */ + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x1_v84a_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x1_v84a_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp d0, d1, [x0] + ldp d2, d3, [x0, #0x10] + ldp d4, d5, [x0, #0x20] + ldp d6, d7, [x0, #0x30] + ldp d8, d9, [x0, #0x40] + ldp d10, d11, [x0, #0x50] + ldp d12, d13, [x0, #0x60] + ldp d14, d15, [x0, #0x70] + ldp d16, d17, [x0, #0x80] + ldp d18, d19, [x0, #0x90] + ldp d20, d21, [x0, #0xa0] + ldp d22, d23, [x0, #0xb0] + ldr d24, [x0, #0xc0] + mov x2, #0x18 // =24 + +keccak_f1600_x1_v84a_loop: + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, keccak_f1600_x1_v84a_loop + stp d0, d1, [x0] + stp d2, d3, [x0, #0x10] + stp d4, d5, [x0, #0x20] + stp d6, d7, [x0, #0x30] + stp d8, d9, [x0, #0x40] + stp d10, d11, [x0, #0x50] + stp d12, d13, [x0, #0x60] + stp d14, d15, [x0, #0x70] + stp d16, d17, [x0, #0x80] + stp d18, d19, [x0, #0x90] + stp d20, d21, [x0, #0xa0] + stp d22, d23, [x0, #0xb0] + str d24, [x0, #0xc0] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret +/* simpasm: footer-start */ +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLD_FIPS202_AARCH64_NEED_X1_V84A && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S b/dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S new file mode 100644 index 000000000..0fcadc3af --- /dev/null +++ b/dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S @@ -0,0 +1,212 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [HYBRID] + * Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64 + * Becker, Kannwischer + * https://eprint.iacr.org/2022/1243 + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// +// This implementation is essentially from the paper @[HYBRID]. +// The only difference is interleaving/deinterleaving of Keccak state +// during load and store, so that the caller need not do this. +// + +#include "../../../../common.h" +#if defined(MLD_FIPS202_AARCH64_NEED_X2_V84A) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) +/* simpasm: header-end */ + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x2_v84a_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x2_v84a_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + add x2, x0, #0xc8 + ldp q25, q26, [x0] + ldp q27, q28, [x2] + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x20] + ldp q27, q28, [x2, #0x20] + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x40] + ldp q27, q28, [x2, #0x40] + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x60] + ldp q27, q28, [x2, #0x60] + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x80] + ldp q27, q28, [x2, #0x80] + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0xa0] + ldp q27, q28, [x2, #0xa0] + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0, #0xc0] + ldr d27, [x2, #0xc0] + trn1 v24.2d, v25.2d, v27.2d + mov x2, #0x18 // =24 + +keccak_f1600_x2_v84a_loop: + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, keccak_f1600_x2_v84a_loop + add x2, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0] + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + stp q27, q28, [x2] + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0, #0x20] + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + stp q27, q28, [x2, #0x20] + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0, #0x40] + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + stp q27, q28, [x2, #0x40] + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0, #0x60] + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + stp q27, q28, [x2, #0x60] + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0, #0x80] + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + stp q27, q28, [x2, #0x80] + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0, #0xa0] + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + stp q27, q28, [x2, #0xa0] + str d24, [x0, #0xc0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x2, #0xc0] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret +/* simpasm: footer-start */ +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLD_FIPS202_AARCH64_NEED_X2_V84A && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S b/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S new file mode 100644 index 000000000..f95f2ee62 --- /dev/null +++ b/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S @@ -0,0 +1,1006 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +#include "../../../../common.h" +#if defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_scalar_v8a_hybrid_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_scalar_v8a_hybrid_asm) + + sub sp, sp, #0xe0 + stp x19, x20, [sp, #0x30] + stp x21, x22, [sp, #0x40] + stp x23, x24, [sp, #0x50] + stp x25, x26, [sp, #0x60] + stp x27, x28, [sp, #0x70] + stp x29, x30, [sp, #0x80] + stp d8, d9, [sp, #0x90] + stp d10, d11, [sp, #0xa0] + stp d12, d13, [sp, #0xb0] + stp d14, d15, [sp, #0xc0] + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0] + ldp q27, q28, [x4] + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x20] + ldp q27, q28, [x4, #0x20] + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x40] + ldp q27, q28, [x4, #0x40] + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x60] + ldp q27, q28, [x4, #0x60] + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x80] + ldp q27, q28, [x4, #0x80] + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0xa0] + ldp q27, q28, [x4, #0xa0] + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0, #0xc0] + ldr d27, [x4, #0xc0] + trn1 v24.2d, v25.2d, v27.2d + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 + +keccak_f1600_x4_v8a_scalar_hybrid_initial: + eor x30, x24, x25 + eor x27, x9, x10 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x0, x30, x21 + eor v30.16b, v30.16b, v15.16b + eor x26, x27, x6 + eor x27, x26, x7 + eor v30.16b, v30.16b, v20.16b + eor x29, x0, x22 + eor v29.16b, v1.16b, v6.16b + eor x26, x29, x23 + eor v29.16b, v29.16b, v11.16b + eor x29, x4, x5 + eor x30, x29, x1 + eor v29.16b, v29.16b, v16.16b + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor v28.16b, v2.16b, v7.16b + eor x30, x19, x20 + eor x30, x30, x16 + eor v28.16b, v28.16b, v12.16b + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x17 + eor x30, x30, x28 + eor v27.16b, v3.16b, v8.16b + eor x29, x29, x3 + eor v27.16b, v27.16b, v13.16b + eor x0, x0, x30, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + eor v26.16b, v4.16b, v9.16b + str x23, [sp, #0xd0] + eor v26.16b, v26.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor v26.16b, v26.16b, v24.16b + eor x15, x15, x0 + eor x1, x1, x27 + add v31.2d, v28.2d, v28.2d + eor x23, x23, x12 + sri v31.2d, v28.2d, #0x3f + eor x23, x23, x13 + eor v25.16b, v31.16b, v30.16b + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + add v31.2d, v26.2d, v26.2d + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor v28.16b, v31.16b, v28.16b + eor x13, x28, x23 + eor x28, x24, x30 + add v31.2d, v29.2d, v29.2d + eor x24, x16, x23 + sri v31.2d, v29.2d, #0x3f + eor x16, x21, x30 + eor v26.16b, v31.16b, v26.16b + eor x21, x25, x30 + eor x30, x19, x23 + add v31.2d, v27.2d, v27.2d + eor x19, x20, x23 + sri v31.2d, v27.2d, #0x3f + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + add v31.2d, v30.2d, v30.2d + eor x2, x6, x29 + sri v31.2d, v30.2d, #0x3f + eor x6, x8, x29 + eor v27.16b, v31.16b, v27.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v30.16b, v0.16b, v26.16b + bic x3, x13, x17, ror #19 + eor v31.16b, v2.16b, v29.16b + eor x5, x5, x27 + ldr x27, [sp, #0xd0] + shl v0.2d, v31.2d, #0x3e + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor v31.16b, v12.16b, v29.16b + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + shl v2.2d, v31.2d, #0x2b + eor x8, x8, x17, ror #2 + sri v2.2d, v31.2d, #0x15 + eor x17, x10, x29 + eor v31.16b, v13.16b, v28.16b + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + shl v12.2d, v31.2d, #0x19 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + eor v31.16b, v19.16b, v27.16b + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + shl v13.2d, v31.2d, #0x8 + bic x7, x2, x5, ror #47 + sri v13.2d, v31.2d, #0x38 + eor x2, x25, x24, ror #39 + eor v31.16b, v23.16b, v28.16b + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + shl v19.2d, v31.2d, #0x38 + eor x25, x25, x17, ror #53 + sri v19.2d, v31.2d, #0x8 + bic x17, x11, x17, ror #60 + eor v31.16b, v15.16b, v26.16b + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + shl v23.2d, v31.2d, #0x29 + eor x7, x7, x22, ror #25 + sri v23.2d, v31.2d, #0x17 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor v31.16b, v1.16b, v25.16b + eor x22, x22, x15, ror #23 + shl v15.2d, v31.2d, #0x1 + bic x20, x27, x20, ror #48 + sri v15.2d, v31.2d, #0x3f + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor v31.16b, v8.16b, v28.16b + eor x15, x5, x27, ror #27 + shl v1.2d, v31.2d, #0x37 + eor x5, x20, x11, ror #41 + sri v1.2d, v31.2d, #0x9 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor v31.16b, v16.16b, v25.16b + eor x17, x24, x9, ror #47 + shl v8.2d, v31.2d, #0x2d + mov x24, #0x1 // =1 + sri v8.2d, v31.2d, #0x13 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v7.16b, v29.16b + bic x24, x29, x1, ror #44 + shl v16.2d, v31.2d, #0x6 + bic x27, x1, x21, ror #50 + sri v16.2d, v31.2d, #0x3a + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + eor v31.16b, v10.16b, v26.16b + ldr x11, [x11] + shl v7.2d, v31.2d, #0x3 + bic x4, x21, x30, ror #57 + sri v7.2d, v31.2d, #0x3d + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v3.16b, v28.16b + bic x9, x14, x6, ror #5 + shl v10.2d, v31.2d, #0x1c + eor x9, x9, x0, ror #43 + sri v10.2d, v31.2d, #0x24 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor v31.16b, v18.16b, v28.16b + eor x11, x4, x26, ror #35 + shl v3.2d, v31.2d, #0x15 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x16, x27, x30, ror #43 + eor v31.16b, v17.16b, v29.16b + bic x27, x30, x26, ror #42 + shl v18.2d, v31.2d, #0xf + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v18.2d, v31.2d, #0x31 + eor x14, x26, x6, ror #46 + eor v31.16b, v11.16b, v25.16b + eor x6, x27, x29, ror #41 + shl v17.2d, v31.2d, #0xa + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + sri v17.2d, v31.2d, #0x36 + eor x26, x8, x9, ror #57 + eor v31.16b, v9.16b, v27.16b + eor x27, x0, x14, ror #10 + shl v11.2d, v31.2d, #0x14 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v11.2d, v31.2d, #0x2c + eor x30, x23, x22, ror #50 + eor v31.16b, v22.16b, v29.16b + eor x0, x26, x10, ror #31 + shl v9.2d, v31.2d, #0x3d + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + sri v9.2d, v31.2d, #0x3 + eor x30, x30, x24, ror #34 + eor v31.16b, v14.16b, v27.16b + eor x0, x0, x7, ror #27 + shl v22.2d, v31.2d, #0x27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + sri v22.2d, v31.2d, #0x19 + ror x30, x27, #0x3e + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v14.2d, v31.2d, #0x12 + eor x16, x30, x16 + sri v14.2d, v31.2d, #0x2e + eor x28, x30, x28, ror #63 + eor v31.16b, v4.16b, v27.16b + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + shl v20.2d, v31.2d, #0x1b + eor x28, x1, x2, ror #61 + sri v20.2d, v31.2d, #0x25 + eor x19, x30, x19, ror #37 + eor v31.16b, v24.16b, v27.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v4.2d, v31.2d, #0xe + eor x26, x26, x0, ror #55 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x3, ror #39 + eor v31.16b, v21.16b, v25.16b + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + shl v24.2d, v31.2d, #0x2 + eor x0, x0, x29, ror #63 + sri v24.2d, v31.2d, #0x3e + eor x27, x28, x27, ror #61 + eor v31.16b, v5.16b, v26.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + shl v21.2d, v31.2d, #0x24 + eor x29, x30, x20, ror #2 + sri v21.2d, v31.2d, #0x1c + eor x20, x26, x3, ror #39 + eor v31.16b, v6.16b, v25.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + shl v27.2d, v31.2d, #0x2c + eor x3, x28, x21, ror #20 + sri v27.2d, v31.2d, #0x14 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bic v31.16b, v7.16b, v11.16b + eor x24, x28, x24, ror #28 + eor v5.16b, v31.16b, v10.16b + eor x1, x30, x17, ror #36 + bic v31.16b, v8.16b, v7.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v6.16b, v31.16b, v11.16b + eor x8, x27, x8, ror #56 + bic v31.16b, v9.16b, v8.16b + eor x17, x27, x7, ror #19 + eor v7.16b, v31.16b, v7.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v10.16b, v9.16b + eor x4, x26, x4, ror #54 + eor v8.16b, v31.16b, v8.16b + eor x0, x0, x12, ror #3 + bic v31.16b, v11.16b, v10.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v9.16b, v31.16b, v9.16b + eor x26, x26, x5, ror #25 + bic v31.16b, v12.16b, v16.16b + eor x2, x7, x16, ror #39 + eor v10.16b, v31.16b, v15.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + bic v31.16b, v13.16b, v12.16b + eor x7, x7, x22, ror #25 + eor v11.16b, v31.16b, v16.16b + eor x12, x30, x20, ror #58 + bic v31.16b, v14.16b, v13.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v12.16b, v31.16b, v12.16b + eor x22, x20, x15, ror #23 + bic v31.16b, v15.16b, v14.16b + bic x6, x19, x13, ror #42 + eor v13.16b, v31.16b, v13.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v16.16b, v15.16b + eor x5, x21, x5, ror #21 + eor v14.16b, v31.16b, v14.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic v31.16b, v17.16b, v21.16b + bic x21, x21, x25, ror #50 + eor v15.16b, v31.16b, v20.16b + bic x20, x27, x4, ror #25 + bic v31.16b, v18.16b, v17.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor v16.16b, v31.16b, v21.16b + eor x21, x17, x25, ror #30 + bic v31.16b, v19.16b, v18.16b + bic x19, x25, x19, ror #57 + eor v17.16b, v31.16b, v17.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + bic v31.16b, v20.16b, v19.16b + ldr x9, [sp, #0x8] + eor v18.16b, v31.16b, v18.16b + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + eor v19.16b, v31.16b, v19.16b + bic x20, x11, x27, ror #60 + bic v31.16b, v22.16b, v1.16b + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + bic v31.16b, v23.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + eor v21.16b, v31.16b, v1.16b + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + eor v22.16b, v31.16b, v22.16b + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v23.16b, v31.16b, v23.16b + eor x1, x5, x28 + bic v31.16b, v1.16b, v0.16b + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + bic v31.16b, v2.16b, v27.16b + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor v1.16b, v31.16b, v27.16b + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic v31.16b, v30.16b, v4.16b + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +keccak_f1600_x4_v8a_scalar_hybrid_loop: + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x26, x8, x9, ror #57 + eor v30.16b, v30.16b, v15.16b + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v30.16b, v30.16b, v20.16b + eor x26, x26, x6, ror #51 + eor v29.16b, v1.16b, v6.16b + eor x30, x23, x22, ror #50 + eor v29.16b, v29.16b, v11.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v29.16b, v29.16b, v16.16b + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor v28.16b, v2.16b, v7.16b + eor x26, x30, x21, ror #26 + eor v28.16b, v28.16b, v12.16b + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor v27.16b, v3.16b, v8.16b + eor x16, x30, x16 + eor v27.16b, v27.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor v27.16b, v27.16b, v23.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v26.16b, v4.16b, v9.16b + eor x29, x29, x20, ror #2 + eor v26.16b, v26.16b, v14.16b + eor x28, x28, x4, ror #54 + eor v26.16b, v26.16b, v19.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor v26.16b, v26.16b, v24.16b + eor x28, x28, x5, ror #25 + add v31.2d, v28.2d, v28.2d + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + sri v31.2d, v28.2d, #0x3f + eor x27, x28, x27, ror #61 + eor v25.16b, v31.16b, v30.16b + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor v28.16b, v31.16b, v28.16b + eor x11, x0, x11, ror #50 + add v31.2d, v29.2d, v29.2d + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + sri v31.2d, v29.2d, #0x3f + eor x21, x26, x1 + eor v26.16b, v31.16b, v26.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + add v31.2d, v27.2d, v27.2d + eor x1, x30, x17, ror #36 + sri v31.2d, v27.2d, #0x3f + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + add v31.2d, v30.2d, v30.2d + eor x17, x27, x7, ror #19 + sri v31.2d, v30.2d, #0x3f + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v27.16b, v31.16b, v27.16b + eor x4, x26, x4, ror #54 + eor v30.16b, v0.16b, v26.16b + eor x0, x0, x12, ror #3 + eor v31.16b, v2.16b, v29.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + shl v0.2d, v31.2d, #0x3e + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + eor v31.16b, v12.16b, v29.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + shl v2.2d, v31.2d, #0x2b + eor x7, x7, x22, ror #25 + sri v2.2d, v31.2d, #0x15 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v31.16b, v13.16b, v28.16b + eor x30, x27, x6, ror #43 + shl v12.2d, v31.2d, #0x19 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + eor v31.16b, v19.16b, v27.16b + bic x5, x13, x17, ror #63 + shl v13.2d, v31.2d, #0x8 + eor x5, x21, x5, ror #21 + sri v13.2d, v31.2d, #0x38 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v31.16b, v23.16b, v28.16b + bic x21, x21, x25, ror #50 + shl v19.2d, v31.2d, #0x38 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + sri v19.2d, v31.2d, #0x8 + eor x16, x21, x19, ror #43 + eor v31.16b, v15.16b, v26.16b + eor x21, x17, x25, ror #30 + shl v23.2d, v31.2d, #0x29 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + sri v23.2d, v31.2d, #0x17 + eor x17, x10, x9, ror #47 + eor v31.16b, v1.16b, v25.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + shl v15.2d, v31.2d, #0x1 + bic x20, x4, x28, ror #2 + sri v15.2d, v31.2d, #0x3f + eor x10, x20, x1, ror #50 + eor v31.16b, v8.16b, v28.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + shl v1.2d, v31.2d, #0x37 + bic x4, x28, x1, ror #48 + sri v1.2d, v31.2d, #0x9 + bic x1, x1, x11, ror #57 + eor v31.16b, v16.16b, v25.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + shl v8.2d, v31.2d, #0x2d + add x25, x25, #0x1 + sri v8.2d, v31.2d, #0x13 + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v7.16b, v29.16b + eor x25, x1, x27, ror #53 + shl v16.2d, v31.2d, #0x6 + bic x27, x30, x26, ror #47 + sri v16.2d, v31.2d, #0x3a + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v31.16b, v10.16b, v26.16b + eor x11, x19, x13, ror #35 + shl v7.2d, v31.2d, #0x3 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + sri v7.2d, v31.2d, #0x3d + bic x27, x24, x9, ror #47 + eor v31.16b, v3.16b, v28.16b + bic x19, x23, x3, ror #9 + shl v10.2d, v31.2d, #0x1c + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + sri v10.2d, v31.2d, #0x24 + bic x29, x3, x29, ror #35 + eor v31.16b, v18.16b, v28.16b + eor x13, x13, x9, ror #57 + shl v3.2d, v31.2d, #0x15 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + sri v3.2d, v31.2d, #0x2b + bic x14, x14, x8, ror #5 + eor v31.16b, v17.16b, v29.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v18.2d, v31.2d, #0xf + bic x23, x8, x23, ror #38 + sri v18.2d, v31.2d, #0x31 + eor x8, x27, x0, ror #2 + eor v31.16b, v11.16b, v25.16b + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + shl v17.2d, v31.2d, #0xa + eor x23, x3, x26, ror #52 + sri v17.2d, v31.2d, #0x36 + eor x3, x29, x30, ror #24 + eor x0, x15, x11, ror #52 + eor v31.16b, v9.16b, v27.16b + eor x0, x0, x13, ror #48 + shl v11.2d, v31.2d, #0x14 + eor x26, x8, x9, ror #57 + sri v11.2d, v31.2d, #0x2c + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v31.16b, v22.16b, v29.16b + eor x26, x26, x6, ror #51 + shl v9.2d, v31.2d, #0x3d + eor x30, x23, x22, ror #50 + sri v9.2d, v31.2d, #0x3 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v31.16b, v14.16b, v27.16b + eor x27, x27, x12, ror #5 + shl v22.2d, v31.2d, #0x27 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + sri v22.2d, v31.2d, #0x19 + eor x26, x30, x21, ror #26 + eor v31.16b, v20.16b, v26.16b + eor x26, x26, x25, ror #15 + shl v14.2d, v31.2d, #0x12 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + sri v14.2d, v31.2d, #0x2e + ror x26, x26, #0x3a + eor v31.16b, v4.16b, v27.16b + eor x16, x30, x16 + shl v20.2d, v31.2d, #0x1b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + sri v20.2d, v31.2d, #0x25 + eor x29, x29, x17, ror #36 + eor v31.16b, v24.16b, v27.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + shl v4.2d, v31.2d, #0xe + eor x29, x29, x20, ror #2 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x4, ror #54 + eor v31.16b, v21.16b, v25.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v24.2d, v31.2d, #0x2 + eor x28, x28, x5, ror #25 + sri v24.2d, v31.2d, #0x3e + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor v31.16b, v5.16b, v26.16b + eor x27, x28, x27, ror #61 + shl v21.2d, v31.2d, #0x24 + eor x13, x0, x13, ror #46 + sri v21.2d, v31.2d, #0x1c + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor v31.16b, v6.16b, v25.16b + eor x20, x26, x3, ror #39 + shl v27.2d, v31.2d, #0x2c + eor x11, x0, x11, ror #50 + sri v27.2d, v31.2d, #0x14 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v7.16b, v11.16b + eor x21, x26, x1 + eor v5.16b, v31.16b, v10.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + bic v31.16b, v8.16b, v7.16b + eor x1, x30, x17, ror #36 + eor v6.16b, v31.16b, v11.16b + eor x14, x0, x14, ror #8 + bic v31.16b, v9.16b, v8.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor v7.16b, v31.16b, v7.16b + eor x17, x27, x7, ror #19 + bic v31.16b, v10.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v8.16b, v31.16b, v8.16b + eor x4, x26, x4, ror #54 + bic v31.16b, v11.16b, v10.16b + eor x0, x0, x12, ror #3 + eor v9.16b, v31.16b, v9.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bic v31.16b, v12.16b, v16.16b + eor x26, x26, x5, ror #25 + eor v10.16b, v31.16b, v15.16b + eor x2, x7, x16, ror #39 + bic v31.16b, v13.16b, v12.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v11.16b, v31.16b, v16.16b + eor x7, x7, x22, ror #25 + bic v31.16b, v14.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v12.16b, v31.16b, v12.16b + eor x30, x27, x6, ror #43 + bic v31.16b, v15.16b, v14.16b + eor x22, x20, x15, ror #23 + eor v13.16b, v31.16b, v13.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic v31.16b, v16.16b, v15.16b + bic x5, x13, x17, ror #63 + eor v14.16b, v31.16b, v14.16b + eor x5, x21, x5, ror #21 + bic v31.16b, v17.16b, v21.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v15.16b, v31.16b, v20.16b + bic x21, x21, x25, ror #50 + bic v31.16b, v18.16b, v17.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor v16.16b, v31.16b, v21.16b + eor x16, x21, x19, ror #43 + bic v31.16b, v19.16b, v18.16b + eor x21, x17, x25, ror #30 + eor v17.16b, v31.16b, v17.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bic v31.16b, v20.16b, v19.16b + eor x17, x10, x9, ror #47 + eor v18.16b, v31.16b, v18.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor v19.16b, v31.16b, v19.16b + eor x10, x20, x1, ror #50 + bic v31.16b, v22.16b, v1.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic v31.16b, v23.16b, v22.16b + bic x1, x1, x11, ror #57 + eor v21.16b, v31.16b, v1.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + eor v22.16b, v31.16b, v22.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + eor v23.16b, v31.16b, v23.16b + bic x27, x30, x26, ror #47 + bic v31.16b, v1.16b, v0.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v2.16b, v27.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + eor v1.16b, v31.16b, v27.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + bic v31.16b, v30.16b, v4.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +keccak_f1600_x4_v8a_scalar_hybrid_loop_end: + b.le keccak_f1600_x4_v8a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq keccak_f1600_x4_scalar_v8a_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b keccak_f1600_x4_v8a_scalar_hybrid_initial + +keccak_f1600_x4_scalar_v8a_hybrid_done: + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0] + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + stp q27, q28, [x4] + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0, #0x20] + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + stp q27, q28, [x4, #0x20] + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0, #0x40] + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + stp q27, q28, [x4, #0x40] + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0, #0x60] + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + stp q27, q28, [x4, #0x60] + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0, #0x80] + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + stp q27, q28, [x4, #0x80] + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0, #0xa0] + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + stp q27, q28, [x4, #0xa0] + str d24, [x0, #0xc0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4, #0xc0] + ldp d14, d15, [sp, #0xc0] + ldp d12, d13, [sp, #0xb0] + ldp d10, d11, [sp, #0xa0] + ldp d8, d9, [sp, #0x90] + ldp x19, x20, [sp, #0x30] + ldp x21, x22, [sp, #0x40] + ldp x23, x24, [sp, #0x50] + ldp x25, x26, [sp, #0x60] + ldp x27, x28, [sp, #0x70] + ldp x29, x30, [sp, #0x80] + add sp, sp, #0xe0 + ret +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S b/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S new file mode 100644 index 000000000..6be374a10 --- /dev/null +++ b/dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S @@ -0,0 +1,916 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +#include "../../../../common.h" +#if defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) +/* simpasm: header-end */ + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm) + + sub sp, sp, #0xe0 + stp x19, x20, [sp, #0x30] + stp x21, x22, [sp, #0x40] + stp x23, x24, [sp, #0x50] + stp x25, x26, [sp, #0x60] + stp x27, x28, [sp, #0x70] + stp x29, x30, [sp, #0x80] + stp d8, d9, [sp, #0x90] + stp d10, d11, [sp, #0xa0] + stp d12, d13, [sp, #0xb0] + stp d14, d15, [sp, #0xc0] + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0] + ldp q27, q28, [x4] + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x20] + ldp q27, q28, [x4, #0x20] + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x40] + ldp q27, q28, [x4, #0x40] + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x60] + ldp q27, q28, [x4, #0x60] + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0x80] + ldp q27, q28, [x4, #0x80] + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0, #0xa0] + ldp q27, q28, [x4, #0xa0] + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0, #0xc0] + ldr d27, [x4, #0xc0] + trn1 v24.2d, v25.2d, v27.2d + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 + +keccak_f1600_x4_v8a_v84a_scalar_hybrid_initial: + eor x30, x24, x25 + eor x27, x9, x10 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x0, x30, x21 + eor x26, x27, x6 + eor v30.16b, v30.16b, v20.16b + eor x27, x26, x7 + eor x29, x0, x22 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x26, x29, x23 + eor x29, x4, x5 + eor v29.16b, v29.16b, v16.16b + eor x30, x29, x1 + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor x30, x19, x20 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor x30, x30, x17 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x28 + eor x29, x29, x3 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + str x23, [sp, #0xd0] + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor x15, x15, x0 + eor v26.16b, v26.16b, v24.16b + eor x1, x1, x27 + eor x23, x23, x12 + rax1 v25.2d, v30.2d, v28.2d + eor x23, x23, x13 + eor x11, x11, x0 + add v31.2d, v26.2d, v26.2d + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor x13, x28, x23 + eor v28.16b, v31.16b, v28.16b + eor x28, x24, x30 + eor x24, x16, x23 + rax1 v26.2d, v26.2d, v29.2d + eor x16, x21, x30 + eor x21, x25, x30 + add v31.2d, v27.2d, v27.2d + eor x30, x19, x23 + sri v31.2d, v27.2d, #0x3f + eor x19, x20, x23 + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + rax1 v27.2d, v27.2d, v30.2d + eor x2, x6, x29 + eor x6, x8, x29 + eor v30.16b, v0.16b, v26.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v31.16b, v2.16b, v29.16b + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + shl v0.2d, v31.2d, #0x3e + ldr x27, [sp, #0xd0] + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + xar v2.2d, v12.2d, v29.2d, #0x15 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor v31.16b, v13.16b, v28.16b + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + shl v12.2d, v31.2d, #0x19 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + xar v13.2d, v19.2d, v27.2d, #0x38 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + eor v31.16b, v23.16b, v28.16b + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + shl v19.2d, v31.2d, #0x38 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor v31.16b, v1.16b, v25.16b + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + shl v15.2d, v31.2d, #0x1 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + sri v15.2d, v31.2d, #0x3f + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor v31.16b, v16.16b, v25.16b + eor x5, x20, x11, ror #41 + shl v8.2d, v31.2d, #0x2d + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + sri v8.2d, v31.2d, #0x13 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + xar v16.2d, v7.2d, v29.2d, #0x3a + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v10.16b, v26.16b + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + shl v7.2d, v31.2d, #0x3 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + sri v7.2d, v31.2d, #0x3d + ldr x11, [x11] + bic x4, x21, x30, ror #57 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v18.16b, v28.16b + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + shl v3.2d, v31.2d, #0x15 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + sri v3.2d, v31.2d, #0x2b + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x0, x16, x19, ror #35 + eor v31.16b, v11.16b, v25.16b + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + shl v17.2d, v31.2d, #0xa + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v17.2d, v31.2d, #0x36 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v31.16b, v22.16b, v29.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + shl v9.2d, v31.2d, #0x3d + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v9.2d, v31.2d, #0x3 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + shl v14.2d, v31.2d, #0x12 + eor x26, x30, x21, ror #26 + sri v14.2d, v31.2d, #0x2e + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor v31.16b, v24.16b, v27.16b + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + shl v4.2d, v31.2d, #0xe + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + sri v4.2d, v31.2d, #0x32 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor v31.16b, v5.16b, v26.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v21.2d, v31.2d, #0x24 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + sri v21.2d, v31.2d, #0x1c + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + bic v31.16b, v7.16b, v11.16b + eor x29, x30, x20, ror #2 + eor v5.16b, v31.16b, v10.16b + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v9.16b, v8.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor v7.16b, v31.16b, v7.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + bic v31.16b, v11.16b, v10.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor v9.16b, v31.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + bic v31.16b, v13.16b, v12.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v11.16b, v31.16b, v16.16b + eor x26, x26, x5, ror #25 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic v31.16b, v15.16b, v14.16b + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v13.16b, v31.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + bic v31.16b, v16.16b, v15.16b + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + eor v14.16b, v31.16b, v14.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bcax v15.16b, v20.16b, v17.16b, v21.16b + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic v31.16b, v18.16b, v17.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v16.16b, v31.16b, v21.16b + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + bic v31.16b, v20.16b, v19.16b + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v18.16b, v31.16b, v18.16b + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + bic v31.16b, v22.16b, v1.16b + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor v20.16b, v31.16b, v0.16b + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + bcax v21.16b, v1.16b, v23.16b, v22.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic v31.16b, v24.16b, v23.16b + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + eor v22.16b, v31.16b, v22.16b + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + bcax v23.16b, v23.16b, v0.16b, v24.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v1.16b, v0.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v24.16b, v31.16b, v24.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +keccak_f1600_x4_v8a_v84a_scalar_hybrid_loop: + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor v30.16b, v30.16b, v20.16b + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor v29.16b, v29.16b, v16.16b + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor x16, x30, x16 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor v27.16b, v27.16b, v23.16b + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor v26.16b, v26.16b, v19.16b + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + eor v26.16b, v26.16b, v24.16b + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + rax1 v25.2d, v30.2d, v28.2d + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor v28.16b, v31.16b, v28.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + rax1 v26.2d, v26.2d, v29.2d + eor x21, x26, x1 + add v31.2d, v27.2d, v27.2d + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + sri v31.2d, v27.2d, #0x3f + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + rax1 v27.2d, v27.2d, v30.2d + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + eor v30.16b, v0.16b, v26.16b + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor v31.16b, v2.16b, v29.16b + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + shl v0.2d, v31.2d, #0x3e + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + xar v2.2d, v12.2d, v29.2d, #0x15 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v31.16b, v13.16b, v28.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + shl v12.2d, v31.2d, #0x19 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + xar v13.2d, v19.2d, v27.2d, #0x38 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + eor v31.16b, v23.16b, v28.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + shl v19.2d, v31.2d, #0x38 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v31.16b, v1.16b, v25.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + shl v15.2d, v31.2d, #0x1 + ldr x9, [sp, #0x8] + sri v15.2d, v31.2d, #0x3f + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor v31.16b, v16.16b, v25.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + shl v8.2d, v31.2d, #0x2d + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + sri v8.2d, v31.2d, #0x13 + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + xar v16.2d, v7.2d, v29.2d, #0x3a + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v10.16b, v26.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + shl v7.2d, v31.2d, #0x3 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + sri v7.2d, v31.2d, #0x3d + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + eor v31.16b, v18.16b, v28.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + shl v3.2d, v31.2d, #0x15 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor v31.16b, v11.16b, v25.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v17.2d, v31.2d, #0xa + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + sri v17.2d, v31.2d, #0x36 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + eor v31.16b, v22.16b, v29.16b + eor x0, x15, x11, ror #52 + shl v9.2d, v31.2d, #0x3d + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + sri v9.2d, v31.2d, #0x3 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor v31.16b, v20.16b, v26.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + shl v14.2d, v31.2d, #0x12 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + sri v14.2d, v31.2d, #0x2e + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor v31.16b, v24.16b, v27.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v4.2d, v31.2d, #0xe + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + sri v4.2d, v31.2d, #0x32 + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v31.16b, v5.16b, v26.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v21.2d, v31.2d, #0x24 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + sri v21.2d, v31.2d, #0x1c + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + bic v31.16b, v7.16b, v11.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor v5.16b, v31.16b, v10.16b + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + bic v31.16b, v9.16b, v8.16b + eor x3, x28, x21, ror #20 + eor v7.16b, v31.16b, v7.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bic v31.16b, v11.16b, v10.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v9.16b, v31.16b, v9.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v13.16b, v12.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor v11.16b, v31.16b, v16.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic v31.16b, v15.16b, v14.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v13.16b, v31.16b, v13.16b + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic v31.16b, v16.16b, v15.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v14.16b, v31.16b, v14.16b + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + bcax v15.16b, v20.16b, v17.16b, v21.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v18.16b, v17.16b + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor v16.16b, v31.16b, v21.16b + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + bic v31.16b, v20.16b, v19.16b + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + eor v18.16b, v31.16b, v18.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + bic v31.16b, v22.16b, v1.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + eor v20.16b, v31.16b, v0.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bcax v21.16b, v1.16b, v23.16b, v22.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic v31.16b, v24.16b, v23.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + eor v22.16b, v31.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bcax v23.16b, v23.16b, v0.16b, v24.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + bic v31.16b, v1.16b, v0.16b + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + eor v24.16b, v31.16b, v24.16b + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +keccak_f1600_x4_v8a_v84a_scalar_hybrid_loop_end: + b.le keccak_f1600_x4_v8a_v84a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq keccak_f1600_x4_scalar_v8a_v84a_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b keccak_f1600_x4_v8a_v84a_scalar_hybrid_initial + +keccak_f1600_x4_scalar_v8a_v84a_hybrid_done: + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0] + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + stp q27, q28, [x4] + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0, #0x20] + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + stp q27, q28, [x4, #0x20] + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0, #0x40] + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + stp q27, q28, [x4, #0x40] + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0, #0x60] + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + stp q27, q28, [x4, #0x60] + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0, #0x80] + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + stp q27, q28, [x4, #0x80] + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0, #0xa0] + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + stp q27, q28, [x4, #0xa0] + str d24, [x0, #0xc0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4, #0xc0] + ldp d14, d15, [sp, #0xc0] + ldp d12, d13, [sp, #0xb0] + ldp d10, d11, [sp, #0xa0] + ldp d8, d9, [sp, #0x90] + ldp x19, x20, [sp, #0x30] + ldp x21, x22, [sp, #0x40] + ldp x23, x24, [sp, #0x50] + ldp x25, x26, [sp, #0x60] + ldp x27, x28, [sp, #0x70] + ldp x29, x30, [sp, #0x80] + add sp, sp, #0xe0 + ret +/* simpasm: footer-start */ +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/aarch64/src/keccakf1600_round_constants.c b/dev/fips202/aarch64/src/keccakf1600_round_constants.c new file mode 100644 index 000000000..05e9cbfee --- /dev/null +++ b/dev/fips202/aarch64/src/keccakf1600_round_constants.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../../common.h" + +#if (defined(MLD_FIPS202_AARCH64_NEED_X1_SCALAR) || \ + defined(MLD_FIPS202_AARCH64_NEED_X1_V84A) || \ + defined(MLD_FIPS202_AARCH64_NEED_X2_V84A) || \ + defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID) || \ + defined(MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID)) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "fips202_native_aarch64.h" + +MLD_ALIGN const uint64_t mld_keccakf1600_round_constants[] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, +}; + +#else /* (MLD_FIPS202_AARCH64_NEED_X1_SCALAR || \ + MLD_FIPS202_AARCH64_NEED_X1_V84A || MLD_FIPS202_AARCH64_NEED_X2_V84A \ + || MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID || \ + MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(fips202_aarch64_round_constants) + +#endif /* !((MLD_FIPS202_AARCH64_NEED_X1_SCALAR || \ + MLD_FIPS202_AARCH64_NEED_X1_V84A || MLD_FIPS202_AARCH64_NEED_X2_V84A \ + || MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID || \ + MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/fips202/aarch64/x1_scalar.h b/dev/fips202/aarch64/x1_scalar.h new file mode 100644 index 000000000..eeafcaaff --- /dev/null +++ b/dev/fips202/aarch64/x1_scalar.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_X1_SCALAR_H +#define MLD_FIPS202_NATIVE_AARCH64_X1_SCALAR_H + +/* Part of backend API */ +#define MLD_USE_FIPS202_X1_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_AARCH64_NEED_X1_SCALAR + +#if !defined(__ASSEMBLER__) +#include "src/fips202_native_aarch64.h" +static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state) +{ + mld_keccak_f1600_x1_scalar_asm(state, mld_keccakf1600_round_constants); +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_X1_SCALAR_H */ diff --git a/dev/fips202/aarch64/x1_v84a.h b/dev/fips202/aarch64/x1_v84a.h new file mode 100644 index 000000000..b35861ab2 --- /dev/null +++ b/dev/fips202/aarch64/x1_v84a.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_X1_V84A_H +#define MLD_FIPS202_NATIVE_AARCH64_X1_V84A_H + +#if !defined(__ARM_FEATURE_SHA3) +#error This backend can only be used if SHA3 extensions are available. +#endif + +/* Part of backend API */ +#define MLD_USE_FIPS202_X1_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_AARCH64_NEED_X1_V84A + +#if !defined(__ASSEMBLER__) +#include "src/fips202_native_aarch64.h" +static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state) +{ + mld_keccak_f1600_x1_v84a_asm(state, mld_keccakf1600_round_constants); +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_X1_V84A_H */ diff --git a/dev/fips202/aarch64/x2_v84a.h b/dev/fips202/aarch64/x2_v84a.h new file mode 100644 index 000000000..d03c8e8ee --- /dev/null +++ b/dev/fips202/aarch64/x2_v84a.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_X2_V84A_H +#define MLD_FIPS202_NATIVE_AARCH64_X2_V84A_H + +#if !defined(__ARM_FEATURE_SHA3) +#error This backend can only be used if SHA3 extensions are available. +#endif + +/* Part of backend API */ +#define MLD_USE_FIPS202_X2_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_AARCH64_NEED_X2_V84A + +#if !defined(__ASSEMBLER__) +#include "src/fips202_native_aarch64.h" +static MLD_INLINE void mld_keccak_f1600_x2_native(uint64_t *state) +{ + mld_keccak_f1600_x2_v84a_asm(state, mld_keccakf1600_round_constants); +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_X2_V84A_H */ diff --git a/dev/fips202/aarch64/x4_v8a_scalar.h b/dev/fips202/aarch64/x4_v8a_scalar.h new file mode 100644 index 000000000..76a71672d --- /dev/null +++ b/dev/fips202/aarch64/x4_v8a_scalar.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_X4_V8A_SCALAR_H +#define MLD_FIPS202_NATIVE_AARCH64_X4_V8A_SCALAR_H + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID + +#if !defined(__ASSEMBLER__) +#include "src/fips202_native_aarch64.h" +static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state) +{ + mld_keccak_f1600_x4_scalar_v8a_hybrid_asm(state, + mld_keccakf1600_round_constants); +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_X4_V8A_SCALAR_H */ diff --git a/dev/fips202/aarch64/x4_v8a_v84a_scalar.h b/dev/fips202/aarch64/x4_v8a_v84a_scalar.h new file mode 100644 index 000000000..c95b97699 --- /dev/null +++ b/dev/fips202/aarch64/x4_v8a_v84a_scalar.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_AARCH64_X4_V8A_V84A_SCALAR_H +#define MLD_FIPS202_NATIVE_AARCH64_X4_V8A_V84A_SCALAR_H + +#if !defined(__ARM_FEATURE_SHA3) +#error This backend can only be used if SHA3 extensions are available. +#endif + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID + +#if !defined(__ASSEMBLER__) +#include "src/fips202_native_aarch64.h" +static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state) +{ + mld_keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm( + state, mld_keccakf1600_round_constants); +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_AARCH64_X4_V8A_V84A_SCALAR_H */ diff --git a/mldsa/native/x86_64/README.md b/dev/x86_64/README.md similarity index 100% rename from mldsa/native/x86_64/README.md rename to dev/x86_64/README.md diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h new file mode 100644 index 000000000..59704f0f2 --- /dev/null +++ b/dev/x86_64/meta.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_X86_64_META_H +#define MLD_NATIVE_X86_64_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_X86_64_DEFAULT + +#define MLD_USE_NATIVE_NTT_CUSTOM_ORDER +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_REJ_UNIFORM +#define MLD_USE_NATIVE_REJ_UNIFORM_ETA2 +#define MLD_USE_NATIVE_REJ_UNIFORM_ETA4 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_32 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_88 +#define MLD_USE_NATIVE_POLY_CADDQ +#define MLD_USE_NATIVE_POLY_USE_HINT_32 +#define MLD_USE_NATIVE_POLY_USE_HINT_88 +#define MLD_USE_NATIVE_POLY_CHKNORM +#define MLD_USE_NATIVE_POLYZ_UNPACK_17 +#define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "src/arith_native_x86_64.h" + +static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N]) +{ + mld_nttunpack_avx2((__m256i *)(data)); +} + +static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_avx2((__m256i *)data, mld_qdata.vec); +} +static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_invntt_avx2((__m256i *)data, mld_qdata.vec); +} + +static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + /* AVX2 implementation assumes specific buffer lengths */ + if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_BUFLEN) + { + return -1; + } + + /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */ + return (int)mld_rej_uniform_avx2(r, buf); +} + +static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + unsigned int outlen; + /* AVX2 implementation assumes specific buffer lengths */ + if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN) + { + return -1; + } + + /* Constant time: Inputs and outputs to this function are secret. + * It is safe to leak which coefficients are accepted/rejected. + * The assembly implementation must not leak any other information about the + * accepted coefficients. Constant-time testing cannot cover this, and we + * hence have to manually verify the assembly. + * We declassify prior the input data and mark the outputs as secret. + */ + MLD_CT_TESTING_DECLASSIFY(buf, buflen); + outlen = mld_rej_uniform_eta2_avx2(r, buf); + MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen); + /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */ + return (int)outlen; +} + +static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +{ + unsigned int outlen; + /* AVX2 implementation assumes specific buffer lengths */ + if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN) + { + return -1; + } + + /* Constant time: Inputs and outputs to this function are secret. + * It is safe to leak which coefficients are accepted/rejected. + * The assembly implementation must not leak any other information about the + * accepted coefficients. Constant-time testing cannot cover this, and we + * hence have to manually verify the assembly. + * We declassify prior the input data and mark the outputs as secret. + */ + MLD_CT_TESTING_DECLASSIFY(buf, buflen); + outlen = mld_rej_uniform_eta4_avx2(r, buf); + MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen); + /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */ + return (int)outlen; +} + +static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); +} + +static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); +} + +static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N]) +{ + mld_poly_caddq_avx2(a); +} +static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a, + const int32_t *h) +{ + mld_poly_use_hint_32_avx2((__m256i *)b, (const __m256i *)a, + (const __m256i *)h); +} + +static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a, + const int32_t *h) +{ + mld_poly_use_hint_88_avx2((__m256i *)b, (const __m256i *)a, + (const __m256i *)h); +} + +static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B) +{ + return mld_poly_chknorm_avx2((const __m256i *)a, B); +} + +static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a) +{ + mld_polyz_unpack_17_avx2((__m256i *)r, a); +} + +static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) +{ + mld_polyz_unpack_19_avx2((__m256i *)r, a); +} + +static MLD_INLINE void mld_poly_pointwise_montgomery_native( + int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + mld_pointwise_avx2((__m256i *)c, (const __m256i *)a, (const __m256i *)b, + mld_qdata.vec); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[MLDSA_L][MLDSA_N], + const int32_t v[MLDSA_L][MLDSA_N]) +{ + mld_pointwise_acc_l4_avx2((__m256i *)w, (const __m256i *)u, + (const __m256i *)v, mld_qdata.vec); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[MLDSA_L][MLDSA_N], + const int32_t v[MLDSA_L][MLDSA_N]) +{ + mld_pointwise_acc_l5_avx2((__m256i *)w, (const __m256i *)u, + (const __m256i *)v, mld_qdata.vec); +} + +static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[MLDSA_L][MLDSA_N], + const int32_t v[MLDSA_L][MLDSA_N]) +{ + mld_pointwise_acc_l7_avx2((__m256i *)w, (const __m256i *)u, + (const __m256i *)v, mld_qdata.vec); +} + +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_NATIVE_X86_64_META_H */ diff --git a/dev/x86_64/src/align.h b/dev/x86_64/src/align.h new file mode 100644 index 000000000..8a4ca0cb2 --- /dev/null +++ b/dev/x86_64/src/align.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +#ifndef MLD_NATIVE_X86_64_SRC_ALIGN_H +#define MLD_NATIVE_X86_64_SRC_ALIGN_H + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include +#include + +#define MLD_ALIGNED_INT32(N) \ + union \ + { \ + int32_t coeffs[N]; \ + __m256i vec[(N + 7) / 8]; \ + } + +#endif /* !MLD_NATIVE_X86_64_SRC_ALIGN_H */ diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h new file mode 100644 index 000000000..822b29bca --- /dev/null +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H +#define MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H +#include "../../../common.h" + +#include +#include +#include "consts.h" + +#define MLD_AVX2_REJ_UNIFORM_BUFLEN \ + (5 * 168) /* REJ_UNIFORM_NBLOCKS * SHAKE128_RATE */ + + +/* + * Sampling 256 coefficients mod 15 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/15))/2 = 136.5 bytes. + * We sample 1 block (=136 bytes) of SHAKE256_RATE output initially. + * Sampling 2 blocks initially results in slightly worse performance. + */ +#define MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN (1 * 136) + +/* + * Sampling 256 coefficients mod 9 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/9))/2 = 227.5 bytes. + * We sample 2 blocks (=272 bytes) of SHAKE256_RATE output initially. + */ +#define MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN (2 * 136) + +#define mld_rej_uniform_table MLD_NAMESPACE(mld_rej_uniform_table) +extern const uint8_t mld_rej_uniform_table[256][8]; + +#define mld_ntt_avx2 MLD_NAMESPACE(ntt_avx2) +void mld_ntt_avx2(__m256i *r, const __m256i *mld_qdata); + +#define mld_invntt_avx2 MLD_NAMESPACE(invntt_avx2) +void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata); + +#define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2) +void mld_nttunpack_avx2(__m256i *r); + +#define mld_rej_uniform_avx2 MLD_NAMESPACE(mld_rej_uniform_avx2) +unsigned mld_rej_uniform_avx2(int32_t *r, + const uint8_t buf[MLD_AVX2_REJ_UNIFORM_BUFLEN]); + +#define mld_rej_uniform_eta2_avx2 MLD_NAMESPACE(mld_rej_uniform_eta2_avx2) +unsigned mld_rej_uniform_eta2_avx2( + int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN]); + +#define mld_rej_uniform_eta4_avx2 MLD_NAMESPACE(mld_rej_uniform_eta4_avx2) +unsigned mld_rej_uniform_eta4_avx2( + int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]); + +#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a); + +#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a); + +#define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2) +void mld_poly_caddq_avx2(int32_t *r); + +#define mld_poly_use_hint_32_avx2 MLD_NAMESPACE(mld_poly_use_hint_32_avx2) +void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, const __m256i *h); + +#define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2) +void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h); + +#define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2) +uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B); + +#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2) +void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a); + +#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) +void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a); + +#define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2) +void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b, + const __m256i *qdata); + +#define mld_pointwise_acc_l4_avx2 MLD_NAMESPACE(pointwise_acc_l4_avx2) +void mld_pointwise_acc_l4_avx2(__m256i *c, const __m256i *a, const __m256i *b, + const __m256i *qdata); + +#define mld_pointwise_acc_l5_avx2 MLD_NAMESPACE(pointwise_acc_l5_avx2) +void mld_pointwise_acc_l5_avx2(__m256i *c, const __m256i *a, const __m256i *b, + const __m256i *qdata); + +#define mld_pointwise_acc_l7_avx2 MLD_NAMESPACE(pointwise_acc_l7_avx2) +void mld_pointwise_acc_l7_avx2(__m256i *c, const __m256i *a, const __m256i *b, + const __m256i *qdata); + +#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */ diff --git a/dev/x86_64/src/consts.c b/dev/x86_64/src/consts.c new file mode 100644 index 000000000..cd4bfde5a --- /dev/null +++ b/dev/x86_64/src/consts.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "align.h" +#include "consts.h" +#define MLD_AVX2_Q MLDSA_Q +/* check-magic: 58728449 == pow(MLDSA_Q,-1,2^32) */ +#define MLD_AVX2_QINV 58728449 +/* check-magic: -4186625 == pow(2,32,MLDSA_Q) */ +#define MLD_AVX2_MONT -4186625 +/* check-magic: 41978 == pow(2,64-8,MLDSA_Q) */ +#define MLD_AVX2_DIV 41978 +/* check-magic: -8395782 == signed_mod(MLD_AVX2_QINV*MLD_AVX2_DIV,2^32) */ +#define MLD_AVX2_DIV_QINV -8395782 + +const qdata_t mld_qdata = {{ +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0 + MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, + MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, + +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV 8 + MLD_AVX2_QINV, MLD_AVX2_QINV, MLD_AVX2_QINV, MLD_AVX2_QINV, + MLD_AVX2_QINV, MLD_AVX2_QINV, MLD_AVX2_QINV, MLD_AVX2_QINV, + +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV 16 + MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, + MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, MLD_AVX2_DIV_QINV, + +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV 24 + MLD_AVX2_DIV, MLD_AVX2_DIV, MLD_AVX2_DIV, MLD_AVX2_DIV, + MLD_AVX2_DIV, MLD_AVX2_DIV, MLD_AVX2_DIV, MLD_AVX2_DIV, + +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV 32 +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328 +#include "x86_64_zetas.i" + +}}; + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_consts) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/consts.h b/dev/x86_64/src/consts.h new file mode 100644 index 000000000..34c9e8f5a --- /dev/null +++ b/dev/x86_64/src/consts.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +#ifndef MLD_NATIVE_X86_64_SRC_CONSTS_H +#define MLD_NATIVE_X86_64_SRC_CONSTS_H +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV 8 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV 16 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV 24 +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV 32 +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328 + + +#ifndef __ASSEMBLER__ +#include "align.h" +typedef MLD_ALIGNED_INT32(624) qdata_t; +#define mld_qdata MLD_NAMESPACE(qdata) +extern const qdata_t mld_qdata; +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_NATIVE_X86_64_SRC_CONSTS_H */ diff --git a/dev/x86_64/src/intt.S b/dev/x86_64/src/intt.S new file mode 100644 index 000000000..f45d0fd87 --- /dev/null +++ b/dev/x86_64/src/intt.S @@ -0,0 +1,285 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 + +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h + +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +.endm + +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +/* level 0 */ +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 + +/* level 1 */ +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 + +/* level 2 */ +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 + +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 + +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 + +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 + +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +/* level 6 */ +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +/* level 7 */ +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(invntt_avx2) +MLD_ASM_FN_SYMBOL(invntt_avx2) + +vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 + +ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/ntt.S b/dev/x86_64/src/ntt.S new file mode 100644 index 000000000..8fae4ccbc --- /dev/null +++ b/dev/x86_64/src/ntt.S @@ -0,0 +1,243 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + + /* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 + +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l +.endm + +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +/* level 1 */ +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm + +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +/* level 3 */ +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 + +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +/* level 4 */ +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 + +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +/* level 5 */ +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 + +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 + +/* level 6 */ +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 + +/* level 7 */ +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(ntt_avx2) +MLD_ASM_FN_SYMBOL(ntt_avx2) +vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 + +ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/nttunpack.S b/dev/x86_64/src/nttunpack.S new file mode 100644 index 000000000..41f3d40de --- /dev/null +++ b/dev/x86_64/src/nttunpack.S @@ -0,0 +1,101 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + + /* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + #include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(nttunpack_avx2) +MLD_ASM_FN_SYMBOL(nttunpack_avx2) + +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret + +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/pointwise.S b/dev/x86_64/src/pointwise.S new file mode 100644 index 000000000..8bd73616f --- /dev/null +++ b/dev/x86_64/src/pointwise.S @@ -0,0 +1,151 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + + .intel_syntax noprefix + .text + +/* + * void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) + * + * Pointwise multiplication of polynomials in NTT domain with Montgomery reduction + * + * Arguments: + * rdi: pointer to output polynomial c + * rsi: pointer to input polynomial a + * rdx: pointer to input polynomial b + * rcx: pointer to qdata constants + */ + .balign 4 + .global MLD_ASM_NAMESPACE(pointwise_avx2) +MLD_ASM_FN_SYMBOL(pointwise_avx2) + +// Load constants + vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] + vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] + + xor eax, eax +_looptop1: +// Load + vmovdqa ymm2, [rsi] + vmovdqa ymm4, [rsi + 32] + vmovdqa ymm6, [rsi + 64] + vmovdqa ymm10, [rdx] + vmovdqa ymm12, [rdx + 32] + vmovdqa ymm14, [rdx + 64] + vpsrlq ymm3, ymm2, 32 + vpsrlq ymm5, ymm4, 32 + vmovshdup ymm7, ymm6 + vpsrlq ymm11, ymm10, 32 + vpsrlq ymm13, ymm12, 32 + vmovshdup ymm15, ymm14 + +// Multiply + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm4, ymm4, ymm12 + vpmuldq ymm5, ymm5, ymm13 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm7, ymm7, ymm15 + +// Reduce + vpmuldq ymm10, ymm0, ymm2 + vpmuldq ymm11, ymm0, ymm3 + vpmuldq ymm12, ymm0, ymm4 + vpmuldq ymm13, ymm0, ymm5 + vpmuldq ymm14, ymm0, ymm6 + vpmuldq ymm15, ymm0, ymm7 + vpmuldq ymm10, ymm1, ymm10 + vpmuldq ymm11, ymm1, ymm11 + vpmuldq ymm12, ymm1, ymm12 + vpmuldq ymm13, ymm1, ymm13 + vpmuldq ymm14, ymm1, ymm14 + vpmuldq ymm15, ymm1, ymm15 + vpsubq ymm2, ymm2, ymm10 + vpsubq ymm3, ymm3, ymm11 + vpsubq ymm4, ymm4, ymm12 + vpsubq ymm5, ymm5, ymm13 + vpsubq ymm6, ymm6, ymm14 + vpsubq ymm7, ymm7, ymm15 + vpsrlq ymm2, ymm2, 32 + vpsrlq ymm4, ymm4, 32 + vmovshdup ymm6, ymm6 + +// Store + vpblendd ymm2, ymm2, ymm3, 0xAA + vpblendd ymm4, ymm4, ymm5, 0xAA + vpblendd ymm6, ymm6, ymm7, 0xAA + vmovdqa [rdi], ymm2 + vmovdqa [rdi + 32], ymm4 + vmovdqa [rdi + 64], ymm6 + + add rdi, 96 + add rsi, 96 + add rdx, 96 + add eax, 1 + cmp eax, 10 + jb _looptop1 + + vmovdqa ymm2, [rsi] + vmovdqa ymm4, [rsi + 32] + vmovdqa ymm10, [rdx] + vmovdqa ymm12, [rdx + 32] + vpsrlq ymm3, ymm2, 32 + vpsrlq ymm5, ymm4, 32 + vmovshdup ymm11, ymm10 + vmovshdup ymm13, ymm12 + +// Multiply + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm4, ymm4, ymm12 + vpmuldq ymm5, ymm5, ymm13 + +// Reduce + vpmuldq ymm10, ymm0, ymm2 + vpmuldq ymm11, ymm0, ymm3 + vpmuldq ymm12, ymm0, ymm4 + vpmuldq ymm13, ymm0, ymm5 + vpmuldq ymm10, ymm1, ymm10 + vpmuldq ymm11, ymm1, ymm11 + vpmuldq ymm12, ymm1, ymm12 + vpmuldq ymm13, ymm1, ymm13 + vpsubq ymm2, ymm2, ymm10 + vpsubq ymm3, ymm3, ymm11 + vpsubq ymm4, ymm4, ymm12 + vpsubq ymm5, ymm5, ymm13 + vpsrlq ymm2, ymm2, 32 + vmovshdup ymm4, ymm4 + +// Store + vpblendd ymm2, ymm3, ymm2, 0x55 + vpblendd ymm4, ymm5, ymm4, 0x55 + vmovdqa [rdi], ymm2 + vmovdqa [rdi + 32], ymm4 + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/pointwise_acc_l4.S b/dev/x86_64/src/pointwise_acc_l4.S new file mode 100644 index 000000000..e64881ccb --- /dev/null +++ b/dev/x86_64/src/pointwise_acc_l4.S @@ -0,0 +1,126 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + + .intel_syntax noprefix + .text + +.macro pointwise off +// Load + vmovdqa ymm6, [rsi + \off] + vmovdqa ymm8, [rsi + \off + 32] + vmovdqa ymm10, [rdx + \off] + vmovdqa ymm12, [rdx + \off + 32] + vpsrlq ymm7, ymm6, 32 + vpsrlq ymm9, ymm8, 32 + vmovshdup ymm11, ymm10 + vmovshdup ymm13, ymm12 + +// Multiply + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm8, ymm8, ymm12 + vpmuldq ymm9, ymm9, ymm13 +.endm + +.macro acc + vpaddq ymm2, ymm6, ymm2 + vpaddq ymm3, ymm7, ymm3 + vpaddq ymm4, ymm8, ymm4 + vpaddq ymm5, ymm9, ymm5 +.endm + +/* + * void mld_pointwise_acc_l4_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) + * + * Pointwise multiplication with accumulation across multiple polynomial vectors + * + * Arguments: + * rdi: pointer to output polynomial c + * rsi: pointer to input polynomial a (multiple vectors) + * rdx: pointer to input polynomial b (multiple vectors) + * rcx: pointer to qdata constants + */ + .balign 4 + .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2) + +// Load constants + vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] + vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] + + xor eax, eax +_looptop2: + pointwise 0 + +// Move + vmovdqa ymm2, ymm6 + vmovdqa ymm3, ymm7 + vmovdqa ymm4, ymm8 + vmovdqa ymm5, ymm9 + + pointwise 1024 + acc + + pointwise 2048 + acc + + pointwise 3072 + acc + +// Reduce + vpmuldq ymm6, ymm0, ymm2 + vpmuldq ymm7, ymm0, ymm3 + vpmuldq ymm8, ymm0, ymm4 + vpmuldq ymm9, ymm0, ymm5 + vpmuldq ymm6, ymm1, ymm6 + vpmuldq ymm7, ymm1, ymm7 + vpmuldq ymm8, ymm1, ymm8 + vpmuldq ymm9, ymm1, ymm9 + vpsubq ymm2, ymm2, ymm6 + vpsubq ymm3, ymm3, ymm7 + vpsubq ymm4, ymm4, ymm8 + vpsubq ymm5, ymm5, ymm9 + vpsrlq ymm2, ymm2, 32 + vmovshdup ymm4, ymm4 + +// Store + vpblendd ymm2, ymm2, ymm3, 0xAA + vpblendd ymm4, ymm4, ymm5, 0xAA + + vmovdqa [rdi], ymm2 + vmovdqa [rdi + 32], ymm4 + + add rsi, 64 + add rdx, 64 + add rdi, 64 + add eax, 1 + cmp eax, 16 + jb _looptop2 + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/pointwise_acc_l5.S b/dev/x86_64/src/pointwise_acc_l5.S new file mode 100644 index 000000000..db7348f19 --- /dev/null +++ b/dev/x86_64/src/pointwise_acc_l5.S @@ -0,0 +1,129 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + + .intel_syntax noprefix + .text + +.macro pointwise off +// Load + vmovdqa ymm6, [rsi + \off] + vmovdqa ymm8, [rsi + \off + 32] + vmovdqa ymm10, [rdx + \off] + vmovdqa ymm12, [rdx + \off + 32] + vpsrlq ymm7, ymm6, 32 + vpsrlq ymm9, ymm8, 32 + vmovshdup ymm11, ymm10 + vmovshdup ymm13, ymm12 + +// Multiply + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm8, ymm8, ymm12 + vpmuldq ymm9, ymm9, ymm13 +.endm + +.macro acc + vpaddq ymm2, ymm6, ymm2 + vpaddq ymm3, ymm7, ymm3 + vpaddq ymm4, ymm8, ymm4 + vpaddq ymm5, ymm9, ymm5 +.endm + +/* + * void mld_pointwise_acc_l5_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) + * + * Pointwise multiplication with accumulation across multiple polynomial vectors + * + * Arguments: + * rdi: pointer to output polynomial c + * rsi: pointer to input polynomial a (multiple vectors) + * rdx: pointer to input polynomial b (multiple vectors) + * rcx: pointer to qdata constants + */ + .balign 4 + .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2) + +// Load constants + vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] + vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] + + xor eax, eax +_looptop2: + pointwise 0 + +// Move + vmovdqa ymm2, ymm6 + vmovdqa ymm3, ymm7 + vmovdqa ymm4, ymm8 + vmovdqa ymm5, ymm9 + + pointwise 1024 + acc + + pointwise 2048 + acc + + pointwise 3072 + acc + + pointwise 4096 + acc + +// Reduce + vpmuldq ymm6, ymm0, ymm2 + vpmuldq ymm7, ymm0, ymm3 + vpmuldq ymm8, ymm0, ymm4 + vpmuldq ymm9, ymm0, ymm5 + vpmuldq ymm6, ymm1, ymm6 + vpmuldq ymm7, ymm1, ymm7 + vpmuldq ymm8, ymm1, ymm8 + vpmuldq ymm9, ymm1, ymm9 + vpsubq ymm2, ymm2, ymm6 + vpsubq ymm3, ymm3, ymm7 + vpsubq ymm4, ymm4, ymm8 + vpsubq ymm5, ymm5, ymm9 + vpsrlq ymm2, ymm2, 32 + vmovshdup ymm4, ymm4 + +// Store + vpblendd ymm2, ymm2, ymm3, 0xAA + vpblendd ymm4, ymm4, ymm5, 0xAA + + vmovdqa [rdi], ymm2 + vmovdqa [rdi + 32], ymm4 + + add rsi, 64 + add rdx, 64 + add rdi, 64 + add eax, 1 + cmp eax, 16 + jb _looptop2 + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/pointwise_acc_l7.S b/dev/x86_64/src/pointwise_acc_l7.S new file mode 100644 index 000000000..bae230d75 --- /dev/null +++ b/dev/x86_64/src/pointwise_acc_l7.S @@ -0,0 +1,135 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + + .intel_syntax noprefix + .text + +.macro pointwise off +// Load + vmovdqa ymm6, [rsi + \off] + vmovdqa ymm8, [rsi + \off + 32] + vmovdqa ymm10, [rdx + \off] + vmovdqa ymm12, [rdx + \off + 32] + vpsrlq ymm7, ymm6, 32 + vpsrlq ymm9, ymm8, 32 + vmovshdup ymm11, ymm10 + vmovshdup ymm13, ymm12 + +// Multiply + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm8, ymm8, ymm12 + vpmuldq ymm9, ymm9, ymm13 +.endm + +.macro acc + vpaddq ymm2, ymm6, ymm2 + vpaddq ymm3, ymm7, ymm3 + vpaddq ymm4, ymm8, ymm4 + vpaddq ymm5, ymm9, ymm5 +.endm + +/* + * void mld_pointwise_acc_l7_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) + * + * Pointwise multiplication with accumulation across multiple polynomial vectors + * + * Arguments: + * rdi: pointer to output polynomial c + * rsi: pointer to input polynomial a (multiple vectors) + * rdx: pointer to input polynomial b (multiple vectors) + * rcx: pointer to qdata constants + */ + .balign 4 + .global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2) + +// Load constants + vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] + vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] + + xor eax, eax +_looptop2: + pointwise 0 + +// Move + vmovdqa ymm2, ymm6 + vmovdqa ymm3, ymm7 + vmovdqa ymm4, ymm8 + vmovdqa ymm5, ymm9 + + pointwise 1024 + acc + + pointwise 2048 + acc + + pointwise 3072 + acc + + pointwise 4096 + acc + + pointwise 5120 + acc + + pointwise 6144 + acc + +// Reduce + vpmuldq ymm6, ymm0, ymm2 + vpmuldq ymm7, ymm0, ymm3 + vpmuldq ymm8, ymm0, ymm4 + vpmuldq ymm9, ymm0, ymm5 + vpmuldq ymm6, ymm1, ymm6 + vpmuldq ymm7, ymm1, ymm7 + vpmuldq ymm8, ymm1, ymm8 + vpmuldq ymm9, ymm1, ymm9 + vpsubq ymm2, ymm2, ymm6 + vpsubq ymm3, ymm3, ymm7 + vpsubq ymm4, ymm4, ymm8 + vpsubq ymm5, ymm5, ymm9 + vpsrlq ymm2, ymm2, 32 + vmovshdup ymm4, ymm4 + +// Store + vpblendd ymm2, ymm2, ymm3, 0xAA + vpblendd ymm4, ymm4, ymm5, 0xAA + + vmovdqa [rdi], ymm2 + vmovdqa [rdi + 32], ymm4 + + add rsi, 64 + add rdx, 64 + add rdi, 64 + add eax, 1 + cmp eax, 16 + jb _looptop2 + + ret +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/x86_64/src/poly_caddq_avx2.c b/dev/x86_64/src/poly_caddq_avx2.c new file mode 100644 index 000000000..05d86d14f --- /dev/null +++ b/dev/x86_64/src/poly_caddq_avx2.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +/************************************************* + * Name: mld_poly_caddq_avx2 + * + * Description: For all coefficients of in/out polynomial add Q if + * coefficient is negative. + * + * Arguments: - int32_t *r: pointer to input/output polynomial + **************************************************/ +void mld_poly_caddq_avx2(int32_t *r) +{ + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_set1_epi32(MLDSA_Q); + const __m256i zero = _mm256_setzero_si256(); + __m256i *rr = (__m256i *)r; + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&rr[i]); + g = _mm256_cmpgt_epi32(zero, f); + g = _mm256_and_si256(g, q); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&rr[i], f); + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_reduce) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/poly_chknorm_avx2.c b/dev/x86_64/src/poly_chknorm_avx2.c new file mode 100644 index 000000000..3fdda60c0 --- /dev/null +++ b/dev/x86_64/src/poly_chknorm_avx2.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include "arith_native_x86_64.h" + +uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B) +{ + unsigned int i; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + + t = _mm256_setzero_si256(); + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + return (uint32_t)(_mm256_testz_si256(t, t) - 1); +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_chknorm) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c new file mode 100644 index 000000000..89bf93e2b --- /dev/null +++ b/dev/x86_64/src/poly_decompose_32_avx2.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +{ + unsigned int i; + __m256i f, f0, f1; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + /* check-magic: 1025 == round((2**22*128) / ((MLDSA_Q - 1) / 16)) */ + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_decompose) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c new file mode 100644 index 000000000..f17d663c9 --- /dev/null +++ b/dev/x86_64/src/poly_decompose_88_avx2.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +{ + unsigned int i; + __m256i f, f0, f1, t; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + /* check-magic: 11275 == round((2**24*128) / ((MLDSA_Q - 1) / 44)) */ + const __m256i v = _mm256_set1_epi32(11275); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(128); + const __m256i max = _mm256_set1_epi32(43); + const __m256i zero = _mm256_setzero_si256(); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + t = _mm256_sub_epi32(max, f1); + f1 = _mm256_blendv_epi32(f1, zero, t); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_decompose) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/poly_use_hint_32_avx2.c b/dev/x86_64/src/poly_use_hint_32_avx2.c new file mode 100644 index 000000000..ad5d71de2 --- /dev/null +++ b/dev/x86_64/src/poly_use_hint_32_avx2.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, + const __m256i *hint) +{ + unsigned int i; + __m256i f, f0, f1, h, t; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + /* check-magic: 1025 == round((2**22*128) / ((MLDSA_Q - 1) / 16)) */ + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + const __m256i zero = _mm256_setzero_si256(); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + h = _mm256_load_si256(&hint[i]); + + /* Reference: The reference avx2 implementation calls poly_decompose to + * compute all a1, a0 before the loop. + */ + /* decompose */ + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + + /* Reference: The reference avx2 implementation checks a0 >= 0, which is + * different from the specification and the reference C implementation. We + * follow the specification and check a0 > 0. + */ + /* t = (a0 > 0) ? h : -h */ + f0 = _mm256_cmpgt_epi32(f0, zero); + t = _mm256_blendv_epi32(h, zero, f0); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + + /* f1 = (f1 + t) % 16 */ + f1 = _mm256_add_epi32(f1, h); + f1 = _mm256_and_si256(f1, mask); + + _mm256_store_si256(&b[i], f1); + } +} + +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_use_hint) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/poly_use_hint_88_avx2.c b/dev/x86_64/src/poly_use_hint_88_avx2.c new file mode 100644 index 000000000..a91fa80b9 --- /dev/null +++ b/dev/x86_64/src/poly_use_hint_88_avx2.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, + const __m256i *hint) +{ + unsigned int i; + __m256i f, f0, f1, h, t; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + /* check-magic: 11275 == round((2**24*128) / ((MLDSA_Q - 1) / 44)) */ + const __m256i v = _mm256_set1_epi32(11275); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(128); + const __m256i max = _mm256_set1_epi32(43); + const __m256i zero = _mm256_setzero_si256(); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + h = _mm256_load_si256(&hint[i]); + + /* Reference: The reference avx2 implementation calls poly_decompose to + * compute all a1, a0 before the loop. + */ + /* decompose */ + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + t = _mm256_sub_epi32(max, f1); + f1 = _mm256_blendv_epi32(f1, zero, t); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + + /* Reference: The reference avx2 implementation checks a0 >= 0, which is + * different from the specification and the reference C implementation. We + * follow the specification and check a0 > 0. + */ + /* t = (a0 > 0) ? h : -h */ + f0 = _mm256_cmpgt_epi32(f0, zero); + t = _mm256_blendv_epi32(h, zero, f0); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + + /* f1 = (f1 + t) % 44 */ + f1 = _mm256_add_epi32(f1, h); + f1 = _mm256_blendv_epi32(f1, max, f1); + f = _mm256_cmpgt_epi32(f1, max); + f1 = _mm256_blendv_epi32(f1, zero, f); + + _mm256_store_si256(&b[i], f1); + } +} + +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_use_hint) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/polyz_unpack_17_avx2.c b/dev/x86_64/src/polyz_unpack_17_avx2.c new file mode 100644 index 000000000..b48eaacab --- /dev/null +++ b/dev/x86_64/src/polyz_unpack_17_avx2.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include "arith_native_x86_64.h" + +void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a) +{ + unsigned int i; + __m256i f; + const __m256i shufbidx = + _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1, -1, 8, + 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); + const __m256i mask = _mm256_set1_epi32(0x3FFFF); + const __m256i gamma1 = _mm256_set1_epi32(MLDSA_GAMMA1); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_loadu_si256((__m256i *)&a[18 * i]); + + /* Permute 64-bit lanes + * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0] + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Original Layout ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ bytes 16..23 │ bytes 24..31 │ + * └─────────────────┴─────────────────┴─────────────────┴─────────────────┘ + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Layout after permute ║ + * ║ Byte indices in high half shifted down by 8 positions ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌───────────────┬─────────────────┐ ┌─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ │ bytes 8..15 │ bytes 16..23 │ + * └───────────────┴─────────────────┘ └─────────────────┴─────────────────┘ + * Lower 128-bit lane (bytes 0-15) Upper 128-bit lane (bytes 16-31) + */ + f = _mm256_permute4x64_epi64(f, 0x94); + + /* Shuffling 8-bit lanes + * + * ┌─ Indices 0-8 into low 128-bit half of permuted vector ────────────────┐ + * │ Shuffle: [-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0] │ + * │ Result: [0, byte8, byte7, byte6, ..., 0, byte2, byte1, byte0] │ + * └───────────────────────────────────────────────────────────────────────┘ + * + * ┌─ Indices 1-9 into high 128-bit half of permuted vector ───────────────┐ + * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1] │ + * │ Result: [0, byte17, byte16, byte15, ..., 0, byte11, byte10, byte9] │ + * └───────────────────────────────────────────────────────────────────────┘ + */ + f = _mm256_shuffle_epi8(f, shufbidx); + + /* Keep only 18 out of 24 bits in each 32-bit lane */ + /* Bits 0..23 16..39 32..55 48..71 + * 72..95 88..111 104..127 120..143 */ + f = _mm256_srlv_epi32(f, srlvdidx); + /* Bits 0..23 18..39 36..55 54..71 + * 72..95 90..111 108..127 126..143 */ + f = _mm256_and_si256(f, mask); + /* Bits 0..17 18..35 36..53 54..71 + * 72..89 90..107 108..125 126..143 */ + + /* Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] */ + f = _mm256_sub_epi32(gamma1, f); + + _mm256_store_si256(&r[i], f); + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_polyz_unpack) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/polyz_unpack_19_avx2.c b/dev/x86_64/src/polyz_unpack_19_avx2.c new file mode 100644 index 000000000..e5f21db1c --- /dev/null +++ b/dev/x86_64/src/polyz_unpack_19_avx2.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include "arith_native_x86_64.h" + +void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a) +{ + unsigned int i; + __m256i f; + const __m256i shufbidx = + _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, -1, + 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); + /* Equivalent to _mm256_set_epi32(4, 0, 4, 0, 4, 0, 4, 0) */ + const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); + const __m256i mask = _mm256_set1_epi32(0xFFFFF); + const __m256i gamma1 = _mm256_set1_epi32(MLDSA_GAMMA1); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + + /* Permute 64-bit lanes + * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0] + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Original Layout ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ bytes 16..23 │ bytes 24..31 │ + * └─────────────────┴─────────────────┴─────────────────┴─────────────────┘ + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Layout after permute ║ + * ║ Byte indices in high half shifted down by 8 positions ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌───────────────┬─────────────────┐ ┌─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ │ bytes 8..15 │ bytes 16..23 │ + * └───────────────┴─────────────────┘ └─────────────────┴─────────────────┘ + * Lower 128-bit lane (bytes 0-15) Upper 128-bit lane (bytes 16-31) + */ + f = _mm256_permute4x64_epi64(f, 0x94); + + /* Shuffling 8-bit lanes + * + * ┌─ Indices 0-9 into low 128-bit half of permuted vector ────────────────┐ + * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0] │ + * │ Result: [0, byte9, byte8, byte7, ..., 0, byte2, byte1, byte0] │ + * └───────────────────────────────────────────────────────────────────────┘ + * + * ┌─ Indices 2-11 into high 128-bit half of permuted vector ──────────────┐ + * │ Shuffle: [-1, 11, 9, 8, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2] │ + * │ Result: [0, byte19, byte18, byte17, ..., 0, byte12, byte11, byte10] │ + * └───────────────────────────────────────────────────────────────────────┘ + */ + f = _mm256_shuffle_epi8(f, shufbidx); + + /* Keep only 20 out of 24 bits in each 32-bit lane */ + /* Bits 0..23 16..39 40..63 56..79 + * 80..103 96..119 120..143 136..159 */ + f = _mm256_srlv_epi32(f, srlvdidx); + /* Bits 0..23 20..39 40..63 60..79 + * 80..103 100..119 120..143 140..159 */ + f = _mm256_and_si256(f, mask); + /* Bits 0..19 20..39 40..59 60..79 + * 80..99 100..119 120..139 140..159 */ + + /* Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] */ + f = _mm256_sub_epi32(gamma1, f); + + _mm256_store_si256(&r[i], f); + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_polyz_unpack) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/rej_uniform_avx2.c b/dev/x86_64/src/rej_uniform_avx2.c new file mode 100644 index 000000000..07a5df970 --- /dev/null +++ b/dev/x86_64/src/rej_uniform_avx2.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +/* + * Reference: The pqcrystals implementation assumes a buffer that is 8 bytes + *. larger as the first loop overreads by 8 bytes that are then + * discarded. We instead do not pad the buffer and do not overread. + * The performance impact is negligible and it does not force the + * frontend to perform the unintuitive padding. + */ + +unsigned int mld_rej_uniform_avx2( + int32_t *MLD_RESTRICT r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_BUFLEN]) +{ + unsigned int ctr, pos; + uint32_t good; + __m256i d, tmp; + const __m256i bound = _mm256_set1_epi32(MLDSA_Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = + _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); + + ctr = pos = 0; + while (ctr <= MLDSA_N - 8 && pos <= MLD_AVX2_REJ_UNIFORM_BUFLEN - 32) + { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + + /* Permute 64-bit lanes + * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0] + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Original Layout ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ bytes 16..23 │ bytes 24..31 │ + * └─────────────────┴─────────────────┴─────────────────┴─────────────────┘ + * + * ╔═══════════════════════════════════════════════════════════════════════╗ + * ║ Layout after permute ║ + * ║ Byte indices in high half shifted down by 8 positions ║ + * ╚═══════════════════════════════════════════════════════════════════════╝ + * ┌───────────────┬─────────────────┐ ┌─────────────────┬─────────────────┐ + * │ Lane 0 │ Lane 1 │ │ Lane 2 │ Lane 3 │ + * │ bytes 0..7 │ bytes 8..15 │ │ bytes 8..15 │ bytes 16..23 │ + * └───────────────┴─────────────────┘ └─────────────────┴─────────────────┘ + * Lower 128-bit lane (bytes 0-15) Upper 128-bit lane (bytes 16-31) + */ + d = _mm256_permute4x64_epi64(d, 0x94); + + /* Shuffling 8-bit lanes + * + * ┌─ Indices 0-11 into low 128-bit half of permuted vector────────────────┐ + * │ Shuffle: [-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0] │ + * │ Result: [0, byte11, byte10, byte9, ..., 0, byte2, byte1, byte0] │ + * └───────────────────────────────────────────────────────────────────────┘ + * + * ┌─ Indices 4-15 into high 128-bit half of permuted vector ──────────────┐ + * │ Shuffle: [-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4] │ + * │ Result: [0, byte23, byte22, byte21, ..., 0, byte14, byte13, byte12 │ + * └───────────────────────────────────────────────────────────────────────┘ + */ + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; + + tmp = _mm256_sub_epi32(d, bound); + good = (uint32_t)_mm256_movemask_ps((__m256)tmp); + tmp = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good])); + d = _mm256_permutevar8x32_epi32(d, tmp); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += (unsigned)_mm_popcnt_u32(good); + } + + while (ctr < MLDSA_N && pos <= MLD_AVX2_REJ_UNIFORM_BUFLEN - 3) + { + uint32_t t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < MLDSA_Q) + { + /* Safe because t < MLDSA_Q. */ + r[ctr++] = (int32_t)t; + } + } + + return ctr; +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_rej_uniform) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/rej_uniform_eta2_avx2.c b/dev/x86_64/src/rej_uniform_eta2_avx2.c new file mode 100644 index 000000000..532f92b7c --- /dev/null +++ b/dev/x86_64/src/rej_uniform_eta2_avx2.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define MLD_AVX2_ETA2 2 + +/* + * Reference: In the pqcrystals implementation this function is called + * rej_eta_avx and supports multiple values for ETA via preprocessor + * conditionals. We move the conditionals to the frontend. + */ +unsigned int mld_rej_uniform_eta2_avx2( + int32_t *MLD_RESTRICT r, + const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN]) +{ + unsigned int ctr, pos; + uint32_t good; + __m256i f0, f1, f2; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(MLD_AVX2_ETA2); + const __m256i bound = mask; + /* check-magic: -6560 == 32*round(-2**10 / 5) */ + const __m256i v = _mm256_set1_epi32(-6560); + const __m256i p = _mm256_set1_epi32(5); + + ctr = pos = 0; + while (ctr <= MLDSA_N - 8 && pos <= MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN - 16) + { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = (uint32_t)_mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good); + pos += 4; + } + + while (ctr < MLDSA_N && pos < MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN) + { + uint32_t t0 = buf[pos] & 0x0F; + uint32_t t1 = buf[pos++] >> 4; + + if (t0 < 15) + { + t0 = t0 - (205 * t0 >> 10) * 5; + r[ctr++] = (int32_t)(2 - t0); + } + if (t1 < 15 && ctr < MLDSA_N) + { + t1 = t1 - (205 * t1 >> 10) * 5; + r[ctr++] = (int32_t)(2 - t1); + } + } + + return ctr; +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_rej_uniform_eta2) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/rej_uniform_eta4_avx2.c b/dev/x86_64/src/rej_uniform_eta4_avx2.c new file mode 100644 index 000000000..d382beca8 --- /dev/null +++ b/dev/x86_64/src/rej_uniform_eta4_avx2.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define MLD_AVX2_ETA4 4 + +/* + * Reference: In the pqcrystals implementation this function is called + * rej_eta_avx and supports multiple values for ETA via preprocessor + * conditionals. We move the conditionals to the frontend. + */ + +unsigned int mld_rej_uniform_eta4_avx2( + int32_t *MLD_RESTRICT r, + const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]) +{ + unsigned int ctr, pos; + uint32_t good; + __m256i f0, f1; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(MLD_AVX2_ETA4); + const __m256i bound = _mm256_set1_epi8(9); + + ctr = pos = 0; + while (ctr <= MLDSA_N - 8 && pos <= MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN - 16) + { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = (uint32_t)_mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > MLDSA_N - 8) + { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&mld_rej_uniform_table[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += (unsigned)_mm_popcnt_u32(good); + pos += 4; + } + + while (ctr < MLDSA_N && pos < MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN) + { + uint32_t t0 = buf[pos] & 0x0F; + uint32_t t1 = buf[pos++] >> 4; + + if (t0 < 9) + { + r[ctr++] = (int32_t)(4 - t0); + } + if (t1 < 9 && ctr < MLDSA_N) + { + r[ctr++] = (int32_t)(4 - t1); + } + } + + return ctr; +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_rej_uniform_eta4) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/rej_uniform_table.c b/dev/x86_64/src/rej_uniform_table.c new file mode 100644 index 000000000..6c537b454 --- /dev/null +++ b/dev/x86_64/src/rej_uniform_table.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "arith_native_x86_64.h" + +/* + * Lookup table used by rejection sampling. + * See autogen for details. + */ +MLD_ALIGN const uint8_t mld_rej_uniform_table[256][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0, 0, 0, 0}, {0, 1, 0, 0, 0, 0, 0, 0}, + {2, 0, 0, 0, 0, 0, 0, 0}, {0, 2, 0, 0, 0, 0, 0, 0}, + {1, 2, 0, 0, 0, 0, 0, 0}, {0, 1, 2, 0, 0, 0, 0, 0}, + {3, 0, 0, 0, 0, 0, 0, 0}, {0, 3, 0, 0, 0, 0, 0, 0}, + {1, 3, 0, 0, 0, 0, 0, 0}, {0, 1, 3, 0, 0, 0, 0, 0}, + {2, 3, 0, 0, 0, 0, 0, 0}, {0, 2, 3, 0, 0, 0, 0, 0}, + {1, 2, 3, 0, 0, 0, 0, 0}, {0, 1, 2, 3, 0, 0, 0, 0}, + {4, 0, 0, 0, 0, 0, 0, 0}, {0, 4, 0, 0, 0, 0, 0, 0}, + {1, 4, 0, 0, 0, 0, 0, 0}, {0, 1, 4, 0, 0, 0, 0, 0}, + {2, 4, 0, 0, 0, 0, 0, 0}, {0, 2, 4, 0, 0, 0, 0, 0}, + {1, 2, 4, 0, 0, 0, 0, 0}, {0, 1, 2, 4, 0, 0, 0, 0}, + {3, 4, 0, 0, 0, 0, 0, 0}, {0, 3, 4, 0, 0, 0, 0, 0}, + {1, 3, 4, 0, 0, 0, 0, 0}, {0, 1, 3, 4, 0, 0, 0, 0}, + {2, 3, 4, 0, 0, 0, 0, 0}, {0, 2, 3, 4, 0, 0, 0, 0}, + {1, 2, 3, 4, 0, 0, 0, 0}, {0, 1, 2, 3, 4, 0, 0, 0}, + {5, 0, 0, 0, 0, 0, 0, 0}, {0, 5, 0, 0, 0, 0, 0, 0}, + {1, 5, 0, 0, 0, 0, 0, 0}, {0, 1, 5, 0, 0, 0, 0, 0}, + {2, 5, 0, 0, 0, 0, 0, 0}, {0, 2, 5, 0, 0, 0, 0, 0}, + {1, 2, 5, 0, 0, 0, 0, 0}, {0, 1, 2, 5, 0, 0, 0, 0}, + {3, 5, 0, 0, 0, 0, 0, 0}, {0, 3, 5, 0, 0, 0, 0, 0}, + {1, 3, 5, 0, 0, 0, 0, 0}, {0, 1, 3, 5, 0, 0, 0, 0}, + {2, 3, 5, 0, 0, 0, 0, 0}, {0, 2, 3, 5, 0, 0, 0, 0}, + {1, 2, 3, 5, 0, 0, 0, 0}, {0, 1, 2, 3, 5, 0, 0, 0}, + {4, 5, 0, 0, 0, 0, 0, 0}, {0, 4, 5, 0, 0, 0, 0, 0}, + {1, 4, 5, 0, 0, 0, 0, 0}, {0, 1, 4, 5, 0, 0, 0, 0}, + {2, 4, 5, 0, 0, 0, 0, 0}, {0, 2, 4, 5, 0, 0, 0, 0}, + {1, 2, 4, 5, 0, 0, 0, 0}, {0, 1, 2, 4, 5, 0, 0, 0}, + {3, 4, 5, 0, 0, 0, 0, 0}, {0, 3, 4, 5, 0, 0, 0, 0}, + {1, 3, 4, 5, 0, 0, 0, 0}, {0, 1, 3, 4, 5, 0, 0, 0}, + {2, 3, 4, 5, 0, 0, 0, 0}, {0, 2, 3, 4, 5, 0, 0, 0}, + {1, 2, 3, 4, 5, 0, 0, 0}, {0, 1, 2, 3, 4, 5, 0, 0}, + {6, 0, 0, 0, 0, 0, 0, 0}, {0, 6, 0, 0, 0, 0, 0, 0}, + {1, 6, 0, 0, 0, 0, 0, 0}, {0, 1, 6, 0, 0, 0, 0, 0}, + {2, 6, 0, 0, 0, 0, 0, 0}, {0, 2, 6, 0, 0, 0, 0, 0}, + {1, 2, 6, 0, 0, 0, 0, 0}, {0, 1, 2, 6, 0, 0, 0, 0}, + {3, 6, 0, 0, 0, 0, 0, 0}, {0, 3, 6, 0, 0, 0, 0, 0}, + {1, 3, 6, 0, 0, 0, 0, 0}, {0, 1, 3, 6, 0, 0, 0, 0}, + {2, 3, 6, 0, 0, 0, 0, 0}, {0, 2, 3, 6, 0, 0, 0, 0}, + {1, 2, 3, 6, 0, 0, 0, 0}, {0, 1, 2, 3, 6, 0, 0, 0}, + {4, 6, 0, 0, 0, 0, 0, 0}, {0, 4, 6, 0, 0, 0, 0, 0}, + {1, 4, 6, 0, 0, 0, 0, 0}, {0, 1, 4, 6, 0, 0, 0, 0}, + {2, 4, 6, 0, 0, 0, 0, 0}, {0, 2, 4, 6, 0, 0, 0, 0}, + {1, 2, 4, 6, 0, 0, 0, 0}, {0, 1, 2, 4, 6, 0, 0, 0}, + {3, 4, 6, 0, 0, 0, 0, 0}, {0, 3, 4, 6, 0, 0, 0, 0}, + {1, 3, 4, 6, 0, 0, 0, 0}, {0, 1, 3, 4, 6, 0, 0, 0}, + {2, 3, 4, 6, 0, 0, 0, 0}, {0, 2, 3, 4, 6, 0, 0, 0}, + {1, 2, 3, 4, 6, 0, 0, 0}, {0, 1, 2, 3, 4, 6, 0, 0}, + {5, 6, 0, 0, 0, 0, 0, 0}, {0, 5, 6, 0, 0, 0, 0, 0}, + {1, 5, 6, 0, 0, 0, 0, 0}, {0, 1, 5, 6, 0, 0, 0, 0}, + {2, 5, 6, 0, 0, 0, 0, 0}, {0, 2, 5, 6, 0, 0, 0, 0}, + {1, 2, 5, 6, 0, 0, 0, 0}, {0, 1, 2, 5, 6, 0, 0, 0}, + {3, 5, 6, 0, 0, 0, 0, 0}, {0, 3, 5, 6, 0, 0, 0, 0}, + {1, 3, 5, 6, 0, 0, 0, 0}, {0, 1, 3, 5, 6, 0, 0, 0}, + {2, 3, 5, 6, 0, 0, 0, 0}, {0, 2, 3, 5, 6, 0, 0, 0}, + {1, 2, 3, 5, 6, 0, 0, 0}, {0, 1, 2, 3, 5, 6, 0, 0}, + {4, 5, 6, 0, 0, 0, 0, 0}, {0, 4, 5, 6, 0, 0, 0, 0}, + {1, 4, 5, 6, 0, 0, 0, 0}, {0, 1, 4, 5, 6, 0, 0, 0}, + {2, 4, 5, 6, 0, 0, 0, 0}, {0, 2, 4, 5, 6, 0, 0, 0}, + {1, 2, 4, 5, 6, 0, 0, 0}, {0, 1, 2, 4, 5, 6, 0, 0}, + {3, 4, 5, 6, 0, 0, 0, 0}, {0, 3, 4, 5, 6, 0, 0, 0}, + {1, 3, 4, 5, 6, 0, 0, 0}, {0, 1, 3, 4, 5, 6, 0, 0}, + {2, 3, 4, 5, 6, 0, 0, 0}, {0, 2, 3, 4, 5, 6, 0, 0}, + {1, 2, 3, 4, 5, 6, 0, 0}, {0, 1, 2, 3, 4, 5, 6, 0}, + {7, 0, 0, 0, 0, 0, 0, 0}, {0, 7, 0, 0, 0, 0, 0, 0}, + {1, 7, 0, 0, 0, 0, 0, 0}, {0, 1, 7, 0, 0, 0, 0, 0}, + {2, 7, 0, 0, 0, 0, 0, 0}, {0, 2, 7, 0, 0, 0, 0, 0}, + {1, 2, 7, 0, 0, 0, 0, 0}, {0, 1, 2, 7, 0, 0, 0, 0}, + {3, 7, 0, 0, 0, 0, 0, 0}, {0, 3, 7, 0, 0, 0, 0, 0}, + {1, 3, 7, 0, 0, 0, 0, 0}, {0, 1, 3, 7, 0, 0, 0, 0}, + {2, 3, 7, 0, 0, 0, 0, 0}, {0, 2, 3, 7, 0, 0, 0, 0}, + {1, 2, 3, 7, 0, 0, 0, 0}, {0, 1, 2, 3, 7, 0, 0, 0}, + {4, 7, 0, 0, 0, 0, 0, 0}, {0, 4, 7, 0, 0, 0, 0, 0}, + {1, 4, 7, 0, 0, 0, 0, 0}, {0, 1, 4, 7, 0, 0, 0, 0}, + {2, 4, 7, 0, 0, 0, 0, 0}, {0, 2, 4, 7, 0, 0, 0, 0}, + {1, 2, 4, 7, 0, 0, 0, 0}, {0, 1, 2, 4, 7, 0, 0, 0}, + {3, 4, 7, 0, 0, 0, 0, 0}, {0, 3, 4, 7, 0, 0, 0, 0}, + {1, 3, 4, 7, 0, 0, 0, 0}, {0, 1, 3, 4, 7, 0, 0, 0}, + {2, 3, 4, 7, 0, 0, 0, 0}, {0, 2, 3, 4, 7, 0, 0, 0}, + {1, 2, 3, 4, 7, 0, 0, 0}, {0, 1, 2, 3, 4, 7, 0, 0}, + {5, 7, 0, 0, 0, 0, 0, 0}, {0, 5, 7, 0, 0, 0, 0, 0}, + {1, 5, 7, 0, 0, 0, 0, 0}, {0, 1, 5, 7, 0, 0, 0, 0}, + {2, 5, 7, 0, 0, 0, 0, 0}, {0, 2, 5, 7, 0, 0, 0, 0}, + {1, 2, 5, 7, 0, 0, 0, 0}, {0, 1, 2, 5, 7, 0, 0, 0}, + {3, 5, 7, 0, 0, 0, 0, 0}, {0, 3, 5, 7, 0, 0, 0, 0}, + {1, 3, 5, 7, 0, 0, 0, 0}, {0, 1, 3, 5, 7, 0, 0, 0}, + {2, 3, 5, 7, 0, 0, 0, 0}, {0, 2, 3, 5, 7, 0, 0, 0}, + {1, 2, 3, 5, 7, 0, 0, 0}, {0, 1, 2, 3, 5, 7, 0, 0}, + {4, 5, 7, 0, 0, 0, 0, 0}, {0, 4, 5, 7, 0, 0, 0, 0}, + {1, 4, 5, 7, 0, 0, 0, 0}, {0, 1, 4, 5, 7, 0, 0, 0}, + {2, 4, 5, 7, 0, 0, 0, 0}, {0, 2, 4, 5, 7, 0, 0, 0}, + {1, 2, 4, 5, 7, 0, 0, 0}, {0, 1, 2, 4, 5, 7, 0, 0}, + {3, 4, 5, 7, 0, 0, 0, 0}, {0, 3, 4, 5, 7, 0, 0, 0}, + {1, 3, 4, 5, 7, 0, 0, 0}, {0, 1, 3, 4, 5, 7, 0, 0}, + {2, 3, 4, 5, 7, 0, 0, 0}, {0, 2, 3, 4, 5, 7, 0, 0}, + {1, 2, 3, 4, 5, 7, 0, 0}, {0, 1, 2, 3, 4, 5, 7, 0}, + {6, 7, 0, 0, 0, 0, 0, 0}, {0, 6, 7, 0, 0, 0, 0, 0}, + {1, 6, 7, 0, 0, 0, 0, 0}, {0, 1, 6, 7, 0, 0, 0, 0}, + {2, 6, 7, 0, 0, 0, 0, 0}, {0, 2, 6, 7, 0, 0, 0, 0}, + {1, 2, 6, 7, 0, 0, 0, 0}, {0, 1, 2, 6, 7, 0, 0, 0}, + {3, 6, 7, 0, 0, 0, 0, 0}, {0, 3, 6, 7, 0, 0, 0, 0}, + {1, 3, 6, 7, 0, 0, 0, 0}, {0, 1, 3, 6, 7, 0, 0, 0}, + {2, 3, 6, 7, 0, 0, 0, 0}, {0, 2, 3, 6, 7, 0, 0, 0}, + {1, 2, 3, 6, 7, 0, 0, 0}, {0, 1, 2, 3, 6, 7, 0, 0}, + {4, 6, 7, 0, 0, 0, 0, 0}, {0, 4, 6, 7, 0, 0, 0, 0}, + {1, 4, 6, 7, 0, 0, 0, 0}, {0, 1, 4, 6, 7, 0, 0, 0}, + {2, 4, 6, 7, 0, 0, 0, 0}, {0, 2, 4, 6, 7, 0, 0, 0}, + {1, 2, 4, 6, 7, 0, 0, 0}, {0, 1, 2, 4, 6, 7, 0, 0}, + {3, 4, 6, 7, 0, 0, 0, 0}, {0, 3, 4, 6, 7, 0, 0, 0}, + {1, 3, 4, 6, 7, 0, 0, 0}, {0, 1, 3, 4, 6, 7, 0, 0}, + {2, 3, 4, 6, 7, 0, 0, 0}, {0, 2, 3, 4, 6, 7, 0, 0}, + {1, 2, 3, 4, 6, 7, 0, 0}, {0, 1, 2, 3, 4, 6, 7, 0}, + {5, 6, 7, 0, 0, 0, 0, 0}, {0, 5, 6, 7, 0, 0, 0, 0}, + {1, 5, 6, 7, 0, 0, 0, 0}, {0, 1, 5, 6, 7, 0, 0, 0}, + {2, 5, 6, 7, 0, 0, 0, 0}, {0, 2, 5, 6, 7, 0, 0, 0}, + {1, 2, 5, 6, 7, 0, 0, 0}, {0, 1, 2, 5, 6, 7, 0, 0}, + {3, 5, 6, 7, 0, 0, 0, 0}, {0, 3, 5, 6, 7, 0, 0, 0}, + {1, 3, 5, 6, 7, 0, 0, 0}, {0, 1, 3, 5, 6, 7, 0, 0}, + {2, 3, 5, 6, 7, 0, 0, 0}, {0, 2, 3, 5, 6, 7, 0, 0}, + {1, 2, 3, 5, 6, 7, 0, 0}, {0, 1, 2, 3, 5, 6, 7, 0}, + {4, 5, 6, 7, 0, 0, 0, 0}, {0, 4, 5, 6, 7, 0, 0, 0}, + {1, 4, 5, 6, 7, 0, 0, 0}, {0, 1, 4, 5, 6, 7, 0, 0}, + {2, 4, 5, 6, 7, 0, 0, 0}, {0, 2, 4, 5, 6, 7, 0, 0}, + {1, 2, 4, 5, 6, 7, 0, 0}, {0, 1, 2, 4, 5, 6, 7, 0}, + {3, 4, 5, 6, 7, 0, 0, 0}, {0, 3, 4, 5, 6, 7, 0, 0}, + {1, 3, 4, 5, 6, 7, 0, 0}, {0, 1, 3, 4, 5, 6, 7, 0}, + {2, 3, 4, 5, 6, 7, 0, 0}, {0, 2, 3, 4, 5, 6, 7, 0}, + {1, 2, 3, 4, 5, 6, 7, 0}, {0, 1, 2, 3, 4, 5, 6, 7}, +}; + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_rej_uniform_table) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/dev/x86_64/src/x86_64_zetas.i b/dev/x86_64/src/x86_64_zetas.i new file mode 100644 index 000000000..9b6c603ba --- /dev/null +++ b/dev/x86_64/src/x86_64_zetas.i @@ -0,0 +1,88 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +/* + * Table of zeta values used in the AVX2 NTTs + * See autogen for details. + */ +/* twiddles * q^-1 */ +- 151046689, + 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, 308362795, + 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, + -1091570561, -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, + 515185417, 515185417, -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, + 625853735, 625853735, 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, + 858240904, 858240904, 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, + 684667771, 684667771, 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, + -748618600, -748618600, 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, + -1170414139, -1170414139, -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, + -1076973524, -1898723372, -594436433, -202001019, -475984260, -561427818, 1797021249, + -1061813248, 2059733581, -1661512036, -1104976547, -1750224323, -901666090, 418987550, + 1831915353, -1925356481, 992097815, 879957084, 2024403852, 1484874664, -1636082790, -285388938, + -1983539117, -1495136972, -950076368, -1714807468, -952438995, -1574918427, 1350681039, + -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, -783134478, + -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, 1321868265, + 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, + -2131021878, -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, + -1669960606, -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, + -1729304568, 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, + -894060583, -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, + 1819892093, 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, + 235104446, -1045062172, 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, + -2040058690, -853476187, -2047270596, -1723816713, -1591599803, -440824168, 1119856484, + 1544891539, 155290192, -973777462, 991903578, 912367099, -44694137, 1176904444, -421552614, + -818371958, 1747917558, -325927722, 908452108, 1851023419, -1176751719, -1354528380, -72690498, + -314284737, 985022747, 963438279, -1078959975, 604552167, -1021949428, 608791570, 173440395, + -2126092136, -1316619236, -1039370342, 6087993, -110126092, 565464272, -1758099917, -1600929361, + 879867909, -1809756372, 400711272, 1363007700, 30313375, -326425360, 1683520342, -517299994, + 2027935492, -1372618620, 128353682, -1123881663, 137583815, -635454918, -642772911, 45766801, + 671509323, -2070602178, 419615363, 1216882040, -270590488, -1276805128, 371462360, -1357098057, + -384158533, 827959816, -596344473, 702390549, -279505433, -260424530, -71875110, -1208667171, + -1499603926, 2036925262, -540420426, 746144248, -1420958686, 2032221021, 1904936414, 1257750362, + 1926727420, 1931587462, 1258381762, 885133339, 1629985060, 1967222129, 6363718, -1287922800, + 1136965286, 1779436847, 1116720494, 1042326957, 1405999311, 713994583, 940195359, -1542497137, + 2061661095, -883155599, 1726753853, -1547952704, 394851342, 283780712, 776003547, 1123958025, + 201262505, 1934038751, 374860238, + /* twiddles */ + + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, 1826347, 1826347, 1826347, + 1826347, 2353451, 2353451, 2353451, 2353451, -359251, -359251, -359251, -359251, -2091905, + -2091905, -2091905, -2091905, 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, + -2884855, 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, 2725464, + 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, -549488, -549488, -1119584, + -1119584, 2619752, 2619752, -2108549, -2108549, -2118186, -2118186, -3859737, -3859737, + -1399561, -1399561, -3277672, -3277672, 1757237, 1757237, -19422, -19422, 4010497, 4010497, + 280005, 280005, 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, -1699267, -1643818, + 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, 811944, 531354, 954230, 3881043, + 3900724, -2556880, 2071892, -2797779, -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, + -3190144, -3632928, 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, 189548, 3159746, -2409325, + 1315589, 1285669, -812732, -3019102, -3628969, -1528703, -3041255, 3475950, -1585221, 1939314, + -1000202, -3157330, 126922, -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, + -1430430, 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, -3553272, + -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, 2091667, -3342478, 266997, + -3520352, 900702, 495491, -655327, -3556995, 342297, 3437287, 2842341, 4055324, -3767016, + -2994039, -1333058, -451100, -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, + 810149, -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, 3407706, + 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, 286988, -3342277, 2691481, + 1247620, 1250494, 1869119, 1237275, 1312455, 1917081, 777191, -2831860, -3724270, 2432395, + 3369112, 162844, 1652634, 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, + 1400424, 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, -2437823, + 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, -2546312, 2235880, -1671176, + 594136, 2454455, 185531, 1616392, -3694233, 3866901, 1717735, -1803090, -260646, -420899, + 1612842, -48306, -846154, 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, + -3595838, 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, -1374803, + 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, 269760, 472078, 1910376, + -3833893, -2286327, -3545687, -1362209, 1976782, diff --git a/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S b/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S index 862ada421..3663c140f 100644 --- a/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +++ b/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S @@ -14,7 +14,7 @@ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) /* - * WARNING: This file is auto-derived from the mlkem-native source file + * WARNING: This file is auto-derived from the mldsa-native source file * dev/fips202/aarch64/src/keccak_f1600_x1_scalar_asm.S using scripts/simpasm. Do not modify it directly. */ diff --git a/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S b/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S index 7679895ab..6bf58e28c 100644 --- a/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +++ b/mldsa/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S @@ -31,7 +31,7 @@ #if defined(__ARM_FEATURE_SHA3) /* - * WARNING: This file is auto-derived from the mlkem-native source file + * WARNING: This file is auto-derived from the mldsa-native source file * dev/fips202/aarch64/src/keccak_f1600_x1_v84a_asm.S using scripts/simpasm. Do not modify it directly. */ diff --git a/mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S b/mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S index 5dfc04300..d387bc871 100644 --- a/mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +++ b/mldsa/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S @@ -31,7 +31,7 @@ #if defined(__ARM_FEATURE_SHA3) /* - * WARNING: This file is auto-derived from the mlkem-native source file + * WARNING: This file is auto-derived from the mldsa-native source file * dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S using scripts/simpasm. Do not modify it directly. */ diff --git a/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S b/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S index 08ccc0fb8..5bcea9ad2 100644 --- a/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +++ b/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S @@ -14,7 +14,7 @@ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) /* - * WARNING: This file is auto-derived from the mlkem-native source file + * WARNING: This file is auto-derived from the mldsa-native source file * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S using scripts/simpasm. Do not modify it directly. */ diff --git a/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S b/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S index c4949c268..d06cdea28 100644 --- a/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +++ b/mldsa/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S @@ -16,7 +16,7 @@ #if defined(__ARM_FEATURE_SHA3) /* - * WARNING: This file is auto-derived from the mlkem-native source file + * WARNING: This file is auto-derived from the mldsa-native source file * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S using scripts/simpasm. Do not modify it directly. */ diff --git a/mldsa/native/aarch64/src/intt.S b/mldsa/native/aarch64/src/intt.S index 7591271e3..699b4aaab 100644 --- a/mldsa/native/aarch64/src/intt.S +++ b/mldsa/native/aarch64/src/intt.S @@ -26,345 +26,332 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro mulmodq dst, src, const, idx0, idx1 - sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] - mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] - mls \dst\().4s, t2.4s, modulus.s[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s - mul \dst\().4s, \src\().4s, \const\().4s - mls \dst\().4s, t2.4s, modulus.s[0] -.endm - - -.macro gs_butterfly a, b, root, idx0, idx1 - sub tmp.4s, \a\().4s, \b\().4s - add \a\().4s, \a\().4s, \b\().4s - mulmodq \b, tmp, \root, \idx0, \idx1 -.endm - -.macro gs_butterfly_v a, b, root, root_twisted - sub tmp.4s, \a\().4s, \b\().4s - add \a\().4s, \a\().4s, \b\().4s - mulmod \b, tmp, \root, \root_twisted -.endm - -.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 - mulmod \dst0, \src0, ninv, ninv_tw - mulmod \dst1, \src1, ninv, ninv_tw - mulmod \dst2, \src2, ninv, ninv_tw - mulmod \dst3, \src3, ninv, ninv_tw - mulmod \dst4, \src4, ninv, ninv_tw - mulmod \dst5, \src5, ninv, ninv_tw - mulmod \dst6, \src6, ninv, ninv_tw - mulmod \dst7, \src7, ninv, ninv_tw -.endm - -.macro load_roots_1234 r_ptr - ldr q_root0, [\r_ptr], #(8*16) - ldr q_root1, [\r_ptr, #(-8*16 + 1*16)] - ldr q_root2, [\r_ptr, #(-8*16 + 2*16)] - ldr q_root3, [\r_ptr, #(-8*16 + 3*16)] - ldr q_root4, [\r_ptr, #(-8*16 + 4*16)] - ldr q_root5, [\r_ptr, #(-8*16 + 5*16)] - ldr q_root6, [\r_ptr, #(-8*16 + 6*16)] - ldr q_root7, [\r_ptr, #(-8*16 + 7*16)] -.endm - -.macro load_next_roots_56 root0, r_ptr0 - ldr q_\root0, [\r_ptr0], #16 -.endm - -.macro load_next_roots_6 root0, r_ptr0 - ldr q_\root0, [\r_ptr0], #8 -.endm - -.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr q_\root0, [\r_ptr1], #(6*16) - ldr q_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] - ldr q_\root1, [\r_ptr1, #(-6*16 + 2*16)] - ldr q_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] - ldr q_\root2, [\r_ptr1, #(-6*16 + 4*16)] - ldr q_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data0, data1, data2, data3 - trn1 t0.4s, \data0\().4s, \data1\().4s - trn2 t1.4s, \data0\().4s, \data1\().4s - trn1 t2.4s, \data2\().4s, \data3\().4s - trn2 t3.4s, \data2\().4s, \data3\().4s - - trn2 \data2\().2d, t0.2d, t2.2d - trn2 \data3\().2d, t1.2d, t3.2d - trn1 \data0\().2d, t0.2d, t2.2d - trn1 \data1\().2d, t1.2d, t3.2d -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/intt.S using scripts/simpasm. Do not modify it directly. + */ -.macro pop_stack - restore_vregs -.endm .text -.global MLD_ASM_NAMESPACE(intt_asm) .balign 4 +.global MLD_ASM_NAMESPACE(intt_asm) MLD_ASM_FN_SYMBOL(intt_asm) - push_stack - - in .req x0 - r5678_ptr .req x1 - r1234_ptr .req x2 - inp .req x3 - count .req x4 - xtmp .req x5 - - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - data8 .req v16 - data9 .req v17 - data10 .req v18 - data11 .req v19 - data12 .req v20 - data13 .req v21 - data14 .req v22 - data15 .req v23 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - q_data8 .req q16 - q_data9 .req q17 - q_data10 .req q18 - q_data11 .req q19 - q_data12 .req q20 - q_data13 .req q21 - q_data14 .req q22 - q_data15 .req q23 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root3 .req v3 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - root3_tw .req v7 - - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root3 .req q3 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - q_root3_tw .req q7 - - - tmp .req v24 - q_tmp .req q24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - modulus .req v29 + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w5, #0xe001 // =57345 + movk w5, #0x7f, lsl #16 + dup v29.4s, w5 + mov x3, x0 + mov x4, #0x10 // =16 - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup modulus.4s, wtmp - - mov inp, in - - mov count, #16 - - .p2align 2 layer5678_start: + ld4 { v8.4s, v9.4s, v10.4s, v11.4s }, [x0] + ldr q0, [x1], #0x60 + ldur q4, [x1, #-0x50] + ldur q1, [x1, #-0x40] + ldur q5, [x1, #-0x30] + ldur q2, [x1, #-0x20] + ldur q6, [x1, #-0x10] + sub v24.4s, v8.4s, v9.4s + add v8.4s, v8.4s, v9.4s + sqrdmulh v27.4s, v24.4s, v5.4s + mul v9.4s, v24.4s, v1.4s + mls v9.4s, v27.4s, v29.s[0] + sub v24.4s, v10.4s, v11.4s + add v10.4s, v10.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v6.4s + mul v11.4s, v24.4s, v2.4s + mls v11.4s, v27.4s, v29.s[0] + sub v24.4s, v8.4s, v10.4s + add v8.4s, v8.4s, v10.4s + sqrdmulh v27.4s, v24.4s, v4.4s + mul v10.4s, v24.4s, v0.4s + mls v10.4s, v27.4s, v29.s[0] + sub v24.4s, v9.4s, v11.4s + add v9.4s, v9.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v4.4s + mul v11.4s, v24.4s, v0.4s + mls v11.4s, v27.4s, v29.s[0] + trn1 v25.4s, v8.4s, v9.4s + trn2 v26.4s, v8.4s, v9.4s + trn1 v27.4s, v10.4s, v11.4s + trn2 v28.4s, v10.4s, v11.4s + trn2 v10.2d, v25.2d, v27.2d + trn2 v11.2d, v26.2d, v28.2d + trn1 v8.2d, v25.2d, v27.2d + trn1 v9.2d, v26.2d, v28.2d + ldr q1, [x2], #0x8 + ldr q0, [x2], #0x10 + sub v24.4s, v8.4s, v9.4s + add v8.4s, v8.4s, v9.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v9.4s, v24.4s, v0.s[0] + mls v9.4s, v27.4s, v29.s[0] + sub v24.4s, v10.4s, v11.4s + add v10.4s, v10.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v0.s[3] + mul v11.4s, v24.4s, v0.s[2] + mls v11.4s, v27.4s, v29.s[0] + sub v24.4s, v8.4s, v10.4s + add v8.4s, v8.4s, v10.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v10.4s, v24.4s, v1.s[0] + mls v10.4s, v27.4s, v29.s[0] + sub v24.4s, v9.4s, v11.4s + add v9.4s, v9.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v11.4s, v24.4s, v1.s[0] + mls v11.4s, v27.4s, v29.s[0] + str q8, [x0], #0x40 + stur q9, [x0, #-0x30] + stur q10, [x0, #-0x20] + stur q11, [x0, #-0x10] + subs x4, x4, #0x1 + cbnz x4, layer5678_start + mov x0, x3 + mov x4, #0x4 // =4 + mov w5, #0x3ffe // =16382 + dup v25.4s, w5 + mov w5, #0xe03 // =3587 + movk w5, #0x40, lsl #16 + dup v26.4s, w5 + ldr q0, [x2], #0x80 + ldur q1, [x2, #-0x70] + ldur q2, [x2, #-0x60] + ldur q3, [x2, #-0x50] + ldur q4, [x2, #-0x40] + ldur q5, [x2, #-0x30] + ldur q6, [x2, #-0x20] + ldur q7, [x2, #-0x10] - ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in] - - load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r5678_ptr - - gs_butterfly_v data0, data1, root1, root1_tw - gs_butterfly_v data2, data3, root2, root2_tw - gs_butterfly_v data0, data2, root0, root0_tw - gs_butterfly_v data1, data3, root0, root0_tw - - transpose4 data0, data1, data2, data3 - - load_next_roots_6 root1, r1234_ptr - load_next_roots_56 root0, r1234_ptr - - gs_butterfly data0, data1, root0, 0, 1 - gs_butterfly data2, data3, root0, 2, 3 - gs_butterfly data0, data2, root1, 0, 1 - gs_butterfly data1, data3, root1, 0, 1 - - str q_data0, [in], #(16*4) - str q_data1, [in, #(-16*4 + 1*16)] - str q_data2, [in, #(-16*4 + 2*16)] - str q_data3, [in, #(-16*4 + 3*16)] - - subs count, count, #1 - cbnz count, layer5678_start - - .unreq root0_tw - .unreq root1_tw - .unreq root2_tw - .unreq root3_tw - .unreq q_root0_tw - .unreq q_root1_tw - .unreq q_root2_tw - .unreq q_root3_tw - .unreq t0 - .unreq t1 - - root4 .req v4 - root5 .req v5 - root6 .req v6 - root7 .req v7 - q_root4 .req q4 - q_root5 .req q5 - q_root6 .req q6 - q_root7 .req q7 - ninv .req v25 - ninv_tw .req v26 - - mov in, inp - mov count, #4 - - // load ninv - mov wtmp, #16382 // 2^(32 - 8) mod Q - dup ninv.4s, wtmp - - // load ninv_tw = 4197891 - movz wtmp, #3587 - movk wtmp, #64, lsl #16 - dup ninv_tw.4s, wtmp - - load_roots_1234 r1234_ptr - - .p2align 2 layer1234_start: - ldr q_data0, [in, #(0*(512/8))] - ldr q_data1, [in, #(1*(512/8))] - ldr q_data2, [in, #(2*(512/8))] - ldr q_data3, [in, #(3*(512/8))] - ldr q_data4, [in, #(4*(512/8))] - ldr q_data5, [in, #(5*(512/8))] - ldr q_data6, [in, #(6*(512/8))] - ldr q_data7, [in, #(7*(512/8))] - ldr q_data8, [in, #(8*(512/8))] - ldr q_data9, [in, #(9*(512/8))] - ldr q_data10, [in, #(10*(512/8))] - ldr q_data11, [in, #(11*(512/8))] - ldr q_data12, [in, #(12*(512/8))] - ldr q_data13, [in, #(13*(512/8))] - ldr q_data14, [in, #(14*(512/8))] - ldr q_data15, [in, #(15*(512/8))] - - // layer4 - gs_butterfly data0, data1, root3, 2, 3 - gs_butterfly data2, data3, root4, 0, 1 - gs_butterfly data4, data5, root4, 2, 3 - gs_butterfly data6, data7, root5, 0, 1 - gs_butterfly data8, data9, root5, 2, 3 - gs_butterfly data10, data11, root6, 0, 1 - gs_butterfly data12, data13, root6, 2, 3 - gs_butterfly data14, data15, root7, 0, 1 - - // layer3 - gs_butterfly data0, data2, root1, 2, 3 - gs_butterfly data1, data3, root1, 2, 3 - gs_butterfly data4, data6, root2, 0, 1 - gs_butterfly data5, data7, root2, 0, 1 - gs_butterfly data8, data10, root2, 2, 3 - gs_butterfly data9, data11, root2, 2, 3 - gs_butterfly data12, data14, root3, 0, 1 - gs_butterfly data13, data15, root3, 0, 1 - - // layer2 - gs_butterfly data0, data4, root0, 2, 3 - gs_butterfly data1, data5, root0, 2, 3 - gs_butterfly data2, data6, root0, 2, 3 - gs_butterfly data3, data7, root0, 2, 3 - gs_butterfly data8, data12, root1, 0, 1 - gs_butterfly data9, data13, root1, 0, 1 - gs_butterfly data10, data14, root1, 0, 1 - gs_butterfly data11, data15, root1, 0, 1 - - // layer 1 - gs_butterfly data0, data8, root0, 0, 1 - gs_butterfly data1, data9, root0, 0, 1 - gs_butterfly data2, data10, root0, 0, 1 - gs_butterfly data3, data11, root0, 0, 1 - gs_butterfly data4, data12, root0, 0, 1 - gs_butterfly data5, data13, root0, 0, 1 - gs_butterfly data6, data14, root0, 0, 1 - gs_butterfly data7, data15, root0, 0, 1 - - str q_data8, [in, #(8*(512/8))] - str q_data9, [in, #(9*(512/8))] - str q_data10, [in, #(10*(512/8))] - str q_data11, [in, #(11*(512/8))] - str q_data12, [in, #(12*(512/8))] - str q_data13, [in, #(13*(512/8))] - str q_data14, [in, #(14*(512/8))] - str q_data15, [in, #(15*(512/8))] - - // Scale half the coeffs 2^-8 and the Montgomery factor 2^32. - // For the other half, the scaling has been merged into the - // multiplication with the twiddle factor on the last layer. - mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 - - str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(512/8))] - str q_data2, [in, #(-16 + 2*(512/8))] - str q_data3, [in, #(-16 + 3*(512/8))] - str q_data4, [in, #(-16 + 4*(512/8))] - str q_data5, [in, #(-16 + 5*(512/8))] - str q_data6, [in, #(-16 + 6*(512/8))] - str q_data7, [in, #(-16 + 7*(512/8))] - - subs count, count, #1 - cbnz count, layer1234_start - - pop_stack + ldr q8, [x0] + ldr q9, [x0, #0x40] + ldr q10, [x0, #0x80] + ldr q11, [x0, #0xc0] + ldr q12, [x0, #0x100] + ldr q13, [x0, #0x140] + ldr q14, [x0, #0x180] + ldr q15, [x0, #0x1c0] + ldr q16, [x0, #0x200] + ldr q17, [x0, #0x240] + ldr q18, [x0, #0x280] + ldr q19, [x0, #0x2c0] + ldr q20, [x0, #0x300] + ldr q21, [x0, #0x340] + ldr q22, [x0, #0x380] + ldr q23, [x0, #0x3c0] + sub v24.4s, v8.4s, v9.4s + add v8.4s, v8.4s, v9.4s + sqrdmulh v27.4s, v24.4s, v3.s[3] + mul v9.4s, v24.4s, v3.s[2] + mls v9.4s, v27.4s, v29.s[0] + sub v24.4s, v10.4s, v11.4s + add v10.4s, v10.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v4.s[1] + mul v11.4s, v24.4s, v4.s[0] + mls v11.4s, v27.4s, v29.s[0] + sub v24.4s, v12.4s, v13.4s + add v12.4s, v12.4s, v13.4s + sqrdmulh v27.4s, v24.4s, v4.s[3] + mul v13.4s, v24.4s, v4.s[2] + mls v13.4s, v27.4s, v29.s[0] + sub v24.4s, v14.4s, v15.4s + add v14.4s, v14.4s, v15.4s + sqrdmulh v27.4s, v24.4s, v5.s[1] + mul v15.4s, v24.4s, v5.s[0] + mls v15.4s, v27.4s, v29.s[0] + sub v24.4s, v16.4s, v17.4s + add v16.4s, v16.4s, v17.4s + sqrdmulh v27.4s, v24.4s, v5.s[3] + mul v17.4s, v24.4s, v5.s[2] + mls v17.4s, v27.4s, v29.s[0] + sub v24.4s, v18.4s, v19.4s + add v18.4s, v18.4s, v19.4s + sqrdmulh v27.4s, v24.4s, v6.s[1] + mul v19.4s, v24.4s, v6.s[0] + mls v19.4s, v27.4s, v29.s[0] + sub v24.4s, v20.4s, v21.4s + add v20.4s, v20.4s, v21.4s + sqrdmulh v27.4s, v24.4s, v6.s[3] + mul v21.4s, v24.4s, v6.s[2] + mls v21.4s, v27.4s, v29.s[0] + sub v24.4s, v22.4s, v23.4s + add v22.4s, v22.4s, v23.4s + sqrdmulh v27.4s, v24.4s, v7.s[1] + mul v23.4s, v24.4s, v7.s[0] + mls v23.4s, v27.4s, v29.s[0] + sub v24.4s, v8.4s, v10.4s + add v8.4s, v8.4s, v10.4s + sqrdmulh v27.4s, v24.4s, v1.s[3] + mul v10.4s, v24.4s, v1.s[2] + mls v10.4s, v27.4s, v29.s[0] + sub v24.4s, v9.4s, v11.4s + add v9.4s, v9.4s, v11.4s + sqrdmulh v27.4s, v24.4s, v1.s[3] + mul v11.4s, v24.4s, v1.s[2] + mls v11.4s, v27.4s, v29.s[0] + sub v24.4s, v12.4s, v14.4s + add v12.4s, v12.4s, v14.4s + sqrdmulh v27.4s, v24.4s, v2.s[1] + mul v14.4s, v24.4s, v2.s[0] + mls v14.4s, v27.4s, v29.s[0] + sub v24.4s, v13.4s, v15.4s + add v13.4s, v13.4s, v15.4s + sqrdmulh v27.4s, v24.4s, v2.s[1] + mul v15.4s, v24.4s, v2.s[0] + mls v15.4s, v27.4s, v29.s[0] + sub v24.4s, v16.4s, v18.4s + add v16.4s, v16.4s, v18.4s + sqrdmulh v27.4s, v24.4s, v2.s[3] + mul v18.4s, v24.4s, v2.s[2] + mls v18.4s, v27.4s, v29.s[0] + sub v24.4s, v17.4s, v19.4s + add v17.4s, v17.4s, v19.4s + sqrdmulh v27.4s, v24.4s, v2.s[3] + mul v19.4s, v24.4s, v2.s[2] + mls v19.4s, v27.4s, v29.s[0] + sub v24.4s, v20.4s, v22.4s + add v20.4s, v20.4s, v22.4s + sqrdmulh v27.4s, v24.4s, v3.s[1] + mul v22.4s, v24.4s, v3.s[0] + mls v22.4s, v27.4s, v29.s[0] + sub v24.4s, v21.4s, v23.4s + add v21.4s, v21.4s, v23.4s + sqrdmulh v27.4s, v24.4s, v3.s[1] + mul v23.4s, v24.4s, v3.s[0] + mls v23.4s, v27.4s, v29.s[0] + sub v24.4s, v8.4s, v12.4s + add v8.4s, v8.4s, v12.4s + sqrdmulh v27.4s, v24.4s, v0.s[3] + mul v12.4s, v24.4s, v0.s[2] + mls v12.4s, v27.4s, v29.s[0] + sub v24.4s, v9.4s, v13.4s + add v9.4s, v9.4s, v13.4s + sqrdmulh v27.4s, v24.4s, v0.s[3] + mul v13.4s, v24.4s, v0.s[2] + mls v13.4s, v27.4s, v29.s[0] + sub v24.4s, v10.4s, v14.4s + add v10.4s, v10.4s, v14.4s + sqrdmulh v27.4s, v24.4s, v0.s[3] + mul v14.4s, v24.4s, v0.s[2] + mls v14.4s, v27.4s, v29.s[0] + sub v24.4s, v11.4s, v15.4s + add v11.4s, v11.4s, v15.4s + sqrdmulh v27.4s, v24.4s, v0.s[3] + mul v15.4s, v24.4s, v0.s[2] + mls v15.4s, v27.4s, v29.s[0] + sub v24.4s, v16.4s, v20.4s + add v16.4s, v16.4s, v20.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v20.4s, v24.4s, v1.s[0] + mls v20.4s, v27.4s, v29.s[0] + sub v24.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v21.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v21.4s, v24.4s, v1.s[0] + mls v21.4s, v27.4s, v29.s[0] + sub v24.4s, v18.4s, v22.4s + add v18.4s, v18.4s, v22.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v22.4s, v24.4s, v1.s[0] + mls v22.4s, v27.4s, v29.s[0] + sub v24.4s, v19.4s, v23.4s + add v19.4s, v19.4s, v23.4s + sqrdmulh v27.4s, v24.4s, v1.s[1] + mul v23.4s, v24.4s, v1.s[0] + mls v23.4s, v27.4s, v29.s[0] + sub v24.4s, v8.4s, v16.4s + add v8.4s, v8.4s, v16.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v16.4s, v24.4s, v0.s[0] + mls v16.4s, v27.4s, v29.s[0] + sub v24.4s, v9.4s, v17.4s + add v9.4s, v9.4s, v17.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v17.4s, v24.4s, v0.s[0] + mls v17.4s, v27.4s, v29.s[0] + sub v24.4s, v10.4s, v18.4s + add v10.4s, v10.4s, v18.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v18.4s, v24.4s, v0.s[0] + mls v18.4s, v27.4s, v29.s[0] + sub v24.4s, v11.4s, v19.4s + add v11.4s, v11.4s, v19.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v19.4s, v24.4s, v0.s[0] + mls v19.4s, v27.4s, v29.s[0] + sub v24.4s, v12.4s, v20.4s + add v12.4s, v12.4s, v20.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v20.4s, v24.4s, v0.s[0] + mls v20.4s, v27.4s, v29.s[0] + sub v24.4s, v13.4s, v21.4s + add v13.4s, v13.4s, v21.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v21.4s, v24.4s, v0.s[0] + mls v21.4s, v27.4s, v29.s[0] + sub v24.4s, v14.4s, v22.4s + add v14.4s, v14.4s, v22.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v22.4s, v24.4s, v0.s[0] + mls v22.4s, v27.4s, v29.s[0] + sub v24.4s, v15.4s, v23.4s + add v15.4s, v15.4s, v23.4s + sqrdmulh v27.4s, v24.4s, v0.s[1] + mul v23.4s, v24.4s, v0.s[0] + mls v23.4s, v27.4s, v29.s[0] + str q16, [x0, #0x200] + str q17, [x0, #0x240] + str q18, [x0, #0x280] + str q19, [x0, #0x2c0] + str q20, [x0, #0x300] + str q21, [x0, #0x340] + str q22, [x0, #0x380] + str q23, [x0, #0x3c0] + sqrdmulh v27.4s, v8.4s, v26.4s + mul v8.4s, v8.4s, v25.4s + mls v8.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v9.4s, v26.4s + mul v9.4s, v9.4s, v25.4s + mls v9.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v10.4s, v26.4s + mul v10.4s, v10.4s, v25.4s + mls v10.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v11.4s, v26.4s + mul v11.4s, v11.4s, v25.4s + mls v11.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v12.4s, v26.4s + mul v12.4s, v12.4s, v25.4s + mls v12.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v13.4s, v26.4s + mul v13.4s, v13.4s, v25.4s + mls v13.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v14.4s, v26.4s + mul v14.4s, v14.4s, v25.4s + mls v14.4s, v27.4s, v29.s[0] + sqrdmulh v27.4s, v15.4s, v26.4s + mul v15.4s, v15.4s, v25.4s + mls v15.4s, v27.4s, v29.s[0] + str q8, [x0], #0x10 + str q9, [x0, #0x30] + str q10, [x0, #0x70] + str q11, [x0, #0xb0] + str q12, [x0, #0xf0] + str q13, [x0, #0x130] + str q14, [x0, #0x170] + str q15, [x0, #0x1b0] + subs x4, x4, #0x1 + cbnz x4, layer1234_start + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S index ab4f299b7..068db9b16 100644 --- a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S @@ -5,108 +5,116 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro montgomery_reduce_long res, inl, inh - uzp1 \res\().4s, \inl\().4s, \inh\().4s - mul \res\().4s, \res\().4s, modulus_twisted.4s - smlal \inl\().2d, \res\().2s, modulus.2s - smlal2 \inh\().2d, \res\().4s, modulus.4s - uzp2 \res\().4s, \inl\().4s, \inh\().4s -.endm - -.macro load_polys p0, p1, p2, p3, ptr, idx -.if \idx == 0 - ldr \p1, [\ptr, #1*16] - ldr \p2, [\ptr, #2*16] - ldr \p3, [\ptr, #3*16] - ldr \p0, [\ptr], #4*16 -.else - ldr \p0, [\ptr, #(1024*\idx-4*16)] - ldr \p1, [\ptr, #(1024*\idx-3*16)] - ldr \p2, [\ptr, #(1024*\idx-2*16)] - ldr \p3, [\ptr, #(1024*\idx-1*16)] -.endif -.endm - -.macro pmull dl, dh, a, b - smull \dl\().2d, \a\().2s, \b\().2s - smull2 \dh\().2d, \a\().4s, \b\().4s -.endm - -.macro pmlal dl, dh, a, b - smlal \dl\().2d, \a\().2s, \b\().2s - smlal2 \dh\().2d, \a\().4s, \b\().4s -.endm - -out_ptr .req x0 -a_ptr .req x1 -b_ptr .req x2 -count .req x3 -wtmp .req w3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S using scripts/simpasm. Do not modify it directly. + */ -modulus .req v0 -modulus_twisted .req v1 .text -.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) .balign 4 +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l4_asm) MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l4_asm) - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup modulus.4s, wtmp - // load -q^-1 = 4236238847 - movz wtmp, #57343 - movk wtmp, #64639, lsl #16 - dup modulus_twisted.4s, wtmp - - mov count, #(MLDSA_N / 4) + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0xdfff // =57343 + movk w3, #0xfc7f, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 l4_loop_start: - load_polys q16, q17, q18, q19, a_ptr, 0 - load_polys q20, q21, q22, q23, b_ptr, 0 - - pmull v24, v25, v16, v20 - pmull v26, v27, v17, v21 - pmull v28, v29, v18, v22 - pmull v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 1 - load_polys q20, q21, q22, q23, b_ptr, 1 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 2 - load_polys q20, q21, q22, q23, b_ptr, 2 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 3 - load_polys q20, q21, q22, q23, b_ptr, 3 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - montgomery_reduce_long v16, v24, v25 - montgomery_reduce_long v17, v26, v27 - montgomery_reduce_long v18, v28, v29 - montgomery_reduce_long v19, v30, v31 - - str q17, [out_ptr, #1*16] - str q18, [out_ptr, #2*16] - str q19, [out_ptr, #3*16] - str q16, [out_ptr], #4*16 - - subs count, count, #4 - cbnz count, l4_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlal v24.2d, v16.2s, v0.2s + smlal2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlal v26.2d, v17.2s, v0.2s + smlal2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlal v28.2d, v18.2s, v0.2s + smlal2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlal v30.2d, v19.2s, v0.2s + smlal2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, l4_loop_start + ret - ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S index 5b72b8c0a..3b5604393 100644 --- a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S @@ -5,116 +5,132 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro montgomery_reduce_long res, inl, inh - uzp1 \res\().4s, \inl\().4s, \inh\().4s - mul \res\().4s, \res\().4s, modulus_twisted.4s - smlal \inl\().2d, \res\().2s, modulus.2s - smlal2 \inh\().2d, \res\().4s, modulus.4s - uzp2 \res\().4s, \inl\().4s, \inh\().4s -.endm - -.macro load_polys p0, p1, p2, p3, ptr, idx -.if \idx == 0 - ldr \p1, [\ptr, #1*16] - ldr \p2, [\ptr, #2*16] - ldr \p3, [\ptr, #3*16] - ldr \p0, [\ptr], #4*16 -.else - ldr \p0, [\ptr, #(1024*\idx-4*16)] - ldr \p1, [\ptr, #(1024*\idx-3*16)] - ldr \p2, [\ptr, #(1024*\idx-2*16)] - ldr \p3, [\ptr, #(1024*\idx-1*16)] -.endif -.endm - -.macro pmull dl, dh, a, b - smull \dl\().2d, \a\().2s, \b\().2s - smull2 \dh\().2d, \a\().4s, \b\().4s -.endm - -.macro pmlal dl, dh, a, b - smlal \dl\().2d, \a\().2s, \b\().2s - smlal2 \dh\().2d, \a\().4s, \b\().4s -.endm - -out_ptr .req x0 -a_ptr .req x1 -b_ptr .req x2 -count .req x3 -wtmp .req w3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l5.S using scripts/simpasm. Do not modify it directly. + */ -modulus .req v0 -modulus_twisted .req v1 .text -.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) .balign 4 +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l5_asm) MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l5_asm) - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup modulus.4s, wtmp - - // load -q^-1 = 4236238847 - movz wtmp, #57343 - movk wtmp, #64639, lsl #16 - dup modulus_twisted.4s, wtmp - mov count, #(MLDSA_N / 4) + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0xdfff // =57343 + movk w3, #0xfc7f, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 l5_loop_start: - load_polys q16, q17, q18, q19, a_ptr, 0 - load_polys q20, q21, q22, q23, b_ptr, 0 - - pmull v24, v25, v16, v20 - pmull v26, v27, v17, v21 - pmull v28, v29, v18, v22 - pmull v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 1 - load_polys q20, q21, q22, q23, b_ptr, 1 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 2 - load_polys q20, q21, q22, q23, b_ptr, 2 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 3 - load_polys q20, q21, q22, q23, b_ptr, 3 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 4 - load_polys q20, q21, q22, q23, b_ptr, 4 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - montgomery_reduce_long v16, v24, v25 - montgomery_reduce_long v17, v26, v27 - montgomery_reduce_long v18, v28, v29 - montgomery_reduce_long v19, v30, v31 - - str q17, [out_ptr, #1*16] - str q18, [out_ptr, #2*16] - str q19, [out_ptr, #3*16] - str q16, [out_ptr], #4*16 - - subs count, count, #4 - cbnz count, l5_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xfc0] + ldr q17, [x1, #0xfd0] + ldr q18, [x1, #0xfe0] + ldr q19, [x1, #0xff0] + ldr q20, [x2, #0xfc0] + ldr q21, [x2, #0xfd0] + ldr q22, [x2, #0xfe0] + ldr q23, [x2, #0xff0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlal v24.2d, v16.2s, v0.2s + smlal2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlal v26.2d, v17.2s, v0.2s + smlal2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlal v28.2d, v18.2s, v0.2s + smlal2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlal v30.2d, v19.2s, v0.2s + smlal2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, l5_loop_start + ret - ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S index bfa16e0e0..0c77248c0 100644 --- a/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +++ b/mldsa/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S @@ -5,132 +5,164 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro montgomery_reduce_long res, inl, inh - uzp1 \res\().4s, \inl\().4s, \inh\().4s - mul \res\().4s, \res\().4s, modulus_twisted.4s - smlal \inl\().2d, \res\().2s, modulus.2s - smlal2 \inh\().2d, \res\().4s, modulus.4s - uzp2 \res\().4s, \inl\().4s, \inh\().4s -.endm - -.macro load_polys p0, p1, p2, p3, ptr, idx -.if \idx == 0 - ldr \p1, [\ptr, #1*16] - ldr \p2, [\ptr, #2*16] - ldr \p3, [\ptr, #3*16] - ldr \p0, [\ptr], #4*16 -.else - ldr \p0, [\ptr, #(1024*\idx-4*16)] - ldr \p1, [\ptr, #(1024*\idx-3*16)] - ldr \p2, [\ptr, #(1024*\idx-2*16)] - ldr \p3, [\ptr, #(1024*\idx-1*16)] -.endif -.endm - -.macro pmull dl, dh, a, b - smull \dl\().2d, \a\().2s, \b\().2s - smull2 \dh\().2d, \a\().4s, \b\().4s -.endm - -.macro pmlal dl, dh, a, b - smlal \dl\().2d, \a\().2s, \b\().2s - smlal2 \dh\().2d, \a\().4s, \b\().4s -.endm - -out_ptr .req x0 -a_ptr .req x1 -b_ptr .req x2 -count .req x3 -wtmp .req w3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l7.S using scripts/simpasm. Do not modify it directly. + */ -modulus .req v0 -modulus_twisted .req v1 .text -.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) .balign 4 +.global MLD_ASM_NAMESPACE(polyvecl_pointwise_acc_montgomery_l7_asm) MLD_ASM_FN_SYMBOL(polyvecl_pointwise_acc_montgomery_l7_asm) - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup modulus.4s, wtmp - - // load -q^-1 = 4236238847 - movz wtmp, #57343 - movk wtmp, #64639, lsl #16 - dup modulus_twisted.4s, wtmp - mov count, #(MLDSA_N / 4) + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0xdfff // =57343 + movk w3, #0xfc7f, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 l7_loop_start: - load_polys q16, q17, q18, q19, a_ptr, 0 - load_polys q20, q21, q22, q23, b_ptr, 0 - - pmull v24, v25, v16, v20 - pmull v26, v27, v17, v21 - pmull v28, v29, v18, v22 - pmull v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 1 - load_polys q20, q21, q22, q23, b_ptr, 1 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 2 - load_polys q20, q21, q22, q23, b_ptr, 2 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 3 - load_polys q20, q21, q22, q23, b_ptr, 3 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 4 - load_polys q20, q21, q22, q23, b_ptr, 4 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 5 - load_polys q20, q21, q22, q23, b_ptr, 5 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - load_polys q16, q17, q18, q19, a_ptr, 6 - load_polys q20, q21, q22, q23, b_ptr, 6 - - pmlal v24, v25, v16, v20 - pmlal v26, v27, v17, v21 - pmlal v28, v29, v18, v22 - pmlal v30, v31, v19, v23 - - montgomery_reduce_long v16, v24, v25 - montgomery_reduce_long v17, v26, v27 - montgomery_reduce_long v18, v28, v29 - montgomery_reduce_long v19, v30, v31 - - str q17, [out_ptr, #1*16] - str q18, [out_ptr, #2*16] - str q19, [out_ptr, #3*16] - str q16, [out_ptr], #4*16 - - subs count, count, #4 - cbnz count, l7_loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x3c0] + ldr q17, [x1, #0x3d0] + ldr q18, [x1, #0x3e0] + ldr q19, [x1, #0x3f0] + ldr q20, [x2, #0x3c0] + ldr q21, [x2, #0x3d0] + ldr q22, [x2, #0x3e0] + ldr q23, [x2, #0x3f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x7c0] + ldr q17, [x1, #0x7d0] + ldr q18, [x1, #0x7e0] + ldr q19, [x1, #0x7f0] + ldr q20, [x2, #0x7c0] + ldr q21, [x2, #0x7d0] + ldr q22, [x2, #0x7e0] + ldr q23, [x2, #0x7f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xbc0] + ldr q17, [x1, #0xbd0] + ldr q18, [x1, #0xbe0] + ldr q19, [x1, #0xbf0] + ldr q20, [x2, #0xbc0] + ldr q21, [x2, #0xbd0] + ldr q22, [x2, #0xbe0] + ldr q23, [x2, #0xbf0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0xfc0] + ldr q17, [x1, #0xfd0] + ldr q18, [x1, #0xfe0] + ldr q19, [x1, #0xff0] + ldr q20, [x2, #0xfc0] + ldr q21, [x2, #0xfd0] + ldr q22, [x2, #0xfe0] + ldr q23, [x2, #0xff0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x13c0] + ldr q17, [x1, #0x13d0] + ldr q18, [x1, #0x13e0] + ldr q19, [x1, #0x13f0] + ldr q20, [x2, #0x13c0] + ldr q21, [x2, #0x13d0] + ldr q22, [x2, #0x13e0] + ldr q23, [x2, #0x13f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + ldr q16, [x1, #0x17c0] + ldr q17, [x1, #0x17d0] + ldr q18, [x1, #0x17e0] + ldr q19, [x1, #0x17f0] + ldr q20, [x2, #0x17c0] + ldr q21, [x2, #0x17d0] + ldr q22, [x2, #0x17e0] + ldr q23, [x2, #0x17f0] + smlal v24.2d, v16.2s, v20.2s + smlal2 v25.2d, v16.4s, v20.4s + smlal v26.2d, v17.2s, v21.2s + smlal2 v27.2d, v17.4s, v21.4s + smlal v28.2d, v18.2s, v22.2s + smlal2 v29.2d, v18.4s, v22.4s + smlal v30.2d, v19.2s, v23.2s + smlal2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlal v24.2d, v16.2s, v0.2s + smlal2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlal v26.2d, v17.2s, v0.2s + smlal2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlal v28.2d, v18.2s, v0.2s + smlal2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlal v30.2d, v19.2s, v0.2s + smlal2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, l7_loop_start + ret - ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/ntt.S b/mldsa/native/aarch64/src/ntt.S index c9f091fbc..6c10617d9 100644 --- a/mldsa/native/aarch64/src/ntt.S +++ b/mldsa/native/aarch64/src/ntt.S @@ -26,280 +26,269 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro mulmodq dst, src, const, idx0, idx1 - sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] - mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] - mls \dst\().4s, t2.4s, consts.s[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s - mul \dst\().4s, \src\().4s, \const\().4s - mls \dst\().4s, t2.4s, consts.s[0] -.endm - -.macro ct_butterfly a, b, root, idx0, idx1 - mulmodq tmp, \b, \root, \idx0, \idx1 - sub \b\().4s, \a\().4s, tmp.4s - add \a\().4s, \a\().4s, tmp.4s -.endm - -.macro ct_butterfly_v a, b, root, root_twisted - mulmod tmp, \b, \root, \root_twisted - sub \b\().4s, \a\().4s, tmp.4s - add \a\().4s, \a\().4s, tmp.4s -.endm - -.macro load_roots_123 - ldr q_root0, [r012345_ptr], #64 - ldr q_root1, [r012345_ptr, #(-64 + 16)] - ldr q_root2, [r012345_ptr, #(-64 + 32)] - ldr q_root3, [r012345_ptr, #(-64 + 48)] -.endm - -.macro load_roots_456 - ldr q_root0, [r012345_ptr], #64 - ldr q_root1, [r012345_ptr, #(-64 + 16)] - ldr q_root2, [r012345_ptr, #(-64 + 32)] - ldr q_root3, [r012345_ptr, #(-64 + 48)] -.endm - -.macro load_roots_78_part1 - ldr q_root0, [r67_ptr], #(12*16) - ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)] - ldr q_root1, [r67_ptr, #(-12*16 + 2*16)] - ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)] - ldr q_root2, [r67_ptr, #(-12*16 + 4*16)] - ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)] -.endm - -.macro load_roots_78_part2 - ldr q_root0, [r67_ptr, (-12*16 + 6*16)] - ldr q_root0_tw, [r67_ptr, (-12*16 + 7*16)] - ldr q_root1, [r67_ptr, (-12*16 + 8*16)] - ldr q_root1_tw, [r67_ptr, (-12*16 + 9*16)] - ldr q_root2, [r67_ptr, (-12*16 + 10*16)] - ldr q_root2_tw, [r67_ptr, (-12*16 + 11*16)] -.endm - -.macro transpose4 data0, data1, data2, data3 - trn1 t0.4s, \data0\().4s, \data1\().4s - trn2 t1.4s, \data0\().4s, \data1\().4s - trn1 t2.4s, \data2\().4s, \data3\().4s - trn2 t3.4s, \data2\().4s, \data3\().4s - - trn2 \data2\().2d, t0.2d, t2.2d - trn2 \data3\().2d, t1.2d, t3.2d - trn1 \data0\().2d, t0.2d, t2.2d - trn1 \data1\().2d, t1.2d, t3.2d -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - // Inputs - in .req x0 // Input/output buffer - r012345_ptr .req x1 // twiddles for layer 0,1,2,3,4,5 - r67_ptr .req x2 // twiddles for layer 6,7 - - count .req x3 - inp .req x4 - inpp .req x5 - xtmp .req x6 - wtmp .req w6 - - data0 .req v9 - data1 .req v10 - data2 .req v11 - data3 .req v12 - data4 .req v13 - data5 .req v14 - data6 .req v15 - data7 .req v16 - - q_data0 .req q9 - q_data1 .req q10 - q_data2 .req q11 - q_data3 .req q12 - q_data4 .req q13 - q_data5 .req q14 - q_data6 .req q15 - q_data7 .req q16 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root3 .req v3 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root3 .req q3 - - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - root3_tw .req v7 - - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - q_root3_tw .req q7 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/ntt.S using scripts/simpasm. Do not modify it directly. + */ - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - consts .req v8 - q_consts .req q8 .text -.global MLD_ASM_NAMESPACE(ntt_asm) .balign 4 +.global MLD_ASM_NAMESPACE(ntt_asm) MLD_ASM_FN_SYMBOL(ntt_asm) - push_stack - - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup consts.4s, wtmp - - mov inp, in - mov count, #8 - load_roots_123 + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w6, #0xe001 // =57345 + movk w6, #0x7f, lsl #16 + dup v8.4s, w6 + mov x4, x0 + mov x3, #0x8 // =8 + ldr q0, [x1], #0x40 + ldur q1, [x1, #-0x30] + ldur q2, [x1, #-0x20] + ldur q3, [x1, #-0x10] - .p2align 2 layer123_start: - ldr q_data0, [in, #(0*(1024/8))] - ldr q_data1, [in, #(1*(1024/8))] - ldr q_data2, [in, #(2*(1024/8))] - ldr q_data3, [in, #(3*(1024/8))] - ldr q_data4, [in, #(4*(1024/8))] - ldr q_data5, [in, #(5*(1024/8))] - ldr q_data6, [in, #(6*(1024/8))] - ldr q_data7, [in, #(7*(1024/8))] - - ct_butterfly data0, data4, root0, 0, 1 - ct_butterfly data1, data5, root0, 0, 1 - ct_butterfly data2, data6, root0, 0, 1 - ct_butterfly data3, data7, root0, 0, 1 - - ct_butterfly data0, data2, root0, 2, 3 - ct_butterfly data1, data3, root0, 2, 3 - ct_butterfly data4, data6, root1, 0, 1 - ct_butterfly data5, data7, root1, 0, 1 - - ct_butterfly data0, data1, root1, 2, 3 - ct_butterfly data2, data3, root2, 0, 1 - ct_butterfly data4, data5, root2, 2, 3 - ct_butterfly data6, data7, root3, 0, 1 + ldr q9, [x0] + ldr q10, [x0, #0x80] + ldr q11, [x0, #0x100] + ldr q12, [x0, #0x180] + ldr q13, [x0, #0x200] + ldr q14, [x0, #0x280] + ldr q15, [x0, #0x300] + ldr q16, [x0, #0x380] + sqrdmulh v27.4s, v13.4s, v0.s[1] + mul v24.4s, v13.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v13.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v14.4s, v0.s[1] + mul v24.4s, v14.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v14.4s, v10.4s, v24.4s + add v10.4s, v10.4s, v24.4s + sqrdmulh v27.4s, v15.4s, v0.s[1] + mul v24.4s, v15.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v15.4s, v11.4s, v24.4s + add v11.4s, v11.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v0.s[1] + mul v24.4s, v16.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v12.4s, v24.4s + add v12.4s, v12.4s, v24.4s + sqrdmulh v27.4s, v11.4s, v0.s[3] + mul v24.4s, v11.4s, v0.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v11.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v0.s[3] + mul v24.4s, v12.4s, v0.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v10.4s, v24.4s + add v10.4s, v10.4s, v24.4s + sqrdmulh v27.4s, v15.4s, v1.s[1] + mul v24.4s, v15.4s, v1.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v15.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v1.s[1] + mul v24.4s, v16.4s, v1.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v14.4s, v24.4s + add v14.4s, v14.4s, v24.4s + sqrdmulh v27.4s, v10.4s, v1.s[3] + mul v24.4s, v10.4s, v1.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v10.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v2.s[1] + mul v24.4s, v12.4s, v2.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v11.4s, v24.4s + add v11.4s, v11.4s, v24.4s + sqrdmulh v27.4s, v14.4s, v2.s[3] + mul v24.4s, v14.4s, v2.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v14.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v3.s[1] + mul v24.4s, v16.4s, v3.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v15.4s, v24.4s + add v15.4s, v15.4s, v24.4s + str q9, [x0], #0x10 + str q10, [x0, #0x70] + str q11, [x0, #0xf0] + str q12, [x0, #0x170] + str q13, [x0, #0x1f0] + str q14, [x0, #0x270] + str q15, [x0, #0x2f0] + str q16, [x0, #0x370] + subs x3, x3, #0x1 + cbnz x3, layer123_start + mov x0, x4 + add x5, x0, #0x40 + mov x3, #0x8 // =8 + sub x0, x0, #0x40 + sub x5, x5, #0x40 - str q_data0, [in], #16 - str q_data1, [in, #(-16 + 1*(1024/8))] - str q_data2, [in, #(-16 + 2*(1024/8))] - str q_data3, [in, #(-16 + 3*(1024/8))] - str q_data4, [in, #(-16 + 4*(1024/8))] - str q_data5, [in, #(-16 + 5*(1024/8))] - str q_data6, [in, #(-16 + 6*(1024/8))] - str q_data7, [in, #(-16 + 7*(1024/8))] - - subs count, count, #1 - cbnz count, layer123_start - - mov in, inp - add inpp, in, #64 - mov count, #8 - - // Use two data pointers and carefully arrange - // increments to facilitate reordering of loads - // and stores by SLOTHY. - // - // TODO: Think of alternatives here -- the start with `in` - // pointing to 64 byte below the actual data, which in theory - // could underflow. It's unclear how the CPU would behave in this case. - sub in, in, #64 - sub inpp, inpp, #64 - - .p2align 2 layer45678_start: - ldr q_data0, [in, #(64 + 16*0)] - ldr q_data1, [in, #(64 + 16*1)] - ldr q_data2, [in, #(64 + 16*2)] - ldr q_data3, [in, #(64 + 16*3)] - ldr q_data4, [inpp, #(64 + 16*0)] - ldr q_data5, [inpp, #(64 + 16*1)] - ldr q_data6, [inpp, #(64 + 16*2)] - ldr q_data7, [inpp, #(64 + 16*3)] - - add in, in, #64 - add inpp, inpp, #64 - - load_roots_456 - - ct_butterfly data0, data4, root0, 0, 1 - ct_butterfly data1, data5, root0, 0, 1 - ct_butterfly data2, data6, root0, 0, 1 - ct_butterfly data3, data7, root0, 0, 1 - - ct_butterfly data0, data2, root0, 2, 3 - ct_butterfly data1, data3, root0, 2, 3 - ct_butterfly data4, data6, root1, 0, 1 - ct_butterfly data5, data7, root1, 0, 1 - - ct_butterfly data0, data1, root1, 2, 3 - ct_butterfly data2, data3, root2, 0, 1 - ct_butterfly data4, data5, root2, 2, 3 - ct_butterfly data6, data7, root3, 0, 1 - - // Transpose using trn - transpose4 data0, data1, data2, data3 - transpose4 data4, data5, data6, data7 - - load_roots_78_part1 - - ct_butterfly_v data0, data2, root0, root0_tw - ct_butterfly_v data1, data3, root0, root0_tw - ct_butterfly_v data0, data1, root1, root1_tw - ct_butterfly_v data2, data3, root2, root2_tw - - load_roots_78_part2 - - ct_butterfly_v data4, data6, root0, root0_tw - ct_butterfly_v data5, data7, root0, root0_tw - ct_butterfly_v data4, data5, root1, root1_tw - ct_butterfly_v data6, data7, root2, root2_tw - - // Transpose as part of st4 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 - st4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp], #64 - - subs count, count, #1 - cbnz count, layer45678_start - - pop_stack - ret + ldr q9, [x0, #0x40] + ldr q10, [x0, #0x50] + ldr q11, [x0, #0x60] + ldr q12, [x0, #0x70] + ldr q13, [x5, #0x40] + ldr q14, [x5, #0x50] + ldr q15, [x5, #0x60] + ldr q16, [x5, #0x70] + add x0, x0, #0x40 + add x5, x5, #0x40 + ldr q0, [x1], #0x40 + ldur q1, [x1, #-0x30] + ldur q2, [x1, #-0x20] + ldur q3, [x1, #-0x10] + sqrdmulh v27.4s, v13.4s, v0.s[1] + mul v24.4s, v13.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v13.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v14.4s, v0.s[1] + mul v24.4s, v14.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v14.4s, v10.4s, v24.4s + add v10.4s, v10.4s, v24.4s + sqrdmulh v27.4s, v15.4s, v0.s[1] + mul v24.4s, v15.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v15.4s, v11.4s, v24.4s + add v11.4s, v11.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v0.s[1] + mul v24.4s, v16.4s, v0.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v12.4s, v24.4s + add v12.4s, v12.4s, v24.4s + sqrdmulh v27.4s, v11.4s, v0.s[3] + mul v24.4s, v11.4s, v0.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v11.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v0.s[3] + mul v24.4s, v12.4s, v0.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v10.4s, v24.4s + add v10.4s, v10.4s, v24.4s + sqrdmulh v27.4s, v15.4s, v1.s[1] + mul v24.4s, v15.4s, v1.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v15.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v1.s[1] + mul v24.4s, v16.4s, v1.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v14.4s, v24.4s + add v14.4s, v14.4s, v24.4s + sqrdmulh v27.4s, v10.4s, v1.s[3] + mul v24.4s, v10.4s, v1.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v10.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v2.s[1] + mul v24.4s, v12.4s, v2.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v11.4s, v24.4s + add v11.4s, v11.4s, v24.4s + sqrdmulh v27.4s, v14.4s, v2.s[3] + mul v24.4s, v14.4s, v2.s[2] + mls v24.4s, v27.4s, v8.s[0] + sub v14.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v3.s[1] + mul v24.4s, v16.4s, v3.s[0] + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v15.4s, v24.4s + add v15.4s, v15.4s, v24.4s + trn1 v25.4s, v9.4s, v10.4s + trn2 v26.4s, v9.4s, v10.4s + trn1 v27.4s, v11.4s, v12.4s + trn2 v28.4s, v11.4s, v12.4s + trn2 v11.2d, v25.2d, v27.2d + trn2 v12.2d, v26.2d, v28.2d + trn1 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn1 v25.4s, v13.4s, v14.4s + trn2 v26.4s, v13.4s, v14.4s + trn1 v27.4s, v15.4s, v16.4s + trn2 v28.4s, v15.4s, v16.4s + trn2 v15.2d, v25.2d, v27.2d + trn2 v16.2d, v26.2d, v28.2d + trn1 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + ldr q0, [x2], #0xc0 + ldur q4, [x2, #-0xb0] + ldur q1, [x2, #-0xa0] + ldur q5, [x2, #-0x90] + ldur q2, [x2, #-0x80] + ldur q6, [x2, #-0x70] + sqrdmulh v27.4s, v11.4s, v4.4s + mul v24.4s, v11.4s, v0.4s + mls v24.4s, v27.4s, v8.s[0] + sub v11.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v4.4s + mul v24.4s, v12.4s, v0.4s + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v10.4s, v24.4s + add v10.4s, v10.4s, v24.4s + sqrdmulh v27.4s, v10.4s, v5.4s + mul v24.4s, v10.4s, v1.4s + mls v24.4s, v27.4s, v8.s[0] + sub v10.4s, v9.4s, v24.4s + add v9.4s, v9.4s, v24.4s + sqrdmulh v27.4s, v12.4s, v6.4s + mul v24.4s, v12.4s, v2.4s + mls v24.4s, v27.4s, v8.s[0] + sub v12.4s, v11.4s, v24.4s + add v11.4s, v11.4s, v24.4s + ldur q0, [x2, #-0x60] + ldur q4, [x2, #-0x50] + ldur q1, [x2, #-0x40] + ldur q5, [x2, #-0x30] + ldur q2, [x2, #-0x20] + ldur q6, [x2, #-0x10] + sqrdmulh v27.4s, v15.4s, v4.4s + mul v24.4s, v15.4s, v0.4s + mls v24.4s, v27.4s, v8.s[0] + sub v15.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v4.4s + mul v24.4s, v16.4s, v0.4s + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v14.4s, v24.4s + add v14.4s, v14.4s, v24.4s + sqrdmulh v27.4s, v14.4s, v5.4s + mul v24.4s, v14.4s, v1.4s + mls v24.4s, v27.4s, v8.s[0] + sub v14.4s, v13.4s, v24.4s + add v13.4s, v13.4s, v24.4s + sqrdmulh v27.4s, v16.4s, v6.4s + mul v24.4s, v16.4s, v2.4s + mls v24.4s, v27.4s, v8.s[0] + sub v16.4s, v15.4s, v24.4s + add v15.4s, v15.4s, v24.4s + st4 { v9.4s, v10.4s, v11.4s, v12.4s }, [x0], #64 + st4 { v13.4s, v14.4s, v15.4s, v16.4s }, [x5], #64 + subs x3, x3, #0x1 + cbnz x3, layer45678_start + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/pointwise_montgomery.S b/mldsa/native/aarch64/src/pointwise_montgomery.S index 140fa2fb6..9a7d47288 100644 --- a/mldsa/native/aarch64/src/pointwise_montgomery.S +++ b/mldsa/native/aarch64/src/pointwise_montgomery.S @@ -5,72 +5,68 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) -.macro montgomery_reduce_long res, inl, inh - uzp1 \res\().4s, \inl\().4s, \inh\().4s - mul \res\().4s, \res\().4s, modulus_twisted.4s - smlal \inl\().2d, \res\().2s, modulus.2s - smlal2 \inh\().2d, \res\().4s, modulus.4s - uzp2 \res\().4s, \inl\().4s, \inh\().4s -.endm - -.macro pmull dl, dh, a, b - smull \dl\().2d, \a\().2s, \b\().2s - smull2 \dh\().2d, \a\().4s, \b\().4s -.endm - -out_ptr .req x0 -a_ptr .req x1 -b_ptr .req x2 -count .req x3 -wtmp .req w3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/pointwise_montgomery.S using scripts/simpasm. Do not modify it directly. + */ -modulus .req v0 -modulus_twisted .req v1 .text -.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm) MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup modulus.4s, wtmp - - // load -q^-1 = 4236238847 - movz wtmp, #57343 - movk wtmp, #64639, lsl #16 - dup modulus_twisted.4s, wtmp - mov count, #(MLDSA_N / 4) + mov w3, #0xe001 // =57345 + movk w3, #0x7f, lsl #16 + dup v0.4s, w3 + mov w3, #0xdfff // =57343 + movk w3, #0xfc7f, lsl #16 + dup v1.4s, w3 + mov x3, #0x40 // =64 loop_start: - ldr q17, [a_ptr, #1*16] - ldr q18, [a_ptr, #2*16] - ldr q19, [a_ptr, #3*16] - ldr q16, [a_ptr], #4*16 - - ldr q21, [b_ptr, #1*16] - ldr q22, [b_ptr, #2*16] - ldr q23, [b_ptr, #3*16] - ldr q20, [b_ptr], #4*16 - - pmull v24, v25, v16, v20 - pmull v26, v27, v17, v21 - pmull v28, v29, v18, v22 - pmull v30, v31, v19, v23 - - montgomery_reduce_long v16, v24, v25 - montgomery_reduce_long v17, v26, v27 - montgomery_reduce_long v18, v28, v29 - montgomery_reduce_long v19, v30, v31 - - str q17, [out_ptr, #1*16] - str q18, [out_ptr, #2*16] - str q19, [out_ptr, #3*16] - str q16, [out_ptr], #4*16 - - subs count, count, #4 - cbnz count, loop_start + ldr q17, [x1, #0x10] + ldr q18, [x1, #0x20] + ldr q19, [x1, #0x30] + ldr q16, [x1], #0x40 + ldr q21, [x2, #0x10] + ldr q22, [x2, #0x20] + ldr q23, [x2, #0x30] + ldr q20, [x2], #0x40 + smull v24.2d, v16.2s, v20.2s + smull2 v25.2d, v16.4s, v20.4s + smull v26.2d, v17.2s, v21.2s + smull2 v27.2d, v17.4s, v21.4s + smull v28.2d, v18.2s, v22.2s + smull2 v29.2d, v18.4s, v22.4s + smull v30.2d, v19.2s, v23.2s + smull2 v31.2d, v19.4s, v23.4s + uzp1 v16.4s, v24.4s, v25.4s + mul v16.4s, v16.4s, v1.4s + smlal v24.2d, v16.2s, v0.2s + smlal2 v25.2d, v16.4s, v0.4s + uzp2 v16.4s, v24.4s, v25.4s + uzp1 v17.4s, v26.4s, v27.4s + mul v17.4s, v17.4s, v1.4s + smlal v26.2d, v17.2s, v0.2s + smlal2 v27.2d, v17.4s, v0.4s + uzp2 v17.4s, v26.4s, v27.4s + uzp1 v18.4s, v28.4s, v29.4s + mul v18.4s, v18.4s, v1.4s + smlal v28.2d, v18.2s, v0.2s + smlal2 v29.2d, v18.4s, v0.4s + uzp2 v18.4s, v28.4s, v29.4s + uzp1 v19.4s, v30.4s, v31.4s + mul v19.4s, v19.4s, v1.4s + smlal v30.2d, v19.2s, v0.2s + smlal2 v31.2d, v19.4s, v0.4s + uzp2 v19.4s, v30.4s, v31.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x4 + cbnz x3, loop_start + ret - ret #endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/poly_caddq_asm.S b/mldsa/native/aarch64/src/poly_caddq_asm.S index 44d6d0fb5..d158872bc 100644 --- a/mldsa/native/aarch64/src/poly_caddq_asm.S +++ b/mldsa/native/aarch64/src/poly_caddq_asm.S @@ -6,54 +6,41 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -.macro caddq inout - ushr tmp.4s, \inout\().4s, #31 - mla \inout\().4s, tmp.4s, q_reg.4s -.endm +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_caddq_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 .global MLD_ASM_NAMESPACE(poly_caddq_asm) -.balign 16 MLD_ASM_FN_SYMBOL(poly_caddq_asm) - // Function signature: void mld_poly_caddq_asm(int32_t *a) - // x0: pointer to polynomial coefficients - // Register assignments - a_ptr .req x0 - count .req x1 - q_reg .req v4 - tmp .req v5 + mov w9, #0xe001 // =57345 + movk w9, #0x7f, lsl #16 + dup v4.4s, w9 + mov x1, #0x10 // =16 - // Load constants - // MLDSA_Q = 8380417 = 0x7FE001 - movz w9, #0xE001 - movk w9, #0x7F, lsl #16 - dup q_reg.4s, w9 // Load Q values - - mov count, #64/4 poly_caddq_loop: - ldr q0, [a_ptr, #0*16] - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - - caddq v0 - caddq v1 - caddq v2 - caddq v3 - - str q1, [a_ptr, #1*16] - str q2, [a_ptr, #2*16] - str q3, [a_ptr, #3*16] - str q0, [a_ptr], #4*16 - - subs count, count, #1 - bne poly_caddq_loop - + ldr q0, [x0] + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ushr v5.4s, v0.4s, #0x1f + mla v0.4s, v5.4s, v4.4s + ushr v5.4s, v1.4s, #0x1f + mla v1.4s, v5.4s, v4.4s + ushr v5.4s, v2.4s, #0x1f + mla v2.4s, v5.4s, v4.4s + ushr v5.4s, v3.4s, #0x1f + mla v3.4s, v5.4s, v4.4s + str q1, [x0, #0x10] + str q2, [x0, #0x20] + str q3, [x0, #0x30] + str q0, [x0], #0x40 + subs x1, x1, #0x1 + b.ne poly_caddq_loop ret - .unreq a_ptr - .unreq count - .unreq q_reg - .unreq tmp - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_chknorm_asm.S b/mldsa/native/aarch64/src/poly_chknorm_asm.S index 78009928f..e94dc48dc 100644 --- a/mldsa/native/aarch64/src/poly_chknorm_asm.S +++ b/mldsa/native/aarch64/src/poly_chknorm_asm.S @@ -6,57 +6,42 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -.macro chknorm a - abs \a\().4s, \a\().4s - cmge \a\().4s, \a\().4s, bound.4s - orr flags.16b, flags.16b, \a\().16b -.endm - - /* Parameters */ - a_ptr .req x0 // Input polynomial - B .req w1 // Input norm bound - - count .req x2 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_chknorm_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - bound .req v20 - flags .req v21 .text -.global MLD_ASM_NAMESPACE(poly_chknorm_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_chknorm_asm) MLD_ASM_FN_SYMBOL(poly_chknorm_asm) - // Load constants - dup bound.4s, B - movi flags.4s, 0 - - mov count, #(64/4) + dup v20.4s, w1 + movi v21.4s, #0x0 + mov x2, #0x10 // =16 poly_chknorm_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - chknorm v1 - chknorm v2 - chknorm v3 - chknorm v0 - - subs count, count, #1 - bne poly_chknorm_loop - - // Return 0xffffffff if any of the 4 lanes is 0xffffffff - umaxv s21, flags.4s - fmov w0, s21 - + ldr q1, [x0, #0x10] + ldr q2, [x0, #0x20] + ldr q3, [x0, #0x30] + ldr q0, [x0], #0x40 + abs v1.4s, v1.4s + cmge v1.4s, v1.4s, v20.4s + orr v21.16b, v21.16b, v1.16b + abs v2.4s, v2.4s + cmge v2.4s, v2.4s, v20.4s + orr v21.16b, v21.16b, v2.16b + abs v3.4s, v3.4s + cmge v3.4s, v3.4s, v20.4s + orr v21.16b, v21.16b, v3.16b + abs v0.4s, v0.4s + cmge v0.4s, v0.4s, v20.4s + orr v21.16b, v21.16b, v0.16b + subs x2, x2, #0x1 + b.ne poly_chknorm_loop + umaxv s21, v21.4s + fmov w0, s21 ret - .unreq a_ptr - .unreq B - .unreq count - .unreq bound - .unreq flags - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S index bddfef2c2..68bccf97f 100644 --- a/mldsa/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S @@ -6,99 +6,70 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// a aliased with a0 -.macro decompose32 a1, a, temp - // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / - // 2^49), where round-() denotes "round half down". This is - // exact for 0 <= a < Q. Note that half is rounded down since - // 1074791425 / 2^49 ≲ 1 / 523776. - sqdmulh \a1\().4s, \a\().4s, barrett_const.4s - srshr \a1\().4s, \a1\().4s, #18 - - // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was - // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we - // still round it to 0 like other "wrapped around" cases.) - - // Check for wrap-around - cmgt \temp\().4s, \a\().4s, q_bound.4s - - // Compute remainder a0 - mls \a\().4s, \a1\().4s, gamma2_2x.4s - - // If wrap-around is required, set a1 = 0 and a0 -= 1 - bic \a1\().16b, \a1\().16b, \temp\().16b - add \a\().4s, \a\().4s, \temp\().4s -.endm - - /* Parameters */ - a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial - - count .req x3 - - /* Constant register assignments */ - q .req v20 // Q = 8380417 - q_bound .req v21 // 31*GAMMA2 = 8118528 - gamma2_2x .req v22 // 2*GAMMA2 = 523776 - barrett_const .req v23 // Barrett constant = 1074791425 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_decompose_32_asm.S using scripts/simpasm. Do not modify it directly. + */ .text -.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) - // Load constants into SIMD registers - movz w4, #57345 - movk w4, #127, lsl #16 - dup q.4s, w4 - - movz w5, #0xe100 - movk w5, #0x7b, lsl #16 - dup q_bound.4s, w5 - - movz w7, #0xfe00 - movk w7, #7, lsl #16 - dup gamma2_2x.4s, w7 - movz w11, #0x0401 - movk w11, #0x4010, lsl #16 - dup barrett_const.4s, w11 - - mov count, #(64/4) + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0xe100 // =57600 + movk w5, #0x7b, lsl #16 + dup v21.4s, w5 + mov w7, #0xfe00 // =65024 + movk w7, #0x7, lsl #16 + dup v22.4s, w7 + mov w11, #0x401 // =1025 + movk w11, #0x4010, lsl #16 + dup v23.4s, w11 + mov x3, #0x10 // =16 poly_decompose_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - decompose32 v5, v1, v24 - decompose32 v6, v2, v24 - decompose32 v7, v3, v24 - decompose32 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 - - subs count, count, #1 - bne poly_decompose_32_loop - + ldr q1, [x2, #0x10] + ldr q2, [x2, #0x20] + ldr q3, [x2, #0x30] + ldr q0, [x2], #0x40 + sqdmulh v5.4s, v1.4s, v23.4s + srshr v5.4s, v5.4s, #0x12 + cmgt v24.4s, v1.4s, v21.4s + mls v1.4s, v5.4s, v22.4s + bic v5.16b, v5.16b, v24.16b + add v1.4s, v1.4s, v24.4s + sqdmulh v6.4s, v2.4s, v23.4s + srshr v6.4s, v6.4s, #0x12 + cmgt v24.4s, v2.4s, v21.4s + mls v2.4s, v6.4s, v22.4s + bic v6.16b, v6.16b, v24.16b + add v2.4s, v2.4s, v24.4s + sqdmulh v7.4s, v3.4s, v23.4s + srshr v7.4s, v7.4s, #0x12 + cmgt v24.4s, v3.4s, v21.4s + mls v3.4s, v7.4s, v22.4s + bic v7.16b, v7.16b, v24.16b + add v3.4s, v3.4s, v24.4s + sqdmulh v4.4s, v0.4s, v23.4s + srshr v4.4s, v4.4s, #0x12 + cmgt v24.4s, v0.4s, v21.4s + mls v0.4s, v4.4s, v22.4s + bic v4.16b, v4.16b, v24.16b + add v0.4s, v0.4s, v24.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + str q1, [x1, #0x10] + str q2, [x1, #0x20] + str q3, [x1, #0x30] + str q0, [x1], #0x40 + subs x3, x3, #0x1 + b.ne poly_decompose_32_loop ret - .unreq a1_ptr - .unreq a0_ptr - .unreq a_ptr - .unreq count - .unreq q - .unreq q_bound - .unreq gamma2_2x - .unreq barrett_const - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_decompose_88_asm.S b/mldsa/native/aarch64/src/poly_decompose_88_asm.S index e36ce1f89..bc5c223a0 100644 --- a/mldsa/native/aarch64/src/poly_decompose_88_asm.S +++ b/mldsa/native/aarch64/src/poly_decompose_88_asm.S @@ -6,97 +6,70 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// a aliased with a0 -.macro decompose88 a1, a, temp - // Compute a1 = round-(a / 190464) ≈ round(a * 1477838209 / - // 2^48), where round-() denotes "round half down". This is - // exact for 0 <= a < Q. Note that half is rounded down since - // 1477838209 / 2^48 ≲ 1 / 190464. - sqdmulh \a1\().4s, \a\().4s, barrett_const.4s - srshr \a1\().4s, \a1\().4s, #17 - - // If a1 = 44, i.e. a > 87*GAMMA2, proceed as if a' = a - Q was - // given instead. (For a = 87*GAMMA2 + 1 thus a' = -GAMMA2, we - // still round it to 0 like other "wrapped around" cases.) - - // Check for wrap-around - cmgt \temp\().4s, \a\().4s, q_bound.4s - - // Compute remainder a0 - mls \a\().4s, \a1\().4s, gamma2_2x.4s - - // If wrap-around is required, set a1 = 0 and a0 -= 1 - bic \a1\().16b, \a1\().16b, \temp\().16b - add \a\().4s, \a\().4s, \temp\().4s -.endm - - /* Parameters */ - a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial - - count .req x3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_decompose_88_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - q .req v20 // Q = 8380417 - q_bound .req v21 // 87*GAMMA2 = 8285184 - gamma2_2x .req v22 // 2*GAMMA2 = 190464 - barrett_const .req v23 // Barrett constant = 1477838209 .text -.global MLD_ASM_NAMESPACE(poly_decompose_88_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_decompose_88_asm) MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) - // Load constants into SIMD registers - movz w4, #57345 - movk w4, #127, lsl #16 - dup q.4s, w4 - - movz w5, #0x6c00 - movk w5, #0x7e, lsl #16 - dup q_bound.4s, w5 - movz w7, #0xe800 - movk w7, #0x2, lsl #16 - dup gamma2_2x.4s, w7 + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0x6c00 // =27648 + movk w5, #0x7e, lsl #16 + dup v21.4s, w5 + mov w7, #0xe800 // =59392 + movk w7, #0x2, lsl #16 + dup v22.4s, w7 + mov w11, #0x581 // =1409 + movk w11, #0x5816, lsl #16 + dup v23.4s, w11 + mov x3, #0x10 // =16 - movz w11, #0x0581 - movk w11, #0x5816, lsl #16 - dup barrett_const.4s, w11 - - mov count, #(64/4) poly_decompose_88_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - decompose88 v5, v1, v24 - decompose88 v6, v2, v24 - decompose88 v7, v3, v24 - decompose88 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 - - subs count, count, #1 - bne poly_decompose_88_loop - + ldr q1, [x2, #0x10] + ldr q2, [x2, #0x20] + ldr q3, [x2, #0x30] + ldr q0, [x2], #0x40 + sqdmulh v5.4s, v1.4s, v23.4s + srshr v5.4s, v5.4s, #0x11 + cmgt v24.4s, v1.4s, v21.4s + mls v1.4s, v5.4s, v22.4s + bic v5.16b, v5.16b, v24.16b + add v1.4s, v1.4s, v24.4s + sqdmulh v6.4s, v2.4s, v23.4s + srshr v6.4s, v6.4s, #0x11 + cmgt v24.4s, v2.4s, v21.4s + mls v2.4s, v6.4s, v22.4s + bic v6.16b, v6.16b, v24.16b + add v2.4s, v2.4s, v24.4s + sqdmulh v7.4s, v3.4s, v23.4s + srshr v7.4s, v7.4s, #0x11 + cmgt v24.4s, v3.4s, v21.4s + mls v3.4s, v7.4s, v22.4s + bic v7.16b, v7.16b, v24.16b + add v3.4s, v3.4s, v24.4s + sqdmulh v4.4s, v0.4s, v23.4s + srshr v4.4s, v4.4s, #0x11 + cmgt v24.4s, v0.4s, v21.4s + mls v0.4s, v4.4s, v22.4s + bic v4.16b, v4.16b, v24.16b + add v0.4s, v0.4s, v24.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + str q1, [x1, #0x10] + str q2, [x1, #0x20] + str q3, [x1, #0x30] + str q0, [x1], #0x40 + subs x3, x3, #0x1 + b.ne poly_decompose_88_loop ret - .unreq a1_ptr - .unreq a0_ptr - .unreq a_ptr - .unreq count - .unreq q - .unreq q_bound - .unreq gamma2_2x - .unreq barrett_const - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_use_hint_32_asm.S b/mldsa/native/aarch64/src/poly_use_hint_32_asm.S index e3c4730e3..e07d1536c 100644 --- a/mldsa/native/aarch64/src/poly_use_hint_32_asm.S +++ b/mldsa/native/aarch64/src/poly_use_hint_32_asm.S @@ -6,116 +6,87 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// a aliased with a0 -.macro decompose32 a1, a, temp - // Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 / - // 2^49), where round-() denotes "round half down". This is - // exact for 0 <= a < Q. Note that half is rounded down since - // 1074791425 / 2^49 ≲ 1 / 523776. - sqdmulh \a1\().4s, \a\().4s, barrett_const.4s - srshr \a1\().4s, \a1\().4s, #18 - - // If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was - // given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we - // still round it to 0 like other "wrapped around" cases.) - - // Check for wrap-around - cmgt \temp\().4s, \a\().4s, q_bound.4s - - // Compute remainder a0 - mls \a\().4s, \a1\().4s, gamma2_2x.4s - - // If wrap-around is required, set a1 = 0 and a0 -= 1 - bic \a1\().16b, \a1\().16b, \temp\().16b - add \a\().4s, \a\().4s, \temp\().4s -.endm - -// a aliased with delta -.macro use_hint32 b, a, h, temp - decompose32 \b, \a, \temp - - // delta = (a0 <= 0) ? -1 : 1 - cmle \a\().4s, \a\().4s, #0 - orr \a\().4s, #1 - - // b = (b + delta * h) % 16 - mla \b\().4s, \a\().4s, \h\().4s - and \b\().16b, \b\().16b, mask_15.16b -.endm - - /* Parameters */ - b_ptr .req x0 // Output polynomial - a_ptr .req x1 // Input polynomial - h_ptr .req x2 // Input hints - - count .req x3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_use_hint_32_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - q .req v20 // Q = 8380417 - q_bound .req v21 // 31*GAMMA2 = 8118528 - gamma2_2x .req v22 // 2*GAMMA2 = 523776 - barrett_const .req v23 // Barrett constant = 1074791425 - mask_15 .req v24 // mask = 15 .text -.global MLD_ASM_NAMESPACE(poly_use_hint_32_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_use_hint_32_asm) MLD_ASM_FN_SYMBOL(poly_use_hint_32_asm) - // Load constants into SIMD registers - movz w4, #57345 - movk w4, #127, lsl #16 - dup q.4s, w4 - - movz w5, #0xe100 - movk w5, #0x7b, lsl #16 - dup q_bound.4s, w5 - - movz w7, #0xfe00 - movk w7, #7, lsl #16 - dup gamma2_2x.4s, w7 - movz w11, #0x0401 - movk w11, #0x4010, lsl #16 - dup barrett_const.4s, w11 - - movi mask_15.4s, #15 - - mov count, #(64/4) + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0xe100 // =57600 + movk w5, #0x7b, lsl #16 + dup v21.4s, w5 + mov w7, #0xfe00 // =65024 + movk w7, #0x7, lsl #16 + dup v22.4s, w7 + mov w11, #0x401 // =1025 + movk w11, #0x4010, lsl #16 + dup v23.4s, w11 + movi v24.4s, #0xf + mov x3, #0x10 // =16 poly_use_hint_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - ldr q5, [h_ptr, #1*16] - ldr q6, [h_ptr, #2*16] - ldr q7, [h_ptr, #3*16] - ldr q4, [h_ptr], #4*16 - - use_hint32 v17, v1, v5, v25 - use_hint32 v18, v2, v6, v25 - use_hint32 v19, v3, v7, v25 - use_hint32 v16, v0, v4, v25 - - str q17, [b_ptr, #1*16] - str q18, [b_ptr, #2*16] - str q19, [b_ptr, #3*16] - str q16, [b_ptr], #4*16 - - subs count, count, #1 - bne poly_use_hint_32_loop - + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + ldr q0, [x1], #0x40 + ldr q5, [x2, #0x10] + ldr q6, [x2, #0x20] + ldr q7, [x2, #0x30] + ldr q4, [x2], #0x40 + sqdmulh v17.4s, v1.4s, v23.4s + srshr v17.4s, v17.4s, #0x12 + cmgt v25.4s, v1.4s, v21.4s + mls v1.4s, v17.4s, v22.4s + bic v17.16b, v17.16b, v25.16b + add v1.4s, v1.4s, v25.4s + cmle v1.4s, v1.4s, #0 + orr v1.4s, #0x1 + mla v17.4s, v1.4s, v5.4s + and v17.16b, v17.16b, v24.16b + sqdmulh v18.4s, v2.4s, v23.4s + srshr v18.4s, v18.4s, #0x12 + cmgt v25.4s, v2.4s, v21.4s + mls v2.4s, v18.4s, v22.4s + bic v18.16b, v18.16b, v25.16b + add v2.4s, v2.4s, v25.4s + cmle v2.4s, v2.4s, #0 + orr v2.4s, #0x1 + mla v18.4s, v2.4s, v6.4s + and v18.16b, v18.16b, v24.16b + sqdmulh v19.4s, v3.4s, v23.4s + srshr v19.4s, v19.4s, #0x12 + cmgt v25.4s, v3.4s, v21.4s + mls v3.4s, v19.4s, v22.4s + bic v19.16b, v19.16b, v25.16b + add v3.4s, v3.4s, v25.4s + cmle v3.4s, v3.4s, #0 + orr v3.4s, #0x1 + mla v19.4s, v3.4s, v7.4s + and v19.16b, v19.16b, v24.16b + sqdmulh v16.4s, v0.4s, v23.4s + srshr v16.4s, v16.4s, #0x12 + cmgt v25.4s, v0.4s, v21.4s + mls v0.4s, v16.4s, v22.4s + bic v16.16b, v16.16b, v25.16b + add v0.4s, v0.4s, v25.4s + cmle v0.4s, v0.4s, #0 + orr v0.4s, #0x1 + mla v16.4s, v0.4s, v4.4s + and v16.16b, v16.16b, v24.16b + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x1 + b.ne poly_use_hint_32_loop ret - .unreq b_ptr - .unreq a_ptr - .unreq h_ptr - .unreq count - .unreq q - .unreq q_bound - .unreq gamma2_2x - .unreq barrett_const - .unreq mask_15 - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_use_hint_88_asm.S b/mldsa/native/aarch64/src/poly_use_hint_88_asm.S index 964f6e9f5..6f33129e4 100644 --- a/mldsa/native/aarch64/src/poly_use_hint_88_asm.S +++ b/mldsa/native/aarch64/src/poly_use_hint_88_asm.S @@ -6,118 +6,95 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// a aliased with a0 -.macro decompose88 a1, a, temp - // Compute a1 = round-(a / 190464) ≈ round(a * 1477838209 / - // 2^48), where round-() denotes "round half down". This is - // exact for 0 <= a < Q. Note that half is rounded down since - // 1477838209 / 2^48 ≲ 1 / 190464. - sqdmulh \a1\().4s, \a\().4s, barrett_const.4s - srshr \a1\().4s, \a1\().4s, #17 - - // If a1 = 44, i.e. a > 87*GAMMA2, proceed as if a' = a - Q was - // given instead. (For a = 87*GAMMA2 + 1 thus a' = -GAMMA2, we - // still round it to 0 like other "wrapped around" cases.) - - // Check for wrap-around - cmgt \temp\().4s, \a\().4s, q_bound.4s - - // Compute remainder a0 - mls \a\().4s, \a1\().4s, gamma2_2x.4s - - // If wrap-around is required, set a1 = 0 and a0 -= 1 - bic \a1\().16b, \a1\().16b, \temp\().16b - add \a\().4s, \a\().4s, \temp\().4s -.endm - -// a aliased with delta -.macro use_hint88 b, a, h, temp - decompose88 \b, \a, \temp - - // delta = (a0 <= 0) ? -1 : 1 - cmle \a\().4s, \a\().4s, #0 - orr \a\().4s, #1 - - // b = (b + delta * h) % 44 - mla \b\().4s, \a\().4s, \h\().4s - cmgt \temp\().4s, \b\().4s, const_43.4s - bic \b\().16b, \b\().16b, \temp\().16b - umin \b\().4s, \b\().4s, const_43.4s -.endm - - /* Parameters */ - b_ptr .req x0 // Output polynomial - a_ptr .req x1 // Input polynomial - h_ptr .req x2 // Input hints - - count .req x3 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/poly_use_hint_88_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - q .req v20 // Q = 8380417 - q_bound .req v21 // 87*GAMMA2 = 8285184 - gamma2_2x .req v22 // 2*GAMMA2 = 190464 - barrett_const .req v23 // Barrett constant = 1477838209 - const_43 .req v24 // mask = 43 .text -.global MLD_ASM_NAMESPACE(poly_use_hint_88_asm) .balign 4 +.global MLD_ASM_NAMESPACE(poly_use_hint_88_asm) MLD_ASM_FN_SYMBOL(poly_use_hint_88_asm) - // Load constants into SIMD registers - movz w4, #57345 - movk w4, #127, lsl #16 - dup q.4s, w4 - - movz w5, #0x6c00 - movk w5, #0x7e, lsl #16 - dup q_bound.4s, w5 - - movz w7, #0xe800 - movk w7, #0x2, lsl #16 - dup gamma2_2x.4s, w7 - movz w11, #0x0581 - movk w11, #0x5816, lsl #16 - dup barrett_const.4s, w11 - - movi const_43.4s, #43 - - mov count, #(64/4) + mov w4, #0xe001 // =57345 + movk w4, #0x7f, lsl #16 + dup v20.4s, w4 + mov w5, #0x6c00 // =27648 + movk w5, #0x7e, lsl #16 + dup v21.4s, w5 + mov w7, #0xe800 // =59392 + movk w7, #0x2, lsl #16 + dup v22.4s, w7 + mov w11, #0x581 // =1409 + movk w11, #0x5816, lsl #16 + dup v23.4s, w11 + movi v24.4s, #0x2b + mov x3, #0x10 // =16 poly_use_hint_88_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 - - ldr q5, [h_ptr, #1*16] - ldr q6, [h_ptr, #2*16] - ldr q7, [h_ptr, #3*16] - ldr q4, [h_ptr], #4*16 - - use_hint88 v17, v1, v5, v25 - use_hint88 v18, v2, v6, v25 - use_hint88 v19, v3, v7, v25 - use_hint88 v16, v0, v4, v25 - - str q17, [b_ptr, #1*16] - str q18, [b_ptr, #2*16] - str q19, [b_ptr, #3*16] - str q16, [b_ptr], #4*16 - - subs count, count, #1 - bne poly_use_hint_88_loop - + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] + ldr q0, [x1], #0x40 + ldr q5, [x2, #0x10] + ldr q6, [x2, #0x20] + ldr q7, [x2, #0x30] + ldr q4, [x2], #0x40 + sqdmulh v17.4s, v1.4s, v23.4s + srshr v17.4s, v17.4s, #0x11 + cmgt v25.4s, v1.4s, v21.4s + mls v1.4s, v17.4s, v22.4s + bic v17.16b, v17.16b, v25.16b + add v1.4s, v1.4s, v25.4s + cmle v1.4s, v1.4s, #0 + orr v1.4s, #0x1 + mla v17.4s, v1.4s, v5.4s + cmgt v25.4s, v17.4s, v24.4s + bic v17.16b, v17.16b, v25.16b + umin v17.4s, v17.4s, v24.4s + sqdmulh v18.4s, v2.4s, v23.4s + srshr v18.4s, v18.4s, #0x11 + cmgt v25.4s, v2.4s, v21.4s + mls v2.4s, v18.4s, v22.4s + bic v18.16b, v18.16b, v25.16b + add v2.4s, v2.4s, v25.4s + cmle v2.4s, v2.4s, #0 + orr v2.4s, #0x1 + mla v18.4s, v2.4s, v6.4s + cmgt v25.4s, v18.4s, v24.4s + bic v18.16b, v18.16b, v25.16b + umin v18.4s, v18.4s, v24.4s + sqdmulh v19.4s, v3.4s, v23.4s + srshr v19.4s, v19.4s, #0x11 + cmgt v25.4s, v3.4s, v21.4s + mls v3.4s, v19.4s, v22.4s + bic v19.16b, v19.16b, v25.16b + add v3.4s, v3.4s, v25.4s + cmle v3.4s, v3.4s, #0 + orr v3.4s, #0x1 + mla v19.4s, v3.4s, v7.4s + cmgt v25.4s, v19.4s, v24.4s + bic v19.16b, v19.16b, v25.16b + umin v19.4s, v19.4s, v24.4s + sqdmulh v16.4s, v0.4s, v23.4s + srshr v16.4s, v16.4s, #0x11 + cmgt v25.4s, v0.4s, v21.4s + mls v0.4s, v16.4s, v22.4s + bic v16.16b, v16.16b, v25.16b + add v0.4s, v0.4s, v25.4s + cmle v0.4s, v0.4s, #0 + orr v0.4s, #0x1 + mla v16.4s, v0.4s, v4.4s + cmgt v25.4s, v16.4s, v24.4s + bic v16.16b, v16.16b, v25.16b + umin v16.4s, v16.4s, v24.4s + str q17, [x0, #0x10] + str q18, [x0, #0x20] + str q19, [x0, #0x30] + str q16, [x0], #0x40 + subs x3, x3, #0x1 + b.ne poly_use_hint_88_loop ret - .unreq b_ptr - .unreq a_ptr - .unreq h_ptr - .unreq count - .unreq q - .unreq q_bound - .unreq gamma2_2x - .unreq barrett_const - .unreq const_43 - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/polyz_unpack_17_asm.S b/mldsa/native/aarch64/src/polyz_unpack_17_asm.S index 10e6cbd96..139a9b2a1 100644 --- a/mldsa/native/aarch64/src/polyz_unpack_17_asm.S +++ b/mldsa/native/aarch64/src/polyz_unpack_17_asm.S @@ -7,97 +7,56 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -.macro trim_map_17 a - // Keep only 18 out of 24 bits in each 32-bit lane - // Lane 0 1 2 3 - // Bits 0..23 16..39 32..55 48..71 - ushl \a\().4s, \a\().4s, shifts.4s - // Bits 0..23 18..39 36..55 54..71 - and \a\().16b, \a\().16b, mask.16b - // Bits 0..17 18..35 36..53 54..71 - - // Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] - sub \a\().4s, gamma1.4s, \a\().4s -.endm - - /* Parameters */ - output .req x0 - buf .req x1 - indices .req x2 - - xtmp .req x3 - count .req x9 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/polyz_unpack_17_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - idx0 .req v24 - idx1 .req v25 - idx2 .req v26 - idx3 .req v27 - shifts .req v28 - mask .req v29 // 2^18 - 1 - gamma1 .req v30 // 2^17 .text -.global MLD_ASM_NAMESPACE(polyz_unpack_17_asm) .balign 4 +.global MLD_ASM_NAMESPACE(polyz_unpack_17_asm) MLD_ASM_FN_SYMBOL(polyz_unpack_17_asm) - // Load indices - ldr q24, [indices] - ldr q25, [indices, #1*16] - ldr q26, [indices, #2*16] - ldr q27, [indices, #3*16] - - // Load per-lane shifts 0, -2, -4, -6. (Negative means right shift.) - // The shifts for the 4 32-bit lanes are sign-extended from the lowest - // 8 bits, so it suffices to set up only byte 0, 4, 8, 12. - movz xtmp, 0xfe, lsl 32 - mov shifts.d[0], xtmp - movz xtmp, 0xfc - movk xtmp, 0xfa, lsl 32 - mov shifts.d[1], xtmp - movi mask.4s, 0x3, msl 16 - - movi gamma1.4s, 0x2, lsl 16 - - mov count, #(64/4) + ldr q24, [x2] + ldr q25, [x2, #0x10] + ldr q26, [x2, #0x20] + ldr q27, [x2, #0x30] + mov x3, #0xfe00000000 // =1090921693184 + mov v28.d[0], x3 + mov x3, #0xfc // =252 + movk x3, #0xfa, lsl #32 + mov v28.d[1], x3 + movi v29.4s, #0x3, msl #16 + movi v30.4s, #0x2, lsl #16 + mov x9, #0x10 // =16 polyz_unpack_17_loop: - ldr q1, [buf, #16] - ldr q2, [buf, #32] - ldr q0, [buf], #36 - - tbl v4.16b, {v0.16b}, idx0.16b - tbl v5.16b, {v0.16b - v1.16b}, idx1.16b - tbl v6.16b, {v1.16b}, idx2.16b - tbl v7.16b, {v1.16b - v2.16b}, idx3.16b - - trim_map_17 v4 - trim_map_17 v5 - trim_map_17 v6 - trim_map_17 v7 - - str q5, [output, #1*16] - str q6, [output, #2*16] - str q7, [output, #3*16] - str q4, [output], #4*16 - - subs count, count, #1 - bne polyz_unpack_17_loop - + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q0, [x1], #0x24 + tbl v4.16b, { v0.16b }, v24.16b + tbl v5.16b, { v0.16b, v1.16b }, v25.16b + tbl v6.16b, { v1.16b }, v26.16b + tbl v7.16b, { v1.16b, v2.16b }, v27.16b + ushl v4.4s, v4.4s, v28.4s + and v4.16b, v4.16b, v29.16b + sub v4.4s, v30.4s, v4.4s + ushl v5.4s, v5.4s, v28.4s + and v5.16b, v5.16b, v29.16b + sub v5.4s, v30.4s, v5.4s + ushl v6.4s, v6.4s, v28.4s + and v6.16b, v6.16b, v29.16b + sub v6.4s, v30.4s, v6.4s + ushl v7.4s, v7.4s, v28.4s + and v7.16b, v7.16b, v29.16b + sub v7.4s, v30.4s, v7.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + subs x9, x9, #0x1 + b.ne polyz_unpack_17_loop ret - .unreq output - .unreq buf - .unreq indices - .unreq xtmp - .unreq count - .unreq idx0 - .unreq idx1 - .unreq idx2 - .unreq idx3 - .unreq shifts - .unreq mask - .unreq gamma1 - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/polyz_unpack_19_asm.S b/mldsa/native/aarch64/src/polyz_unpack_19_asm.S index 24155fd1d..fe758f572 100644 --- a/mldsa/native/aarch64/src/polyz_unpack_19_asm.S +++ b/mldsa/native/aarch64/src/polyz_unpack_19_asm.S @@ -7,94 +7,53 @@ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -.macro trim_map_19 a - // Keep only 20 out of 24 bits in each 32-bit lane - // Lane 0 1 2 3 - // Bits 0..23 16..39 40..63 56..79 - ushl \a\().4s, \a\().4s, shifts.4s - // Bits 0..23 20..39 40..63 60..79 - and \a\().16b, \a\().16b, mask.16b - // Bits 0..19 20..39 40..59 60..79 - - // Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] - sub \a\().4s, gamma1.4s, \a\().4s -.endm - - /* Parameters */ - output .req x0 - buf .req x1 - indices .req x2 - - xtmp .req x3 - count .req x9 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/polyz_unpack_19_asm.S using scripts/simpasm. Do not modify it directly. + */ - /* Constant register assignments */ - idx0 .req v24 - idx1 .req v25 - idx2 .req v26 - idx3 .req v27 - shifts .req v28 - mask .req v29 // 2^20 - 1 - gamma1 .req v30 // 2^19 .text -.global MLD_ASM_NAMESPACE(polyz_unpack_19_asm) .balign 4 +.global MLD_ASM_NAMESPACE(polyz_unpack_19_asm) MLD_ASM_FN_SYMBOL(polyz_unpack_19_asm) - // Load indices - ldr q24, [indices] - ldr q25, [indices, #1*16] - ldr q26, [indices, #2*16] - ldr q27, [indices, #3*16] - - // Load per-lane shifts 0, -4, 0, -4. (Negative means right shift.) - // The shifts for the 4 32-bit lanes are sign-extended from the lowest - // 8 bits, so it suffices to set up only byte 0, 4, 8, 12. - movz xtmp, 0xfc, lsl 32 - dup shifts.2d, xtmp - movi mask.4s, 0xf, msl 16 - - movi gamma1.4s, 0x8, lsl 16 - - mov count, #(64/4) + ldr q24, [x2] + ldr q25, [x2, #0x10] + ldr q26, [x2, #0x20] + ldr q27, [x2, #0x30] + mov x3, #0xfc00000000 // =1082331758592 + dup v28.2d, x3 + movi v29.4s, #0xf, msl #16 + movi v30.4s, #0x8, lsl #16 + mov x9, #0x10 // =16 polyz_unpack_19_loop: - ldr q1, [buf, #16] - ldr q2, [buf, #32] - ldr q0, [buf], #40 - - tbl v4.16b, {v0.16b}, idx0.16b - tbl v5.16b, {v0.16b - v1.16b}, idx1.16b - tbl v6.16b, {v1.16b}, idx2.16b - tbl v7.16b, {v1.16b - v2.16b}, idx3.16b - - trim_map_19 v4 - trim_map_19 v5 - trim_map_19 v6 - trim_map_19 v7 - - str q5, [output, #1*16] - str q6, [output, #2*16] - str q7, [output, #3*16] - str q4, [output], #4*16 - - subs count, count, #1 - bne polyz_unpack_19_loop - + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q0, [x1], #0x28 + tbl v4.16b, { v0.16b }, v24.16b + tbl v5.16b, { v0.16b, v1.16b }, v25.16b + tbl v6.16b, { v1.16b }, v26.16b + tbl v7.16b, { v1.16b, v2.16b }, v27.16b + ushl v4.4s, v4.4s, v28.4s + and v4.16b, v4.16b, v29.16b + sub v4.4s, v30.4s, v4.4s + ushl v5.4s, v5.4s, v28.4s + and v5.16b, v5.16b, v29.16b + sub v5.4s, v30.4s, v5.4s + ushl v6.4s, v6.4s, v28.4s + and v6.16b, v6.16b, v29.16b + sub v6.4s, v30.4s, v6.4s + ushl v7.4s, v7.4s, v28.4s + and v7.16b, v7.16b, v29.16b + sub v7.4s, v30.4s, v7.4s + str q5, [x0, #0x10] + str q6, [x0, #0x20] + str q7, [x0, #0x30] + str q4, [x0], #0x40 + subs x9, x9, #0x1 + b.ne polyz_unpack_19_loop ret - .unreq output - .unreq buf - .unreq indices - .unreq xtmp - .unreq count - .unreq idx0 - .unreq idx1 - .unreq idx2 - .unreq idx3 - .unreq shifts - .unreq mask - .unreq gamma1 - #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/rej_uniform_asm.S b/mldsa/native/aarch64/src/rej_uniform_asm.S index 9e447ec6c..b5e1d388f 100644 --- a/mldsa/native/aarch64/src/rej_uniform_asm.S +++ b/mldsa/native/aarch64/src/rej_uniform_asm.S @@ -8,402 +8,173 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// We save the output on the stack first, and copy to the actual -// output buffer only in the end. This is because the main loop can overwrite -// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). -#define STACK_SIZE (4*MLDSA_N + 64) - -.macro push_stack - sub sp, sp, #STACK_SIZE -.endm - -.macro pop_stack - add sp, sp, #STACK_SIZE -.endm - - /* Parameters */ - output .req x0 - buf .req x1 - buflen .req x2 - table_idx .req x3 - - len .req x4 - - /* Temporary output on the stack */ - xtmp .req x7 - wtmp .req w7 - output_tmp .req x7 - output_tmp_base .req x8 - - /* Number of coefficients sampled so far */ - count .req x9 - - /* Temporary registers */ - initial_zero_count .req x11 - final_copy_count .req x11 - - rec_idx_0 .req x12 - rec_idx_1 .req x13 - rec_idx_2 .req x14 - rec_idx_3 .req x15 - - ctr0 .req x12 - ctr1 .req x13 - ctr2 .req x14 - ctr3 .req x15 - - ctr01 .req ctr0 - ctr23 .req ctr2 - - /* Vector registers */ - - buf0 .req v0 - buf1 .req v1 - buf2 .req v2 - - tmp0 .req v4 - tmp1 .req v5 - tmp2 .req v6 - tmp3 .req v7 - - sign0 .req v4 - sign1 .req v5 - sign2 .req v6 - sign3 .req v7 - - val0 .req v16 - val0q .req q16 - val1 .req v17 - val1q .req q17 - val2 .req v18 - val2q .req q18 - val3 .req v19 - val3q .req q19 - - t0 .req d20 - t1 .req d21 - t2 .req d22 - t3 .req d23 - - table0 .req v24 - table0q .req q24 - table1 .req v25 - table1q .req q25 - table2 .req v26 - table2q .req q26 - table3 .req v27 - table3q .req q27 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly. + */ - mldsa_q .req v30 - bits .req v31 - .text - .global MLD_ASM_NAMESPACE(rej_uniform_asm) - .balign 4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(rej_uniform_asm) MLD_ASM_FN_SYMBOL(rej_uniform_asm) - push_stack - - // Load 0x1, 0x2, 0x4, 0x8 - movz xtmp, 0x1 - movk xtmp, 0x2, lsl 32 - mov bits.d[0], xtmp - - movz xtmp, 0x4 - movk xtmp, 0x8, lsl 32 - mov bits.d[1], xtmp - - // load q = 8380417 - movz wtmp, #57345 - movk wtmp, #127, lsl #16 - dup mldsa_q.4s, wtmp - mov output_tmp_base, sp - mov output_tmp, output_tmp_base + sub sp, sp, #0x440 + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #32 + mov v31.d[0], x7 + mov x7, #0x4 // =4 + movk x7, #0x8, lsl #32 + mov v31.d[1], x7 + mov w7, #0xe001 // =57345 + movk w7, #0x7f, lsl #16 + dup v30.4s, w7 + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b - // The entire temporary stack buffer is copied to the output buffer - // at the end of this routine. To avoid leaking original stack contents - // in case not enough bytes have been sampled, zero the temporary buffer. - mov initial_zero_count, #0 - eor val0.16b, val0.16b, val0.16b rej_uniform_initial_zero: - str val0q, [output_tmp], #64 - str val0q, [output_tmp, #-48] - str val0q, [output_tmp, #-32] - str val0q, [output_tmp, #-16] - add initial_zero_count, initial_zero_count, #16 - cmp initial_zero_count, #MLDSA_N - b.lt rej_uniform_initial_zero - - mov output_tmp, output_tmp_base - - mov count, #0 - mov len, #MLDSA_N - - cmp buflen, #48 - b.lo rej_uniform_loop48_end + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt rej_uniform_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 + cmp x2, #0x30 + b.lo rej_uniform_loop48_end rej_uniform_loop48: - // Finish once we've generated sufficiently many coefficients - cmp count, len - b.hs rej_uniform_memory_copy - - // First, we unpack the byte stream into a stream of signed - // coefficients, interpreting each consecutive 3 bytes as one - // signed 24-bit coefficients, presented as 32-bit integers. - // The topmost bit is masked out making it a 23-bit coefficient. - // - // We handle 16 coefficients a time, and use ld3 for the required - // de-interleaving of the byte stream. - sub buflen, buflen, #48 - ld3 {buf0.16b, buf1.16b, buf2.16b}, [buf], #48 - - // Mask out top-most bit - movi tmp0.16b, #0x80 - bic buf2.16b, buf2.16b, tmp0.16b - - // Unpack 16 triples of bytes into 16 32-bit integers, - // represented as 4 vectors val0-val3. - zip1 tmp0.16b, buf0.16b, buf1.16b - zip2 tmp1.16b, buf0.16b, buf1.16b - uxtl tmp2.8h, buf2.8b - uxtl2 tmp3.8h, buf2.16b - - zip1 val0.8h, tmp0.8h, tmp2.8h - zip2 val1.8h, tmp0.8h, tmp2.8h - zip1 val2.8h, tmp1.8h, tmp3.8h - zip2 val3.8h, tmp1.8h, tmp3.8h - - // At this point, val0-val3 are the signed integers to do rejection - // sampling on. For each of them, do the following: - // - Check which coefficients are within range, and represent the set - // of lane-indices of those coefficients as an 4-bit bitmap. - // - Move the respective lanes to the front of the vector. This is the - // most complex part, and is done by interpreting the 4-bit bitmap as - // an index into a lookup table giving the lane-table to be use for - // the `tbl` instruction. - // - Write the vector to the output buffer, but merely increase the output - // buffer pointer by the number of valid coefficients. - - - // Set valid lanes to -1 (0b1...1) - cmhi sign0.4s, mldsa_q.4s, val0.4s - cmhi sign1.4s, mldsa_q.4s, val1.4s - cmhi sign2.4s, mldsa_q.4s, val2.4s - cmhi sign3.4s, mldsa_q.4s, val3.4s - - // If lane i is valid and has value -1, retain only i-th bit - and sign0.16b, sign0.16b, bits.16b - and sign1.16b, sign1.16b, bits.16b - and sign2.16b, sign2.16b, bits.16b - and sign3.16b, sign3.16b, bits.16b - - // Get 4-bit bitmap of valid lane indices by adding lanes - uaddlv t0, sign0.4s - uaddlv t1, sign1.4s - uaddlv t2, sign2.4s - uaddlv t3, sign3.4s + cmp x9, x4 + b.hs rej_uniform_memory_copy + sub x2, x2, #0x30 + ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 + movi v4.16b, #0x80 + bic v2.16b, v2.16b, v4.16b + zip1 v4.16b, v0.16b, v1.16b + zip2 v5.16b, v0.16b, v1.16b + ushll v6.8h, v2.8b, #0x0 + ushll2 v7.8h, v2.16b, #0x0 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + zip1 v18.8h, v5.8h, v7.8h + zip2 v19.8h, v5.8h, v7.8h + cmhi v4.4s, v30.4s, v16.4s + cmhi v5.4s, v30.4s, v17.4s + cmhi v6.4s, v30.4s, v18.4s + cmhi v7.4s, v30.4s, v19.4s + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + and v6.16b, v6.16b, v31.16b + and v7.16b, v7.16b, v31.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + uaddlv d22, v6.4s + uaddlv d23, v7.4s + fmov x12, d20 + fmov x13, d21 + fmov x14, d22 + fmov x15, d23 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + ldr q26, [x3, x14, lsl #4] + ldr q27, [x3, x15, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + cnt v6.16b, v6.16b + cnt v7.16b, v7.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + uaddlv d22, v6.4s + uaddlv d23, v7.4s + fmov x12, d20 + fmov x13, d21 + fmov x14, d22 + fmov x15, d23 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + tbl v18.16b, { v18.16b }, v26.16b + tbl v19.16b, { v19.16b }, v27.16b + str q16, [x7] + add x7, x7, x12, lsl #2 + str q17, [x7] + add x7, x7, x13, lsl #2 + str q18, [x7] + add x7, x7, x14, lsl #2 + str q19, [x7] + add x7, x7, x15, lsl #2 + add x12, x12, x13 + add x14, x14, x15 + add x9, x9, x12 + add x9, x9, x14 + cmp x2, #0x30 + b.hs rej_uniform_loop48 - fmov rec_idx_0, t0 - fmov rec_idx_1, t1 - fmov rec_idx_2, t2 - fmov rec_idx_3, t3 - - ldr table0q, [table_idx, rec_idx_0, lsl #4] - ldr table1q, [table_idx, rec_idx_1, lsl #4] - ldr table2q, [table_idx, rec_idx_2, lsl #4] - ldr table3q, [table_idx, rec_idx_3, lsl #4] - - // Compute number of valid coefficients. Recall that at this - // point, lane i has value 2^i (hence popcount 1) if its coefficient - // is valid, and 0 otherwise. - cnt sign0.16b, sign0.16b - cnt sign1.16b, sign1.16b - cnt sign2.16b, sign2.16b - cnt sign3.16b, sign3.16b - - // Extract number of valid coefficients - uaddlv t0, sign0.4s - uaddlv t1, sign1.4s - uaddlv t2, sign2.4s - uaddlv t3, sign3.4s - - fmov ctr0, t0 - fmov ctr1, t1 - fmov ctr2, t2 - fmov ctr3, t3 - - // Move valid coefficients to the front - tbl val0.16b, {val0.16b}, table0.16b - tbl val1.16b, {val1.16b}, table1.16b - tbl val2.16b, {val2.16b}, table2.16b - tbl val3.16b, {val3.16b}, table3.16b - - str val0q, [output_tmp] - add output_tmp, output_tmp, ctr0, lsl #2 - - str val1q, [output_tmp] - add output_tmp, output_tmp, ctr1, lsl #2 - - str val2q, [output_tmp] - add output_tmp, output_tmp, ctr2, lsl #2 - - str val3q, [output_tmp] - add output_tmp, output_tmp, ctr3, lsl #2 - - add ctr01, ctr0, ctr1 - add ctr23, ctr2, ctr3 - add count, count, ctr01 - add count, count, ctr23 - - cmp buflen, #48 - b.hs rej_uniform_loop48 rej_uniform_loop48_end: - - // Finish once we've generated sufficiently many coefficients - cmp count, len - b.hs rej_uniform_memory_copy - - cmp buflen, #24 - b.lo rej_uniform_memory_copy - - sub buflen, buflen, #24 - ld3 {buf0.8b, buf1.8b, buf2.8b}, [buf], #24 - - // mask out top-most bit - movi tmp0.16b, #0x80 - bic buf2.16b, buf2.16b, tmp0.16b - - zip1 tmp0.16b, buf0.16b, buf1.16b - uxtl tmp2.8h, buf2.8b - - zip1 val0.8h, tmp0.8h, tmp2.8h - zip2 val1.8h, tmp0.8h, tmp2.8h - - cmhi sign0.4s, mldsa_q.4s, val0.4s - cmhi sign1.4s, mldsa_q.4s, val1.4s - - and sign0.16b, sign0.16b, bits.16b - and sign1.16b, sign1.16b, bits.16b - - uaddlv t0, sign0.4s - uaddlv t1, sign1.4s - - fmov rec_idx_0, t0 - fmov rec_idx_1, t1 - - ldr table0q, [table_idx, rec_idx_0, lsl #4] - ldr table1q, [table_idx, rec_idx_1, lsl #4] - - cnt sign0.16b, sign0.16b - cnt sign1.16b, sign1.16b - - uaddlv t0, sign0.4s - uaddlv t1, sign1.4s - - fmov ctr0, t0 - fmov ctr1, t1 - - tbl val0.16b, {val0.16b}, table0.16b - tbl val1.16b, {val1.16b}, table1.16b - - str val0q, [output_tmp] - add output_tmp, output_tmp, ctr0, lsl #2 - - str val1q, [output_tmp] - add output_tmp, output_tmp, ctr1, lsl #2 - - add count, count, ctr0 - add count, count, ctr1 + cmp x9, x4 + b.hs rej_uniform_memory_copy + cmp x2, #0x18 + b.lo rej_uniform_memory_copy + sub x2, x2, #0x18 + ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 + movi v4.16b, #0x80 + bic v2.16b, v2.16b, v4.16b + zip1 v4.16b, v0.16b, v1.16b + ushll v6.8h, v2.8b, #0x0 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + cmhi v4.4s, v30.4s, v16.4s + cmhi v5.4s, v30.4s, v17.4s + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + fmov x12, d20 + fmov x13, d21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv d20, v4.4s + uaddlv d21, v5.4s + fmov x12, d20 + fmov x13, d21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #2 + str q17, [x7] + add x7, x7, x13, lsl #2 + add x9, x9, x12 + add x9, x9, x13 rej_uniform_memory_copy: - // min = min(count,len) - cmp count, len - csel count, count, len, lo + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 // =0 + mov x7, x8 - // Always copy MLDSA_N coefficients from the stack to the destination, - // even if not all of them may be valid. This simplifies the loop and - // allows us to stick to vectorized code. - mov final_copy_count, #0 - mov output_tmp, output_tmp_base rej_uniform_final_copy: - ldr val0q, [output_tmp], #64 - ldr val1q, [output_tmp, #-48] - ldr val2q, [output_tmp, #-32] - ldr val3q, [output_tmp, #-16] - str val0q, [output], #64 - str val1q, [output, #-48] - str val2q, [output, #-32] - str val3q, [output, #-16] - add final_copy_count, final_copy_count, #16 - cmp final_copy_count, #MLDSA_N - b.lt rej_uniform_final_copy - - mov x0, count - b rej_uniform_return - + ldr q16, [x7], #0x40 + ldur q17, [x7, #-0x30] + ldur q18, [x7, #-0x20] + ldur q19, [x7, #-0x10] + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt rej_uniform_final_copy + mov x0, x9 + b rej_uniform_return rej_uniform_return: - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq output - .unreq buf - .unreq buflen - .unreq table_idx - .unreq len - .unreq output_tmp - .unreq output_tmp_base - .unreq count - .unreq xtmp - .unreq wtmp - .unreq final_copy_count - .unreq initial_zero_count - .unreq rec_idx_0 - .unreq rec_idx_1 - .unreq rec_idx_2 - .unreq rec_idx_3 - .unreq ctr0 - .unreq ctr1 - .unreq ctr2 - .unreq ctr3 - .unreq ctr01 - .unreq ctr23 - .unreq buf0 - .unreq buf1 - .unreq buf2 - .unreq tmp0 - .unreq tmp1 - .unreq tmp2 - .unreq tmp3 - .unreq sign0 - .unreq sign1 - .unreq sign2 - .unreq sign3 - .unreq val0 - .unreq val0q - .unreq val1 - .unreq val1q - .unreq val2 - .unreq val2q - .unreq val3 - .unreq val3q - .unreq t0 - .unreq t1 - .unreq t2 - .unreq t3 - .unreq table0 - .unreq table0q - .unreq table1 - .unreq table1q - .unreq table2 - .unreq table2q - .unreq table3 - .unreq table3q - .unreq mldsa_q - .unreq bits - -/* TODO: autogenerate this in autogen */ -#undef STACK_SIZE + add sp, sp, #0x440 + ret #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/rej_uniform_eta2_asm.S b/mldsa/native/aarch64/src/rej_uniform_eta2_asm.S index c218693cc..dbf96e4a7 100644 --- a/mldsa/native/aarch64/src/rej_uniform_eta2_asm.S +++ b/mldsa/native/aarch64/src/rej_uniform_eta2_asm.S @@ -8,325 +8,117 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// We save the output on the stack first, and copy to the actual -// output buffer only in the end. This is because the main loop can overwrite -// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). -#define STACK_SIZE (2*MLDSA_N + 64) - -.macro push_stack - sub sp, sp, #STACK_SIZE -.endm - -.macro pop_stack - add sp, sp, #STACK_SIZE -.endm - - /* Parameters */ - output .req x0 - buf .req x1 - buflen .req x2 - table_idx .req x3 - - len .req x4 - - /* Temporary output on the stack */ - xtmp .req x7 - wtmp .req w7 - output_tmp .req x7 - output_tmp_base .req x8 - - /* Number of coefficients sampled so far */ - count .req x9 - buf_consumed .req x10 - - /* Temporary registers */ - tmp .req w11 - initial_zero_count .req x11 - final_copy_count .req x11 - - rec_idx_0 .req x12 - rec_idx_1 .req x13 - - rec_idx_0_w .req w12 - rec_idx_1_w .req w13 - - ctr0 .req x12 - ctr1 .req x13 - - ctr0_w .req w12 - ctr1_w .req w13 - - ctr01 .req ctr0 - - /* Vector registers */ - buf0 .req v0 - - tmp0 .req v26 - tmp1 .req v27 - tmp2 .req v28 - tmp3 .req v29 - - sign0 .req v4 - sign1 .req v5 - const2 .req v7 - - // Barrett reduction constants - barrett_const .req v26 - modulus5 .req v27 - barrett_tmp .req v28 - - val0 .req v16 - val0q .req q16 - val1 .req v17 - val1q .req q17 - val2 .req v18 - val2q .req q18 - val3 .req v19 - val3q .req q19 - - t0 .req s20 - t1 .req s21 - - table0 .req v24 - table0q .req q24 - table1 .req v25 - table1q .req q25 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/rej_uniform_eta2_asm.S using scripts/simpasm. Do not modify it directly. + */ - eta_bound .req v30 - bits .req v31 - .text - .global MLD_ASM_NAMESPACE(rej_uniform_eta2_asm) - .balign 4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(rej_uniform_eta2_asm) MLD_ASM_FN_SYMBOL(rej_uniform_eta2_asm) - push_stack - // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - movz xtmp, 0x1 - movk xtmp, 0x2, lsl 16 - movk xtmp, 0x4, lsl 32 - movk xtmp, 0x8, lsl 48 - mov bits.d[0], xtmp + sub sp, sp, #0x240 + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 // =16 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + movi v30.8h, #0xf + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b - movz xtmp, 0x10 - movk xtmp, 0x20, lsl 16 - movk xtmp, 0x40, lsl 32 - movk xtmp, 0x80, lsl 48 - mov bits.d[1], xtmp - - // Load eta2 bound = 15 - movi eta_bound.8h, #15 - - mov output_tmp_base, sp - mov output_tmp, output_tmp_base - - // The entire temporary stack buffer is copied to the output buffer - // at the end of this routine. To avoid leaking original stack contents - // in case not enough bytes have been sampled, zero the temporary buffer. - // The temporary buffer holds 16-bit values that are expanded to 32-bit - // on copy out - mov initial_zero_count, #0 - eor val0.16b, val0.16b, val0.16b rej_uniform_eta2_initial_zero: - str val0q, [output_tmp], #64 - str val0q, [output_tmp, #-48] - str val0q, [output_tmp, #-32] - str val0q, [output_tmp, #-16] - add initial_zero_count, initial_zero_count, #32 - cmp initial_zero_count, #MLDSA_N - b.lt rej_uniform_eta2_initial_zero - - mov output_tmp, output_tmp_base - - mov count, #0 - mov len, #MLDSA_N + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt rej_uniform_eta2_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 rej_uniform_eta2_loop8: - // Finish once we've generated sufficiently many coefficients - cmp count, len - b.hs rej_uniform_eta2_memory_copy - - // Load 8 bytes and extract nibbles to get 16 4-bit values - sub buflen, buflen, #8 - ld1 {buf0.8b}, [buf], #8 - - // Extract nibbles - movi tmp0.8b, #0x0F - and tmp1.8b, buf0.8b, tmp0.8b // Low nibbles [L0, L1, L2, L3, L4, L5, L6, L7] - ushr tmp2.8b, buf0.8b, #4 // High nibbles [H0, H1, H2, H3, H4, H5, H6, H7] + cmp x9, x4 + b.hs rej_uniform_eta2_memory_copy + sub x2, x2, #0x8 + ld1 { v0.8b }, [x1], #8 + movi v26.8b, #0xf + and v27.8b, v0.8b, v26.8b + ushr v28.8b, v0.8b, #0x4 + zip1 v26.8b, v27.8b, v28.8b + zip2 v29.8b, v27.8b, v28.8b + ushll v16.8h, v26.8b, #0x0 + ushll v17.8h, v29.8b, #0x0 + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + add x12, x12, x13 + add x9, x9, x12 + cmp x2, #0x8 + b.hs rej_uniform_eta2_loop8 - // Interleave low and high nibbles: L0,H0,L1,H1,L2,H2,L3,H3,... - zip1 tmp0.8b, tmp1.8b, tmp2.8b // First 8 nibbles interleaved [L0,H0,L1,H1,L2,H2,L3,H3] - zip2 tmp3.8b, tmp1.8b, tmp2.8b // Next 8 nibbles interleaved [L4,H4,L5,H5,L6,H6,L7,H7] - - // Convert to 16-bit values - uxtl val0.8h, tmp0.8b - uxtl val1.8h, tmp3.8b - - // At this point, val0-val1 are the signed integers to do rejection - // sampling on. For each of them, do the following: - // - Check which coefficients are within range, and represent the set - // of lane-indices of those coefficients as an 8-bit bitmap. - // - Move the respective lanes to the front of the vector. This is the - // most complex part, and is done by interpreting the 8-bit bitmap as - // an index into a lookup table giving the lane-table to be use for - // the `tbl` instruction. - // - Write the vector to the output buffer, but merely increase the output - // buffer pointer by the number of valid coefficients. - - // Check which coefficients are within range (< 15) - cmhi sign0.8h, eta_bound.8h, val0.8h - cmhi sign1.8h, eta_bound.8h, val1.8h - - // If lane i is valid and has value -1, retain only i-th bit - and sign0.16b, sign0.16b, bits.16b - and sign1.16b, sign1.16b, bits.16b - - // Get 8-bit bitmap of valid lane indices by adding lanes - uaddlv t0, sign0.8h - uaddlv t1, sign1.8h - - fmov rec_idx_0_w, t0 - fmov rec_idx_1_w, t1 - - ldr table0q, [table_idx, rec_idx_0, lsl #4] - ldr table1q, [table_idx, rec_idx_1, lsl #4] - - // Compute number of valid coefficients. Recall that at this - // point, lane i has value 2^i (hence popcount 1) if its coefficient - // is valid, and 0 otherwise. - cnt sign0.16b, sign0.16b - cnt sign1.16b, sign1.16b - - // Extract number of valid coefficients - uaddlv t0, sign0.8h - uaddlv t1, sign1.8h - - fmov ctr0_w, t0 - fmov ctr1_w, t1 - - // Move valid coefficients to the front - tbl val0.16b, {val0.16b}, table0.16b - tbl val1.16b, {val1.16b}, table1.16b - - - // We store 16-bit coefficients here. They will be expanded to 32-bit - // on copy out - str val0q, [output_tmp] - add output_tmp, output_tmp, ctr0, lsl #1 - - str val1q, [output_tmp] - add output_tmp, output_tmp, ctr1, lsl #1 - - add ctr01, ctr0, ctr1 - add count, count, ctr01 - - cmp buflen, #8 - b.hs rej_uniform_eta2_loop8 -rej_uniform_eta2_loop8_end: rej_uniform_eta2_memory_copy: - // min = min(count,len) - cmp count, len - csel count, count, len, lo + cmp x9, x4 + csel x9, x9, x4, lo + mov w7, #0x199a // =6554 + dup v26.8h, w7 + movi v27.8h, #0x5 + movi v7.8h, #0x2 + mov x11, #0x0 // =0 + mov x7, x8 - // Initialize constant vectors for Barrett reduction - movz wtmp, #6554 // round((2**15)/5) - dup barrett_const.8h, wtmp - movi modulus5.8h, #5 - movi const2.8h, #2 - - // Always copy MLDSA_N coefficients from the stack to the destination - mov final_copy_count, #0 - mov output_tmp, output_tmp_base rej_uniform_eta2_final_copy: - ldr val0q, [output_tmp], #32 - ldr val2q, [output_tmp, #-16] - - // Reference: - // Barrett reduction: t0 = t0 - (205 * t0 >> 10) * 5; - - // To make efficient use of sqdmulh, we use the equivalent - // t0 = t0 - (13108 * t0 >> 16) * 5; - - sqdmulh barrett_tmp.8h, val0.8h, barrett_const.8h - mls val0.8h, barrett_tmp.8h, modulus5.8h - - sqdmulh barrett_tmp.8h, val2.8h, barrett_const.8h - mls val2.8h, barrett_tmp.8h, modulus5.8h - - sub val0.8h, const2.8h, val0.8h - sub val2.8h, const2.8h, val2.8h - - // Expand from 16-bit to 32-bit - sxtl2 val1.4s, val0.8h - sxtl val0.4s, val0.4h - - sxtl2 val3.4s, val2.8h - sxtl val2.4s, val2.4h - - str val0q, [output], #64 - str val1q, [output, #-48] - str val2q, [output, #-32] - str val3q, [output, #-16] - add final_copy_count, final_copy_count, #16 - cmp final_copy_count, #MLDSA_N - b.lt rej_uniform_eta2_final_copy - - mov x0, count - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq output - .unreq buf - .unreq buflen - .unreq table_idx - .unreq len - .unreq output_tmp - .unreq output_tmp_base - .unreq count - .unreq buf_consumed - .unreq tmp - .unreq xtmp - .unreq final_copy_count - .unreq initial_zero_count - .unreq rec_idx_0 - .unreq rec_idx_1 - .unreq rec_idx_0_w - .unreq rec_idx_1_w - .unreq ctr0 - .unreq ctr1 - .unreq ctr0_w - .unreq ctr1_w - .unreq ctr01 - .unreq buf0 - .unreq tmp0 - .unreq tmp1 - .unreq tmp2 - .unreq tmp3 - .unreq sign0 - .unreq sign1 - .unreq val0 - .unreq val0q - .unreq val1 - .unreq val1q - .unreq val2 - .unreq val2q - .unreq val3 - .unreq val3q - .unreq t0 - .unreq t1 - .unreq table0 - .unreq table0q - .unreq table1 - .unreq table1q - .unreq eta_bound - .unreq bits - .unreq const2 - .unreq barrett_const - .unreq modulus5 - .unreq barrett_tmp - -#undef STACK_SIZE + ldr q16, [x7], #0x20 + ldur q18, [x7, #-0x10] + sqdmulh v28.8h, v16.8h, v26.8h + mls v16.8h, v28.8h, v27.8h + sqdmulh v28.8h, v18.8h, v26.8h + mls v18.8h, v28.8h, v27.8h + sub v16.8h, v7.8h, v16.8h + sub v18.8h, v7.8h, v18.8h + sshll2 v17.4s, v16.8h, #0x0 + sshll v16.4s, v16.4h, #0x0 + sshll2 v19.4s, v18.8h, #0x0 + sshll v18.4s, v18.4h, #0x0 + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt rej_uniform_eta2_final_copy + mov x0, x9 + add sp, sp, #0x240 + ret #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/rej_uniform_eta4_asm.S b/mldsa/native/aarch64/src/rej_uniform_eta4_asm.S index cb7baee58..4c031dfe4 100644 --- a/mldsa/native/aarch64/src/rej_uniform_eta4_asm.S +++ b/mldsa/native/aarch64/src/rej_uniform_eta4_asm.S @@ -8,301 +8,110 @@ #if defined(MLD_ARITH_BACKEND_AARCH64) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -// We save the output on the stack first, and copy to the actual -// output buffer only in the end. This is because the main loop can overwrite -// by up to 60 bytes, which we account for here (we use 64 bytes for alignment). -#define STACK_SIZE (2*MLDSA_N + 64) - -.macro push_stack - sub sp, sp, #STACK_SIZE -.endm - -.macro pop_stack - add sp, sp, #STACK_SIZE -.endm - - /* Parameters */ - output .req x0 - buf .req x1 - buflen .req x2 - table_idx .req x3 - - len .req x4 - - /* Temporary output on the stack */ - xtmp .req x7 - wtmp .req w7 - output_tmp .req x7 - output_tmp_base .req x8 - - /* Number of coefficients sampled so far */ - count .req x9 - buf_consumed .req x10 - - /* Temporary registers */ - tmp .req w11 - initial_zero_count .req x11 - final_copy_count .req x11 - - rec_idx_0 .req x12 - rec_idx_1 .req x13 - - rec_idx_0_w .req w12 - rec_idx_1_w .req w13 - - ctr0 .req x12 - ctr1 .req x13 - - ctr0_w .req w12 - ctr1_w .req w13 - - ctr01 .req ctr0 - - /* Vector registers */ - buf0 .req v0 - - tmp0 .req v26 - tmp1 .req v27 - tmp2 .req v28 - tmp3 .req v29 - - sign0 .req v4 - sign1 .req v5 - const4 .req v7 - - - val0 .req v16 - val0q .req q16 - val1 .req v17 - val1q .req q17 - val2 .req v18 - val2q .req q18 - val3 .req v19 - val3q .req q19 - - t0 .req s20 - t1 .req s21 - - table0 .req v24 - table0q .req q24 - table1 .req v25 - table1q .req q25 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_clean/src/rej_uniform_eta4_asm.S using scripts/simpasm. Do not modify it directly. + */ - eta_bound .req v30 - bits .req v31 - .text - .global MLD_ASM_NAMESPACE(rej_uniform_eta4_asm) - .balign 4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(rej_uniform_eta4_asm) MLD_ASM_FN_SYMBOL(rej_uniform_eta4_asm) - push_stack - - // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - movz xtmp, 0x1 - movk xtmp, 0x2, lsl 16 - movk xtmp, 0x4, lsl 32 - movk xtmp, 0x8, lsl 48 - mov bits.d[0], xtmp - - movz xtmp, 0x10 - movk xtmp, 0x20, lsl 16 - movk xtmp, 0x40, lsl 32 - movk xtmp, 0x80, lsl 48 - mov bits.d[1], xtmp - // Load eta4 bound = 9 - movi eta_bound.8h, #9 - movi const4.8h, #4 + sub sp, sp, #0x240 + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 // =16 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + movi v30.8h, #0x9 + movi v7.8h, #0x4 + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b - mov output_tmp_base, sp - mov output_tmp, output_tmp_base - - // The entire temporary stack buffer is copied to the output buffer - // at the end of this routine. To avoid leaking original stack contents - // in case not enough bytes have been sampled, zero the temporary buffer. - // The temporary buffer holds 16-bit values that are expanded to 32-bit - // on copy out - mov initial_zero_count, #0 - eor val0.16b, val0.16b, val0.16b rej_uniform_eta4_initial_zero: - str val0q, [output_tmp], #64 - str val0q, [output_tmp, #-48] - str val0q, [output_tmp, #-32] - str val0q, [output_tmp, #-16] - add initial_zero_count, initial_zero_count, #32 - cmp initial_zero_count, #MLDSA_N - b.lt rej_uniform_eta4_initial_zero - - mov output_tmp, output_tmp_base - - mov count, #0 - mov len, #MLDSA_N + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt rej_uniform_eta4_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 rej_uniform_eta4_loop8: - // Finish once we've generated sufficiently many coefficients - cmp count, len - b.hs rej_uniform_eta4_memory_copy + cmp x9, x4 + b.hs rej_uniform_eta4_memory_copy + sub x2, x2, #0x8 + ld1 { v0.8b }, [x1], #8 + movi v26.8b, #0xf + and v27.8b, v0.8b, v26.8b + ushr v28.8b, v0.8b, #0x4 + zip1 v26.8b, v27.8b, v28.8b + zip2 v29.8b, v27.8b, v28.8b + ushll v16.8h, v26.8b, #0x0 + ushll v17.8h, v29.8b, #0x0 + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + add x12, x12, x13 + add x9, x9, x12 + cmp x2, #0x8 + b.hs rej_uniform_eta4_loop8 - // Load 8 bytes and extract nibbles to get 16 4-bit values - sub buflen, buflen, #8 - ld1 {buf0.8b}, [buf], #8 - - // Extract nibbles - movi tmp0.8b, #0x0F - and tmp1.8b, buf0.8b, tmp0.8b // Low nibbles [L0, L1, L2, L3, L4, L5, L6, L7] - ushr tmp2.8b, buf0.8b, #4 // High nibbles [H0, H1, H2, H3, H4, H5, H6, H7] - - // Interleave low and high nibbles: L0,H0,L1,H1,L2,H2,L3,H3,... - zip1 tmp0.8b, tmp1.8b, tmp2.8b // First 8 nibbles interleaved [L0,H0,L1,H1,L2,H2,L3,H3] - zip2 tmp3.8b, tmp1.8b, tmp2.8b // Next 8 nibbles interleaved [L4,H4,L5,H5,L6,H6,L7,H7] - - // Convert to 16-bit values - uxtl val0.8h, tmp0.8b - uxtl val1.8h, tmp3.8b - - // At this point, val0-val1 are the signed integers to do rejection - // sampling on. For each of them, do the following: - // - Check which coefficients are within range, and represent the set - // of lane-indices of those coefficients as an 8-bit bitmap. - // - Move the respective lanes to the front of the vector. This is the - // most complex part, and is done by interpreting the 8-bit bitmap as - // an index into a lookup table giving the lane-table to be use for - // the `tbl` instruction. - // - Write the vector to the output buffer, but merely increase the output - // buffer pointer by the number of valid coefficients. - - // Check which coefficients are within range (< 9) - cmhi sign0.8h, eta_bound.8h, val0.8h - cmhi sign1.8h, eta_bound.8h, val1.8h - - // If lane i is valid and has value -1, retain only i-th bit - and sign0.16b, sign0.16b, bits.16b - and sign1.16b, sign1.16b, bits.16b - - // Get 8-bit bitmap of valid lane indices by adding lanes - uaddlv t0, sign0.8h - uaddlv t1, sign1.8h - - fmov rec_idx_0_w, t0 - fmov rec_idx_1_w, t1 - - ldr table0q, [table_idx, rec_idx_0, lsl #4] - ldr table1q, [table_idx, rec_idx_1, lsl #4] - - // Compute number of valid coefficients. Recall that at this - // point, lane i has value 2^i (hence popcount 1) if its coefficient - // is valid, and 0 otherwise. - cnt sign0.16b, sign0.16b - cnt sign1.16b, sign1.16b - - // Extract number of valid coefficients - uaddlv t0, sign0.8h - uaddlv t1, sign1.8h - - fmov ctr0_w, t0 - fmov ctr1_w, t1 - - // Move valid coefficients to the front - tbl val0.16b, {val0.16b}, table0.16b - tbl val1.16b, {val1.16b}, table1.16b - - - // We store 16-bit coefficients here. They will be expanded to 32-bit - // on copy out - str val0q, [output_tmp] - add output_tmp, output_tmp, ctr0, lsl #1 - - str val1q, [output_tmp] - add output_tmp, output_tmp, ctr1, lsl #1 - - add ctr01, ctr0, ctr1 - add count, count, ctr01 - - cmp buflen, #8 - b.hs rej_uniform_eta4_loop8 -rej_uniform_eta4_loop8_end: rej_uniform_eta4_memory_copy: - // min = min(count,len) - cmp count, len - csel count, count, len, lo + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 // =0 + mov x7, x8 - // Always copy MLDSA_N coefficients from the stack to the destination - mov final_copy_count, #0 - mov output_tmp, output_tmp_base rej_uniform_eta4_final_copy: - ldr val0q, [output_tmp], #32 - ldr val2q, [output_tmp, #-16] - - // Apply eta4 transformation: 4 - nibble - sub val0.8h, const4.8h, val0.8h - sub val2.8h, const4.8h, val2.8h - - // Expand from 16-bit to 32-bit - sxtl2 val1.4s, val0.8h - sxtl val0.4s, val0.4h - - sxtl2 val3.4s, val2.8h - sxtl val2.4s, val2.4h - - str val0q, [output], #64 - str val1q, [output, #-48] - str val2q, [output, #-32] - str val3q, [output, #-16] - add final_copy_count, final_copy_count, #16 - cmp final_copy_count, #MLDSA_N - b.lt rej_uniform_eta4_final_copy - - mov x0, count - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq output - .unreq buf - .unreq buflen - .unreq table_idx - .unreq len - .unreq output_tmp - .unreq output_tmp_base - .unreq count - .unreq buf_consumed - .unreq tmp - .unreq xtmp - .unreq final_copy_count - .unreq initial_zero_count - .unreq rec_idx_0 - .unreq rec_idx_1 - .unreq rec_idx_0_w - .unreq rec_idx_1_w - .unreq ctr0 - .unreq ctr1 - .unreq ctr0_w - .unreq ctr1_w - .unreq ctr01 - .unreq buf0 - .unreq tmp0 - .unreq tmp1 - .unreq tmp2 - .unreq tmp3 - .unreq sign0 - .unreq sign1 - .unreq val0 - .unreq val0q - .unreq val1 - .unreq val1q - .unreq val2 - .unreq val2q - .unreq val3 - .unreq val3q - .unreq t0 - .unreq t1 - .unreq table0 - .unreq table0q - .unreq table1 - .unreq table1q - .unreq eta_bound - .unreq bits - -#undef STACK_SIZE + ldr q16, [x7], #0x20 + ldur q18, [x7, #-0x10] + sub v16.8h, v7.8h, v16.8h + sub v18.8h, v7.8h, v18.8h + sshll2 v17.4s, v16.8h, #0x0 + sshll v16.4s, v16.4h, #0x0 + sshll2 v19.4s, v18.8h, #0x0 + sshll v18.4s, v18.4h, #0x0 + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x10 + cmp x11, #0x100 + b.lt rej_uniform_eta4_final_copy + mov x0, x9 + add sp, sp, #0x240 + ret #endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/x86_64/src/intt.S b/mldsa/native/x86_64/src/intt.S index beebdd642..f2c577566 100644 --- a/mldsa/native/x86_64/src/intt.S +++ b/mldsa/native/x86_64/src/intt.S @@ -21,264 +21,2283 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - -.macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle2 r0,r1,r2,r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 -vpsubd %ymm\l,%ymm\h,%ymm12 -vpaddd %ymm\h,%ymm\l,%ymm\l - -vpmuldq %ymm\zl0,%ymm12,%ymm13 -vmovshdup %ymm12,%ymm\h -vpmuldq %ymm\zl1,%ymm\h,%ymm14 - -vpmuldq %ymm\zh0,%ymm12,%ymm12 -vpmuldq %ymm\zh1,%ymm\h,%ymm\h - -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 - -vpsubd %ymm13,%ymm12,%ymm12 -vpsubd %ymm14,%ymm\h,%ymm\h - -vmovshdup %ymm12,%ymm12 -vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h -.endm - -.macro levels0t5 off -vmovdqa 256*\off+ 0(%rdi),%ymm4 -vmovdqa 256*\off+ 32(%rdi),%ymm5 -vmovdqa 256*\off+ 64(%rdi),%ymm6 -vmovdqa 256*\off+ 96(%rdi),%ymm7 -vmovdqa 256*\off+128(%rdi),%ymm8 -vmovdqa 256*\off+160(%rdi),%ymm9 -vmovdqa 256*\off+192(%rdi),%ymm10 -vmovdqa 256*\off+224(%rdi),%ymm11 - -/* level 0 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4,5,1,3,2,15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 6,7,1,3,2,15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 8,9,1,3,2,15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 10,11,1,3,2,15 - -/* level 1 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4,6,1,3,2,15 -butterfly 5,7,1,3,2,15 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 8,10,1,3,2,15 -butterfly 9,11,1,3,2,15 - -/* level 2 */ -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 -vmovshdup %ymm3,%ymm1 -vmovshdup %ymm15,%ymm2 -butterfly 4,8,1,3,2,15 -butterfly 5,9,1,3,2,15 -butterfly 6,10,1,3,2,15 -butterfly 7,11,1,3,2,15 - -/* level 3 */ -shuffle2 4,5,3,5 -shuffle2 6,7,4,7 -shuffle2 8,9,6,9 -shuffle2 10,11,8,11 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 -butterfly 3,5 -butterfly 4,7 -butterfly 6,9 -butterfly 8,11 - -/* level 4 */ -shuffle4 3,4,10,4 -shuffle4 6,8,3,8 -shuffle4 5,7,6,7 -shuffle4 9,11,5,11 - -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 -vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 -butterfly 10,4 -butterfly 3,8 -butterfly 6,7 -butterfly 5,11 - -/* level 5 */ -shuffle8 10,3,9,3 -shuffle8 6,5,10,5 -shuffle8 4,8,6,8 -shuffle8 7,11,4,11 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+7-\off)*4(%rsi),%ymm2 -butterfly 9,3 -butterfly 10,5 -butterfly 6,8 -butterfly 4,11 - -vmovdqa %ymm9,256*\off+ 0(%rdi) -vmovdqa %ymm10,256*\off+ 32(%rdi) -vmovdqa %ymm6,256*\off+ 64(%rdi) -vmovdqa %ymm4,256*\off+ 96(%rdi) -vmovdqa %ymm3,256*\off+128(%rdi) -vmovdqa %ymm5,256*\off+160(%rdi) -vmovdqa %ymm8,256*\off+192(%rdi) -vmovdqa %ymm11,256*\off+224(%rdi) -.endm - -.macro levels6t7 off -vmovdqa 0+32*\off(%rdi),%ymm4 -vmovdqa 128+32*\off(%rdi),%ymm5 -vmovdqa 256+32*\off(%rdi),%ymm6 -vmovdqa 384+32*\off(%rdi),%ymm7 -vmovdqa 512+32*\off(%rdi),%ymm8 -vmovdqa 640+32*\off(%rdi),%ymm9 -vmovdqa 768+32*\off(%rdi),%ymm10 -vmovdqa 896+32*\off(%rdi),%ymm11 - -/* level 6 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 -butterfly 4,6 -butterfly 5,7 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 -butterfly 8,10 -butterfly 9,11 - -/* level 7 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2 - -butterfly 4,8 -butterfly 5,9 -butterfly 6,10 -butterfly 7,11 - -vmovdqa %ymm8,512+32*\off(%rdi) -vmovdqa %ymm9,640+32*\off(%rdi) -vmovdqa %ymm10,768+32*\off(%rdi) -vmovdqa %ymm11,896+32*\off(%rdi) - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2 -vpmuldq %ymm1,%ymm4,%ymm12 -vpmuldq %ymm1,%ymm5,%ymm13 -vmovshdup %ymm4,%ymm8 -vmovshdup %ymm5,%ymm9 -vpmuldq %ymm1,%ymm8,%ymm14 -vpmuldq %ymm1,%ymm9,%ymm15 -vpmuldq %ymm2,%ymm4,%ymm4 -vpmuldq %ymm2,%ymm5,%ymm5 -vpmuldq %ymm2,%ymm8,%ymm8 -vpmuldq %ymm2,%ymm9,%ymm9 -vpmuldq %ymm0,%ymm12,%ymm12 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 -vpmuldq %ymm0,%ymm15,%ymm15 -vpsubd %ymm12,%ymm4,%ymm4 -vpsubd %ymm13,%ymm5,%ymm5 -vpsubd %ymm14,%ymm8,%ymm8 -vpsubd %ymm15,%ymm9,%ymm9 -vmovshdup %ymm4,%ymm4 -vmovshdup %ymm5,%ymm5 -vpblendd $0xAA,%ymm8,%ymm4,%ymm4 -vpblendd $0xAA,%ymm9,%ymm5,%ymm5 - -vpmuldq %ymm1,%ymm6,%ymm12 -vpmuldq %ymm1,%ymm7,%ymm13 -vmovshdup %ymm6,%ymm8 -vmovshdup %ymm7,%ymm9 -vpmuldq %ymm1,%ymm8,%ymm14 -vpmuldq %ymm1,%ymm9,%ymm15 -vpmuldq %ymm2,%ymm6,%ymm6 -vpmuldq %ymm2,%ymm7,%ymm7 -vpmuldq %ymm2,%ymm8,%ymm8 -vpmuldq %ymm2,%ymm9,%ymm9 -vpmuldq %ymm0,%ymm12,%ymm12 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 -vpmuldq %ymm0,%ymm15,%ymm15 -vpsubd %ymm12,%ymm6,%ymm6 -vpsubd %ymm13,%ymm7,%ymm7 -vpsubd %ymm14,%ymm8,%ymm8 -vpsubd %ymm15,%ymm9,%ymm9 -vmovshdup %ymm6,%ymm6 -vmovshdup %ymm7,%ymm7 -vpblendd $0xAA,%ymm8,%ymm6,%ymm6 -vpblendd $0xAA,%ymm9,%ymm7,%ymm7 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/intt.S using scripts/simpasm. Do not modify it directly. + */ -vmovdqa %ymm4, 0+32*\off(%rdi) -vmovdqa %ymm5,128+32*\off(%rdi) -vmovdqa %ymm6,256+32*\off(%rdi) -vmovdqa %ymm7,384+32*\off(%rdi) -.endm .text .balign 4 .global MLD_ASM_NAMESPACE(invntt_avx2) MLD_ASM_FN_SYMBOL(invntt_avx2) -vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 - -levels0t5 0 -levels0t5 1 -levels0t5 2 -levels0t5 3 - -levels6t7 0 -levels6t7 1 -levels6t7 2 -levels6t7 3 - -ret + vmovdqa (%rsi), %ymm0 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm9, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm8, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm11, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm8, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm10, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm3, 0x80(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm8, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm9, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm8, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm11, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm8, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm10, 0x120(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm4, 0x160(%rdi) + vmovdqa %ymm3, 0x180(%rdi) + vmovdqa %ymm5, 0x1a0(%rdi) + vmovdqa %ymm8, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm9, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm8, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm11, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm8, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm10, 0x220(%rdi) + vmovdqa %ymm6, 0x240(%rdi) + vmovdqa %ymm4, 0x260(%rdi) + vmovdqa %ymm3, 0x280(%rdi) + vmovdqa %ymm5, 0x2a0(%rdi) + vmovdqa %ymm8, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm9, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm8, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm11, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm8, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm6, 0x340(%rdi) + vmovdqa %ymm4, 0x360(%rdi) + vmovdqa %ymm3, 0x380(%rdi) + vmovdqa %ymm5, 0x3a0(%rdi) + vmovdqa %ymm8, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/ntt.S b/mldsa/native/x86_64/src/ntt.S index d1916dbeb..df8d970c1 100644 --- a/mldsa/native/x86_64/src/ntt.S +++ b/mldsa/native/x86_64/src/ntt.S @@ -22,220 +22,2355 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - -.macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle2 r0,r1,r2,r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 -vpmuldq %ymm\zl0,%ymm\h,%ymm13 -vmovshdup %ymm\h,%ymm12 -vpmuldq %ymm\zl1,%ymm12,%ymm14 - -vpmuldq %ymm\zh0,%ymm\h,%ymm\h -vpmuldq %ymm\zh1,%ymm12,%ymm12 - -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 - -vmovshdup %ymm\h,%ymm\h -vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h - -vpsubd %ymm\h,%ymm\l,%ymm12 -vpaddd %ymm\h,%ymm\l,%ymm\l - -vmovshdup %ymm13,%ymm13 -vpblendd $0xAA,%ymm14,%ymm13,%ymm13 - -vpaddd %ymm13,%ymm12,%ymm\h -vpsubd %ymm13,%ymm\l,%ymm\l -.endm - -.macro levels0t1 off -/* level 0 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2 - -vmovdqa 0+32*\off(%rdi),%ymm4 -vmovdqa 128+32*\off(%rdi),%ymm5 -vmovdqa 256+32*\off(%rdi),%ymm6 -vmovdqa 384+32*\off(%rdi),%ymm7 -vmovdqa 512+32*\off(%rdi),%ymm8 -vmovdqa 640+32*\off(%rdi),%ymm9 -vmovdqa 768+32*\off(%rdi),%ymm10 -vmovdqa 896+32*\off(%rdi),%ymm11 - -butterfly 4,8 -butterfly 5,9 -butterfly 6,10 -butterfly 7,11 - -/* level 1 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 -butterfly 4,6 -butterfly 5,7 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 -butterfly 8,10 -butterfly 9,11 - -vmovdqa %ymm4, 0+32*\off(%rdi) -vmovdqa %ymm5,128+32*\off(%rdi) -vmovdqa %ymm6,256+32*\off(%rdi) -vmovdqa %ymm7,384+32*\off(%rdi) -vmovdqa %ymm8,512+32*\off(%rdi) -vmovdqa %ymm9,640+32*\off(%rdi) -vmovdqa %ymm10,768+32*\off(%rdi) -vmovdqa %ymm11,896+32*\off(%rdi) -.endm - -.macro levels2t7 off -/* level 2 */ -vmovdqa 256*\off+ 0(%rdi),%ymm4 -vmovdqa 256*\off+ 32(%rdi),%ymm5 -vmovdqa 256*\off+ 64(%rdi),%ymm6 -vmovdqa 256*\off+ 96(%rdi),%ymm7 -vmovdqa 256*\off+128(%rdi),%ymm8 -vmovdqa 256*\off+160(%rdi),%ymm9 -vmovdqa 256*\off+192(%rdi),%ymm10 -vmovdqa 256*\off+224(%rdi),%ymm11 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2 - -butterfly 4,8 -butterfly 5,9 -butterfly 6,10 -butterfly 7,11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -/* level 3 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 - -butterfly 3,5 -butterfly 8,10 -butterfly 4,6 -butterfly 9,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -/* level 4 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 - -butterfly 7,8 -butterfly 5,6 -butterfly 3,4 -butterfly 10,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -/* level 5 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 - -butterfly 9,5,1,10,2,15 -butterfly 8,4,1,10,2,15 -butterfly 7,3,1,10,2,15 -butterfly 6,11,1,10,2,15 - -/* level 6 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 9,7,1,10,2,15 -butterfly 8,6,1,10,2,15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 5,3,1,10,2,15 -butterfly 4,11,1,10,2,15 - -/* level 7 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 9,8,1,10,2,15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 7,6,1,10,2,15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 5,4,1,10,2,15 - -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 -butterfly 3,11,1,10,2,15 +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/ntt.S using scripts/simpasm. Do not modify it directly. + */ -vmovdqa %ymm9,256*\off+ 0(%rdi) -vmovdqa %ymm8,256*\off+ 32(%rdi) -vmovdqa %ymm7,256*\off+ 64(%rdi) -vmovdqa %ymm6,256*\off+ 96(%rdi) -vmovdqa %ymm5,256*\off+128(%rdi) -vmovdqa %ymm4,256*\off+160(%rdi) -vmovdqa %ymm3,256*\off+192(%rdi) -vmovdqa %ymm11,256*\off+224(%rdi) -.endm .text .balign 4 .global MLD_ASM_NAMESPACE(ntt_avx2) MLD_ASM_FN_SYMBOL(ntt_avx2) -vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 - -levels0t1 0 -levels0t1 1 -levels0t1 2 -levels0t1 3 -levels2t7 0 -levels2t7 1 -levels2t7 2 -levels2t7 3 + vmovdqa (%rsi), %ymm0 + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xa0(%rsi), %ymm1 + vmovdqa 0x540(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x120(%rsi), %ymm1 + vmovdqa 0x5c0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm8, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1a0(%rsi), %ymm1 + vmovdqa 0x640(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm11, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x220(%rsi), %ymm1 + vmovdqa 0x6c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2a0(%rsi), %ymm1 + vmovdqa 0x740(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x320(%rsi), %ymm1 + vmovdqa 0x7c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3a0(%rsi), %ymm1 + vmovdqa 0x840(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x420(%rsi), %ymm1 + vmovdqa 0x8c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4a0(%rsi), %ymm1 + vmovdqa 0x940(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm11, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xc0(%rsi), %ymm1 + vmovdqa 0x560(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x140(%rsi), %ymm1 + vmovdqa 0x5e0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm8, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1c0(%rsi), %ymm1 + vmovdqa 0x660(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm11, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x240(%rsi), %ymm1 + vmovdqa 0x6e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2c0(%rsi), %ymm1 + vmovdqa 0x760(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x340(%rsi), %ymm1 + vmovdqa 0x7e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3c0(%rsi), %ymm1 + vmovdqa 0x860(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x440(%rsi), %ymm1 + vmovdqa 0x8e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4c0(%rsi), %ymm1 + vmovdqa 0x960(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm11, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xe0(%rsi), %ymm1 + vmovdqa 0x580(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x160(%rsi), %ymm1 + vmovdqa 0x600(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm8, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1e0(%rsi), %ymm1 + vmovdqa 0x680(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm11, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x260(%rsi), %ymm1 + vmovdqa 0x700(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2e0(%rsi), %ymm1 + vmovdqa 0x780(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x360(%rsi), %ymm1 + vmovdqa 0x800(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3e0(%rsi), %ymm1 + vmovdqa 0x880(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x460(%rsi), %ymm1 + vmovdqa 0x900(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4e0(%rsi), %ymm1 + vmovdqa 0x980(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm11, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm8, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm9, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm10, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm11, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0x100(%rsi), %ymm1 + vmovdqa 0x5a0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x180(%rsi), %ymm1 + vmovdqa 0x620(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm8, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x200(%rsi), %ymm1 + vmovdqa 0x6a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm11, %ymm6, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x280(%rsi), %ymm1 + vmovdqa 0x720(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x300(%rsi), %ymm1 + vmovdqa 0x7a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm11, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x380(%rsi), %ymm1 + vmovdqa 0x820(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x400(%rsi), %ymm1 + vmovdqa 0x8a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x480(%rsi), %ymm1 + vmovdqa 0x920(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x500(%rsi), %ymm1 + vmovdqa 0x9a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm11, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + retq -ret #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/nttunpack.S b/mldsa/native/x86_64/src/nttunpack.S index 101c9543e..29c22a922 100644 --- a/mldsa/native/x86_64/src/nttunpack.S +++ b/mldsa/native/x86_64/src/nttunpack.S @@ -23,24 +23,10 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -.macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle2 r0,r1,r2,r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 -.endm +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly. + */ .text @@ -48,53 +34,65 @@ vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .global MLD_ASM_NAMESPACE(nttunpack_avx2) MLD_ASM_FN_SYMBOL(nttunpack_avx2) -call nttunpack128_avx -add $256,%rdi -call nttunpack128_avx -add $256,%rdi -call nttunpack128_avx -add $256,%rdi -call nttunpack128_avx -ret + callq nttunpack128_avx + addq $0x100, %rdi # imm = 0x100 + callq nttunpack128_avx + addq $0x100, %rdi # imm = 0x100 + callq nttunpack128_avx + addq $0x100, %rdi # imm = 0x100 + callq nttunpack128_avx + retq nttunpack128_avx: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -#store -vmovdqa %ymm9,(%rdi) -vmovdqa %ymm8,32(%rdi) -vmovdqa %ymm7,64(%rdi) -vmovdqa %ymm6,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm3,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/pointwise.S b/mldsa/native/x86_64/src/pointwise.S index 19c381ab5..67e31fb76 100644 --- a/mldsa/native/x86_64/src/pointwise.S +++ b/mldsa/native/x86_64/src/pointwise.S @@ -21,130 +21,104 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - - .intel_syntax noprefix - .text - /* - * void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) - * - * Pointwise multiplication of polynomials in NTT domain with Montgomery reduction - * - * Arguments: - * rdi: pointer to output polynomial c - * rsi: pointer to input polynomial a - * rdx: pointer to input polynomial b - * rcx: pointer to qdata constants + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise.S using scripts/simpasm. Do not modify it directly. */ - .balign 4 - .global MLD_ASM_NAMESPACE(pointwise_avx2) -MLD_ASM_FN_SYMBOL(pointwise_avx2) - -// Load constants - vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] - vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] - - xor eax, eax -_looptop1: -// Load - vmovdqa ymm2, [rsi] - vmovdqa ymm4, [rsi + 32] - vmovdqa ymm6, [rsi + 64] - vmovdqa ymm10, [rdx] - vmovdqa ymm12, [rdx + 32] - vmovdqa ymm14, [rdx + 64] - vpsrlq ymm3, ymm2, 32 - vpsrlq ymm5, ymm4, 32 - vmovshdup ymm7, ymm6 - vpsrlq ymm11, ymm10, 32 - vpsrlq ymm13, ymm12, 32 - vmovshdup ymm15, ymm14 -// Multiply - vpmuldq ymm2, ymm2, ymm10 - vpmuldq ymm3, ymm3, ymm11 - vpmuldq ymm4, ymm4, ymm12 - vpmuldq ymm5, ymm5, ymm13 - vpmuldq ymm6, ymm6, ymm14 - vpmuldq ymm7, ymm7, ymm15 -// Reduce - vpmuldq ymm10, ymm0, ymm2 - vpmuldq ymm11, ymm0, ymm3 - vpmuldq ymm12, ymm0, ymm4 - vpmuldq ymm13, ymm0, ymm5 - vpmuldq ymm14, ymm0, ymm6 - vpmuldq ymm15, ymm0, ymm7 - vpmuldq ymm10, ymm1, ymm10 - vpmuldq ymm11, ymm1, ymm11 - vpmuldq ymm12, ymm1, ymm12 - vpmuldq ymm13, ymm1, ymm13 - vpmuldq ymm14, ymm1, ymm14 - vpmuldq ymm15, ymm1, ymm15 - vpsubq ymm2, ymm2, ymm10 - vpsubq ymm3, ymm3, ymm11 - vpsubq ymm4, ymm4, ymm12 - vpsubq ymm5, ymm5, ymm13 - vpsubq ymm6, ymm6, ymm14 - vpsubq ymm7, ymm7, ymm15 - vpsrlq ymm2, ymm2, 32 - vpsrlq ymm4, ymm4, 32 - vmovshdup ymm6, ymm6 - -// Store - vpblendd ymm2, ymm2, ymm3, 0xAA - vpblendd ymm4, ymm4, ymm5, 0xAA - vpblendd ymm6, ymm6, ymm7, 0xAA - vmovdqa [rdi], ymm2 - vmovdqa [rdi + 32], ymm4 - vmovdqa [rdi + 64], ymm6 - - add rdi, 96 - add rsi, 96 - add rdx, 96 - add eax, 1 - cmp eax, 10 - jb _looptop1 - - vmovdqa ymm2, [rsi] - vmovdqa ymm4, [rsi + 32] - vmovdqa ymm10, [rdx] - vmovdqa ymm12, [rdx + 32] - vpsrlq ymm3, ymm2, 32 - vpsrlq ymm5, ymm4, 32 - vmovshdup ymm11, ymm10 - vmovshdup ymm13, ymm12 - -// Multiply - vpmuldq ymm2, ymm2, ymm10 - vpmuldq ymm3, ymm3, ymm11 - vpmuldq ymm4, ymm4, ymm12 - vpmuldq ymm5, ymm5, ymm13 - -// Reduce - vpmuldq ymm10, ymm0, ymm2 - vpmuldq ymm11, ymm0, ymm3 - vpmuldq ymm12, ymm0, ymm4 - vpmuldq ymm13, ymm0, ymm5 - vpmuldq ymm10, ymm1, ymm10 - vpmuldq ymm11, ymm1, ymm11 - vpmuldq ymm12, ymm1, ymm12 - vpmuldq ymm13, ymm1, ymm13 - vpsubq ymm2, ymm2, ymm10 - vpsubq ymm3, ymm3, ymm11 - vpsubq ymm4, ymm4, ymm12 - vpsubq ymm5, ymm5, ymm13 - vpsrlq ymm2, ymm2, 32 - vmovshdup ymm4, ymm4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(pointwise_avx2) +MLD_ASM_FN_SYMBOL(pointwise_avx2) -// Store - vpblendd ymm2, ymm3, ymm2, 0x55 - vpblendd ymm4, ymm5, ymm4, 0x55 - vmovdqa [rdi], ymm2 - vmovdqa [rdi + 32], ymm4 + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax - ret +_looptop1: + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm4 + vmovdqa 0x40(%rsi), %ymm6 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vmovdqa 0x40(%rdx), %ymm14 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] + vpsrlq $0x20, %ymm10, %ymm11 + vpsrlq $0x20, %ymm12, %ymm13 + vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm14, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm6, %ymm0, %ymm14 + vpmuldq %ymm7, %ymm0, %ymm15 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpmuldq %ymm14, %ymm1, %ymm14 + vpmuldq %ymm15, %ymm1, %ymm15 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsubq %ymm14, %ymm6, %ymm6 + vpsubq %ymm15, %ymm7, %ymm7 + vpsrlq $0x20, %ymm2, %ymm2 + vpsrlq $0x20, %ymm4, %ymm4 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + addq $0x60, %rdi + addq $0x60, %rsi + addq $0x60, %rdx + addl $0x1, %eax + cmpl $0xa, %eax + jb _looptop1 + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm4 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/pointwise_acc_l4.S b/mldsa/native/x86_64/src/pointwise_acc_l4.S index 3b2c45fd6..2fa61ab29 100644 --- a/mldsa/native/x86_64/src/pointwise_acc_l4.S +++ b/mldsa/native/x86_64/src/pointwise_acc_l4.S @@ -21,105 +21,111 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - - .intel_syntax noprefix - .text - -.macro pointwise off -// Load - vmovdqa ymm6, [rsi + \off] - vmovdqa ymm8, [rsi + \off + 32] - vmovdqa ymm10, [rdx + \off] - vmovdqa ymm12, [rdx + \off + 32] - vpsrlq ymm7, ymm6, 32 - vpsrlq ymm9, ymm8, 32 - vmovshdup ymm11, ymm10 - vmovshdup ymm13, ymm12 - -// Multiply - vpmuldq ymm6, ymm6, ymm10 - vpmuldq ymm7, ymm7, ymm11 - vpmuldq ymm8, ymm8, ymm12 - vpmuldq ymm9, ymm9, ymm13 -.endm - -.macro acc - vpaddq ymm2, ymm6, ymm2 - vpaddq ymm3, ymm7, ymm3 - vpaddq ymm4, ymm8, ymm4 - vpaddq ymm5, ymm9, ymm5 -.endm - /* - * void mld_pointwise_acc_l4_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) - * - * Pointwise multiplication with accumulation across multiple polynomial vectors - * - * Arguments: - * rdi: pointer to output polynomial c - * rsi: pointer to input polynomial a (multiple vectors) - * rdx: pointer to input polynomial b (multiple vectors) - * rcx: pointer to qdata constants + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l4.S using scripts/simpasm. Do not modify it directly. */ - .balign 4 - .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2) -MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2) - -// Load constants - vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] - vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] - xor eax, eax -_looptop2: - pointwise 0 - -// Move - vmovdqa ymm2, ymm6 - vmovdqa ymm3, ymm7 - vmovdqa ymm4, ymm8 - vmovdqa ymm5, ymm9 - - pointwise 1024 - acc - - pointwise 2048 - acc - - pointwise 3072 - acc -// Reduce - vpmuldq ymm6, ymm0, ymm2 - vpmuldq ymm7, ymm0, ymm3 - vpmuldq ymm8, ymm0, ymm4 - vpmuldq ymm9, ymm0, ymm5 - vpmuldq ymm6, ymm1, ymm6 - vpmuldq ymm7, ymm1, ymm7 - vpmuldq ymm8, ymm1, ymm8 - vpmuldq ymm9, ymm1, ymm9 - vpsubq ymm2, ymm2, ymm6 - vpsubq ymm3, ymm3, ymm7 - vpsubq ymm4, ymm4, ymm8 - vpsubq ymm5, ymm5, ymm9 - vpsrlq ymm2, ymm2, 32 - vmovshdup ymm4, ymm4 - -// Store - vpblendd ymm2, ymm2, ymm3, 0xAA - vpblendd ymm4, ymm4, ymm5, 0xAA - - vmovdqa [rdi], ymm2 - vmovdqa [rdi + 32], ymm4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2) - add rsi, 64 - add rdx, 64 - add rdi, 64 - add eax, 1 - cmp eax, 16 - jb _looptop2 + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax - ret +_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb _looptop2 + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/pointwise_acc_l5.S b/mldsa/native/x86_64/src/pointwise_acc_l5.S index 46d5e7222..c15ba0d8e 100644 --- a/mldsa/native/x86_64/src/pointwise_acc_l5.S +++ b/mldsa/native/x86_64/src/pointwise_acc_l5.S @@ -21,108 +21,127 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - - .intel_syntax noprefix - .text - -.macro pointwise off -// Load - vmovdqa ymm6, [rsi + \off] - vmovdqa ymm8, [rsi + \off + 32] - vmovdqa ymm10, [rdx + \off] - vmovdqa ymm12, [rdx + \off + 32] - vpsrlq ymm7, ymm6, 32 - vpsrlq ymm9, ymm8, 32 - vmovshdup ymm11, ymm10 - vmovshdup ymm13, ymm12 - -// Multiply - vpmuldq ymm6, ymm6, ymm10 - vpmuldq ymm7, ymm7, ymm11 - vpmuldq ymm8, ymm8, ymm12 - vpmuldq ymm9, ymm9, ymm13 -.endm - -.macro acc - vpaddq ymm2, ymm6, ymm2 - vpaddq ymm3, ymm7, ymm3 - vpaddq ymm4, ymm8, ymm4 - vpaddq ymm5, ymm9, ymm5 -.endm - /* - * void mld_pointwise_acc_l5_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) - * - * Pointwise multiplication with accumulation across multiple polynomial vectors - * - * Arguments: - * rdi: pointer to output polynomial c - * rsi: pointer to input polynomial a (multiple vectors) - * rdx: pointer to input polynomial b (multiple vectors) - * rcx: pointer to qdata constants + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l5.S using scripts/simpasm. Do not modify it directly. */ - .balign 4 - .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2) -MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2) -// Load constants - vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] - vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] - - xor eax, eax -_looptop2: - pointwise 0 -// Move - vmovdqa ymm2, ymm6 - vmovdqa ymm3, ymm7 - vmovdqa ymm4, ymm8 - vmovdqa ymm5, ymm9 - - pointwise 1024 - acc - - pointwise 2048 - acc - - pointwise 3072 - acc - - pointwise 4096 - acc - -// Reduce - vpmuldq ymm6, ymm0, ymm2 - vpmuldq ymm7, ymm0, ymm3 - vpmuldq ymm8, ymm0, ymm4 - vpmuldq ymm9, ymm0, ymm5 - vpmuldq ymm6, ymm1, ymm6 - vpmuldq ymm7, ymm1, ymm7 - vpmuldq ymm8, ymm1, ymm8 - vpmuldq ymm9, ymm1, ymm9 - vpsubq ymm2, ymm2, ymm6 - vpsubq ymm3, ymm3, ymm7 - vpsubq ymm4, ymm4, ymm8 - vpsubq ymm5, ymm5, ymm9 - vpsrlq ymm2, ymm2, 32 - vmovshdup ymm4, ymm4 - -// Store - vpblendd ymm2, ymm2, ymm3, 0xAA - vpblendd ymm4, ymm4, ymm5, 0xAA - - vmovdqa [rdi], ymm2 - vmovdqa [rdi + 32], ymm4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2) - add rsi, 64 - add rdx, 64 - add rdi, 64 - add eax, 1 - cmp eax, 16 - jb _looptop2 + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax - ret +_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb _looptop2 + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/mldsa/native/x86_64/src/pointwise_acc_l7.S b/mldsa/native/x86_64/src/pointwise_acc_l7.S index 0c375efc8..d4ccb0af6 100644 --- a/mldsa/native/x86_64/src/pointwise_acc_l7.S +++ b/mldsa/native/x86_64/src/pointwise_acc_l7.S @@ -21,114 +21,159 @@ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - - .intel_syntax noprefix - .text - -.macro pointwise off -// Load - vmovdqa ymm6, [rsi + \off] - vmovdqa ymm8, [rsi + \off + 32] - vmovdqa ymm10, [rdx + \off] - vmovdqa ymm12, [rdx + \off + 32] - vpsrlq ymm7, ymm6, 32 - vpsrlq ymm9, ymm8, 32 - vmovshdup ymm11, ymm10 - vmovshdup ymm13, ymm12 - -// Multiply - vpmuldq ymm6, ymm6, ymm10 - vpmuldq ymm7, ymm7, ymm11 - vpmuldq ymm8, ymm8, ymm12 - vpmuldq ymm9, ymm9, ymm13 -.endm - -.macro acc - vpaddq ymm2, ymm6, ymm2 - vpaddq ymm3, ymm7, ymm3 - vpaddq ymm4, ymm8, ymm4 - vpaddq ymm5, ymm9, ymm5 -.endm - /* - * void mld_pointwise_acc_l7_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) - * - * Pointwise multiplication with accumulation across multiple polynomial vectors - * - * Arguments: - * rdi: pointer to output polynomial c - * rsi: pointer to input polynomial a (multiple vectors) - * rdx: pointer to input polynomial b (multiple vectors) - * rcx: pointer to qdata constants + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l7.S using scripts/simpasm. Do not modify it directly. */ - .balign 4 - .global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2) -MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2) - -// Load constants - vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] - vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] - - xor eax, eax -_looptop2: - pointwise 0 -// Move - vmovdqa ymm2, ymm6 - vmovdqa ymm3, ymm7 - vmovdqa ymm4, ymm8 - vmovdqa ymm5, ymm9 - pointwise 1024 - acc - - pointwise 2048 - acc - - pointwise 3072 - acc - - pointwise 4096 - acc - - pointwise 5120 - acc - - pointwise 6144 - acc - -// Reduce - vpmuldq ymm6, ymm0, ymm2 - vpmuldq ymm7, ymm0, ymm3 - vpmuldq ymm8, ymm0, ymm4 - vpmuldq ymm9, ymm0, ymm5 - vpmuldq ymm6, ymm1, ymm6 - vpmuldq ymm7, ymm1, ymm7 - vpmuldq ymm8, ymm1, ymm8 - vpmuldq ymm9, ymm1, ymm9 - vpsubq ymm2, ymm2, ymm6 - vpsubq ymm3, ymm3, ymm7 - vpsubq ymm4, ymm4, ymm8 - vpsubq ymm5, ymm5, ymm9 - vpsrlq ymm2, ymm2, 32 - vmovshdup ymm4, ymm4 - -// Store - vpblendd ymm2, ymm2, ymm3, 0xAA - vpblendd ymm4, ymm4, ymm5, 0xAA - - vmovdqa [rdi], ymm2 - vmovdqa [rdi + 32], ymm4 +.text +.balign 4 +.global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2) +MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2) - add rsi, 64 - add rdx, 64 - add rdi, 64 - add eax, 1 - cmp eax, 16 - jb _looptop2 + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax - ret +_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1400(%rsi), %ymm6 + vmovdqa 0x1420(%rsi), %ymm8 + vmovdqa 0x1400(%rdx), %ymm10 + vmovdqa 0x1420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1800(%rsi), %ymm6 + vmovdqa 0x1820(%rsi), %ymm8 + vmovdqa 0x1800(%rdx), %ymm10 + vmovdqa 0x1820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb _looptop2 + retq #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ */ diff --git a/scripts/autogen b/scripts/autogen index 3654cb9a6..6c8e4aef0 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1244,6 +1244,252 @@ class BibliographyEntry: return authors +def update_via_simpasm( + infile_full, + outdir, + outfile=None, + cflags=None, + preserve_header=True, + dry_run=False, + force_cross=False, +): + status_update("simpasm", infile_full) + + _, infile = os.path.split(infile_full) + if outfile is None: + outfile = infile + outfile_full = os.path.join(outdir, outfile) + + # Check if we need to use a cross-compiler + if "aarch64" in infile_full: + source_arch = "aarch64" + elif "x86_64" in infile_full: + source_arch = "x86_64" + else: + raise Exception(f"Could not detect architecture of source file {infile_full}.") + # Check native architecture + if platform.machine().lower() in ["arm64", "aarch64"]: + native_arch = "aarch64" + else: + native_arch = "x86_64" + + if native_arch != source_arch: + cross_prefix = f"{source_arch}-unknown-linux-gnu-" + cross_gcc = cross_prefix + "gcc" + # Check if cross-compiler is present + if shutil.which(cross_gcc) is None: + if force_cross is False: + return + raise Exception(f"Could not find cross toolchain {cross_prefix}") + else: + cross_prefix = None + + with tempfile.NamedTemporaryFile(suffix=".S") as tmp: + try: + # Determine architecture from filename + arch = "aarch64" if "aarch64" in infile_full else "x86_64" + + # TODO: Temporary remvoe the "--cfify", add back when CFI script added. + cmd = [ + "./scripts/simpasm", + "--objdump=llvm-objdump", + # "--cfify", + "--arch=" + arch, + "-i", + infile_full, + "-o", + tmp.name, + ] + if cross_prefix is not None: + # Stick with llvm-objdump for disassembly + cmd += ["--cc", cross_prefix + "gcc"] + cmd += ["--nm", cross_prefix + "nm"] + if cflags is not None: + cmd += [f'--cflags="{cflags}"'] + if preserve_header is True: + cmd += ["-p"] + r = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Command failed: {' '.join(cmd)}") + print(f"Exit code: {e.returncode}") + print(f"stderr: {e.stderr}") + raise Exception("Failed to run simpasm") from e + tmp.seek(0) + new_contents = tmp.read().decode() + + update_file(outfile_full, new_contents, dry_run=dry_run) + + +def update_via_copy(infile_full, outfile_full, dry_run=False, transform=None): + status_update("copy", f"{infile_full} -> {outfile_full}") + + with open(infile_full, "r") as f: + content = f.read() + + if transform is not None: + content = transform(content) + + update_file(outfile_full, content, dry_run=dry_run) + + +def update_via_remove(filename, dry_run=False): + if dry_run is True: + print( + f"Autogenerated file {filename} needs removing. Have you called scripts/autogen?", + file=sys.stderr, + ) + exit(1) + + # Remove the file + os.remove(filename) + + +def synchronize_file( + f, in_dir, out_dir, dry_run=False, delete=False, no_simplify=False, **kwargs +): + + # Only synchronize sources, but not README.md, Makefile and so on + extensions = (".c", ".h", ".i", ".inc", ".S") + + if not f.endswith(extensions): + return None + + basename = os.path.basename(f) + + if delete is True: + return basename + + if no_simplify is False and f.endswith(".S"): + update_via_simpasm(f, out_dir, dry_run=dry_run, **kwargs) + else: + # Update via copy + _, infile = os.path.split(f) + outfile_full = os.path.join(out_dir, infile) + # The header guards will also be checked later, but if we + # don't do it here, the dry-run would fail because of a + # mismatching intermediate file + if f.endswith(".h"): + transform = lambda c: adjust_header_guard_for_filename(c, outfile_full) + else: + transform = None + update_via_copy(f, outfile_full, dry_run=dry_run, transform=transform) + + return basename + + +def synchronize_backend( + in_dir, out_dir, dry_run=False, delete=False, no_simplify=False, **kwargs +): + copied = [] + + with ThreadPoolExecutor() as executor: + pool_results = list( + executor.map( + partial( + synchronize_file, + in_dir=in_dir, + out_dir=out_dir, + dry_run=dry_run, + delete=delete, + no_simplify=no_simplify, + **kwargs, + ), + get_files(os.path.join(in_dir, "*")), + ) + ) + + copied = [r for r in pool_results if r is not None] + + if delete is False: + return + + # Check for files in the target directory that have not been copied + for f in get_files(os.path.join(out_dir, "*")): + if os.path.basename(f) in copied: + continue + # Otherwise, remove it + update_via_remove(f, dry_run=dry_run) + + +def synchronize_backends( + *, dry_run=False, force_cross=False, clean=False, delete=False, no_simplify=False +): + if clean is False: + ty = "opt" + else: + ty = "clean" + + if delete is False: + # We may switch the AArch64 arithmetic backend, so adjust the metadata file + update_via_copy( + f"dev/aarch64_{ty}/meta.h", + "mldsa/native/aarch64/meta.h", + dry_run=dry_run, + transform=lambda c: adjust_header_guard_for_filename( + c, "mldsa/native/aarch64/meta.h" + ), + ) + + update_via_copy( + f"dev/x86_64/meta.h", + "mldsa/native/x86_64/meta.h", + dry_run=dry_run, + transform=lambda c: adjust_header_guard_for_filename( + c, "mldsa/native/x86_64/meta.h" + ), + ) + + synchronize_backend( + f"dev/aarch64_{ty}/src", + "mldsa/native/aarch64/src", + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Imldsa/native/aarch64/src", + ) + synchronize_backend( + "dev/fips202/aarch64/src", + "mldsa/fips202/native/aarch64/src", + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Imldsa/fips202/native/aarch64/src -march=armv8.4-a+sha3", + ) + synchronize_backend( + "dev/fips202/aarch64", + "mldsa/fips202/native/aarch64", + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Imldsa/fips202/native/aarch64 -march=armv8.4-a+sha3", + ) + synchronize_backend( + "dev/x86_64/src", + "mldsa/native/x86_64/src", + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + # Turn off control-flow protection (CET) explicitly. Newer versions of + # clang turn it on by default and insert endbr64 instructions at every + # global symbol. + # We insert endbr64 instruction manually via the MLD_ASM_FN_SYMBOL + # macro. + # This leads to duplicate endbr64 instructions causing a failure when + # comparing the object code before and after simplification. + cflags="-Imldsa/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ) + + def gen_markdown_citations_for(filename, bibliography, dry_run=False): # Skip BIBLIOGRAPHY.md @@ -1473,6 +1719,9 @@ def _main(): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--dry-run", default=False, action="store_true") + parser.add_argument("--aarch64-clean", default=True, action="store_true") + parser.add_argument("--no-simplify", default=False, action="store_true") + parser.add_argument("--force-cross", default=False, action="store_true") args = parser.parse_args() @@ -1489,11 +1738,29 @@ def _main(): gen_avx2_zeta_file(args.dry_run) gen_avx2_rej_uniform_table(args.dry_run) high_level_status("Generated zeta and lookup tables") + + synchronize_backends( + dry_run=args.dry_run, + clean=args.aarch64_clean, + no_simplify=args.no_simplify, + force_cross=args.force_cross, + ) + high_level_status("Synchronized backends") + gen_header_guards(args.dry_run) high_level_status("Generated header guards") gen_preprocessor_comments(args.dry_run) high_level_status("Generated preprocessor comments") + synchronize_backends( + dry_run=args.dry_run, + clean=args.aarch64_clean, + delete=True, + force_cross=args.force_cross, + no_simplify=args.no_simplify, + ) + high_level_status("Completed final backend synchronization") + if __name__ == "__main__": _main() diff --git a/scripts/lint b/scripts/lint index 432da1459..7ad2db191 100755 --- a/scripts/lint +++ b/scripts/lint @@ -167,7 +167,9 @@ check-spdx() for file in $(git ls-files -- "*.[chsSi]" | grep "^dev/\|^mldsa/"); do # TODO: Temporarily exclude AArch64 (i)NTT pending license resolution (see issue #381) if [[ $file == "mldsa/native/aarch64/src/ntt.S" ]] || - [[ $file == "mldsa/native/aarch64/src/intt.S" ]]; then + [[ $file == "mldsa/native/aarch64/src/intt.S" ]] || + [[ $file == "dev/aarch64_clean/src/ntt.S" ]] || + [[ $file == "dev/aarch64_clean/src/intt.S" ]]; then continue fi # Ignore symlinks diff --git a/scripts/simpasm b/scripts/simpasm new file mode 100755 index 000000000..1826711ad --- /dev/null +++ b/scripts/simpasm @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 +# Copyright (c) The mlkem-native project authors +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +import subprocess +import argparse +import logging +import pathlib +import tempfile +import platform +import sys +import os +import re + + +def patchup_disasm(asm, cfify=False): + asm = asm.split("\n") + indentation = 8 + + def decode_label(asm_line): + r = re.search(r"^\s*[0-9a-fA-F]+\s*<([a-zA-Z0-9_]+)>:\s*$", asm_line) + if r is None: + return None + return r.group(1) + + def make_label(lbl): + if cfify: + return "L" + lbl + ":" + return lbl + ":" + + # Find first label + for i, l in enumerate(asm): + if decode_label(l) is not None: + break + + asm = asm[i + 1 :] + + def gen(asm): + for l in asm: + if l.strip() == "": + yield "" + continue + lbl = decode_label(l) + # Re-format labels as assembly labels + if lbl is not None: + yield make_label(lbl) + continue + # Drop comments + l = l.split(";")[0] + # Re-format references to labels + # Those are assumed to have the form `0xDEADBEEF