diff --git a/dev/gen_arm_aors.jl b/dev/gen_arm_aors.jl new file mode 100644 index 0000000000..465b90e64a --- /dev/null +++ b/dev/gen_arm_aors.jl @@ -0,0 +1,94 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. See . +# + +# Generating routines for r <- a OP b, where OP is either + or -. +# +# This generation was constructed with processors with Apple silicon in mind. +# Processors decoding less than 6 operations per cycle, or few store and load +# units may have worse performance. + +r = "rp" +a = "ap" +b = "bp" +rp(ix::Int) = "[$r,#$ix*8]" +ap(ix::Int) = "[$a,#$ix*8]" +bp(ix::Int) = "[$b,#$ix*8]" + +sx = "sx" # Return value for carry or borrow +CC = "CC" + +sp = ["s$ix" for ix in 0:14] # Scrap registers + +# Writes assembly that should be preprocessed by M4. +function aors(n::Int) + _str = "PROLOGUE(flint_mpn_aors($n))\n" + function ldr(s0::String, s1::String) + _str *= "\tldr\t$s0, $s1\n" + end + function ldp(s0::String, s1::String, s2::String) + _str *= "\tldp\t$s0, $s1, $s2\n" + end + function str(s0::String, s1::String) + _str *= "\tstr\t$s0, $s1\n" + end + function stp(s0::String, s1::String, s2::String) + _str *= "\tstp\t$s0, $s1, $s2\n" + end + function OP(s0::String, s1::String, s2::String) + _str *= "\tOP\t$s0, $s1, $s2\n" + end + function OPC(s0::String, s1::String, s2::String) + _str *= "\tOPC\t$s0, $s1, $s2\n" + end + function cset(s0::String, s1::String) + _str *= "\tcset\t$s0, $s1\n" + end + + sv = deepcopy(sp) + s(ix::Int) = sv[ix + 1] + function shift(sv::Vector{String}) + sv[(end - 3):end], sv[1:(end - 4)] = sv[1:4], sv[5:end] + end + + ldp( s(0), s(2), ap(0)) + ldp( s(1), s(3), bp(0)) + OP( s(0), s(0), s(1)) + OPC( s(2), s(2), s(3)) + stp( s(0), s(2), rp(0)) + + for ix in 1:(n ÷ 2 - 1) + shift(sv) + ldp( s(0), s(2), ap(2 * ix)) + ldp( s(1), s(3), bp(2 * ix)) + OPC( s(0), s(0), s(1)) + OPC( s(2), s(2), s(3)) + stp( s(0), s(2), rp(2 * ix)) + end + + if n % 2 == 1 + ldr( s(4), ap(n - 1)) + ldr( s(5), bp(n - 1)) + OPC( s(4), s(4), s(5)) + str( s(4), rp(n - 1)) + end + + cset( sx, CC) + + _str *= "\tret\nEPILOGUE()\n" + + return _str +end + +function print_all_aors(nmax::Int = 16) + for n in 2:nmax + println(aors(n)) + end +end diff --git a/dev/gen_x86_aors.jl b/dev/gen_x86_aors.jl new file mode 100644 index 0000000000..0db9110cbd --- /dev/null +++ b/dev/gen_x86_aors.jl @@ -0,0 +1,83 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. See . +# + +# Generating routines for r <- a OP b, where OP is either + or -. +# +# This generation was constructed with processors with descent schedulers in +# mind. + +r = "rp" +a = "ap" +b = "bp" +rp(ix::Int) = "$ix*8($r)" +ap(ix::Int) = "$ix*8($a)" +bp(ix::Int) = "$ix*8($b)" + +sx = "sx" # Return value for carry or borrow, i.e. %rax + +R32(sx::String) = "R32($sx)" +R8(sx::String) = "R8($sx)" + +sp = ["s$ix" for ix in 0:4] # Scrap registers + +# Writes assembly that should be preprocessed by M4. +function aors(n::Int) + str = "\tALIGN(16)\nPROLOGUE(flint_mpn_aors($n))\n" + function mov(s0::String, s1::String) + str *= "\tmov\t$s0, $s1\n" + end + function xor(s0::String, s1::String) + str *= "\txor\t$s0, $s1\n" + end + function OP(s0::String, s1::String) + str *= "\tOP\t$s0, $s1\n" + end + function OPC(s0::String, s1::String) + str *= "\tOPC\t$s0, $s1\n" + end + function setc(s0::String) + str *= "\tsetc\t$s0\n" + end + + sv = deepcopy(sp) + s(ix::Int) = sv[ix + 1] + function shift(sv::Vector{String}) + sv[end], sv[1:end - 1] = sv[1], sv[2:end] + end + + mov( ap(0), s(0)) + + mov( ap(1), s(1)) + xor( R32(sx), R32(sx)) + OP( bp(0), s(0)) + mov( s(0), rp(0)) + + for ix in 1:(n - 2) + shift(sv) + mov( ap(ix + 1), s(1)) + OPC( bp(ix), s(0)) + mov( s(0), rp(ix)) + end + + OPC( bp(n - 1), s(1)) + mov( s(1), rp(n - 1)) + setc( R8(sx)) + + str *= "\tret\nEPILOGUE()\n" + + return str +end + +function print_all_aors(nmax::Int = 16) + for n in 2:nmax + println(aors(n)) + end +end diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 90fc8e6436..9d4ba63c0c 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -462,25 +462,34 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); /* multiplication (general) **************************************************/ +/* NOTE: This is getting a bit messy. How can we clean this up? */ #if FLINT_HAVE_ASSEMBLY_x86_64_adx +# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17 # define FLINT_MPN_MUL_FUNC_TAB_WIDTH 17 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 14 +# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) # define FLINT_HAVE_MUL_FUNC(n, m) ((n) <= 16) # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= 16) # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH) +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp)) +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp)) # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_tab[xn][yn](rp, xp, yp)) # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_n_func_tab[n](rp, xp, yp)) # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp)) #elif FLINT_HAVE_ASSEMBLY_armv8 +# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17 # define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9 +# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) # define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n) # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH) # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH) +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp)) +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp)) # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn)) # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n)) # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp)) @@ -506,6 +515,16 @@ typedef mp_limb_t (* flint_mpn_mul_func_t)(mp_ptr, mp_srcptr, mp_srcptr); typedef mp_limb_t (* flint_mpn_mul_func_n_t)(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); typedef mp_limb_t (* flint_mpn_sqr_func_t)(mp_ptr, mp_srcptr); +#ifdef FLINT_MPN_AORS_FUNC_TAB_WIDTH +# define FLINT_USE_AORS_FUNC_TAB 1 +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_add_func_tab[]; +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_sub_func_tab[]; +#else +# define FLINT_HAVE_AORS_FUNC(n) 0 +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) 0 +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) 0 +#endif + #ifdef FLINT_MPN_MUL_FUNC_N_TAB_WIDTH FLINT_DLL extern const flint_mpn_mul_func_n_t flint_mpn_mul_func_n_tab[]; #else @@ -522,6 +541,28 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n); mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n); +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORS_FUNC(n)) + return FLINT_MPN_ADD_HARD(rp, xp, yp, n); + else + return mpn_add_n(rp, xp, yp, n); +} + +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_sub_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORS_FUNC(n)) + return FLINT_MPN_SUB_HARD(rp, xp, yp, n); + else + return mpn_sub_n(rp, xp, yp, n); +} + MPN_EXTRAS_INLINE mp_limb_t flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_size_t yn) { diff --git a/src/mpn_extras/aors_n.c b/src/mpn_extras/aors_n.c new file mode 100644 index 0000000000..ee9231aecd --- /dev/null +++ b/src/mpn_extras/aors_n.c @@ -0,0 +1,88 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" + +#define DECL_AORS(n) _DECL_AORS(n) +#define _DECL_AORS(n) \ +mp_limb_t flint_mpn_add_##n(mp_ptr, mp_srcptr, mp_srcptr); \ +mp_limb_t flint_mpn_sub_##n(mp_ptr, mp_srcptr, mp_srcptr) + +#define ADD(n) _ADD(n) +#define _ADD(n) flint_mpn_add_##n +#define SUB(n) _SUB(n) +#define _SUB(n) flint_mpn_sub_##n + +/* Herein we assume that x86 and ARM are equivalent. */ +#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8 +DECL_AORS(1); +DECL_AORS(2); +DECL_AORS(3); +DECL_AORS(4); +DECL_AORS(5); +DECL_AORS(6); +DECL_AORS(7); +DECL_AORS(8); +DECL_AORS(9); +DECL_AORS(10); +DECL_AORS(11); +DECL_AORS(12); +DECL_AORS(13); +DECL_AORS(14); +DECL_AORS(15); +DECL_AORS(16); + +/* TODO: Should probably rename these types so to not have two different types. + * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */ +const flint_mpn_mul_func_t flint_mpn_add_func_tab[] = +{ + NULL, + ADD(1), + ADD(2), + ADD(3), + ADD(4), + ADD(5), + ADD(6), + ADD(7), + ADD(8), + ADD(9), + ADD(10), + ADD(11), + ADD(12), + ADD(13), + ADD(14), + ADD(15), + ADD(16) +}; + +const flint_mpn_mul_func_t flint_mpn_sub_func_tab[] = +{ + NULL, + SUB(1), + SUB(2), + SUB(3), + SUB(4), + SUB(5), + SUB(6), + SUB(7), + SUB(8), + SUB(9), + SUB(10), + SUB(11), + SUB(12), + SUB(13), + SUB(14), + SUB(15), + SUB(16) +}; +#else +typedef int this_file_is_empty; +#endif diff --git a/src/mpn_extras/arm64/aors_hard.asm b/src/mpn_extras/arm64/aors_hard.asm new file mode 100644 index 0000000000..ed9cc2a0e0 --- /dev/null +++ b/src/mpn_extras/arm64/aors_hard.asm @@ -0,0 +1,492 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +dnl Everything from n = 2 and onwards is generated by +dnl $topdir/dev/gen_arm_aors.jl. +dnl +dnl This generation was constructed with processors with Apple silicon in mind. +dnl Processors decoding less than 6 operations per cycle, or few store and load +dnl units may have worse performance. + +define(`rp', `x0') +define(`ap', `x1') +define(`bp', `x2') + +define(`sx', `x0') C Beware that this is synonymous with rp +define(`s0', `x3') +define(`s1', `x4') +define(`s2', `x5') +define(`s3', `x6') +define(`s4', `x7') +define(`s5', `x8') +define(`s6', `x9') +define(`s7', `x10') +define(`s8', `x11') +define(`s9', `x12') +define(`s10', `x13') +define(`s11', `x14') +define(`s12', `x15') +define(`s13', `x16') +define(`s14', `x17') + +define(ALL_AORS,` +PROLOGUE(flint_mpn_aors(1)) + ldr s0, [ap,#0*8] + ldr s1, [bp,#0*8] + OP s0, s0, s1 + str s0, [rp,#0*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(2)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(3)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldr s4, [ap,#2*8] + ldr s5, [bp,#2*8] + OPC s4, s4, s5 + str s4, [rp,#2*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(4)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(5)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldr s8, [ap,#4*8] + ldr s9, [bp,#4*8] + OPC s8, s8, s9 + str s8, [rp,#4*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(6)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(7)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldr s12, [ap,#6*8] + ldr s13, [bp,#6*8] + OPC s12, s12, s13 + str s12, [rp,#6*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(8)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(9)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldr s1, [ap,#8*8] + ldr s2, [bp,#8*8] + OPC s1, s1, s2 + str s1, [rp,#8*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(10)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(11)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldr s5, [ap,#10*8] + ldr s6, [bp,#10*8] + OPC s5, s5, s6 + str s5, [rp,#10*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(12)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(13)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldr s9, [ap,#12*8] + ldr s10, [bp,#12*8] + OPC s9, s9, s10 + str s9, [rp,#12*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(14)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(15)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + ldr s13, [ap,#14*8] + ldr s14, [bp,#14*8] + OPC s13, s13, s14 + str s13, [rp,#14*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(16)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + ldp s13, s0, [ap,#14*8] + ldp s14, s1, [bp,#14*8] + OPC s13, s13, s14 + OPC s0, s0, s1 + stp s13, s0, [rp,#14*8] + cset sx, CC + ret +EPILOGUE() +') + +define(`flint_mpn_aors',`flint_mpn_add_$1') +define(`OP',`adds') +define(`OPC',`adcs') +define(`CC',`cs') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') + +define(`flint_mpn_aors',`flint_mpn_sub_$1') +define(`OP',`subs') +define(`OPC',`sbcs') +define(`CC',`cc') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index f39d688fdd..171a9b7342 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -12,6 +12,7 @@ /* Include functions *********************************************************/ #include "t-2add_n_inplace.c" +#include "t-aors_n.c" #include "t-divides.c" #include "t-divrem_preinv1.c" #include "t-divrem_preinvn.c" @@ -38,6 +39,7 @@ test_struct tests[] = { TEST_FUNCTION(flint_mpn_2add_n_inplace), + TEST_FUNCTION(flint_mpn_aors_n), TEST_FUNCTION(flint_mpn_divides), TEST_FUNCTION(flint_mpn_divrem_preinv1), TEST_FUNCTION(flint_mpn_divrem_preinvn), diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c new file mode 100644 index 0000000000..0af210d94c --- /dev/null +++ b/src/mpn_extras/test/t-aors_n.c @@ -0,0 +1,85 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#define N_MIN 1 +#define N_MAX (FLINT_MPN_AORS_FUNC_TAB_WIDTH - 1) +#define N_STOR (FLINT_MPN_AORS_FUNC_TAB_WIDTH + 10) + +TEST_FUNCTION_START(flint_mpn_aors_n, state) +{ +#if FLINT_USE_AORS_FUNC_TAB + slong ix; + + for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++) + { + int result; + int type; + mp_limb_t cf, cg; + mp_size_t n; + mp_ptr fp, gp, xp, yp; + + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); + if (n_randint(state, 1 << 10) == UWORD(0)) + n += N_STOR; + + fp = flint_malloc(sizeof(mp_limb_t) * n); + gp = flint_malloc(sizeof(mp_limb_t) * n); + xp = flint_malloc(sizeof(mp_limb_t) * n); + yp = flint_malloc(sizeof(mp_limb_t) * n); + + flint_mpn_rrandom(xp, state, n); + flint_mpn_rrandom(yp, state, n); + + type = n_randint(state, 2); + + if (type == 0) + { + cf = flint_mpn_add_n(fp, xp, yp, n); + cg = mpn_add_n(gp, xp, yp, n); + } + else + { + cf = flint_mpn_sub_n(fp, xp, yp, n); + cg = mpn_sub_n(gp, xp, yp, n); + } + + result = (cf == cg && mpn_cmp(fp, gp, n) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "%s:\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "FLINT (cy = %wu): %{ulong*}\n" + "GMP (cy = %wu): %{ulong*}\n", + type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", + ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + + flint_free(fp); + flint_free(gp); + flint_free(xp); + flint_free(yp); + } + + TEST_FUNCTION_END(state); +#else + TEST_FUNCTION_END_SKIPPED(state); +#endif +} + +#undef N_MIN +#undef N_MAX +#undef N_STOR diff --git a/src/mpn_extras/x86_64/broadwell/aors_hard.asm b/src/mpn_extras/x86_64/broadwell/aors_hard.asm new file mode 100644 index 0000000000..390ee036ec --- /dev/null +++ b/src/mpn_extras/x86_64/broadwell/aors_hard.asm @@ -0,0 +1,565 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +dnl Everything from n = 2 and onwards is generated by +dnl $topdir/dev/gen_x86_aors.jl. + +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp', `%rdx') + +define(`sx', `%rax') +define(`s0', `%rcx') +define(`s1', `%r8') +define(`s2', `%r9') +define(`s3', `%r10') +define(`s4', `%r11') + +define(ALL_AORS,` + ALIGN(16) +PROLOGUE(flint_mpn_aors(1)) + mov 0*8(ap), s0 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(2)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(3)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(4)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(5)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(6)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(7)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(8)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(9)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(10)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(11)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(12)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(13)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(14)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(15)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + mov 14*8(ap), s4 + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + OPC 14*8(bp), s4 + mov s4, 14*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(16)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + mov 14*8(ap), s4 + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + mov 15*8(ap), s0 + OPC 14*8(bp), s4 + mov s4, 14*8(rp) + OPC 15*8(bp), s0 + mov s0, 15*8(rp) + setc R8(sx) + ret +EPILOGUE() +') + + TEXT +define(`flint_mpn_aors',`flint_mpn_add_$1') +define(`OP',`add') +define(`OPC',`adc') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') + +define(`flint_mpn_aors',`flint_mpn_sub_$1') +define(`OP',`sub') +define(`OPC',`sbb') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC')