diff --git a/dev/gen_arm_aors.jl b/dev/gen_arm_aors.jl
new file mode 100644
index 0000000000..465b90e64a
--- /dev/null
+++ b/dev/gen_arm_aors.jl
@@ -0,0 +1,94 @@
+#
+# Copyright (C) 2024 Albin Ahlbäck
+#
+# This file is part of FLINT.
+#
+# FLINT is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License (LGPL) as published
+# by the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version. See .
+#
+
+# Generating routines for r <- a OP b, where OP is either + or -.
+#
+# This generation was constructed with processors with Apple silicon in mind.
+# Processors decoding less than 6 operations per cycle, or few store and load
+# units may have worse performance.
+
+r = "rp"
+a = "ap"
+b = "bp"
+rp(ix::Int) = "[$r,#$ix*8]"
+ap(ix::Int) = "[$a,#$ix*8]"
+bp(ix::Int) = "[$b,#$ix*8]"
+
+sx = "sx" # Return value for carry or borrow
+CC = "CC"
+
+sp = ["s$ix" for ix in 0:14] # Scrap registers
+
+# Writes assembly that should be preprocessed by M4.
+function aors(n::Int)
+ _str = "PROLOGUE(flint_mpn_aors($n))\n"
+ function ldr(s0::String, s1::String)
+ _str *= "\tldr\t$s0, $s1\n"
+ end
+ function ldp(s0::String, s1::String, s2::String)
+ _str *= "\tldp\t$s0, $s1, $s2\n"
+ end
+ function str(s0::String, s1::String)
+ _str *= "\tstr\t$s0, $s1\n"
+ end
+ function stp(s0::String, s1::String, s2::String)
+ _str *= "\tstp\t$s0, $s1, $s2\n"
+ end
+ function OP(s0::String, s1::String, s2::String)
+ _str *= "\tOP\t$s0, $s1, $s2\n"
+ end
+ function OPC(s0::String, s1::String, s2::String)
+ _str *= "\tOPC\t$s0, $s1, $s2\n"
+ end
+ function cset(s0::String, s1::String)
+ _str *= "\tcset\t$s0, $s1\n"
+ end
+
+ sv = deepcopy(sp)
+ s(ix::Int) = sv[ix + 1]
+ function shift(sv::Vector{String})
+ sv[(end - 3):end], sv[1:(end - 4)] = sv[1:4], sv[5:end]
+ end
+
+ ldp( s(0), s(2), ap(0))
+ ldp( s(1), s(3), bp(0))
+ OP( s(0), s(0), s(1))
+ OPC( s(2), s(2), s(3))
+ stp( s(0), s(2), rp(0))
+
+ for ix in 1:(n ÷ 2 - 1)
+ shift(sv)
+ ldp( s(0), s(2), ap(2 * ix))
+ ldp( s(1), s(3), bp(2 * ix))
+ OPC( s(0), s(0), s(1))
+ OPC( s(2), s(2), s(3))
+ stp( s(0), s(2), rp(2 * ix))
+ end
+
+ if n % 2 == 1
+ ldr( s(4), ap(n - 1))
+ ldr( s(5), bp(n - 1))
+ OPC( s(4), s(4), s(5))
+ str( s(4), rp(n - 1))
+ end
+
+ cset( sx, CC)
+
+ _str *= "\tret\nEPILOGUE()\n"
+
+ return _str
+end
+
+function print_all_aors(nmax::Int = 16)
+ for n in 2:nmax
+ println(aors(n))
+ end
+end
diff --git a/dev/gen_x86_aors.jl b/dev/gen_x86_aors.jl
new file mode 100644
index 0000000000..0db9110cbd
--- /dev/null
+++ b/dev/gen_x86_aors.jl
@@ -0,0 +1,83 @@
+#
+# Copyright (C) 2024 Albin Ahlbäck
+#
+# This file is part of FLINT.
+#
+# FLINT is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License (LGPL) as published
+# by the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version. See .
+#
+
+# Generating routines for r <- a OP b, where OP is either + or -.
+#
+# This generation was constructed with processors with descent schedulers in
+# mind.
+
+r = "rp"
+a = "ap"
+b = "bp"
+rp(ix::Int) = "$ix*8($r)"
+ap(ix::Int) = "$ix*8($a)"
+bp(ix::Int) = "$ix*8($b)"
+
+sx = "sx" # Return value for carry or borrow, i.e. %rax
+
+R32(sx::String) = "R32($sx)"
+R8(sx::String) = "R8($sx)"
+
+sp = ["s$ix" for ix in 0:4] # Scrap registers
+
+# Writes assembly that should be preprocessed by M4.
+function aors(n::Int)
+ str = "\tALIGN(16)\nPROLOGUE(flint_mpn_aors($n))\n"
+ function mov(s0::String, s1::String)
+ str *= "\tmov\t$s0, $s1\n"
+ end
+ function xor(s0::String, s1::String)
+ str *= "\txor\t$s0, $s1\n"
+ end
+ function OP(s0::String, s1::String)
+ str *= "\tOP\t$s0, $s1\n"
+ end
+ function OPC(s0::String, s1::String)
+ str *= "\tOPC\t$s0, $s1\n"
+ end
+ function setc(s0::String)
+ str *= "\tsetc\t$s0\n"
+ end
+
+ sv = deepcopy(sp)
+ s(ix::Int) = sv[ix + 1]
+ function shift(sv::Vector{String})
+ sv[end], sv[1:end - 1] = sv[1], sv[2:end]
+ end
+
+ mov( ap(0), s(0))
+
+ mov( ap(1), s(1))
+ xor( R32(sx), R32(sx))
+ OP( bp(0), s(0))
+ mov( s(0), rp(0))
+
+ for ix in 1:(n - 2)
+ shift(sv)
+ mov( ap(ix + 1), s(1))
+ OPC( bp(ix), s(0))
+ mov( s(0), rp(ix))
+ end
+
+ OPC( bp(n - 1), s(1))
+ mov( s(1), rp(n - 1))
+ setc( R8(sx))
+
+ str *= "\tret\nEPILOGUE()\n"
+
+ return str
+end
+
+function print_all_aors(nmax::Int = 16)
+ for n in 2:nmax
+ println(aors(n))
+ end
+end
diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 90fc8e6436..9d4ba63c0c 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -462,25 +462,34 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
/* multiplication (general) **************************************************/
+/* NOTE: This is getting a bit messy. How can we clean this up? */
#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17
# define FLINT_MPN_MUL_FUNC_TAB_WIDTH 17
# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 14
+# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
# define FLINT_HAVE_MUL_FUNC(n, m) ((n) <= 16)
# define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= 16)
# define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp))
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp))
# define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_tab[xn][yn](rp, xp, yp))
# define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_n_func_tab[n](rp, xp, yp))
# define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
#elif FLINT_HAVE_ASSEMBLY_armv8
+# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17
# define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15
# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9
+# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
# define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n)
# define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH)
# define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp))
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp))
# define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn))
# define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n))
# define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
@@ -506,6 +515,16 @@ typedef mp_limb_t (* flint_mpn_mul_func_t)(mp_ptr, mp_srcptr, mp_srcptr);
typedef mp_limb_t (* flint_mpn_mul_func_n_t)(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
typedef mp_limb_t (* flint_mpn_sqr_func_t)(mp_ptr, mp_srcptr);
+#ifdef FLINT_MPN_AORS_FUNC_TAB_WIDTH
+# define FLINT_USE_AORS_FUNC_TAB 1
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_add_func_tab[];
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_sub_func_tab[];
+#else
+# define FLINT_HAVE_AORS_FUNC(n) 0
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) 0
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) 0
+#endif
+
#ifdef FLINT_MPN_MUL_FUNC_N_TAB_WIDTH
FLINT_DLL extern const flint_mpn_mul_func_n_t flint_mpn_mul_func_n_tab[];
#else
@@ -522,6 +541,28 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si
void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n);
mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n);
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+ FLINT_ASSERT(n >= 1);
+
+ if (FLINT_HAVE_AORS_FUNC(n))
+ return FLINT_MPN_ADD_HARD(rp, xp, yp, n);
+ else
+ return mpn_add_n(rp, xp, yp, n);
+}
+
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_sub_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+ FLINT_ASSERT(n >= 1);
+
+ if (FLINT_HAVE_AORS_FUNC(n))
+ return FLINT_MPN_SUB_HARD(rp, xp, yp, n);
+ else
+ return mpn_sub_n(rp, xp, yp, n);
+}
+
MPN_EXTRAS_INLINE mp_limb_t
flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_size_t yn)
{
diff --git a/src/mpn_extras/aors_n.c b/src/mpn_extras/aors_n.c
new file mode 100644
index 0000000000..ee9231aecd
--- /dev/null
+++ b/src/mpn_extras/aors_n.c
@@ -0,0 +1,88 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "mpn_extras.h"
+
+#define DECL_AORS(n) _DECL_AORS(n)
+#define _DECL_AORS(n) \
+mp_limb_t flint_mpn_add_##n(mp_ptr, mp_srcptr, mp_srcptr); \
+mp_limb_t flint_mpn_sub_##n(mp_ptr, mp_srcptr, mp_srcptr)
+
+#define ADD(n) _ADD(n)
+#define _ADD(n) flint_mpn_add_##n
+#define SUB(n) _SUB(n)
+#define _SUB(n) flint_mpn_sub_##n
+
+/* Herein we assume that x86 and ARM are equivalent. */
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8
+DECL_AORS(1);
+DECL_AORS(2);
+DECL_AORS(3);
+DECL_AORS(4);
+DECL_AORS(5);
+DECL_AORS(6);
+DECL_AORS(7);
+DECL_AORS(8);
+DECL_AORS(9);
+DECL_AORS(10);
+DECL_AORS(11);
+DECL_AORS(12);
+DECL_AORS(13);
+DECL_AORS(14);
+DECL_AORS(15);
+DECL_AORS(16);
+
+/* TODO: Should probably rename these types so to not have two different types.
+ * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */
+const flint_mpn_mul_func_t flint_mpn_add_func_tab[] =
+{
+ NULL,
+ ADD(1),
+ ADD(2),
+ ADD(3),
+ ADD(4),
+ ADD(5),
+ ADD(6),
+ ADD(7),
+ ADD(8),
+ ADD(9),
+ ADD(10),
+ ADD(11),
+ ADD(12),
+ ADD(13),
+ ADD(14),
+ ADD(15),
+ ADD(16)
+};
+
+const flint_mpn_mul_func_t flint_mpn_sub_func_tab[] =
+{
+ NULL,
+ SUB(1),
+ SUB(2),
+ SUB(3),
+ SUB(4),
+ SUB(5),
+ SUB(6),
+ SUB(7),
+ SUB(8),
+ SUB(9),
+ SUB(10),
+ SUB(11),
+ SUB(12),
+ SUB(13),
+ SUB(14),
+ SUB(15),
+ SUB(16)
+};
+#else
+typedef int this_file_is_empty;
+#endif
diff --git a/src/mpn_extras/arm64/aors_hard.asm b/src/mpn_extras/arm64/aors_hard.asm
new file mode 100644
index 0000000000..ed9cc2a0e0
--- /dev/null
+++ b/src/mpn_extras/arm64/aors_hard.asm
@@ -0,0 +1,492 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+dnl Everything from n = 2 and onwards is generated by
+dnl $topdir/dev/gen_arm_aors.jl.
+dnl
+dnl This generation was constructed with processors with Apple silicon in mind.
+dnl Processors decoding less than 6 operations per cycle, or few store and load
+dnl units may have worse performance.
+
+define(`rp', `x0')
+define(`ap', `x1')
+define(`bp', `x2')
+
+define(`sx', `x0') C Beware that this is synonymous with rp
+define(`s0', `x3')
+define(`s1', `x4')
+define(`s2', `x5')
+define(`s3', `x6')
+define(`s4', `x7')
+define(`s5', `x8')
+define(`s6', `x9')
+define(`s7', `x10')
+define(`s8', `x11')
+define(`s9', `x12')
+define(`s10', `x13')
+define(`s11', `x14')
+define(`s12', `x15')
+define(`s13', `x16')
+define(`s14', `x17')
+
+define(ALL_AORS,`
+PROLOGUE(flint_mpn_aors(1))
+ ldr s0, [ap,#0*8]
+ ldr s1, [bp,#0*8]
+ OP s0, s0, s1
+ str s0, [rp,#0*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(2))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(3))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldr s4, [ap,#2*8]
+ ldr s5, [bp,#2*8]
+ OPC s4, s4, s5
+ str s4, [rp,#2*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(4))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(5))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldr s8, [ap,#4*8]
+ ldr s9, [bp,#4*8]
+ OPC s8, s8, s9
+ str s8, [rp,#4*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(6))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(7))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldr s12, [ap,#6*8]
+ ldr s13, [bp,#6*8]
+ OPC s12, s12, s13
+ str s12, [rp,#6*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(8))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(9))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldr s1, [ap,#8*8]
+ ldr s2, [bp,#8*8]
+ OPC s1, s1, s2
+ str s1, [rp,#8*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(10))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(11))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldr s5, [ap,#10*8]
+ ldr s6, [bp,#10*8]
+ OPC s5, s5, s6
+ str s5, [rp,#10*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(12))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldp s5, s7, [ap,#10*8]
+ ldp s6, s8, [bp,#10*8]
+ OPC s5, s5, s6
+ OPC s7, s7, s8
+ stp s5, s7, [rp,#10*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(13))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldp s5, s7, [ap,#10*8]
+ ldp s6, s8, [bp,#10*8]
+ OPC s5, s5, s6
+ OPC s7, s7, s8
+ stp s5, s7, [rp,#10*8]
+ ldr s9, [ap,#12*8]
+ ldr s10, [bp,#12*8]
+ OPC s9, s9, s10
+ str s9, [rp,#12*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(14))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldp s5, s7, [ap,#10*8]
+ ldp s6, s8, [bp,#10*8]
+ OPC s5, s5, s6
+ OPC s7, s7, s8
+ stp s5, s7, [rp,#10*8]
+ ldp s9, s11, [ap,#12*8]
+ ldp s10, s12, [bp,#12*8]
+ OPC s9, s9, s10
+ OPC s11, s11, s12
+ stp s9, s11, [rp,#12*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(15))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldp s5, s7, [ap,#10*8]
+ ldp s6, s8, [bp,#10*8]
+ OPC s5, s5, s6
+ OPC s7, s7, s8
+ stp s5, s7, [rp,#10*8]
+ ldp s9, s11, [ap,#12*8]
+ ldp s10, s12, [bp,#12*8]
+ OPC s9, s9, s10
+ OPC s11, s11, s12
+ stp s9, s11, [rp,#12*8]
+ ldr s13, [ap,#14*8]
+ ldr s14, [bp,#14*8]
+ OPC s13, s13, s14
+ str s13, [rp,#14*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(16))
+ ldp s0, s2, [ap,#0*8]
+ ldp s1, s3, [bp,#0*8]
+ OP s0, s0, s1
+ OPC s2, s2, s3
+ stp s0, s2, [rp,#0*8]
+ ldp s4, s6, [ap,#2*8]
+ ldp s5, s7, [bp,#2*8]
+ OPC s4, s4, s5
+ OPC s6, s6, s7
+ stp s4, s6, [rp,#2*8]
+ ldp s8, s10, [ap,#4*8]
+ ldp s9, s11, [bp,#4*8]
+ OPC s8, s8, s9
+ OPC s10, s10, s11
+ stp s8, s10, [rp,#4*8]
+ ldp s12, s14, [ap,#6*8]
+ ldp s13, s0, [bp,#6*8]
+ OPC s12, s12, s13
+ OPC s14, s14, s0
+ stp s12, s14, [rp,#6*8]
+ ldp s1, s3, [ap,#8*8]
+ ldp s2, s4, [bp,#8*8]
+ OPC s1, s1, s2
+ OPC s3, s3, s4
+ stp s1, s3, [rp,#8*8]
+ ldp s5, s7, [ap,#10*8]
+ ldp s6, s8, [bp,#10*8]
+ OPC s5, s5, s6
+ OPC s7, s7, s8
+ stp s5, s7, [rp,#10*8]
+ ldp s9, s11, [ap,#12*8]
+ ldp s10, s12, [bp,#12*8]
+ OPC s9, s9, s10
+ OPC s11, s11, s12
+ stp s9, s11, [rp,#12*8]
+ ldp s13, s0, [ap,#14*8]
+ ldp s14, s1, [bp,#14*8]
+ OPC s13, s13, s14
+ OPC s0, s0, s1
+ stp s13, s0, [rp,#14*8]
+ cset sx, CC
+ ret
+EPILOGUE()
+')
+
+define(`flint_mpn_aors',`flint_mpn_add_$1')
+define(`OP',`adds')
+define(`OPC',`adcs')
+define(`CC',`cs')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
+
+define(`flint_mpn_aors',`flint_mpn_sub_$1')
+define(`OP',`subs')
+define(`OPC',`sbcs')
+define(`CC',`cc')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index f39d688fdd..171a9b7342 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -12,6 +12,7 @@
/* Include functions *********************************************************/
#include "t-2add_n_inplace.c"
+#include "t-aors_n.c"
#include "t-divides.c"
#include "t-divrem_preinv1.c"
#include "t-divrem_preinvn.c"
@@ -38,6 +39,7 @@
test_struct tests[] =
{
TEST_FUNCTION(flint_mpn_2add_n_inplace),
+ TEST_FUNCTION(flint_mpn_aors_n),
TEST_FUNCTION(flint_mpn_divides),
TEST_FUNCTION(flint_mpn_divrem_preinv1),
TEST_FUNCTION(flint_mpn_divrem_preinvn),
diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c
new file mode 100644
index 0000000000..0af210d94c
--- /dev/null
+++ b/src/mpn_extras/test/t-aors_n.c
@@ -0,0 +1,85 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+ Copyright (C) 2024 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN 1
+#define N_MAX (FLINT_MPN_AORS_FUNC_TAB_WIDTH - 1)
+#define N_STOR (FLINT_MPN_AORS_FUNC_TAB_WIDTH + 10)
+
+TEST_FUNCTION_START(flint_mpn_aors_n, state)
+{
+#if FLINT_USE_AORS_FUNC_TAB
+ slong ix;
+
+ for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++)
+ {
+ int result;
+ int type;
+ mp_limb_t cf, cg;
+ mp_size_t n;
+ mp_ptr fp, gp, xp, yp;
+
+ n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+ if (n_randint(state, 1 << 10) == UWORD(0))
+ n += N_STOR;
+
+ fp = flint_malloc(sizeof(mp_limb_t) * n);
+ gp = flint_malloc(sizeof(mp_limb_t) * n);
+ xp = flint_malloc(sizeof(mp_limb_t) * n);
+ yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+ flint_mpn_rrandom(xp, state, n);
+ flint_mpn_rrandom(yp, state, n);
+
+ type = n_randint(state, 2);
+
+ if (type == 0)
+ {
+ cf = flint_mpn_add_n(fp, xp, yp, n);
+ cg = mpn_add_n(gp, xp, yp, n);
+ }
+ else
+ {
+ cf = flint_mpn_sub_n(fp, xp, yp, n);
+ cg = mpn_sub_n(gp, xp, yp, n);
+ }
+
+ result = (cf == cg && mpn_cmp(fp, gp, n) == 0);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "%s:\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "yp = %{ulong*}\n"
+ "FLINT (cy = %wu): %{ulong*}\n"
+ "GMP (cy = %wu): %{ulong*}\n",
+ type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
+ ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+
+ flint_free(fp);
+ flint_free(gp);
+ flint_free(xp);
+ flint_free(yp);
+ }
+
+ TEST_FUNCTION_END(state);
+#else
+ TEST_FUNCTION_END_SKIPPED(state);
+#endif
+}
+
+#undef N_MIN
+#undef N_MAX
+#undef N_STOR
diff --git a/src/mpn_extras/x86_64/broadwell/aors_hard.asm b/src/mpn_extras/x86_64/broadwell/aors_hard.asm
new file mode 100644
index 0000000000..390ee036ec
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/aors_hard.asm
@@ -0,0 +1,565 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+dnl Everything from n = 2 and onwards is generated by
+dnl $topdir/dev/gen_x86_aors.jl.
+
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp', `%rdx')
+
+define(`sx', `%rax')
+define(`s0', `%rcx')
+define(`s1', `%r8')
+define(`s2', `%r9')
+define(`s3', `%r10')
+define(`s4', `%r11')
+
+define(ALL_AORS,`
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(1))
+ mov 0*8(ap), s0
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(2))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(3))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(4))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(5))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(6))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(7))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(8))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(9))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(10))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(11))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(12))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ mov 11*8(ap), s1
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ OPC 11*8(bp), s1
+ mov s1, 11*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(13))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ mov 11*8(ap), s1
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ mov 12*8(ap), s2
+ OPC 11*8(bp), s1
+ mov s1, 11*8(rp)
+ OPC 12*8(bp), s2
+ mov s2, 12*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(14))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ mov 11*8(ap), s1
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ mov 12*8(ap), s2
+ OPC 11*8(bp), s1
+ mov s1, 11*8(rp)
+ mov 13*8(ap), s3
+ OPC 12*8(bp), s2
+ mov s2, 12*8(rp)
+ OPC 13*8(bp), s3
+ mov s3, 13*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(15))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ mov 11*8(ap), s1
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ mov 12*8(ap), s2
+ OPC 11*8(bp), s1
+ mov s1, 11*8(rp)
+ mov 13*8(ap), s3
+ OPC 12*8(bp), s2
+ mov s2, 12*8(rp)
+ mov 14*8(ap), s4
+ OPC 13*8(bp), s3
+ mov s3, 13*8(rp)
+ OPC 14*8(bp), s4
+ mov s4, 14*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_aors(16))
+ mov 0*8(ap), s0
+ mov 1*8(ap), s1
+ xor R32(sx), R32(sx)
+ OP 0*8(bp), s0
+ mov s0, 0*8(rp)
+ mov 2*8(ap), s2
+ OPC 1*8(bp), s1
+ mov s1, 1*8(rp)
+ mov 3*8(ap), s3
+ OPC 2*8(bp), s2
+ mov s2, 2*8(rp)
+ mov 4*8(ap), s4
+ OPC 3*8(bp), s3
+ mov s3, 3*8(rp)
+ mov 5*8(ap), s0
+ OPC 4*8(bp), s4
+ mov s4, 4*8(rp)
+ mov 6*8(ap), s1
+ OPC 5*8(bp), s0
+ mov s0, 5*8(rp)
+ mov 7*8(ap), s2
+ OPC 6*8(bp), s1
+ mov s1, 6*8(rp)
+ mov 8*8(ap), s3
+ OPC 7*8(bp), s2
+ mov s2, 7*8(rp)
+ mov 9*8(ap), s4
+ OPC 8*8(bp), s3
+ mov s3, 8*8(rp)
+ mov 10*8(ap), s0
+ OPC 9*8(bp), s4
+ mov s4, 9*8(rp)
+ mov 11*8(ap), s1
+ OPC 10*8(bp), s0
+ mov s0, 10*8(rp)
+ mov 12*8(ap), s2
+ OPC 11*8(bp), s1
+ mov s1, 11*8(rp)
+ mov 13*8(ap), s3
+ OPC 12*8(bp), s2
+ mov s2, 12*8(rp)
+ mov 14*8(ap), s4
+ OPC 13*8(bp), s3
+ mov s3, 13*8(rp)
+ mov 15*8(ap), s0
+ OPC 14*8(bp), s4
+ mov s4, 14*8(rp)
+ OPC 15*8(bp), s0
+ mov s0, 15*8(rp)
+ setc R8(sx)
+ ret
+EPILOGUE()
+')
+
+ TEXT
+define(`flint_mpn_aors',`flint_mpn_add_$1')
+define(`OP',`add')
+define(`OPC',`adc')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
+
+define(`flint_mpn_aors',`flint_mpn_sub_$1')
+define(`OP',`sub')
+define(`OPC',`sbb')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')