diff --git a/yafu/Makefile b/yafu/Makefile
index 04374ee..d4c65bd 100644
--- a/yafu/Makefile
+++ b/yafu/Makefile
@@ -16,7 +16,7 @@
 # code to the public domain.
 #        				   --bbuhrow@gmail.com 7/28/09
 # ----------------------------------------------------------------------*/
-COMPILER = gcc
+
 CC = gcc
 CFLAGS = -g -m64 -DUSE_SSE2
 #CFLAGS += -march=core2 -mtune=core2
@@ -40,11 +40,11 @@ LIBS = -L.
 INC += -I../ysieve -I../ytools
 LIBS += -L../ysieve -L../ytools
 
-INC += -I../gmp-6.2.1/
-LIBS += -L../gmp-6.2.1/
+INC += -I../gmp_install/gmp-6.2.0/include
+LIBS += -L../gmp_install/gmp-6.2.0/lib
 
-INC += -I../avx-ecm/
-LIBS += -L../avx-ecm/
+INC += -I../ecm_install/include/
+LIBS += -L../ecm_install/lib/
 
 INC += -I../msieve/zlib 
 LIBS += -L../msieve/
@@ -145,7 +145,7 @@ ifeq ($(FORCE_GENERIC),1)
 endif
 
 # make sure we get the correct libgmp linked by using an absolute path
-LIBS += -lecm ../gmp-6.2.1/.libs/libgmp.a -lytools -lysieve
+LIBS += -lecm /users/buhrow/src/c/gmp_install/gmp-6.2.0/lib/libgmp.a -lytools -lysieve
 #LIBS += -lecm -lgmp -lytools -lysieve
 
 ifeq ($(SKYLAKEX),1)
@@ -173,7 +173,7 @@ ifeq ($(COMPILER),icc)
 	LIBS +=  -lsvml
 endif
 
-CFLAGS += -static $(OPT_FLAGS) $(WARN_FLAGS) $(INC)
+CFLAGS += $(OPT_FLAGS) $(WARN_FLAGS) $(INC)
 
 x86: CFLAGS += -m32
 
diff --git a/yafu/bin/x64/Release/yafu-x64.exe b/yafu/bin/x64/Release/yafu-x64.exe
new file mode 100644
index 0000000..91533ad
Binary files /dev/null and b/yafu/bin/x64/Release/yafu-x64.exe differ
diff --git a/yafu/factor/autofactor.c b/yafu/factor/autofactor.c
index aebe69a..d0c5a97 100644
--- a/yafu/factor/autofactor.c
+++ b/yafu/factor/autofactor.c
@@ -563,7 +563,10 @@ int check_if_done(fact_obj_t *fobj, mpz_t N)
 						// load the new fobj with this number
 						fobj_refactor = (fact_obj_t *)malloc(sizeof(fact_obj_t));
 						init_factobj(fobj_refactor);
+                        copy_factobj(fobj_refactor, fobj);
+
 						mpz_set(fobj_refactor->N, fobj->factors->factors[i].factor);
+                        fobj_refactor->refactor_depth = fobj->refactor_depth;
 
 						// recurse on factor
 						factor(fobj_refactor);
@@ -1823,12 +1826,12 @@ void factor(fact_obj_t *fobj)
 			if (fobj->VFLAG > 0)
 				printf("fac: found siqs savefile, resuming siqs\n");
 
-			// remove any common factor so the input exactly matches
-			// the file
-			// mpz_tdiv_q(b, b, g);
-			// mpz_set(fobj->N, b);
-			// mpz_set(origN, b);
-			// mpz_set(copyN, b);
+            // if the inputs don't match exactly, resume siqs on the exact
+            // number in the savefile and put the cofactor (prime or composite)
+            // into the factor list.  If composite it will get refactored.
+            add_to_factor_list(fobj->factors, g, fobj->VFLAG, fobj->NUM_WITNESSES);
+
+            mpz_set(b, tmpz);
 
 			//override default choice
 			fact_state = state_qs;
diff --git a/yafu/factor/avx-ecm/avx_ecm_main.c b/yafu/factor/avx-ecm/avx_ecm_main.c
index b5e8166..eb5d2fe 100644
--- a/yafu/factor/avx-ecm/avx_ecm_main.c
+++ b/yafu/factor/avx-ecm/avx_ecm_main.c
@@ -164,12 +164,14 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1,
     mpz_init(r);
     mpz_init(N);
 
-    mpz_set(N, fobj->ecm_obj.gmp_n);
+    // set N equal to the original input, so we can
+    // detect Mersenne inputs correctly.
+    mpz_set(N, fobj->N);
 
     // check for Mersenne inputs
     size_n = mpz_sizeinbase(N, 2);
 
-    for (i = size_n; i < 2048; i++)
+    for (i = 31; i <= size_n; i++)
     {
         mpz_set_ui(r, 1);
         mpz_mul_2exp(r, r, i);
@@ -197,13 +199,18 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1,
         mpz_set_ui(r, 1);
         mpz_mul_2exp(r, r, i);
         mpz_mod(g, r, N);
-        if (mpz_sizeinbase(g, 2) < DIGITBITS)
+        if (mpz_sizeinbase(g, 2) < (DIGITBITS/2))
         {
             size_n = i;
             isMersenne = mpz_get_ui(g);
             break;
         }
     }
+    //printf("found isMersenne = 2^%d %d\n", size_n, isMersenne);
+
+    // now set N equal to the actual input, which may have had factors removed
+    // by previous factoring routines.
+    mpz_set(N, fobj->ecm_obj.gmp_n);
 
     // if the input is Mersenne and still contains algebraic factors, remove them.
     if (abs(isMersenne) == 1)
@@ -293,7 +300,7 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1,
         gmp_printf("commencing parallel ecm on %Zd with %d threads\n", N, threads);
     }
 
-    if ((double)nwords / ((double)maxbits / (double)DIGITBITS) < 0.7)
+    if ((isMersenne != 0) && ((double)nwords / ((double)maxbits / (double)DIGITBITS) < 0.7))
     {
         if (verbose > 1)
         {
@@ -615,7 +622,6 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1,
             }
             vecaddmod_ptr = &vecaddmod52;
             vecsubmod_ptr = &vecsubmod52;
-            
         }
     }
     else
diff --git a/yafu/factor/avx-ecm/vecarith52.c b/yafu/factor/avx-ecm/vecarith52.c
index 0ace340..c31733c 100644
--- a/yafu/factor/avx-ecm/vecarith52.c
+++ b/yafu/factor/avx-ecm/vecarith52.c
@@ -52,8 +52,17 @@ This file is a snapshot of a work in progress, originated by Mayo
 #ifdef USE_AVX512F
 #include <immintrin.h>
 
-//#define USE_AMM 1
+#define USE_AMM 1
 
+__m512i __inline _mm512_mask_sbb_src_epi52(__m512i src, __m512i a, __mmask8 m, __mmask8 c, __m512i b, __mmask8* cout)
+{
+    __m512i t = _mm512_mask_sub_epi64(src, m, a, b);
+    *cout = _mm512_mask_cmpgt_epu64_mask(m, b, a);
+    __m512i t2 = _mm512_mask_sub_epi64(src, m, t, _mm512_maskz_set1_epi64(c, 1));
+    *cout = _mm512_kor(*cout, _mm512_mask_cmpgt_epu64_mask(m, t2, t));
+    t2 = _mm512_and_epi64(t2, _mm512_set1_epi64(0xfffffffffffffULL));
+    return t2;
+}
 
 void vecmul52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata)
 {
@@ -4078,10 +4087,15 @@ void vecmulmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t*
             // i = 3, a3..0 = c[0..3]
             // i = 4, a3..0 = c[4..7]
             // i = 5, a3..0 = c[8..11]
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
+
+            c00 = a3;
+            c01 = a2;
+            c02 = a1;
+            c03 = a0;
 
         }
     }
@@ -4145,15 +4159,159 @@ void vecmulmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t*
             // i = 3, a3..0 = c[0..3]
             // i = 4, a3..0 = c[4..7]
             // i = 5, a3..0 = c[8..11]
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
-            _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
+            c04 = a3;
+            c05 = a2;
+            c06 = a1;
+            c07 = a0;
+
+
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
+            //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
 
         }
     }
 
+#ifdef USE_AMM
+    _mm512_store_epi64(c->data + 0 * VECLEN, c00);
+    _mm512_store_epi64(c->data + 1 * VECLEN, c01);
+    _mm512_store_epi64(c->data + 2 * VECLEN, c02);
+    _mm512_store_epi64(c->data + 3 * VECLEN, c03);
+    _mm512_store_epi64(c->data + 4 * VECLEN, c04);
+    _mm512_store_epi64(c->data + 5 * VECLEN, c05);
+    _mm512_store_epi64(c->data + 6 * VECLEN, c06);
+    _mm512_store_epi64(c->data + 7 * VECLEN, c07);
     _mm512_store_epi64(c->data + NWORDS * VECLEN, zero);
+#else
+
+    __m512i cvec;
+    __m512i nvec;
+    __m512i bvec;
+
+    // compare
+    scarry = 0;	    // sub mask
+    scarry2 = 0;	// keep looking mask
+
+    cvec = c07;
+    nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c06;
+    nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c05;
+    nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c04;
+    nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c03;
+    nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c02;
+    nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c01;
+    nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c00;
+    nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    // check for equal as well by flipping mask bits that have still
+    // not been decided (i.e., are equal)
+    scarry |= (~scarry2);
+
+sub:
+
+    if (scarry == 0) goto done;
+    
+    // subtract n from c when c is not less than n, as indicated by a 1 bit in mask
+    scarry2 = 0;
+
+    nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c00, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 0 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c01, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 1 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c02, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 2 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c03, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 3 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c04, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 4 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c05, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 5 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c06, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 6 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c07, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 7 * VECLEN, bvec);
+
+done:
+
+#endif
+    
 
     c->size = NWORDS;
     return;
@@ -7230,7 +7388,7 @@ void vecsqrmod52_fixed624_bfips_ifma(vec_bignum_t* a, vec_bignum_t* c, vec_bignu
 
 #endif
 
-void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
+void vecmulmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
 {
     int i, j, k;
     int NWORDS = mdata->NWORDS;
@@ -7870,7 +8028,7 @@ void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t
     return;
 }
 
-void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
+void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
 {
     int i, j, k;
     uint32_t NWORDS = mdata->NWORDS;
@@ -7901,6 +8059,12 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
     __mmask8 scarry2;
     __mmask8 scarry;
 
+#ifdef USE_AMM
+    uint64_t* outdata = c->data;
+#else
+    uint64_t* outdata = s->data;
+#endif
+
     // deal with the sign
     c->size = NWORDS;
     c->signmask = a->signmask ^ b->signmask;
@@ -7910,8 +8074,6 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
     acc_e1 = zero;
     acc_e2 = zero;
 
-    //uint64_t* outdata = s->data;
-
     // first half mul
     for (i = 0; i < NBLOCKS; i++)
     {
@@ -8074,10 +8236,10 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
         for (j = 0; j < i; j++)
         {
             // accumulate s * n
-            a0 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 1) * VECLEN);
-            a1 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 2) * VECLEN);
-            a2 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 3) * VECLEN);
-            a3 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 4) * VECLEN);
+            a0 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 1) * VECLEN);
+            a1 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 2) * VECLEN);
+            a2 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 3) * VECLEN);
+            a3 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 4) * VECLEN);
 
             b0 = _mm512_load_epi64(n->data + ((i - j - 1) * BLOCKWORDS + 1) * VECLEN);
             b1 = _mm512_load_epi64(n->data + ((i - j - 1) * BLOCKWORDS + 2) * VECLEN);
@@ -8124,7 +8286,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
 
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
             b0 = _mm512_load_epi64(n->data + 0 * VECLEN);
 
             // add in the final product
@@ -8148,7 +8310,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -8156,7 +8318,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -8179,7 +8341,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -8189,7 +8351,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
             // not sure what can be done about it.
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -8212,7 +8374,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -8220,7 +8382,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -8264,10 +8426,10 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
             // i = 3, j = 1, a0..3 = c[11..8]
             // i = 3, j = 2, a0..3 = c[7..4]
             // i = 4, j = 2, a0..3 = c[11..8]
-            a0 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 3) * VECLEN);
-            a1 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 2) * VECLEN);
-            a2 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 1) * VECLEN);
-            a3 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 0) * VECLEN);
+            a0 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 3) * VECLEN);
+            a1 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 2) * VECLEN);
+            a2 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 1) * VECLEN);
+            a3 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 0) * VECLEN);
 
             b0 = _mm512_load_epi64(n->data + ((j - 1) * BLOCKWORDS + 1) * VECLEN);
             b1 = _mm512_load_epi64(n->data + ((j - 1) * BLOCKWORDS + 2) * VECLEN);
@@ -8369,9 +8531,9 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
         // i = 3, a1..3 = c[1..3]
         // i = 4, a1..3 = c[5..7]
         // i = 5, a1..3 = c[9..11]
-        a1 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN);
-        a2 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN);
-        a3 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN);
+        a1 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN);
+        a2 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN);
+        a3 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN);
 
         b0 = _mm512_load_epi64(n->data + (NWORDS - 1) * VECLEN);
         b1 = _mm512_load_epi64(n->data + (NWORDS - 2) * VECLEN);
@@ -8537,16 +8699,21 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign
             // i = 3, a3..0 = c[0..3]
             // i = 4, a3..0 = c[4..7]
             // i = 5, a3..0 = c[8..11]
-            _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
-            _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
-            _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
-            _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
-
+            _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3);
+            _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2);
+            _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1);
+            _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0);
         }
     }
 
 
-#ifndef USE_AMM
+#ifdef USE_AMM
+    //for (i = NWORDS - 1; i >= 0; i--)
+    //{
+    //    b0 = _mm512_load_epi64(s->data + i * VECLEN);
+    //    _mm512_store_epi64(c->data + i * VECLEN, b0);
+    //}
+#else
     a0 = acc_e0;
     scarry2 = _mm512_cmp_epu64_mask(a0, zero, _MM_CMPINT_EQ);
 
@@ -12476,10 +12643,14 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t*
         a1 = _mm512_and_epi64(vlmask, a1);
         a0 = _mm512_and_epi64(vlmask, a0);
 
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0);
+        c00 = a3;
+        c01 = a2;
+        c02 = a1;
+        c03 = a0;
     }
 
     i = 1;
@@ -12593,10 +12764,14 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t*
         a1 = _mm512_and_epi64(vlmask, a1);
         a0 = _mm512_and_epi64(vlmask, a0);
 
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1);
-        _mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1);
+        //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0);
+        c04 = a3;
+        c05 = a2;
+        c06 = a1;
+        c07 = a0;
     }
 
     //printf("fixed416: \n");
@@ -12606,6 +12781,145 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t*
     //}
     //printf("\n");
 
+#ifdef USE_AMM
+    _mm512_store_epi64(c->data + 0 * VECLEN, c00);
+    _mm512_store_epi64(c->data + 1 * VECLEN, c01);
+    _mm512_store_epi64(c->data + 2 * VECLEN, c02);
+    _mm512_store_epi64(c->data + 3 * VECLEN, c03);
+    _mm512_store_epi64(c->data + 4 * VECLEN, c04);
+    _mm512_store_epi64(c->data + 5 * VECLEN, c05);
+    _mm512_store_epi64(c->data + 6 * VECLEN, c06);
+    _mm512_store_epi64(c->data + 7 * VECLEN, c07);
+    _mm512_store_epi64(c->data + NWORDS * VECLEN, zero);
+#else
+
+    __m512i cvec;
+    __m512i nvec;
+    __m512i bvec;
+
+    // compare
+    scarry = 0;	    // sub mask
+    scarry2 = 0;	// keep looking mask
+
+    cvec = c07;
+    nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c06;
+    nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c05;
+    nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c04;
+    nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c03;
+    nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c02;
+    nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c01;
+    nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    cvec = c00;
+    nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN);
+    // compare those that have not already been decided using the mask
+    scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT);
+    scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT);
+
+    // decided all of them, stop comparing.
+    if (scarry2 == 0xff) goto sub;
+
+    // check for equal as well by flipping mask bits that have still
+    // not been decided (i.e., are equal)
+    scarry |= (~scarry2);
+
+sub:
+
+    if (scarry == 0) goto done;
+
+    // subtract n from c when c is not less than n, as indicated by a 1 bit in mask
+    scarry2 = 0;
+
+    nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c00, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 0 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c01, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 1 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c02, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 2 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c03, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 3 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c04, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 4 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c05, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 5 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c06, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 6 * VECLEN, bvec);
+
+    nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN);
+    bvec = _mm512_mask_sbb_src_epi52(zero, c07, scarry, scarry2, nvec, &scarry2);
+    _mm512_store_epi64(c->data + 7 * VECLEN, bvec);
+
+done:
+
+#endif
+
     c->size = NWORDS;
     return;
 }
@@ -12616,7 +12930,7 @@ void vecsqrmod52_mul(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bign
     return;
 }
 
-void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bignum_t *s, vec_monty_t*mdata)
+void vecsqrmod52(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bignum_t *s, vec_monty_t*mdata)
 {
     // 8x sqr:
     // input 8 bignums in the even lanes of a.
@@ -12651,6 +12965,12 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
     __mmask8 scarry2;
     __mmask8 scarry;
 
+#ifdef USE_AMM
+    uint64_t* outdata = c->data;
+#else
+    uint64_t* outdata = s->data;
+#endif
+
     // deal with the sign
     c->size = NWORDS;
     c->signmask = 0;
@@ -13054,10 +13374,10 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
         for (j = 0; j < i; j++)
         {
             // accumulate s * n
-            a0 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 1) * VECLEN);
-            a1 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 2) * VECLEN);
-            a2 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 3) * VECLEN);
-            a3 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 4) * VECLEN);
+            a0 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 1) * VECLEN);
+            a1 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 2) * VECLEN);
+            a2 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 3) * VECLEN);
+            a3 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 4) * VECLEN);
 
             b0 = _mm512_load_epi32(n->data + ((i - j - 1) * BLOCKWORDS + 1) * VECLEN);
             b1 = _mm512_load_epi32(n->data + ((i - j - 1) * BLOCKWORDS + 2) * VECLEN);
@@ -13104,7 +13424,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
             b0 = _mm512_load_epi64(n->data + 0 * VECLEN);
 
             // add in the final product
@@ -13128,7 +13448,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -13136,7 +13456,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -13159,7 +13479,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -13167,7 +13487,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -13190,7 +13510,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             for (k = 0; k < j; k++)
             {
-                a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN);
+                a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN);
                 b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN);
 
                 VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1);
@@ -13198,7 +13518,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
 
             //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0));
             _mm512_mullo_epi52(a0, nhatvec_e, acc_e0);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0);
 
             // add in the final product
             VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1);
@@ -13769,10 +14089,10 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
         // the s*n terms.  No more doubling past here.
         for (j = 0; j < NBLOCKS - 1 - i; j++)
         {
-            a0 = _mm512_load_epi64(s->data + (NWORDS - 1 - j * BLOCKWORDS) * VECLEN);
-            a1 = _mm512_load_epi64(s->data + (NWORDS - 2 - j * BLOCKWORDS) * VECLEN);
-            a2 = _mm512_load_epi64(s->data + (NWORDS - 3 - j * BLOCKWORDS) * VECLEN);
-            a3 = _mm512_load_epi64(s->data + (NWORDS - 4 - j * BLOCKWORDS) * VECLEN);
+            a0 = _mm512_load_epi64(outdata + (NWORDS - 1 - j * BLOCKWORDS) * VECLEN);
+            a1 = _mm512_load_epi64(outdata + (NWORDS - 2 - j * BLOCKWORDS) * VECLEN);
+            a2 = _mm512_load_epi64(outdata + (NWORDS - 3 - j * BLOCKWORDS) * VECLEN);
+            a3 = _mm512_load_epi64(outdata + (NWORDS - 4 - j * BLOCKWORDS) * VECLEN);
 
             b0 = _mm512_load_epi64(n->data + ((i + j) * BLOCKWORDS + 1) * VECLEN);
             b1 = _mm512_load_epi64(n->data + ((i + j) * BLOCKWORDS + 2) * VECLEN);
@@ -13789,9 +14109,9 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
         }
 
         // finish each triangluar shaped column sum (s * n)
-        a1 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 1) * VECLEN);
-        a2 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 2) * VECLEN);
-        a3 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 3) * VECLEN);
+        a1 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 1) * VECLEN);
+        a2 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 2) * VECLEN);
+        a3 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 3) * VECLEN);
 
         b0 = _mm512_load_epi64(n->data + (NWORDS - 1) * VECLEN);
         b1 = _mm512_load_epi64(n->data + (NWORDS - 2) * VECLEN);
@@ -13955,16 +14275,22 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
             a1 = _mm512_and_epi64(vlmask, a1);
             a0 = _mm512_and_epi64(vlmask, a0);
 
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + 0) * VECLEN, a3);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + 1) * VECLEN, a2);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + 2) * VECLEN, a1);
-            _mm512_store_epi64(s->data + (i * BLOCKWORDS + 3) * VECLEN, a0);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + 0) * VECLEN, a3);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + 1) * VECLEN, a2);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + 2) * VECLEN, a1);
+            _mm512_store_epi64(outdata + (i * BLOCKWORDS + 3) * VECLEN, a0);
         }
 
 
     }
 
-#ifndef USE_AMM
+#ifdef USE_AMM
+    //for (i = NWORDS - 1; i >= 0; i--)
+    //{
+    //    b0 = _mm512_load_epi64(s->data + i * VECLEN);
+    //    _mm512_store_epi64(c->data + i * VECLEN, b0);
+    //}
+#else
     a0 = acc_e0;
     scarry2 = _mm512_cmp_epu64_mask(a0, zero, _MM_CMPINT_EQ);
 
@@ -14002,7 +14328,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign
     return;
 }
 
-void vecsqrmod52(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
+void vecsqrmod52_avxecm(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata)
 {
     // 8x sqr:
     // input 8 bignums in the even lanes of a.
@@ -15237,7 +15563,7 @@ void vecsqrmod52(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t
     return;
 }
 
-void vecaddmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata)
+void vecaddmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata)
 {
     // assumptions:
     // a, b, c are of length VECLEN * NWORDS
@@ -15300,7 +15626,7 @@ void vecaddmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t*
     return;
 }
 
-void vecaddmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata)
+void vecaddmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata)
 {
     // assumptions:
     // a, b, c are of length VECLEN * NWORDS
@@ -15380,7 +15706,7 @@ void vecaddmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_mont
     return;
 }
 
-void vecsubmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata)
+void vecsubmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata)
 {
     // assumptions:
     // a, b, c are of length VECLEN * NWORDS
@@ -15422,7 +15748,7 @@ void vecsubmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t*
     return;
 }
 
-void vecsubmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata)
+void vecsubmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata)
 {
     // assumptions:
     // a, b, c are of length VECLEN * NWORDS
@@ -15553,7 +15879,8 @@ void vecsignedaddmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_bi
     return;
 }
 
-void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* sum, vec_bignum_t* diff,
+void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, 
+    vec_bignum_t* sum, vec_bignum_t* diff,
     vec_monty_t* mdata)
 {
     // assumptions:
@@ -15878,7 +16205,7 @@ void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t
     return;
 }
 
-void vec_simul_addsub52(vec_bignum_t* a, vec_bignum_t* b, 
+void vec_simul_addsub52_avxecm(vec_bignum_t* a, vec_bignum_t* b, 
     vec_bignum_t* sum, vec_bignum_t* diff, vec_monty_t* mdata)
 {
     // assumptions:
@@ -15973,7 +16300,8 @@ void vec_simul_addsub52(vec_bignum_t* a, vec_bignum_t* b,
     return;
 }
 
-void vec_simul_addsub52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *sum, vec_bignum_t *diff, 
+void vec_simul_addsub52(vec_bignum_t *a, vec_bignum_t *b, 
+    vec_bignum_t *sum, vec_bignum_t *diff, 
     vec_monty_t* mdata)
 {
     // assumptions:
diff --git a/yafu/factor/factor_common.c b/yafu/factor/factor_common.c
index 1248204..38b0ac3 100644
--- a/yafu/factor/factor_common.c
+++ b/yafu/factor/factor_common.c
@@ -321,6 +321,198 @@ void alloc_factobj(fact_obj_t *fobj)
 	return;
 }
 
+void copy_factobj(fact_obj_t* dest, fact_obj_t* src)
+{
+    uint32_t seed1, seed2;
+    int i;
+
+
+    dest->seed1 = src->seed1;
+    dest->seed2 = src->seed2;
+    dest->lcg_state = src->lcg_state;
+    dest->flags = src->flags;
+    dest->num_threads = src->num_threads;
+    strcpy(dest->flogname, src->flogname);
+    dest->do_logging = src->do_logging;   // not used...
+    dest->LOGFLAG = src->LOGFLAG;
+    dest->NUM_WITNESSES = src->NUM_WITNESSES;
+
+    // initialize stuff for rho	
+    dest->rho_obj.iterations = src->rho_obj.iterations;
+    dest->rho_obj.curr_poly = src->rho_obj.curr_poly;
+
+    // initialize stuff for pm1	
+    dest->pm1_obj.B1 = src->pm1_obj.B1;
+    dest->pm1_obj.B2 = src->pm1_obj.B2;
+    dest->pm1_obj.stg2_is_default = src->pm1_obj.stg2_is_default;
+    dest->pm1_obj.pm1_exponent = src->pm1_obj.pm1_exponent;
+    dest->pm1_obj.pm1_multiplier = src->pm1_obj.pm1_multiplier;
+    dest->pm1_obj.pm1_tune_freq = src->pm1_obj.pm1_tune_freq;
+    dest->pm1_obj.vecnum = src->pm1_obj.vecnum;
+
+    // initialize stuff for pp1	
+    dest->pp1_obj.B1 = src->pp1_obj.B1;
+    dest->pp1_obj.B2 = src->pp1_obj.B2;
+    dest->pp1_obj.stg2_is_default = src->pp1_obj.stg2_is_default;
+    dest->pp1_obj.pp1_exponent = src->pp1_obj.pp1_exponent;
+    dest->pp1_obj.pp1_multiplier = src->pp1_obj.pp1_multiplier;
+    dest->pp1_obj.pp1_tune_freq = src->pp1_obj.pp1_tune_freq;
+    dest->pp1_obj.vecnum = src->pp1_obj.vecnum;
+
+    // initialize stuff for ecm	
+    dest->ecm_obj.B1 = src->ecm_obj.B1;
+    dest->ecm_obj.B2 = src->ecm_obj.B2;
+    dest->ecm_obj.stg2_is_default = src->ecm_obj.stg2_is_default;
+    dest->ecm_obj.sigma = src->ecm_obj.sigma;
+    dest->ecm_obj.num_curves = src->ecm_obj.num_curves;
+    dest->ecm_obj.curves_run = src->ecm_obj.curves_run;
+    dest->ecm_obj.ecm_exponent = src->ecm_obj.ecm_exponent;
+    dest->ecm_obj.ecm_multiplier = src->ecm_obj.ecm_multiplier;
+    dest->ecm_obj.ecm_tune_freq = src->ecm_obj.ecm_tune_freq;
+    dest->ecm_obj.bail_on_factor = src->ecm_obj.bail_on_factor;
+    dest->ecm_obj.save_b1 = src->ecm_obj.save_b1;
+
+    // unlike ggnfs, ecm does not *require* external binaries.  
+    // an empty string indicates the use of the built-in GMP-ECM hooks, while
+    // a non-empty string (filled in by the user) will indicate the use of
+    // an external binary
+    strcpy(dest->ecm_obj.ecm_path, src->ecm_obj.ecm_path);
+    dest->ecm_obj.use_external = src->ecm_obj.use_external;
+#ifdef USE_AVX512F
+    dest->ecm_obj.prefer_gmpecm = src->ecm_obj.prefer_gmpecm;
+    dest->ecm_obj.ecm_ext_xover = src->ecm_obj.ecm_ext_xover;
+#else
+    dest->ecm_obj.prefer_gmpecm = src->ecm_obj.prefer_gmpecm;
+    dest->ecm_obj.ecm_ext_xover = src->ecm_obj.ecm_ext_xover;
+#endif
+
+    dest->ecm_obj.lcg_state = (uint64_t*)xrealloc(dest->ecm_obj.lcg_state,
+        src->num_threads * sizeof(uint64_t));
+    for (i = 0; i < (int)src->num_threads; i++)
+    {
+        dest->ecm_obj.lcg_state[i] =
+            hash64(lcg_rand_64(&dest->lcg_state));
+    }
+
+
+    // initialize stuff for squfof
+    dest->squfof_obj.num_factors = src->squfof_obj.num_factors;
+
+    // initialize stuff for qs	
+    dest->qs_obj.gbl_override_B_flag = src->qs_obj.gbl_override_B_flag;
+    dest->qs_obj.gbl_override_B = src->qs_obj.gbl_override_B;
+    dest->qs_obj.gbl_override_small_cutoff_flag = src->qs_obj.gbl_override_small_cutoff_flag;
+    dest->qs_obj.gbl_override_small_cutoff = src->qs_obj.gbl_override_small_cutoff;
+    dest->qs_obj.gbl_override_blocks_flag = src->qs_obj.gbl_override_blocks_flag;
+    dest->qs_obj.gbl_override_blocks = src->qs_obj.gbl_override_blocks;
+    dest->qs_obj.gbl_override_lpmult_flag = src->qs_obj.gbl_override_lpmult_flag;
+    dest->qs_obj.gbl_override_lpmult = src->qs_obj.gbl_override_lpmult;
+    dest->qs_obj.gbl_override_rel_flag = src->qs_obj.gbl_override_rel_flag;
+    dest->qs_obj.gbl_override_rel = src->qs_obj.gbl_override_rel;
+    dest->qs_obj.gbl_override_tf_flag = src->qs_obj.gbl_override_tf_flag;
+    dest->qs_obj.gbl_override_tf = src->qs_obj.gbl_override_tf;
+    dest->qs_obj.gbl_override_time_flag = src->qs_obj.gbl_override_time_flag;
+    dest->qs_obj.gbl_override_time = src->qs_obj.gbl_override_time;
+    dest->qs_obj.gbl_override_mfbd = src->qs_obj.gbl_override_mfbd;
+    dest->qs_obj.gbl_override_mfbt = src->qs_obj.gbl_override_mfbt;
+    dest->qs_obj.gbl_override_lpb = src->qs_obj.gbl_override_lpb;
+    dest->qs_obj.gbl_override_bdiv_flag = src->qs_obj.gbl_override_bdiv_flag;
+    dest->qs_obj.gbl_override_bdiv = src->qs_obj.gbl_override_bdiv;
+    dest->qs_obj.gbl_override_3lp_bat = src->qs_obj.gbl_override_3lp_bat;
+    dest->qs_obj.gbl_btarget = src->qs_obj.gbl_btarget;
+    dest->qs_obj.flags = src->qs_obj.flags;
+    dest->qs_obj.gbl_force_DLP = src->qs_obj.gbl_force_DLP;
+    dest->qs_obj.gbl_force_TLP = src->qs_obj.gbl_force_TLP;
+    dest->qs_obj.qs_exponent = src->qs_obj.qs_exponent;
+    dest->qs_obj.qs_multiplier = src->qs_obj.qs_multiplier;
+    dest->qs_obj.qs_tune_freq = src->qs_obj.qs_tune_freq;
+    dest->qs_obj.no_small_cutoff_opt = src->qs_obj.no_small_cutoff_opt;
+    strcpy(dest->qs_obj.siqs_savefile, src->qs_obj.siqs_savefile);
+    init_lehman();
+
+    // initialize stuff for trial division	
+    dest->div_obj.print = src->div_obj.print;
+    dest->div_obj.limit = src->div_obj.limit;
+    dest->div_obj.fmtlimit = src->div_obj.fmtlimit;
+
+    //initialize stuff for nfs
+    dest->nfs_obj.snfs = src->nfs_obj.snfs;
+    dest->nfs_obj.gnfs = src->nfs_obj.gnfs;
+    dest->nfs_obj.gnfs_exponent = src->nfs_obj.gnfs_exponent;
+    dest->nfs_obj.gnfs_multiplier = src->nfs_obj.gnfs_multiplier;
+    dest->nfs_obj.gnfs_tune_freq = src->nfs_obj.gnfs_tune_freq;
+    dest->nfs_obj.min_digits = src->nfs_obj.min_digits;
+    dest->nfs_obj.filter_min_rels_nudge = src->nfs_obj.filter_min_rels_nudge;
+    dest->nfs_obj.siever = src->nfs_obj.siever;
+    dest->nfs_obj.startq = src->nfs_obj.startq;
+    dest->nfs_obj.rangeq = src->nfs_obj.rangeq;
+    dest->nfs_obj.polystart = src->nfs_obj.polystart;
+    dest->nfs_obj.polyrange = src->nfs_obj.polyrange;
+    strcpy(dest->nfs_obj.outputfile, src->nfs_obj.outputfile);
+    strcpy(dest->nfs_obj.logfile, src->nfs_obj.logfile);
+    strcpy(dest->nfs_obj.fbfile, src->nfs_obj.fbfile);
+    dest->nfs_obj.sq_side = src->nfs_obj.sq_side;
+    dest->nfs_obj.timeout = src->nfs_obj.timeout;
+    strcpy(dest->nfs_obj.job_infile, src->nfs_obj.job_infile);
+    dest->nfs_obj.poly_option = src->nfs_obj.poly_option;
+    dest->nfs_obj.restart_flag = src->nfs_obj.restart_flag;
+    dest->nfs_obj.nfs_phases = src->nfs_obj.nfs_phases;
+    dest->nfs_obj.snfs_testsieve_threshold = src->nfs_obj.snfs_testsieve_threshold;
+    strcpy(dest->nfs_obj.filearg, src->nfs_obj.filearg);
+
+    dest->nfs_obj.polybatch = src->nfs_obj.polybatch;
+#if defined(_WIN64)
+    strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir);
+#elif defined(WIN32)
+    strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir);
+#else
+    strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir);
+#endif
+
+    //initialize autofactor object
+    //whether we want to output certain info to their own files...
+    dest->autofact_obj.want_output_primes = src->autofact_obj.want_output_primes;
+    dest->autofact_obj.want_output_factors = src->autofact_obj.want_output_factors;
+    dest->autofact_obj.want_output_unfactored = src->autofact_obj.want_output_unfactored;
+    dest->autofact_obj.want_output_expressions = src->autofact_obj.want_output_expressions;
+    dest->autofact_obj.qs_gnfs_xover = src->autofact_obj.qs_gnfs_xover;
+    dest->autofact_obj.qs_snfs_xover = src->autofact_obj.qs_snfs_xover;
+    // use xover even when timing info is available
+    dest->autofact_obj.prefer_xover = src->autofact_obj.prefer_xover;
+    dest->autofact_obj.want_only_1_factor = src->autofact_obj.want_only_1_factor;
+    dest->autofact_obj.no_ecm = src->autofact_obj.no_ecm;
+    dest->autofact_obj.target_pretest_ratio = src->autofact_obj.target_pretest_ratio;
+    dest->autofact_obj.initial_work = src->autofact_obj.initial_work;
+    dest->autofact_obj.has_snfs_form = src->autofact_obj.has_snfs_form;
+
+    //pretesting plan used by factor()
+    dest->autofact_obj.yafu_pretest_plan = src->autofact_obj.yafu_pretest_plan;
+    strcpy(dest->autofact_obj.plan_str, src->autofact_obj.plan_str);
+    dest->autofact_obj.only_pretest = src->autofact_obj.only_pretest;
+    dest->autofact_obj.autofact_active = src->autofact_obj.autofact_active;
+
+    // if a number is <= aprcl_prove_cutoff, we will prove it prime or composite
+    dest->factors->aprcl_prove_cutoff = src->factors->aprcl_prove_cutoff;
+    // if a number is >= aprcl_display_cutoff, we will show the APRCL progress
+    dest->factors->aprcl_display_cutoff = src->factors->aprcl_display_cutoff;
+
+    dest->MEAS_CPU_FREQUENCY = 42;  // not used anymore
+    strcpy(dest->CPU_ID_STR, src->CPU_ID_STR);
+    dest->HAS_AVX2 = src->HAS_AVX2;
+    dest->HAS_AVX = src->HAS_AVX;
+    dest->HAS_SSE41 = src->HAS_SSE41;
+    dest->NUM_WITNESSES = src->NUM_WITNESSES;
+    dest->cache_size1 = src->cache_size1;
+    dest->cache_size2 = src->cache_size2;
+    dest->LOGFLAG = src->LOGFLAG;
+    dest->THREADS = src->THREADS;
+    dest->HAS_BMI2 = src->HAS_BMI2;
+    dest->HAS_AVX512F = src->HAS_AVX512F;
+    dest->HAS_AVX512BW = src->HAS_AVX512BW;
+
+    return;
+}
+
 void reset_factobj(fact_obj_t *fobj)
 {
 	// keep all of the settings in fobj, but do an init/free cycle on all
diff --git a/yafu/factor/nfs/nfs.c b/yafu/factor/nfs/nfs.c
index 1010f90..66f9bd7 100644
--- a/yafu/factor/nfs/nfs.c
+++ b/yafu/factor/nfs/nfs.c
@@ -1207,7 +1207,7 @@ int get_ggnfs_params(fact_obj_t *fobj, nfs_job_t *job)
 
         d = job->snfs->sdifficulty;
 
-        int do_skew_opt = 1;
+        int do_skew_opt = 0;
         
         if (do_skew_opt && (job->snfs->poly->skew > 0))
         {
@@ -1226,6 +1226,7 @@ int get_ggnfs_params(fact_obj_t *fobj, nfs_job_t *job)
             do
             {                
                 job->snfs->poly->skew += skew1percent;
+                
                 //printf("on iteration %d trying skew %lf: ", i++, job->snfs->poly->skew);
                 analyze_one_poly_xface(job->snfs);
                 //printf("murphy = %le\n", job->snfs->poly->murphy);
diff --git a/yafu/factor/nfs/nfs.o b/yafu/factor/nfs/nfs.o
deleted file mode 100644
index 0485ef0..0000000
Binary files a/yafu/factor/nfs/nfs.o and /dev/null differ
diff --git a/yafu/factor/nfs/nfs_filemanip.o b/yafu/factor/nfs/nfs_filemanip.o
deleted file mode 100644
index 4074755..0000000
Binary files a/yafu/factor/nfs/nfs_filemanip.o and /dev/null differ
diff --git a/yafu/factor/nfs/nfs_poly.c b/yafu/factor/nfs/nfs_poly.c
index 910cd64..65a0f52 100644
--- a/yafu/factor/nfs/nfs_poly.c
+++ b/yafu/factor/nfs/nfs_poly.c
@@ -314,7 +314,7 @@ int snfs_choose_poly(fact_obj_t* fobj, nfs_job_t* job)
         for (i = 0; i < 100; i++)
         {
             best->poly->skew = origskew * (1 + (0.4 * (rand() / RAND_MAX) - 0.2));
-            printf("on iteration %d trying skew %lf: ", i, best->poly->skew);
+            //printf("on iteration %d trying skew %lf: ", i, best->poly->skew);
             analyze_one_poly_xface(best->snfs);
             if (best->poly->murphy > bestmurph)
             {
diff --git a/yafu/factor/nfs/nfs_poly.o b/yafu/factor/nfs/nfs_poly.o
deleted file mode 100644
index 0efa788..0000000
Binary files a/yafu/factor/nfs/nfs_poly.o and /dev/null differ
diff --git a/yafu/factor/nfs/nfs_postproc.o b/yafu/factor/nfs/nfs_postproc.o
deleted file mode 100644
index 469c235..0000000
Binary files a/yafu/factor/nfs/nfs_postproc.o and /dev/null differ
diff --git a/yafu/factor/nfs/nfs_sieving.o b/yafu/factor/nfs/nfs_sieving.o
deleted file mode 100644
index 4b15dc9..0000000
Binary files a/yafu/factor/nfs/nfs_sieving.o and /dev/null differ
diff --git a/yafu/factor/nfs/nfs_threading.o b/yafu/factor/nfs/nfs_threading.o
deleted file mode 100644
index 1bfc503..0000000
Binary files a/yafu/factor/nfs/nfs_threading.o and /dev/null differ
diff --git a/yafu/factor/nfs/snfs.c b/yafu/factor/nfs/snfs.c
index 947a6c5..3960fc8 100644
--- a/yafu/factor/nfs/snfs.c
+++ b/yafu/factor/nfs/snfs.c
@@ -1016,7 +1016,7 @@ void find_xyyxf_form(fact_obj_t *fobj, snfs_t *form)
 void find_direct_form(fact_obj_t* fobj, snfs_t* form)
 {
     int b, p, found = 0, i, deg;
-    int c[7];
+    int c[10];
     mpz_t t, r, g, n, m;
 
     // find the following forms:
@@ -1107,7 +1107,7 @@ void find_direct_form(fact_obj_t* fobj, snfs_t* form)
                 form->form_type = SNFS_DIRECT;
                 mpz_set_ui(form->base1, b);
                 form->exp1 = p;
-                for (i = 6; i >= 0; i--)
+                for (i = 8; i >= 0; i--)
                 {
                     mpz_set_si(form->c[i], c[i]);
                 }
diff --git a/yafu/factor/nfs/snfs.o b/yafu/factor/nfs/snfs.o
deleted file mode 100644
index a6e4ddf..0000000
Binary files a/yafu/factor/nfs/snfs.o and /dev/null differ
diff --git a/yafu/factor/qs/SIQS.c b/yafu/factor/qs/SIQS.c
index b530761..718a51d 100644
--- a/yafu/factor/qs/SIQS.c
+++ b/yafu/factor/qs/SIQS.c
@@ -685,7 +685,7 @@ void SIQS(fact_obj_t *fobj)
 
 	// fill in the factorization object	
 	fobj->qs_obj.savefile.name = (char *)malloc(80 * sizeof(char));
-	strcpy(fobj->savefile_name, fobj->qs_obj.siqs_savefile);
+	strncpy(fobj->savefile_name, fobj->qs_obj.siqs_savefile, 80);
 
 	// initialize the data objects both shared (static) and 
 	// per-thread (dynamic)
diff --git a/yafu/factor/qs/med_sieve_32k_avx2.o b/yafu/factor/qs/med_sieve_32k_avx2.o
deleted file mode 100644
index 35cefe9..0000000
Binary files a/yafu/factor/qs/med_sieve_32k_avx2.o and /dev/null differ
diff --git a/yafu/factor/qs/med_sieve_32k_sse4.1.o b/yafu/factor/qs/med_sieve_32k_sse4.1.o
deleted file mode 100644
index ec3a51a..0000000
Binary files a/yafu/factor/qs/med_sieve_32k_sse4.1.o and /dev/null differ
diff --git a/yafu/factor/qs/tdiv_med_32k_avx2.o b/yafu/factor/qs/tdiv_med_32k_avx2.o
deleted file mode 100644
index 172e081..0000000
Binary files a/yafu/factor/qs/tdiv_med_32k_avx2.o and /dev/null differ
diff --git a/yafu/factor/qs/tdiv_resieve_32k_avx2.o b/yafu/factor/qs/tdiv_resieve_32k_avx2.o
deleted file mode 100644
index 17fd7d9..0000000
Binary files a/yafu/factor/qs/tdiv_resieve_32k_avx2.o and /dev/null differ
diff --git a/yafu/factor/qs/update_poly_roots_32k_avx2.o b/yafu/factor/qs/update_poly_roots_32k_avx2.o
deleted file mode 100644
index 6c67a53..0000000
Binary files a/yafu/factor/qs/update_poly_roots_32k_avx2.o and /dev/null differ
diff --git a/yafu/factor/qs/update_poly_roots_32k_knl.o b/yafu/factor/qs/update_poly_roots_32k_knl.o
deleted file mode 100644
index cb4c999..0000000
Binary files a/yafu/factor/qs/update_poly_roots_32k_knl.o and /dev/null differ
diff --git a/yafu/factor/qs/update_poly_roots_32k_sse4.1.o b/yafu/factor/qs/update_poly_roots_32k_sse4.1.o
deleted file mode 100644
index 39793c9..0000000
Binary files a/yafu/factor/qs/update_poly_roots_32k_sse4.1.o and /dev/null differ
diff --git a/yafu/include/factor.h b/yafu/include/factor.h
index bb7d263..e2df154 100644
--- a/yafu/include/factor.h
+++ b/yafu/include/factor.h
@@ -468,6 +468,7 @@ typedef struct
 void init_factobj(fact_obj_t *fobj);
 void free_factobj(fact_obj_t *fobj);
 void reset_factobj(fact_obj_t *fobj);
+void copy_factobj(fact_obj_t* dest, fact_obj_t* src);
 void alloc_factobj(fact_obj_t *fobj);
 
 //#if defined(WIN32)
diff --git a/yafu/include/yafu.h b/yafu/include/yafu.h
index 4180f6a..66ad028 100644
--- a/yafu/include/yafu.h
+++ b/yafu/include/yafu.h
@@ -22,7 +22,7 @@ code to the public domain.
 #ifndef _YAFU_HEAD_DEF
 #define _YAFU_HEAD_DEF
 
-#define YAFU_VERSION_STRING "2.07"
+#define YAFU_VERSION_STRING "2.08"
 
 // default maximum size for strings/buffers
 #define GSTR_MAXSIZE 1024
@@ -62,6 +62,7 @@ typedef struct
     int NO_CLK_TEST;
 
     // machine info
+    char CWD[1024];
     double MEAS_CPU_FREQUENCY;
     int VERBOSE_PROC_INFO;
     char CPU_ID_STR[256];
diff --git a/yafu/libyecm.a b/yafu/libyecm.a
deleted file mode 100644
index 04e6c27..0000000
Binary files a/yafu/libyecm.a and /dev/null differ
diff --git a/yafu/libynfs.a b/yafu/libynfs.a
deleted file mode 100644
index 0bb8037..0000000
Binary files a/yafu/libynfs.a and /dev/null differ
diff --git a/yafu/libysiqs.a b/yafu/libysiqs.a
deleted file mode 100644
index 71ea137..0000000
Binary files a/yafu/libysiqs.a and /dev/null differ
diff --git a/yafu/session.log b/yafu/session.log
deleted file mode 100644
index fe2598d..0000000
--- a/yafu/session.log
+++ /dev/null
@@ -1,22 +0,0 @@
-01/11/22 22:17:20, =====================================
-01/11/22 22:17:20, System/Build Info: 
-01/11/22 22:17:20, YAFU Version 2.07
-01/11/22 22:17:20, Built with GCC 9
-01/11/22 22:17:20, Using GMP-ECM 7.0.5-dev, Powered by GMP 6.2.1
-01/11/22 22:17:20, detected Intel Xeon Processor (Cascadelake)
-detected L1 = 32768 bytes, L2 = 16777216 bytes, CL = 64 bytes
-01/11/22 22:17:20, using 1 random witness for Rabin-Miller PRP checks
-01/11/22 22:17:20, Cached 664579 primes: max prime is 9999991
-
-01/11/22 22:17:20, Random seed: 17372657678477452430
-01/11/22 22:25:17, =====================================
-01/11/22 22:25:17, System/Build Info: 
-01/11/22 22:25:17, YAFU Version 2.07
-01/11/22 22:25:17, Built with GCC 9
-01/11/22 22:25:17, Using GMP-ECM 7.0.5-dev, Powered by GMP 6.2.1
-01/11/22 22:25:17, detected Intel Xeon Processor (Cascadelake)
-detected L1 = 32768 bytes, L2 = 16777216 bytes, CL = 64 bytes
-01/11/22 22:25:17, using 1 random witness for Rabin-Miller PRP checks
-01/11/22 22:25:17, Cached 664579 primes: max prime is 9999991
-
-01/11/22 22:25:17, Random seed: 14069056675023856505
diff --git a/yafu/top/cmdParser/cmdOptions.c b/yafu/top/cmdParser/cmdOptions.c
index e6afe73..b52a62d 100644
--- a/yafu/top/cmdParser/cmdOptions.c
+++ b/yafu/top/cmdParser/cmdOptions.c
@@ -1359,7 +1359,7 @@ void printUsage(options_t* options)
 // this function should not need to be changed:
 // parse options from a .ini (or other) file.
 // ========================================================================
-void readINI(const char* filename, options_t* options)
+int readINI(const char* filename, options_t* options)
 {
     FILE* doc;
     char* str;
@@ -1372,7 +1372,7 @@ void readINI(const char* filename, options_t* options)
     if (doc == NULL)
     {
         printf("warning: could not open %s, no options parsed\n", filename);
-        return;
+        return 0;
     }
 
     str = (char*)malloc(1024 * sizeof(char));
@@ -1424,5 +1424,5 @@ void readINI(const char* filename, options_t* options)
     fclose(doc);
     free(str);
 
-    return;
+    return 1;
 }
diff --git a/yafu/top/cmdParser/cmdOptions.h b/yafu/top/cmdParser/cmdOptions.h
index 1ec4dfa..7aff3b0 100644
--- a/yafu/top/cmdParser/cmdOptions.h
+++ b/yafu/top/cmdParser/cmdOptions.h
@@ -191,7 +191,7 @@ typedef struct
 extern options_t* initOpt(void);
 extern void applyOpt(char* opt, char* arg, options_t* options);
 extern int processOpts(int argc, char** argv, options_t* options);
-extern void readINI(const char* filename, options_t* options);
+extern int readINI(const char* filename, options_t* options);
 
 
 #ifdef __cplusplus
diff --git a/yafu/top/driver.c b/yafu/top/driver.c
index 4c38b9b..eb46c14 100644
--- a/yafu/top/driver.c
+++ b/yafu/top/driver.c
@@ -41,7 +41,7 @@ void apply_tuneinfo(yafu_obj_t* yobj, fact_obj_t *fobj, char *arg);
 
 // function to print the splash screen to file/screen
 void print_splash(fact_obj_t* fobj, info_t* comp_info, int is_cmdline_run,
-    FILE* logfile, int VFLAG, double freq, int numwit);
+    FILE* logfile, int VFLAG, double freq, int numwit, char *cwd);
 void helpfunc(char* s);
 
 // functions to make a batchfile ready to execute, and to process batchfile lines
@@ -79,6 +79,7 @@ int main(int argc, char *argv[])
     //soe_staticdata_t* sdata;
     info_t comp_info;
     int i;
+    int ini_success;
 
 #if defined(__unix__)
 
@@ -106,7 +107,7 @@ int main(int argc, char *argv[])
 	// set defaults for various things and read the .ini file, if any.
 	yafu_init(&yafu_obj);
     options = initOpt();
-    readINI("yafu.ini", options);
+    ini_success = readINI("yafu.ini", options);
 
     // then process the command line, overriding any .ini settings.
     processOpts(argc, argv, options);
@@ -137,6 +138,18 @@ int main(int argc, char *argv[])
         yafu_obj.USERSEED = 1;
     }
 
+    if (yafu_obj.VFLAG > 0)
+    {
+        if (ini_success)
+        {
+            getcwd(yafu_obj.CWD, 1024);
+        }
+        else
+        {
+            strcpy(yafu_obj.CWD, "");
+        }
+    }
+
 #if !defined(__APPLE__)
     // get the computer name, cache sizes, etc.  store in globals
     // we need to have the cpu id string before calling apply_tuneinfo so that
@@ -282,7 +295,7 @@ int main(int argc, char *argv[])
 		
 	// print the splash screen, to the logfile and depending on options, to the screen
 	print_splash(fobj, &comp_info, is_cmdline_run, logfile, yafu_obj.VFLAG, 
-        yafu_obj.MEAS_CPU_FREQUENCY, yafu_obj.NUM_WITNESSES);
+        yafu_obj.MEAS_CPU_FREQUENCY, yafu_obj.NUM_WITNESSES, yafu_obj.CWD);
 	
 	// start the calculator
 	// right now this just allocates room for user variables
@@ -886,7 +899,7 @@ int check_expression(options_t* options)
 }
 
 void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run, 
-    FILE* logfile, int VFLAG, double freq, int numwit)
+    FILE* logfile, int VFLAG, double freq, int numwit, char *cwd)
 {
 	if (VFLAG >= 0)
 		printf("\n\n");
@@ -962,7 +975,17 @@ void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run,
         logprint(logfile,"using %u random witness for Rabin-Miller PRP checks\n", numwit);
     else
         logprint(logfile, "using %u random witnesses for Rabin-Miller PRP checks\n", numwit);
-    logprint(logfile, "Cached %lu primes: max prime is %lu\n\n", fobj->num_p, fobj->max_p);
+    logprint(logfile, "Cached %lu primes: max prime is %lu\n", fobj->num_p, fobj->max_p);
+    if (strlen(cwd) == 0)
+    {
+        char buf[1024];
+        getcwd(buf, 1024);
+        logprint(logfile, "Could not parse yafu.ini from %s\n\n", buf);
+    }
+    else
+    {
+        logprint(logfile, "Parsed yafu.ini from %s\n\n", cwd);
+    }
 
 	if (VFLAG > 0 || !is_cmdline_run)
 	{		
@@ -975,7 +998,17 @@ void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run,
         else
             printf("Using %u random witnesses for Rabin-Miller PRP checks\n", numwit);
 
-        printf("Cached %lu primes; max prime is %lu\n\n", fobj->num_p, fobj->max_p);
+        printf("Cached %lu primes; max prime is %lu\n", fobj->num_p, fobj->max_p);
+        if (strlen(cwd) == 0)
+        {
+            char buf[1024];
+            getcwd(buf, 1024);
+            printf("Could not parse yafu.ini from %s\n\n", buf);
+        }
+        else
+        {
+            printf("Parsed yafu.ini from %s\n\n", cwd);
+        }
 
 		printf("===============================================================\n");
 		printf("======= Welcome to YAFU (Yet Another Factoring Utility) =======\n");
diff --git a/yafu/yafu b/yafu/yafu
deleted file mode 100755
index 9e0545f..0000000
Binary files a/yafu/yafu and /dev/null differ
diff --git a/yafu/yafu-x64.exe b/yafu/yafu-x64.exe
new file mode 100644
index 0000000..e1c0e4c
Binary files /dev/null and b/yafu/yafu-x64.exe differ
diff --git a/yafu/yafu.ini b/yafu/yafu.ini
index 0c2185a..8594575 100644
--- a/yafu/yafu.ini
+++ b/yafu/yafu.ini
@@ -16,7 +16,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 % Run all multi-threaded algorithms with specified thread count
-threads=64
+% threads=1
 
 % the number of Rabin-Miller witnesses to use during PRP checks
 % nprp=1
@@ -168,7 +168,7 @@ aprcl_d=200
 
 % Set a threshold below which siqs will not use a savefile (all relations are
 % processed in-memory)
-inmem=700
+inmem=70
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % NFS options
@@ -176,7 +176,7 @@ inmem=700
 
 % relative or absolute path to a directory containing ggnfs-lasieve4I* executables.
 % without these yafu will not use NFS
-%ggnfs_dir=..\..\ggnfs-bin\x64\
+ggnfs_dir=..\..\ggnfs-bin\x64\
 %ggnfs_dir=../../ggnfs-bin/
 
 % comma delimited list of poly files to test sieve
@@ -263,7 +263,7 @@ B1ecm=11000
 % B2ecm=1100000
 
 % specify a path to an ECM executable file
-ecm_path=../avx-ecm/avx-ecm
+ecm_path=..\gmp-ecm\bin\x64\Release\ecm.exe
 %ecm_path=../gmp-ecm/install/mingw/bin/ecm.exe
 %ecm_path=../gmp-ecm/bin/ecm
 
@@ -288,12 +288,12 @@ ext_ecm=1000000000
 % P-1 options
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Set the B1 level for P-1.
-B1pm1=100000
+% B1pm1=100000
 
 % Set the B2 level for P-1.  
 % Only needed if you want something other than the default
 % for the current B1.
-B2pm1=10000000
+% B2pm1=10000000
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % P+1 options
@@ -304,17 +304,17 @@ B1pm1=20000
 % Set the B2 level for P+1.  
 % Only needed if you want something other than the default
 % for the current B1.
-B2pm1=2000000
+% B2pm1=2000000
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Brent-Pollard Rho options
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-rhomax=500
+% rhomax=200
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Fermat options
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-fmtmax=10000000
+% fmtmax=1000000
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Eratosthenes options
@@ -335,4 +335,4 @@ fmtmax=10000000
 % appear below here
 
 
-%tune_info=Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz,LINUX64,1.59078e-05,0.196092,0.299688,0.0999245,102.36,42
+tune_info=Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz,LINUX64,1.59078e-05,0.196092,0.299688,0.0999245,102.36,42