diff --git a/yafu/Makefile b/yafu/Makefile index 04374ee..d4c65bd 100644 --- a/yafu/Makefile +++ b/yafu/Makefile @@ -16,7 +16,7 @@ # code to the public domain. # --bbuhrow@gmail.com 7/28/09 # ----------------------------------------------------------------------*/ -COMPILER = gcc + CC = gcc CFLAGS = -g -m64 -DUSE_SSE2 #CFLAGS += -march=core2 -mtune=core2 @@ -40,11 +40,11 @@ LIBS = -L. INC += -I../ysieve -I../ytools LIBS += -L../ysieve -L../ytools -INC += -I../gmp-6.2.1/ -LIBS += -L../gmp-6.2.1/ +INC += -I../gmp_install/gmp-6.2.0/include +LIBS += -L../gmp_install/gmp-6.2.0/lib -INC += -I../avx-ecm/ -LIBS += -L../avx-ecm/ +INC += -I../ecm_install/include/ +LIBS += -L../ecm_install/lib/ INC += -I../msieve/zlib LIBS += -L../msieve/ @@ -145,7 +145,7 @@ ifeq ($(FORCE_GENERIC),1) endif # make sure we get the correct libgmp linked by using an absolute path -LIBS += -lecm ../gmp-6.2.1/.libs/libgmp.a -lytools -lysieve +LIBS += -lecm /users/buhrow/src/c/gmp_install/gmp-6.2.0/lib/libgmp.a -lytools -lysieve #LIBS += -lecm -lgmp -lytools -lysieve ifeq ($(SKYLAKEX),1) @@ -173,7 +173,7 @@ ifeq ($(COMPILER),icc) LIBS += -lsvml endif -CFLAGS += -static $(OPT_FLAGS) $(WARN_FLAGS) $(INC) +CFLAGS += $(OPT_FLAGS) $(WARN_FLAGS) $(INC) x86: CFLAGS += -m32 diff --git a/yafu/bin/x64/Release/yafu-x64.exe b/yafu/bin/x64/Release/yafu-x64.exe new file mode 100644 index 0000000..91533ad Binary files /dev/null and b/yafu/bin/x64/Release/yafu-x64.exe differ diff --git a/yafu/factor/autofactor.c b/yafu/factor/autofactor.c index aebe69a..d0c5a97 100644 --- a/yafu/factor/autofactor.c +++ b/yafu/factor/autofactor.c @@ -563,7 +563,10 @@ int check_if_done(fact_obj_t *fobj, mpz_t N) // load the new fobj with this number fobj_refactor = (fact_obj_t *)malloc(sizeof(fact_obj_t)); init_factobj(fobj_refactor); + copy_factobj(fobj_refactor, fobj); + mpz_set(fobj_refactor->N, fobj->factors->factors[i].factor); + fobj_refactor->refactor_depth = fobj->refactor_depth; // recurse on factor factor(fobj_refactor); @@ -1823,12 +1826,12 @@ void factor(fact_obj_t *fobj) if (fobj->VFLAG > 0) printf("fac: found siqs savefile, resuming siqs\n"); - // remove any common factor so the input exactly matches - // the file - // mpz_tdiv_q(b, b, g); - // mpz_set(fobj->N, b); - // mpz_set(origN, b); - // mpz_set(copyN, b); + // if the inputs don't match exactly, resume siqs on the exact + // number in the savefile and put the cofactor (prime or composite) + // into the factor list. If composite it will get refactored. + add_to_factor_list(fobj->factors, g, fobj->VFLAG, fobj->NUM_WITNESSES); + + mpz_set(b, tmpz); //override default choice fact_state = state_qs; diff --git a/yafu/factor/avx-ecm/avx_ecm_main.c b/yafu/factor/avx-ecm/avx_ecm_main.c index b5e8166..eb5d2fe 100644 --- a/yafu/factor/avx-ecm/avx_ecm_main.c +++ b/yafu/factor/avx-ecm/avx_ecm_main.c @@ -164,12 +164,14 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1, mpz_init(r); mpz_init(N); - mpz_set(N, fobj->ecm_obj.gmp_n); + // set N equal to the original input, so we can + // detect Mersenne inputs correctly. + mpz_set(N, fobj->N); // check for Mersenne inputs size_n = mpz_sizeinbase(N, 2); - for (i = size_n; i < 2048; i++) + for (i = 31; i <= size_n; i++) { mpz_set_ui(r, 1); mpz_mul_2exp(r, r, i); @@ -197,13 +199,18 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1, mpz_set_ui(r, 1); mpz_mul_2exp(r, r, i); mpz_mod(g, r, N); - if (mpz_sizeinbase(g, 2) < DIGITBITS) + if (mpz_sizeinbase(g, 2) < (DIGITBITS/2)) { size_n = i; isMersenne = mpz_get_ui(g); break; } } + //printf("found isMersenne = 2^%d %d\n", size_n, isMersenne); + + // now set N equal to the actual input, which may have had factors removed + // by previous factoring routines. + mpz_set(N, fobj->ecm_obj.gmp_n); // if the input is Mersenne and still contains algebraic factors, remove them. if (abs(isMersenne) == 1) @@ -293,7 +300,7 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1, gmp_printf("commencing parallel ecm on %Zd with %d threads\n", N, threads); } - if ((double)nwords / ((double)maxbits / (double)DIGITBITS) < 0.7) + if ((isMersenne != 0) && ((double)nwords / ((double)maxbits / (double)DIGITBITS) < 0.7)) { if (verbose > 1) { @@ -615,7 +622,6 @@ void vec_ecm_main(fact_obj_t* fobj, uint32_t numcurves, uint64_t B1, } vecaddmod_ptr = &vecaddmod52; vecsubmod_ptr = &vecsubmod52; - } } else diff --git a/yafu/factor/avx-ecm/vecarith52.c b/yafu/factor/avx-ecm/vecarith52.c index 0ace340..c31733c 100644 --- a/yafu/factor/avx-ecm/vecarith52.c +++ b/yafu/factor/avx-ecm/vecarith52.c @@ -52,8 +52,17 @@ This file is a snapshot of a work in progress, originated by Mayo #ifdef USE_AVX512F #include -//#define USE_AMM 1 +#define USE_AMM 1 +__m512i __inline _mm512_mask_sbb_src_epi52(__m512i src, __m512i a, __mmask8 m, __mmask8 c, __m512i b, __mmask8* cout) +{ + __m512i t = _mm512_mask_sub_epi64(src, m, a, b); + *cout = _mm512_mask_cmpgt_epu64_mask(m, b, a); + __m512i t2 = _mm512_mask_sub_epi64(src, m, t, _mm512_maskz_set1_epi64(c, 1)); + *cout = _mm512_kor(*cout, _mm512_mask_cmpgt_epu64_mask(m, t2, t)); + t2 = _mm512_and_epi64(t2, _mm512_set1_epi64(0xfffffffffffffULL)); + return t2; +} void vecmul52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata) { @@ -4078,10 +4087,15 @@ void vecmulmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* // i = 3, a3..0 = c[0..3] // i = 4, a3..0 = c[4..7] // i = 5, a3..0 = c[8..11] - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); + + c00 = a3; + c01 = a2; + c02 = a1; + c03 = a0; } } @@ -4145,15 +4159,159 @@ void vecmulmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* // i = 3, a3..0 = c[0..3] // i = 4, a3..0 = c[4..7] // i = 5, a3..0 = c[8..11] - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); + c04 = a3; + c05 = a2; + c06 = a1; + c07 = a0; + + + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); + //_mm512_store_epi64(c->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); } } +#ifdef USE_AMM + _mm512_store_epi64(c->data + 0 * VECLEN, c00); + _mm512_store_epi64(c->data + 1 * VECLEN, c01); + _mm512_store_epi64(c->data + 2 * VECLEN, c02); + _mm512_store_epi64(c->data + 3 * VECLEN, c03); + _mm512_store_epi64(c->data + 4 * VECLEN, c04); + _mm512_store_epi64(c->data + 5 * VECLEN, c05); + _mm512_store_epi64(c->data + 6 * VECLEN, c06); + _mm512_store_epi64(c->data + 7 * VECLEN, c07); _mm512_store_epi64(c->data + NWORDS * VECLEN, zero); +#else + + __m512i cvec; + __m512i nvec; + __m512i bvec; + + // compare + scarry = 0; // sub mask + scarry2 = 0; // keep looking mask + + cvec = c07; + nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c06; + nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c05; + nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c04; + nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c03; + nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c02; + nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c01; + nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c00; + nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + // check for equal as well by flipping mask bits that have still + // not been decided (i.e., are equal) + scarry |= (~scarry2); + +sub: + + if (scarry == 0) goto done; + + // subtract n from c when c is not less than n, as indicated by a 1 bit in mask + scarry2 = 0; + + nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c00, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 0 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c01, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 1 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c02, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 2 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c03, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 3 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c04, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 4 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c05, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 5 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c06, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 6 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c07, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 7 * VECLEN, bvec); + +done: + +#endif + c->size = NWORDS; return; @@ -7230,7 +7388,7 @@ void vecsqrmod52_fixed624_bfips_ifma(vec_bignum_t* a, vec_bignum_t* c, vec_bignu #endif -void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) +void vecmulmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) { int i, j, k; int NWORDS = mdata->NWORDS; @@ -7870,7 +8028,7 @@ void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t return; } -void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) +void vecmulmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) { int i, j, k; uint32_t NWORDS = mdata->NWORDS; @@ -7901,6 +8059,12 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign __mmask8 scarry2; __mmask8 scarry; +#ifdef USE_AMM + uint64_t* outdata = c->data; +#else + uint64_t* outdata = s->data; +#endif + // deal with the sign c->size = NWORDS; c->signmask = a->signmask ^ b->signmask; @@ -7910,8 +8074,6 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign acc_e1 = zero; acc_e2 = zero; - //uint64_t* outdata = s->data; - // first half mul for (i = 0; i < NBLOCKS; i++) { @@ -8074,10 +8236,10 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign for (j = 0; j < i; j++) { // accumulate s * n - a0 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 1) * VECLEN); - a1 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 2) * VECLEN); - a2 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 3) * VECLEN); - a3 = _mm512_load_epi64(s->data + ((j + 1) * BLOCKWORDS - 4) * VECLEN); + a0 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 1) * VECLEN); + a1 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 2) * VECLEN); + a2 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 3) * VECLEN); + a3 = _mm512_load_epi64(outdata + ((j + 1) * BLOCKWORDS - 4) * VECLEN); b0 = _mm512_load_epi64(n->data + ((i - j - 1) * BLOCKWORDS + 1) * VECLEN); b1 = _mm512_load_epi64(n->data + ((i - j - 1) * BLOCKWORDS + 2) * VECLEN); @@ -8124,7 +8286,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); b0 = _mm512_load_epi64(n->data + 0 * VECLEN); // add in the final product @@ -8148,7 +8310,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -8156,7 +8318,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -8179,7 +8341,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -8189,7 +8351,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign // not sure what can be done about it. //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -8212,7 +8374,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -8220,7 +8382,7 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -8264,10 +8426,10 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign // i = 3, j = 1, a0..3 = c[11..8] // i = 3, j = 2, a0..3 = c[7..4] // i = 4, j = 2, a0..3 = c[11..8] - a0 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 3) * VECLEN); - a1 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 2) * VECLEN); - a2 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 1) * VECLEN); - a3 = _mm512_load_epi64(s->data + ((i - j) * BLOCKWORDS + 0) * VECLEN); + a0 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 3) * VECLEN); + a1 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 2) * VECLEN); + a2 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 1) * VECLEN); + a3 = _mm512_load_epi64(outdata + ((i - j) * BLOCKWORDS + 0) * VECLEN); b0 = _mm512_load_epi64(n->data + ((j - 1) * BLOCKWORDS + 1) * VECLEN); b1 = _mm512_load_epi64(n->data + ((j - 1) * BLOCKWORDS + 2) * VECLEN); @@ -8369,9 +8531,9 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign // i = 3, a1..3 = c[1..3] // i = 4, a1..3 = c[5..7] // i = 5, a1..3 = c[9..11] - a1 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN); - a2 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN); - a3 = _mm512_load_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN); + a1 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN); + a2 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN); + a3 = _mm512_load_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN); b0 = _mm512_load_epi64(n->data + (NWORDS - 1) * VECLEN); b1 = _mm512_load_epi64(n->data + (NWORDS - 2) * VECLEN); @@ -8537,16 +8699,21 @@ void vecmulmod52_207(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_bign // i = 3, a3..0 = c[0..3] // i = 4, a3..0 = c[4..7] // i = 5, a3..0 = c[8..11] - _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(s->data + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); - + _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 0) * VECLEN, a3); + _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 1) * VECLEN, a2); + _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 2) * VECLEN, a1); + _mm512_store_epi64(outdata + ((i - NBLOCKS) * BLOCKWORDS + 3) * VECLEN, a0); } } -#ifndef USE_AMM +#ifdef USE_AMM + //for (i = NWORDS - 1; i >= 0; i--) + //{ + // b0 = _mm512_load_epi64(s->data + i * VECLEN); + // _mm512_store_epi64(c->data + i * VECLEN, b0); + //} +#else a0 = acc_e0; scarry2 = _mm512_cmp_epu64_mask(a0, zero, _MM_CMPINT_EQ); @@ -12476,10 +12643,14 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* a1 = _mm512_and_epi64(vlmask, a1); a0 = _mm512_and_epi64(vlmask, a0); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0); + c00 = a3; + c01 = a2; + c02 = a1; + c03 = a0; } i = 1; @@ -12593,10 +12764,14 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* a1 = _mm512_and_epi64(vlmask, a1); a0 = _mm512_and_epi64(vlmask, a0); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 0) * VECLEN, a3); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 1) * VECLEN, a2); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 2) * VECLEN, a1); + //_mm512_store_epi64(c->data + (i * BLOCKWORDS + 3) * VECLEN, a0); + c04 = a3; + c05 = a2; + c06 = a1; + c07 = a0; } //printf("fixed416: \n"); @@ -12606,6 +12781,145 @@ void vecsqrmod52_fixed416_bfips(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* //} //printf("\n"); +#ifdef USE_AMM + _mm512_store_epi64(c->data + 0 * VECLEN, c00); + _mm512_store_epi64(c->data + 1 * VECLEN, c01); + _mm512_store_epi64(c->data + 2 * VECLEN, c02); + _mm512_store_epi64(c->data + 3 * VECLEN, c03); + _mm512_store_epi64(c->data + 4 * VECLEN, c04); + _mm512_store_epi64(c->data + 5 * VECLEN, c05); + _mm512_store_epi64(c->data + 6 * VECLEN, c06); + _mm512_store_epi64(c->data + 7 * VECLEN, c07); + _mm512_store_epi64(c->data + NWORDS * VECLEN, zero); +#else + + __m512i cvec; + __m512i nvec; + __m512i bvec; + + // compare + scarry = 0; // sub mask + scarry2 = 0; // keep looking mask + + cvec = c07; + nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c06; + nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c05; + nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c04; + nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c03; + nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c02; + nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c01; + nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + cvec = c00; + nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN); + // compare those that have not already been decided using the mask + scarry |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_GT); + scarry2 |= _mm512_mask_cmp_epu64_mask(~scarry2, cvec, nvec, _MM_CMPINT_LT); + + // decided all of them, stop comparing. + if (scarry2 == 0xff) goto sub; + + // check for equal as well by flipping mask bits that have still + // not been decided (i.e., are equal) + scarry |= (~scarry2); + +sub: + + if (scarry == 0) goto done; + + // subtract n from c when c is not less than n, as indicated by a 1 bit in mask + scarry2 = 0; + + nvec = _mm512_load_epi64(mdata->n->data + 0 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c00, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 0 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 1 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c01, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 1 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 2 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c02, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 2 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 3 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c03, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 3 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 4 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c04, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 4 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 5 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c05, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 5 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 6 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c06, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 6 * VECLEN, bvec); + + nvec = _mm512_load_epi64(mdata->n->data + 7 * VECLEN); + bvec = _mm512_mask_sbb_src_epi52(zero, c07, scarry, scarry2, nvec, &scarry2); + _mm512_store_epi64(c->data + 7 * VECLEN, bvec); + +done: + +#endif + c->size = NWORDS; return; } @@ -12616,7 +12930,7 @@ void vecsqrmod52_mul(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bign return; } -void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bignum_t *s, vec_monty_t*mdata) +void vecsqrmod52(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bignum_t *s, vec_monty_t*mdata) { // 8x sqr: // input 8 bignums in the even lanes of a. @@ -12651,6 +12965,12 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign __mmask8 scarry2; __mmask8 scarry; +#ifdef USE_AMM + uint64_t* outdata = c->data; +#else + uint64_t* outdata = s->data; +#endif + // deal with the sign c->size = NWORDS; c->signmask = 0; @@ -13054,10 +13374,10 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign for (j = 0; j < i; j++) { // accumulate s * n - a0 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 1) * VECLEN); - a1 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 2) * VECLEN); - a2 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 3) * VECLEN); - a3 = _mm512_load_epi32(s->data + ((j + 1) * BLOCKWORDS - 4) * VECLEN); + a0 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 1) * VECLEN); + a1 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 2) * VECLEN); + a2 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 3) * VECLEN); + a3 = _mm512_load_epi32(outdata + ((j + 1) * BLOCKWORDS - 4) * VECLEN); b0 = _mm512_load_epi32(n->data + ((i - j - 1) * BLOCKWORDS + 1) * VECLEN); b1 = _mm512_load_epi32(n->data + ((i - j - 1) * BLOCKWORDS + 2) * VECLEN); @@ -13104,7 +13424,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); b0 = _mm512_load_epi64(n->data + 0 * VECLEN); // add in the final product @@ -13128,7 +13448,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -13136,7 +13456,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -13159,7 +13479,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -13167,7 +13487,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -13190,7 +13510,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign for (k = 0; k < j; k++) { - a0 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + k) * VECLEN); + a0 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + k) * VECLEN); b1 = _mm512_load_epi64(n->data + (j - k) * VECLEN); VEC_MUL_ACCUM_LOHI_PD(a0, b1, acc_e0, acc_e1); @@ -13198,7 +13518,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign //a0 = _mm512_and_epi64(vlmask, _mm512_mullo_epi64(nhatvec_e, acc_e0)); _mm512_mullo_epi52(a0, nhatvec_e, acc_e0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + j) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + j) * VECLEN, a0); // add in the final product VEC_MUL_ACCUM_LOHI_PD(a0, b0, acc_e0, acc_e1); @@ -13769,10 +14089,10 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign // the s*n terms. No more doubling past here. for (j = 0; j < NBLOCKS - 1 - i; j++) { - a0 = _mm512_load_epi64(s->data + (NWORDS - 1 - j * BLOCKWORDS) * VECLEN); - a1 = _mm512_load_epi64(s->data + (NWORDS - 2 - j * BLOCKWORDS) * VECLEN); - a2 = _mm512_load_epi64(s->data + (NWORDS - 3 - j * BLOCKWORDS) * VECLEN); - a3 = _mm512_load_epi64(s->data + (NWORDS - 4 - j * BLOCKWORDS) * VECLEN); + a0 = _mm512_load_epi64(outdata + (NWORDS - 1 - j * BLOCKWORDS) * VECLEN); + a1 = _mm512_load_epi64(outdata + (NWORDS - 2 - j * BLOCKWORDS) * VECLEN); + a2 = _mm512_load_epi64(outdata + (NWORDS - 3 - j * BLOCKWORDS) * VECLEN); + a3 = _mm512_load_epi64(outdata + (NWORDS - 4 - j * BLOCKWORDS) * VECLEN); b0 = _mm512_load_epi64(n->data + ((i + j) * BLOCKWORDS + 1) * VECLEN); b1 = _mm512_load_epi64(n->data + ((i + j) * BLOCKWORDS + 2) * VECLEN); @@ -13789,9 +14109,9 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign } // finish each triangluar shaped column sum (s * n) - a1 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 1) * VECLEN); - a2 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 2) * VECLEN); - a3 = _mm512_load_epi64(s->data + (i * BLOCKWORDS + 3) * VECLEN); + a1 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 1) * VECLEN); + a2 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 2) * VECLEN); + a3 = _mm512_load_epi64(outdata + (i * BLOCKWORDS + 3) * VECLEN); b0 = _mm512_load_epi64(n->data + (NWORDS - 1) * VECLEN); b1 = _mm512_load_epi64(n->data + (NWORDS - 2) * VECLEN); @@ -13955,16 +14275,22 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign a1 = _mm512_and_epi64(vlmask, a1); a0 = _mm512_and_epi64(vlmask, a0); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + 0) * VECLEN, a3); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + 1) * VECLEN, a2); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + 2) * VECLEN, a1); - _mm512_store_epi64(s->data + (i * BLOCKWORDS + 3) * VECLEN, a0); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + 0) * VECLEN, a3); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + 1) * VECLEN, a2); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + 2) * VECLEN, a1); + _mm512_store_epi64(outdata + (i * BLOCKWORDS + 3) * VECLEN, a0); } } -#ifndef USE_AMM +#ifdef USE_AMM + //for (i = NWORDS - 1; i >= 0; i--) + //{ + // b0 = _mm512_load_epi64(s->data + i * VECLEN); + // _mm512_store_epi64(c->data + i * VECLEN, b0); + //} +#else a0 = acc_e0; scarry2 = _mm512_cmp_epu64_mask(a0, zero, _MM_CMPINT_EQ); @@ -14002,7 +14328,7 @@ void vecsqrmod52_207(vec_bignum_t *a, vec_bignum_t *c, vec_bignum_t *n, vec_bign return; } -void vecsqrmod52(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) +void vecsqrmod52_avxecm(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t* s, vec_monty_t* mdata) { // 8x sqr: // input 8 bignums in the even lanes of a. @@ -15237,7 +15563,7 @@ void vecsqrmod52(vec_bignum_t* a, vec_bignum_t* c, vec_bignum_t* n, vec_bignum_t return; } -void vecaddmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata) +void vecaddmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata) { // assumptions: // a, b, c are of length VECLEN * NWORDS @@ -15300,7 +15626,7 @@ void vecaddmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* return; } -void vecaddmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata) +void vecaddmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata) { // assumptions: // a, b, c are of length VECLEN * NWORDS @@ -15380,7 +15706,7 @@ void vecaddmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_mont return; } -void vecsubmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata) +void vecsubmod52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* mdata) { // assumptions: // a, b, c are of length VECLEN * NWORDS @@ -15422,7 +15748,7 @@ void vecsubmod52(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* c, vec_monty_t* return; } -void vecsubmod52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata) +void vecsubmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_monty_t* mdata) { // assumptions: // a, b, c are of length VECLEN * NWORDS @@ -15553,7 +15879,8 @@ void vecsignedaddmod52(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *c, vec_bi return; } -void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* sum, vec_bignum_t* diff, +void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, + vec_bignum_t* sum, vec_bignum_t* diff, vec_monty_t* mdata) { // assumptions: @@ -15878,7 +16205,7 @@ void vec_simul_addsub52_fixed1040(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t return; } -void vec_simul_addsub52(vec_bignum_t* a, vec_bignum_t* b, +void vec_simul_addsub52_avxecm(vec_bignum_t* a, vec_bignum_t* b, vec_bignum_t* sum, vec_bignum_t* diff, vec_monty_t* mdata) { // assumptions: @@ -15973,7 +16300,8 @@ void vec_simul_addsub52(vec_bignum_t* a, vec_bignum_t* b, return; } -void vec_simul_addsub52_207(vec_bignum_t *a, vec_bignum_t *b, vec_bignum_t *sum, vec_bignum_t *diff, +void vec_simul_addsub52(vec_bignum_t *a, vec_bignum_t *b, + vec_bignum_t *sum, vec_bignum_t *diff, vec_monty_t* mdata) { // assumptions: diff --git a/yafu/factor/factor_common.c b/yafu/factor/factor_common.c index 1248204..38b0ac3 100644 --- a/yafu/factor/factor_common.c +++ b/yafu/factor/factor_common.c @@ -321,6 +321,198 @@ void alloc_factobj(fact_obj_t *fobj) return; } +void copy_factobj(fact_obj_t* dest, fact_obj_t* src) +{ + uint32_t seed1, seed2; + int i; + + + dest->seed1 = src->seed1; + dest->seed2 = src->seed2; + dest->lcg_state = src->lcg_state; + dest->flags = src->flags; + dest->num_threads = src->num_threads; + strcpy(dest->flogname, src->flogname); + dest->do_logging = src->do_logging; // not used... + dest->LOGFLAG = src->LOGFLAG; + dest->NUM_WITNESSES = src->NUM_WITNESSES; + + // initialize stuff for rho + dest->rho_obj.iterations = src->rho_obj.iterations; + dest->rho_obj.curr_poly = src->rho_obj.curr_poly; + + // initialize stuff for pm1 + dest->pm1_obj.B1 = src->pm1_obj.B1; + dest->pm1_obj.B2 = src->pm1_obj.B2; + dest->pm1_obj.stg2_is_default = src->pm1_obj.stg2_is_default; + dest->pm1_obj.pm1_exponent = src->pm1_obj.pm1_exponent; + dest->pm1_obj.pm1_multiplier = src->pm1_obj.pm1_multiplier; + dest->pm1_obj.pm1_tune_freq = src->pm1_obj.pm1_tune_freq; + dest->pm1_obj.vecnum = src->pm1_obj.vecnum; + + // initialize stuff for pp1 + dest->pp1_obj.B1 = src->pp1_obj.B1; + dest->pp1_obj.B2 = src->pp1_obj.B2; + dest->pp1_obj.stg2_is_default = src->pp1_obj.stg2_is_default; + dest->pp1_obj.pp1_exponent = src->pp1_obj.pp1_exponent; + dest->pp1_obj.pp1_multiplier = src->pp1_obj.pp1_multiplier; + dest->pp1_obj.pp1_tune_freq = src->pp1_obj.pp1_tune_freq; + dest->pp1_obj.vecnum = src->pp1_obj.vecnum; + + // initialize stuff for ecm + dest->ecm_obj.B1 = src->ecm_obj.B1; + dest->ecm_obj.B2 = src->ecm_obj.B2; + dest->ecm_obj.stg2_is_default = src->ecm_obj.stg2_is_default; + dest->ecm_obj.sigma = src->ecm_obj.sigma; + dest->ecm_obj.num_curves = src->ecm_obj.num_curves; + dest->ecm_obj.curves_run = src->ecm_obj.curves_run; + dest->ecm_obj.ecm_exponent = src->ecm_obj.ecm_exponent; + dest->ecm_obj.ecm_multiplier = src->ecm_obj.ecm_multiplier; + dest->ecm_obj.ecm_tune_freq = src->ecm_obj.ecm_tune_freq; + dest->ecm_obj.bail_on_factor = src->ecm_obj.bail_on_factor; + dest->ecm_obj.save_b1 = src->ecm_obj.save_b1; + + // unlike ggnfs, ecm does not *require* external binaries. + // an empty string indicates the use of the built-in GMP-ECM hooks, while + // a non-empty string (filled in by the user) will indicate the use of + // an external binary + strcpy(dest->ecm_obj.ecm_path, src->ecm_obj.ecm_path); + dest->ecm_obj.use_external = src->ecm_obj.use_external; +#ifdef USE_AVX512F + dest->ecm_obj.prefer_gmpecm = src->ecm_obj.prefer_gmpecm; + dest->ecm_obj.ecm_ext_xover = src->ecm_obj.ecm_ext_xover; +#else + dest->ecm_obj.prefer_gmpecm = src->ecm_obj.prefer_gmpecm; + dest->ecm_obj.ecm_ext_xover = src->ecm_obj.ecm_ext_xover; +#endif + + dest->ecm_obj.lcg_state = (uint64_t*)xrealloc(dest->ecm_obj.lcg_state, + src->num_threads * sizeof(uint64_t)); + for (i = 0; i < (int)src->num_threads; i++) + { + dest->ecm_obj.lcg_state[i] = + hash64(lcg_rand_64(&dest->lcg_state)); + } + + + // initialize stuff for squfof + dest->squfof_obj.num_factors = src->squfof_obj.num_factors; + + // initialize stuff for qs + dest->qs_obj.gbl_override_B_flag = src->qs_obj.gbl_override_B_flag; + dest->qs_obj.gbl_override_B = src->qs_obj.gbl_override_B; + dest->qs_obj.gbl_override_small_cutoff_flag = src->qs_obj.gbl_override_small_cutoff_flag; + dest->qs_obj.gbl_override_small_cutoff = src->qs_obj.gbl_override_small_cutoff; + dest->qs_obj.gbl_override_blocks_flag = src->qs_obj.gbl_override_blocks_flag; + dest->qs_obj.gbl_override_blocks = src->qs_obj.gbl_override_blocks; + dest->qs_obj.gbl_override_lpmult_flag = src->qs_obj.gbl_override_lpmult_flag; + dest->qs_obj.gbl_override_lpmult = src->qs_obj.gbl_override_lpmult; + dest->qs_obj.gbl_override_rel_flag = src->qs_obj.gbl_override_rel_flag; + dest->qs_obj.gbl_override_rel = src->qs_obj.gbl_override_rel; + dest->qs_obj.gbl_override_tf_flag = src->qs_obj.gbl_override_tf_flag; + dest->qs_obj.gbl_override_tf = src->qs_obj.gbl_override_tf; + dest->qs_obj.gbl_override_time_flag = src->qs_obj.gbl_override_time_flag; + dest->qs_obj.gbl_override_time = src->qs_obj.gbl_override_time; + dest->qs_obj.gbl_override_mfbd = src->qs_obj.gbl_override_mfbd; + dest->qs_obj.gbl_override_mfbt = src->qs_obj.gbl_override_mfbt; + dest->qs_obj.gbl_override_lpb = src->qs_obj.gbl_override_lpb; + dest->qs_obj.gbl_override_bdiv_flag = src->qs_obj.gbl_override_bdiv_flag; + dest->qs_obj.gbl_override_bdiv = src->qs_obj.gbl_override_bdiv; + dest->qs_obj.gbl_override_3lp_bat = src->qs_obj.gbl_override_3lp_bat; + dest->qs_obj.gbl_btarget = src->qs_obj.gbl_btarget; + dest->qs_obj.flags = src->qs_obj.flags; + dest->qs_obj.gbl_force_DLP = src->qs_obj.gbl_force_DLP; + dest->qs_obj.gbl_force_TLP = src->qs_obj.gbl_force_TLP; + dest->qs_obj.qs_exponent = src->qs_obj.qs_exponent; + dest->qs_obj.qs_multiplier = src->qs_obj.qs_multiplier; + dest->qs_obj.qs_tune_freq = src->qs_obj.qs_tune_freq; + dest->qs_obj.no_small_cutoff_opt = src->qs_obj.no_small_cutoff_opt; + strcpy(dest->qs_obj.siqs_savefile, src->qs_obj.siqs_savefile); + init_lehman(); + + // initialize stuff for trial division + dest->div_obj.print = src->div_obj.print; + dest->div_obj.limit = src->div_obj.limit; + dest->div_obj.fmtlimit = src->div_obj.fmtlimit; + + //initialize stuff for nfs + dest->nfs_obj.snfs = src->nfs_obj.snfs; + dest->nfs_obj.gnfs = src->nfs_obj.gnfs; + dest->nfs_obj.gnfs_exponent = src->nfs_obj.gnfs_exponent; + dest->nfs_obj.gnfs_multiplier = src->nfs_obj.gnfs_multiplier; + dest->nfs_obj.gnfs_tune_freq = src->nfs_obj.gnfs_tune_freq; + dest->nfs_obj.min_digits = src->nfs_obj.min_digits; + dest->nfs_obj.filter_min_rels_nudge = src->nfs_obj.filter_min_rels_nudge; + dest->nfs_obj.siever = src->nfs_obj.siever; + dest->nfs_obj.startq = src->nfs_obj.startq; + dest->nfs_obj.rangeq = src->nfs_obj.rangeq; + dest->nfs_obj.polystart = src->nfs_obj.polystart; + dest->nfs_obj.polyrange = src->nfs_obj.polyrange; + strcpy(dest->nfs_obj.outputfile, src->nfs_obj.outputfile); + strcpy(dest->nfs_obj.logfile, src->nfs_obj.logfile); + strcpy(dest->nfs_obj.fbfile, src->nfs_obj.fbfile); + dest->nfs_obj.sq_side = src->nfs_obj.sq_side; + dest->nfs_obj.timeout = src->nfs_obj.timeout; + strcpy(dest->nfs_obj.job_infile, src->nfs_obj.job_infile); + dest->nfs_obj.poly_option = src->nfs_obj.poly_option; + dest->nfs_obj.restart_flag = src->nfs_obj.restart_flag; + dest->nfs_obj.nfs_phases = src->nfs_obj.nfs_phases; + dest->nfs_obj.snfs_testsieve_threshold = src->nfs_obj.snfs_testsieve_threshold; + strcpy(dest->nfs_obj.filearg, src->nfs_obj.filearg); + + dest->nfs_obj.polybatch = src->nfs_obj.polybatch; +#if defined(_WIN64) + strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir); +#elif defined(WIN32) + strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir); +#else + strcpy(dest->nfs_obj.ggnfs_dir, src->nfs_obj.ggnfs_dir); +#endif + + //initialize autofactor object + //whether we want to output certain info to their own files... + dest->autofact_obj.want_output_primes = src->autofact_obj.want_output_primes; + dest->autofact_obj.want_output_factors = src->autofact_obj.want_output_factors; + dest->autofact_obj.want_output_unfactored = src->autofact_obj.want_output_unfactored; + dest->autofact_obj.want_output_expressions = src->autofact_obj.want_output_expressions; + dest->autofact_obj.qs_gnfs_xover = src->autofact_obj.qs_gnfs_xover; + dest->autofact_obj.qs_snfs_xover = src->autofact_obj.qs_snfs_xover; + // use xover even when timing info is available + dest->autofact_obj.prefer_xover = src->autofact_obj.prefer_xover; + dest->autofact_obj.want_only_1_factor = src->autofact_obj.want_only_1_factor; + dest->autofact_obj.no_ecm = src->autofact_obj.no_ecm; + dest->autofact_obj.target_pretest_ratio = src->autofact_obj.target_pretest_ratio; + dest->autofact_obj.initial_work = src->autofact_obj.initial_work; + dest->autofact_obj.has_snfs_form = src->autofact_obj.has_snfs_form; + + //pretesting plan used by factor() + dest->autofact_obj.yafu_pretest_plan = src->autofact_obj.yafu_pretest_plan; + strcpy(dest->autofact_obj.plan_str, src->autofact_obj.plan_str); + dest->autofact_obj.only_pretest = src->autofact_obj.only_pretest; + dest->autofact_obj.autofact_active = src->autofact_obj.autofact_active; + + // if a number is <= aprcl_prove_cutoff, we will prove it prime or composite + dest->factors->aprcl_prove_cutoff = src->factors->aprcl_prove_cutoff; + // if a number is >= aprcl_display_cutoff, we will show the APRCL progress + dest->factors->aprcl_display_cutoff = src->factors->aprcl_display_cutoff; + + dest->MEAS_CPU_FREQUENCY = 42; // not used anymore + strcpy(dest->CPU_ID_STR, src->CPU_ID_STR); + dest->HAS_AVX2 = src->HAS_AVX2; + dest->HAS_AVX = src->HAS_AVX; + dest->HAS_SSE41 = src->HAS_SSE41; + dest->NUM_WITNESSES = src->NUM_WITNESSES; + dest->cache_size1 = src->cache_size1; + dest->cache_size2 = src->cache_size2; + dest->LOGFLAG = src->LOGFLAG; + dest->THREADS = src->THREADS; + dest->HAS_BMI2 = src->HAS_BMI2; + dest->HAS_AVX512F = src->HAS_AVX512F; + dest->HAS_AVX512BW = src->HAS_AVX512BW; + + return; +} + void reset_factobj(fact_obj_t *fobj) { // keep all of the settings in fobj, but do an init/free cycle on all diff --git a/yafu/factor/nfs/nfs.c b/yafu/factor/nfs/nfs.c index 1010f90..66f9bd7 100644 --- a/yafu/factor/nfs/nfs.c +++ b/yafu/factor/nfs/nfs.c @@ -1207,7 +1207,7 @@ int get_ggnfs_params(fact_obj_t *fobj, nfs_job_t *job) d = job->snfs->sdifficulty; - int do_skew_opt = 1; + int do_skew_opt = 0; if (do_skew_opt && (job->snfs->poly->skew > 0)) { @@ -1226,6 +1226,7 @@ int get_ggnfs_params(fact_obj_t *fobj, nfs_job_t *job) do { job->snfs->poly->skew += skew1percent; + //printf("on iteration %d trying skew %lf: ", i++, job->snfs->poly->skew); analyze_one_poly_xface(job->snfs); //printf("murphy = %le\n", job->snfs->poly->murphy); diff --git a/yafu/factor/nfs/nfs.o b/yafu/factor/nfs/nfs.o deleted file mode 100644 index 0485ef0..0000000 Binary files a/yafu/factor/nfs/nfs.o and /dev/null differ diff --git a/yafu/factor/nfs/nfs_filemanip.o b/yafu/factor/nfs/nfs_filemanip.o deleted file mode 100644 index 4074755..0000000 Binary files a/yafu/factor/nfs/nfs_filemanip.o and /dev/null differ diff --git a/yafu/factor/nfs/nfs_poly.c b/yafu/factor/nfs/nfs_poly.c index 910cd64..65a0f52 100644 --- a/yafu/factor/nfs/nfs_poly.c +++ b/yafu/factor/nfs/nfs_poly.c @@ -314,7 +314,7 @@ int snfs_choose_poly(fact_obj_t* fobj, nfs_job_t* job) for (i = 0; i < 100; i++) { best->poly->skew = origskew * (1 + (0.4 * (rand() / RAND_MAX) - 0.2)); - printf("on iteration %d trying skew %lf: ", i, best->poly->skew); + //printf("on iteration %d trying skew %lf: ", i, best->poly->skew); analyze_one_poly_xface(best->snfs); if (best->poly->murphy > bestmurph) { diff --git a/yafu/factor/nfs/nfs_poly.o b/yafu/factor/nfs/nfs_poly.o deleted file mode 100644 index 0efa788..0000000 Binary files a/yafu/factor/nfs/nfs_poly.o and /dev/null differ diff --git a/yafu/factor/nfs/nfs_postproc.o b/yafu/factor/nfs/nfs_postproc.o deleted file mode 100644 index 469c235..0000000 Binary files a/yafu/factor/nfs/nfs_postproc.o and /dev/null differ diff --git a/yafu/factor/nfs/nfs_sieving.o b/yafu/factor/nfs/nfs_sieving.o deleted file mode 100644 index 4b15dc9..0000000 Binary files a/yafu/factor/nfs/nfs_sieving.o and /dev/null differ diff --git a/yafu/factor/nfs/nfs_threading.o b/yafu/factor/nfs/nfs_threading.o deleted file mode 100644 index 1bfc503..0000000 Binary files a/yafu/factor/nfs/nfs_threading.o and /dev/null differ diff --git a/yafu/factor/nfs/snfs.c b/yafu/factor/nfs/snfs.c index 947a6c5..3960fc8 100644 --- a/yafu/factor/nfs/snfs.c +++ b/yafu/factor/nfs/snfs.c @@ -1016,7 +1016,7 @@ void find_xyyxf_form(fact_obj_t *fobj, snfs_t *form) void find_direct_form(fact_obj_t* fobj, snfs_t* form) { int b, p, found = 0, i, deg; - int c[7]; + int c[10]; mpz_t t, r, g, n, m; // find the following forms: @@ -1107,7 +1107,7 @@ void find_direct_form(fact_obj_t* fobj, snfs_t* form) form->form_type = SNFS_DIRECT; mpz_set_ui(form->base1, b); form->exp1 = p; - for (i = 6; i >= 0; i--) + for (i = 8; i >= 0; i--) { mpz_set_si(form->c[i], c[i]); } diff --git a/yafu/factor/nfs/snfs.o b/yafu/factor/nfs/snfs.o deleted file mode 100644 index a6e4ddf..0000000 Binary files a/yafu/factor/nfs/snfs.o and /dev/null differ diff --git a/yafu/factor/qs/SIQS.c b/yafu/factor/qs/SIQS.c index b530761..718a51d 100644 --- a/yafu/factor/qs/SIQS.c +++ b/yafu/factor/qs/SIQS.c @@ -685,7 +685,7 @@ void SIQS(fact_obj_t *fobj) // fill in the factorization object fobj->qs_obj.savefile.name = (char *)malloc(80 * sizeof(char)); - strcpy(fobj->savefile_name, fobj->qs_obj.siqs_savefile); + strncpy(fobj->savefile_name, fobj->qs_obj.siqs_savefile, 80); // initialize the data objects both shared (static) and // per-thread (dynamic) diff --git a/yafu/factor/qs/med_sieve_32k_avx2.o b/yafu/factor/qs/med_sieve_32k_avx2.o deleted file mode 100644 index 35cefe9..0000000 Binary files a/yafu/factor/qs/med_sieve_32k_avx2.o and /dev/null differ diff --git a/yafu/factor/qs/med_sieve_32k_sse4.1.o b/yafu/factor/qs/med_sieve_32k_sse4.1.o deleted file mode 100644 index ec3a51a..0000000 Binary files a/yafu/factor/qs/med_sieve_32k_sse4.1.o and /dev/null differ diff --git a/yafu/factor/qs/tdiv_med_32k_avx2.o b/yafu/factor/qs/tdiv_med_32k_avx2.o deleted file mode 100644 index 172e081..0000000 Binary files a/yafu/factor/qs/tdiv_med_32k_avx2.o and /dev/null differ diff --git a/yafu/factor/qs/tdiv_resieve_32k_avx2.o b/yafu/factor/qs/tdiv_resieve_32k_avx2.o deleted file mode 100644 index 17fd7d9..0000000 Binary files a/yafu/factor/qs/tdiv_resieve_32k_avx2.o and /dev/null differ diff --git a/yafu/factor/qs/update_poly_roots_32k_avx2.o b/yafu/factor/qs/update_poly_roots_32k_avx2.o deleted file mode 100644 index 6c67a53..0000000 Binary files a/yafu/factor/qs/update_poly_roots_32k_avx2.o and /dev/null differ diff --git a/yafu/factor/qs/update_poly_roots_32k_knl.o b/yafu/factor/qs/update_poly_roots_32k_knl.o deleted file mode 100644 index cb4c999..0000000 Binary files a/yafu/factor/qs/update_poly_roots_32k_knl.o and /dev/null differ diff --git a/yafu/factor/qs/update_poly_roots_32k_sse4.1.o b/yafu/factor/qs/update_poly_roots_32k_sse4.1.o deleted file mode 100644 index 39793c9..0000000 Binary files a/yafu/factor/qs/update_poly_roots_32k_sse4.1.o and /dev/null differ diff --git a/yafu/include/factor.h b/yafu/include/factor.h index bb7d263..e2df154 100644 --- a/yafu/include/factor.h +++ b/yafu/include/factor.h @@ -468,6 +468,7 @@ typedef struct void init_factobj(fact_obj_t *fobj); void free_factobj(fact_obj_t *fobj); void reset_factobj(fact_obj_t *fobj); +void copy_factobj(fact_obj_t* dest, fact_obj_t* src); void alloc_factobj(fact_obj_t *fobj); //#if defined(WIN32) diff --git a/yafu/include/yafu.h b/yafu/include/yafu.h index 4180f6a..66ad028 100644 --- a/yafu/include/yafu.h +++ b/yafu/include/yafu.h @@ -22,7 +22,7 @@ code to the public domain. #ifndef _YAFU_HEAD_DEF #define _YAFU_HEAD_DEF -#define YAFU_VERSION_STRING "2.07" +#define YAFU_VERSION_STRING "2.08" // default maximum size for strings/buffers #define GSTR_MAXSIZE 1024 @@ -62,6 +62,7 @@ typedef struct int NO_CLK_TEST; // machine info + char CWD[1024]; double MEAS_CPU_FREQUENCY; int VERBOSE_PROC_INFO; char CPU_ID_STR[256]; diff --git a/yafu/libyecm.a b/yafu/libyecm.a deleted file mode 100644 index 04e6c27..0000000 Binary files a/yafu/libyecm.a and /dev/null differ diff --git a/yafu/libynfs.a b/yafu/libynfs.a deleted file mode 100644 index 0bb8037..0000000 Binary files a/yafu/libynfs.a and /dev/null differ diff --git a/yafu/libysiqs.a b/yafu/libysiqs.a deleted file mode 100644 index 71ea137..0000000 Binary files a/yafu/libysiqs.a and /dev/null differ diff --git a/yafu/session.log b/yafu/session.log deleted file mode 100644 index fe2598d..0000000 --- a/yafu/session.log +++ /dev/null @@ -1,22 +0,0 @@ -01/11/22 22:17:20, ===================================== -01/11/22 22:17:20, System/Build Info: -01/11/22 22:17:20, YAFU Version 2.07 -01/11/22 22:17:20, Built with GCC 9 -01/11/22 22:17:20, Using GMP-ECM 7.0.5-dev, Powered by GMP 6.2.1 -01/11/22 22:17:20, detected Intel Xeon Processor (Cascadelake) -detected L1 = 32768 bytes, L2 = 16777216 bytes, CL = 64 bytes -01/11/22 22:17:20, using 1 random witness for Rabin-Miller PRP checks -01/11/22 22:17:20, Cached 664579 primes: max prime is 9999991 - -01/11/22 22:17:20, Random seed: 17372657678477452430 -01/11/22 22:25:17, ===================================== -01/11/22 22:25:17, System/Build Info: -01/11/22 22:25:17, YAFU Version 2.07 -01/11/22 22:25:17, Built with GCC 9 -01/11/22 22:25:17, Using GMP-ECM 7.0.5-dev, Powered by GMP 6.2.1 -01/11/22 22:25:17, detected Intel Xeon Processor (Cascadelake) -detected L1 = 32768 bytes, L2 = 16777216 bytes, CL = 64 bytes -01/11/22 22:25:17, using 1 random witness for Rabin-Miller PRP checks -01/11/22 22:25:17, Cached 664579 primes: max prime is 9999991 - -01/11/22 22:25:17, Random seed: 14069056675023856505 diff --git a/yafu/top/cmdParser/cmdOptions.c b/yafu/top/cmdParser/cmdOptions.c index e6afe73..b52a62d 100644 --- a/yafu/top/cmdParser/cmdOptions.c +++ b/yafu/top/cmdParser/cmdOptions.c @@ -1359,7 +1359,7 @@ void printUsage(options_t* options) // this function should not need to be changed: // parse options from a .ini (or other) file. // ======================================================================== -void readINI(const char* filename, options_t* options) +int readINI(const char* filename, options_t* options) { FILE* doc; char* str; @@ -1372,7 +1372,7 @@ void readINI(const char* filename, options_t* options) if (doc == NULL) { printf("warning: could not open %s, no options parsed\n", filename); - return; + return 0; } str = (char*)malloc(1024 * sizeof(char)); @@ -1424,5 +1424,5 @@ void readINI(const char* filename, options_t* options) fclose(doc); free(str); - return; + return 1; } diff --git a/yafu/top/cmdParser/cmdOptions.h b/yafu/top/cmdParser/cmdOptions.h index 1ec4dfa..7aff3b0 100644 --- a/yafu/top/cmdParser/cmdOptions.h +++ b/yafu/top/cmdParser/cmdOptions.h @@ -191,7 +191,7 @@ typedef struct extern options_t* initOpt(void); extern void applyOpt(char* opt, char* arg, options_t* options); extern int processOpts(int argc, char** argv, options_t* options); -extern void readINI(const char* filename, options_t* options); +extern int readINI(const char* filename, options_t* options); #ifdef __cplusplus diff --git a/yafu/top/driver.c b/yafu/top/driver.c index 4c38b9b..eb46c14 100644 --- a/yafu/top/driver.c +++ b/yafu/top/driver.c @@ -41,7 +41,7 @@ void apply_tuneinfo(yafu_obj_t* yobj, fact_obj_t *fobj, char *arg); // function to print the splash screen to file/screen void print_splash(fact_obj_t* fobj, info_t* comp_info, int is_cmdline_run, - FILE* logfile, int VFLAG, double freq, int numwit); + FILE* logfile, int VFLAG, double freq, int numwit, char *cwd); void helpfunc(char* s); // functions to make a batchfile ready to execute, and to process batchfile lines @@ -79,6 +79,7 @@ int main(int argc, char *argv[]) //soe_staticdata_t* sdata; info_t comp_info; int i; + int ini_success; #if defined(__unix__) @@ -106,7 +107,7 @@ int main(int argc, char *argv[]) // set defaults for various things and read the .ini file, if any. yafu_init(&yafu_obj); options = initOpt(); - readINI("yafu.ini", options); + ini_success = readINI("yafu.ini", options); // then process the command line, overriding any .ini settings. processOpts(argc, argv, options); @@ -137,6 +138,18 @@ int main(int argc, char *argv[]) yafu_obj.USERSEED = 1; } + if (yafu_obj.VFLAG > 0) + { + if (ini_success) + { + getcwd(yafu_obj.CWD, 1024); + } + else + { + strcpy(yafu_obj.CWD, ""); + } + } + #if !defined(__APPLE__) // get the computer name, cache sizes, etc. store in globals // we need to have the cpu id string before calling apply_tuneinfo so that @@ -282,7 +295,7 @@ int main(int argc, char *argv[]) // print the splash screen, to the logfile and depending on options, to the screen print_splash(fobj, &comp_info, is_cmdline_run, logfile, yafu_obj.VFLAG, - yafu_obj.MEAS_CPU_FREQUENCY, yafu_obj.NUM_WITNESSES); + yafu_obj.MEAS_CPU_FREQUENCY, yafu_obj.NUM_WITNESSES, yafu_obj.CWD); // start the calculator // right now this just allocates room for user variables @@ -886,7 +899,7 @@ int check_expression(options_t* options) } void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run, - FILE* logfile, int VFLAG, double freq, int numwit) + FILE* logfile, int VFLAG, double freq, int numwit, char *cwd) { if (VFLAG >= 0) printf("\n\n"); @@ -962,7 +975,17 @@ void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run, logprint(logfile,"using %u random witness for Rabin-Miller PRP checks\n", numwit); else logprint(logfile, "using %u random witnesses for Rabin-Miller PRP checks\n", numwit); - logprint(logfile, "Cached %lu primes: max prime is %lu\n\n", fobj->num_p, fobj->max_p); + logprint(logfile, "Cached %lu primes: max prime is %lu\n", fobj->num_p, fobj->max_p); + if (strlen(cwd) == 0) + { + char buf[1024]; + getcwd(buf, 1024); + logprint(logfile, "Could not parse yafu.ini from %s\n\n", buf); + } + else + { + logprint(logfile, "Parsed yafu.ini from %s\n\n", cwd); + } if (VFLAG > 0 || !is_cmdline_run) { @@ -975,7 +998,17 @@ void print_splash(fact_obj_t *fobj, info_t *comp_info, int is_cmdline_run, else printf("Using %u random witnesses for Rabin-Miller PRP checks\n", numwit); - printf("Cached %lu primes; max prime is %lu\n\n", fobj->num_p, fobj->max_p); + printf("Cached %lu primes; max prime is %lu\n", fobj->num_p, fobj->max_p); + if (strlen(cwd) == 0) + { + char buf[1024]; + getcwd(buf, 1024); + printf("Could not parse yafu.ini from %s\n\n", buf); + } + else + { + printf("Parsed yafu.ini from %s\n\n", cwd); + } printf("===============================================================\n"); printf("======= Welcome to YAFU (Yet Another Factoring Utility) =======\n"); diff --git a/yafu/yafu b/yafu/yafu deleted file mode 100755 index 9e0545f..0000000 Binary files a/yafu/yafu and /dev/null differ diff --git a/yafu/yafu-x64.exe b/yafu/yafu-x64.exe new file mode 100644 index 0000000..e1c0e4c Binary files /dev/null and b/yafu/yafu-x64.exe differ diff --git a/yafu/yafu.ini b/yafu/yafu.ini index 0c2185a..8594575 100644 --- a/yafu/yafu.ini +++ b/yafu/yafu.ini @@ -16,7 +16,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Run all multi-threaded algorithms with specified thread count -threads=64 +% threads=1 % the number of Rabin-Miller witnesses to use during PRP checks % nprp=1 @@ -168,7 +168,7 @@ aprcl_d=200 % Set a threshold below which siqs will not use a savefile (all relations are % processed in-memory) -inmem=700 +inmem=70 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % NFS options @@ -176,7 +176,7 @@ inmem=700 % relative or absolute path to a directory containing ggnfs-lasieve4I* executables. % without these yafu will not use NFS -%ggnfs_dir=..\..\ggnfs-bin\x64\ +ggnfs_dir=..\..\ggnfs-bin\x64\ %ggnfs_dir=../../ggnfs-bin/ % comma delimited list of poly files to test sieve @@ -263,7 +263,7 @@ B1ecm=11000 % B2ecm=1100000 % specify a path to an ECM executable file -ecm_path=../avx-ecm/avx-ecm +ecm_path=..\gmp-ecm\bin\x64\Release\ecm.exe %ecm_path=../gmp-ecm/install/mingw/bin/ecm.exe %ecm_path=../gmp-ecm/bin/ecm @@ -288,12 +288,12 @@ ext_ecm=1000000000 % P-1 options %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Set the B1 level for P-1. -B1pm1=100000 +% B1pm1=100000 % Set the B2 level for P-1. % Only needed if you want something other than the default % for the current B1. -B2pm1=10000000 +% B2pm1=10000000 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % P+1 options @@ -304,17 +304,17 @@ B1pm1=20000 % Set the B2 level for P+1. % Only needed if you want something other than the default % for the current B1. -B2pm1=2000000 +% B2pm1=2000000 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Brent-Pollard Rho options %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -rhomax=500 +% rhomax=200 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Fermat options %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -fmtmax=10000000 +% fmtmax=1000000 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Eratosthenes options @@ -335,4 +335,4 @@ fmtmax=10000000 % appear below here -%tune_info=Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz,LINUX64,1.59078e-05,0.196092,0.299688,0.0999245,102.36,42 +tune_info=Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz,LINUX64,1.59078e-05,0.196092,0.299688,0.0999245,102.36,42