optimize karatsuba split

adamant-pwn · adamant-pwn · commit 1fbc92f94d84 · 2025-04-28T13:37:18.000+02:00
diff --git a/cp-algo/math/fft.hpp b/cp-algo/math/fft.hpp
@@ -201,16 +201,17 @@ namespace cp_algo::math::fft {
     void mul(auto &a, auto const& b) {
         size_t N = size(a) + size(b) - 1;
         if(std::max(size(a), size(b)) > (1 << 23)) {
+            using T = std::decay_t<decltype(a[0])>;
             // do karatsuba to save memory
             auto n = (std::max(size(a), size(b)) + 1) / 2;
-            auto a0 = to<std::vector>(a | std::views::take(n));
-            auto a1 = to<std::vector>(a | std::views::drop(n));
-            auto b0 = to<std::vector>(b | std::views::take(n));
-            auto b1 = to<std::vector>(b | std::views::drop(n));
+            auto a0 = to<std::vector<T, big_alloc<T>>>(a | std::views::take(n));
+            auto a1 = to<std::vector<T, big_alloc<T>>>(a | std::views::drop(n));
+            auto b0 = to<std::vector<T, big_alloc<T>>>(b | std::views::take(n));
+            auto b1 = to<std::vector<T, big_alloc<T>>>(b | std::views::drop(n));
             a0.resize(n); a1.resize(n);
             b0.resize(n); b1.resize(n);
-            auto a01 = to<std::vector>(std::views::zip_transform(std::plus{}, a0, a1));
-            auto b01 = to<std::vector>(std::views::zip_transform(std::plus{}, b0, b1));
+            auto a01 = to<std::vector<T, big_alloc<T>>>(std::views::zip_transform(std::plus{}, a0, a1));
+            auto b01 = to<std::vector<T, big_alloc<T>>>(std::views::zip_transform(std::plus{}, b0, b1));
             checkpoint("karatsuba split");
             mul(a0, b0);
             mul(a1, b1);
diff --git a/cp-algo/util/simd.hpp b/cp-algo/util/simd.hpp
@@ -33,11 +33,11 @@ namespace cp_algo {
     [[gnu::always_inline]] inline u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
         auto x_ninv = u64x4(u32x8(x) * u32x8(imod));
 #ifdef __AVX2__
-        auto x_res = __m256i(x) + _mm256_mul_epu32(__m256i(x_ninv), __m256i(mod));
+        x += u64x4(_mm256_mul_epu32(__m256i(x_ninv), __m256i(mod)));
 #else
-        auto x_res = x + x_ninv * mod;
+        x += x_ninv * mod;
 #endif
-        return u64x4(x_res) >> 32;
+        return x >> 32;
     }
 
     [[gnu::always_inline]] inline u64x4 montgomery_mul(u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {