From 48261296c2916d6e65cf90c3b0d8384033c05100 Mon Sep 17 00:00:00 2001 From: Martun Karapetyan Date: Tue, 30 Jul 2024 15:22:47 +0000 Subject: [PATCH] Optimize memory consumption a bit. --- .../detail/placeholder_scoped_profiler.hpp | 2 + .../plonk/placeholder/gates_argument.hpp | 2 + .../plonk/placeholder/lookup_argument.hpp | 78 ++++++++++--------- .../nil/actor/core/parallelization_utils.hpp | 19 ++++- 4 files changed, 62 insertions(+), 39 deletions(-) diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/detail/placeholder_scoped_profiler.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/detail/placeholder_scoped_profiler.hpp index b68e16cb..1a6d6eb1 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/detail/placeholder_scoped_profiler.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/detail/placeholder_scoped_profiler.hpp @@ -116,6 +116,8 @@ namespace nil { } // namespace crypto3 } // namespace nil +#define ZK_PLACEHOLDER_PROFILING_ENABLED + #ifdef ZK_PLACEHOLDER_PROFILING_ENABLED #define PROFILE_PLACEHOLDER_SCOPE(name) \ nil::crypto3::zk::snark::detail::placeholder_scoped_profiler profiler(name); diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp index 68a8c9cc..a7555f86 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp @@ -206,6 +206,8 @@ namespace nil { [&variable_values, &extended_domain_sizes, &result, &expressions, i] (std::size_t begin, std::size_t end) { for (std::size_t j = begin; j < end; ++j) { + // Don't use cache here. In practice it's slower to maintain the cache + // than to re-compute the subexpression value when value type is field element. math::expression_evaluator evaluator( expressions[i], [&assignments=variable_values, j] diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp index dd97f891..b0544b92 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp @@ -31,6 +31,7 @@ #define CRYPTO3_ZK_PLONK_PLACEHOLDER_LOOKUP_ARGUMENT_HPP #include +#include #include #include @@ -126,7 +127,7 @@ namespace nil { public: struct prover_lookup_result { - std::array, argument_size> F_dfs; + std::array F_dfs; typename commitment_scheme_type::commitment_type lookup_commitment; }; @@ -157,31 +158,31 @@ namespace nil { PROFILE_PLACEHOLDER_SCOPE("Lookup argument prove eval time"); // Construct lookup gates - math::polynomial_dfs one_polynomial( + polynomial_dfs_type one_polynomial( 0, basic_domain->m, FieldType::value_type::one()); - math::polynomial_dfs zero_polynomial( + polynomial_dfs_type zero_polynomial( 0, basic_domain->m, FieldType::value_type::zero()); - math::polynomial_dfs mask_assignment = + polynomial_dfs_type mask_assignment = one_polynomial - preprocessed_data.q_last - preprocessed_data.q_blind; - std::unique_ptr>> lookup_value_ptr = + std::unique_ptr> lookup_value_ptr = prepare_lookup_value(mask_assignment); auto& lookup_value = *lookup_value_ptr; - std::unique_ptr>> lookup_input_ptr = + std::unique_ptr> lookup_input_ptr = prepare_lookup_input(); auto& lookup_input = *lookup_input_ptr; // 3. Lookup_input and lookup_value are ready // Now sort them! // Reduce value and input: - auto reduced_value_ptr = std::make_unique>>(); + auto reduced_value_ptr = std::make_unique>(); auto& reduced_value = *reduced_value_ptr; for( std::size_t i = 0; i < lookup_value.size(); i++ ){ reduced_value.push_back(reduce_dfs_polynomial_domain(lookup_value[i], basic_domain->m)); } - auto reduced_input_ptr = std::make_unique>>(); + auto reduced_input_ptr = std::make_unique>(); auto& reduced_input = *reduced_input_ptr; for( std::size_t i = 0; i < lookup_input.size(); i++ ){ reduced_input.push_back(reduce_dfs_polynomial_domain(lookup_input[i], basic_domain->m)); @@ -207,7 +208,7 @@ namespace nil { lookup_alphas.push_back(transcript.template challenge()); } - math::polynomial_dfs V_L = compute_V_L( + polynomial_dfs_type V_L = compute_V_L( sorted, reduced_input, reduced_value, beta, gamma); // We don't use reduced_input and reduced_value after this line. @@ -220,18 +221,18 @@ namespace nil { BOOST_ASSERT(std::accumulate(part_sizes.begin(), part_sizes.end(), 0) == sorted.size()); // Compute gs and hs products for each part - std::vector> gs = compute_gs( + std::vector gs = compute_gs( std::move(lookup_input_ptr), std::move(lookup_value_ptr), beta, gamma, part_sizes ); - std::vector> hs = compute_hs( + std::vector hs = compute_hs( sorted, beta, gamma, part_sizes ); - math::polynomial_dfs V_L_shifted = + polynomial_dfs_type V_L_shifted = math::polynomial_shift(V_L, 1, basic_domain->m); - std::array, argument_size> F_dfs; + std::array F_dfs; F_dfs[0] = preprocessed_data.common_data.lagrange_0 * (one_polynomial - V_L); F_dfs[1] = preprocessed_data.q_last * ( V_L * V_L - V_L ); @@ -245,16 +246,16 @@ namespace nil { g *= V_L; h *= V_L_shifted; g -= h; - h = math::polynomial_dfs(); // just clean the memory of h. + h = polynomial_dfs_type(); // just clean the memory of h. g *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial; F_dfs[2] = std::move(g); } else { - std::vector> parts; + std::vector parts; BOOST_ASSERT(part_sizes.size() == gs.size()); BOOST_ASSERT(part_sizes.size() == hs.size()); BOOST_ASSERT(part_sizes.size() == lookup_alphas.size() + 1); - std::vector> reduced_gs(lookup_alphas.size()); - std::vector> reduced_hs(lookup_alphas.size()); + std::vector reduced_gs(lookup_alphas.size()); + std::vector reduced_hs(lookup_alphas.size()); parallel_for(0, lookup_alphas.size(), [this, &gs, &hs, &reduced_gs, &reduced_hs](std::size_t i) { reduced_gs[i] = reduce_dfs_polynomial_domain(gs[i], basic_domain->m); reduced_hs[i] = reduce_dfs_polynomial_domain(hs[i], basic_domain->m); @@ -270,11 +271,11 @@ namespace nil { }, ThreadPool::PoolLevel::HIGH); - math::polynomial_dfs current_poly = V_L; - math::polynomial_dfs previous_poly = V_L; + polynomial_dfs_type current_poly = V_L; + polynomial_dfs_type previous_poly = V_L; // We need to store all the values of current_poly. Suddenly this increases the RAM usage, but // there's no other way to parallelize this loop. - std::vector> all_polys(1, V_L); + std::vector all_polys(1, V_L); for (std::size_t i = 0; i < lookup_alphas.size(); ++i) { @@ -287,19 +288,24 @@ namespace nil { all_polys.push_back(current_poly); previous_poly = current_poly; } - std::vector> F_dfs_2_parts(lookup_alphas.size() + 1); - parallel_for(0, lookup_alphas.size(), - [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts](std::size_t i) { - auto &g = gs[i]; - auto &h = hs[i]; - F_dfs_2_parts[i] = lookup_alphas[i] * (all_polys[i] * g - all_polys[i + 1] * h); + std::vector F_dfs_2_parts( + std::thread::hardware_concurrency() + 1, + polynomial_dfs_type::zero()); + wait_for_all(parallel_run_in_chunks_with_thread_id( + lookup_alphas.size(), + [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts] + (std::size_t thread_id, std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + F_dfs_2_parts[thread_id] += (all_polys[i] * gs[i] - all_polys[i + 1] * hs[i]) * lookup_alphas[i]; + // Save a bit or ram by deleting gs[i] and hs[i], we don't need it any more. + gs[i] = polynomial_dfs_type(); + hs[i] = polynomial_dfs_type(); + } }, - ThreadPool::PoolLevel::HIGH); + ThreadPool::PoolLevel::HIGH)); std::size_t last = lookup_alphas.size(); - auto &g = gs[last]; - auto &h = hs[last]; - F_dfs_2_parts[lookup_alphas.size()] = previous_poly * g - V_L_shifted * h; + F_dfs_2_parts.back() = previous_poly * gs[last] - V_L_shifted * hs[last]; F_dfs[2] += polynomial_sum(std::move(F_dfs_2_parts)); F_dfs[2] *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial; } @@ -311,9 +317,11 @@ namespace nil { alpha_challenges[i] = transcript.template challenge(); } - std::vector> F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end()); + std::vector F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end()); parallel_for(0, F_dfs_3_parts.size(), [this, &F_dfs_3_parts, &alpha_challenges, &sorted](std::size_t i) { - math::polynomial_dfs sorted_shifted = math::polynomial_shift(sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount, this->basic_domain->m); + polynomial_dfs_type sorted_shifted = math::polynomial_shift( + sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount, + this->basic_domain->m); F_dfs_3_parts[i] -= sorted_shifted; F_dfs_3_parts[i] *= alpha_challenges[i] * preprocessed_data.common_data.lagrange_0; }, ThreadPool::PoolLevel::HIGH); @@ -324,9 +332,9 @@ namespace nil { }; } - std::vector> compute_gs( - std::unique_ptr>> lookup_input_ptr, - std::unique_ptr>> lookup_value_ptr, + std::vector compute_gs( + std::unique_ptr> lookup_input_ptr, + std::unique_ptr> lookup_value_ptr, const typename FieldType::value_type& beta, const typename FieldType::value_type& gamma, const std::vector& lookup_part_sizes diff --git a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp index 96fe37f5..32baf239 100644 --- a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp +++ b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp @@ -49,9 +49,9 @@ namespace nil { // Divides work into chunks and makes calls to 'func' in parallel. template - std::vector> parallel_run_in_chunks( + std::vector> parallel_run_in_chunks_with_thread_id( std::size_t elements_count, - std::function func, + std::function func, ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) { auto& thread_pool = ThreadPool::get_instance(pool_id); @@ -73,14 +73,25 @@ namespace nil { std::size_t begin = 0; for (std::size_t i = 0; i < workers_to_use; i++) { auto end = begin + (elements_count - begin) / (workers_to_use - i); - fut.emplace_back(thread_pool.post([begin, end, func]() { - return func(begin, end); + fut.emplace_back(thread_pool.post([i, begin, end, func]() { + return func(i, begin, end); })); begin = end; } return fut; } + template + std::vector> parallel_run_in_chunks( + std::size_t elements_count, + std::function func, + ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) { + return parallel_run_in_chunks_with_thread_id(elements_count, + [func](std::size_t thread_id, std::size_t begin, std::size_t end) -> ReturnType { + return func(begin, end); + }, pool_id); + } + // Similar to std::transform, but in parallel. We return void here for better usability for our use cases. template void parallel_transform(InputIt1 first1, InputIt1 last1, InputIt2 first2,