From d855e346c86e31e750cd38f3cd247f0ce0cb616b Mon Sep 17 00:00:00 2001 From: Martun Karapetyan Date: Mon, 29 Jul 2024 11:00:52 +0000 Subject: [PATCH 1/2] Parallelize gate argument outside the expression tree. --- .../plonk/placeholder/gates_argument.hpp | 109 ++++++------------ 1 file changed, 38 insertions(+), 71 deletions(-) diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp index 37ee0ec1..68a8c9cc 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp @@ -77,18 +77,18 @@ namespace nil { constexpr static const std::size_t argument_size = 1; static inline void build_variable_value_map( - const math::expression& expr, + const math::expression& expr, const plonk_polynomial_dfs_table &assignments, std::shared_ptr> domain, std::size_t extended_domain_size, - std::unordered_map& variable_values_out) { + std::unordered_map& variable_values_out) { - std::unordered_map variable_counts; + std::unordered_map variable_counts; - std::vector variables; + std::vector variables; - math::expression_for_each_variable_visitor visitor( - [&variable_counts, &variables, &variable_values_out](const polynomial_dfs_variable_type& var) { + math::expression_for_each_variable_visitor visitor( + [&variable_counts, &variables, &variable_values_out](const variable_type& var) { // Create the structure of the map so we can change the values later. if (variable_counts[var] == 0) { variables.push_back(var); @@ -107,12 +107,14 @@ namespace nil { parallel_for(0, variables.size(), [&variables, &variable_values_out, &assignments, &domain, &extended_domain, extended_domain_size](std::size_t i) { - const auto& var = variables[i]; - // We may have variable values in required sizes in some cases. - if (variable_values_out[var].size() == extended_domain_size) - return; + const variable_type& var = variables[i]; - polynomial_dfs_type assignment = assignments.get_variable_value(var, domain); + // Convert the variable to polynomial_dfs variable type. + polynomial_dfs_variable_type var_dfs(var.index, var.rotation, var.relative, + static_cast( + static_cast(var.type))); + + polynomial_dfs_type assignment = assignments.get_variable_value(var_dfs, domain); // In parallel version we always resize the assignment poly, it's better for parallelization. // if (count > 1) { @@ -152,11 +154,7 @@ namespace nil { degree_limits.push_back(max_degree / 2); extended_domain_sizes.push_back(max_domain_size / 2); - std::vector> expressions(extended_domain_sizes.size()); - - // Only in parallel version we store the subexpressions of each expression and ignore the cache. - std::vector>> subexpressions(extended_domain_sizes.size()); - + std::vector> expressions(extended_domain_sizes.size()); auto theta_acc = FieldType::value_type::one(); // Every constraint has variable type 'variable_type', but we want it to use @@ -170,28 +168,10 @@ namespace nil { const auto& gates = constraint_system.gates(); for (const auto& gate: gates) { - std::vector> gate_results(extended_domain_sizes.size()); - - // We will split gates into parts especially for zkEVM circuit, since there is only 1 large gate with - // 683 constraints. Will split it into 24 parts, ~32 constraints each. - // This will mean our code will multiply by selector 16 times, instead of just once. But this is - // much better that losing parallelization. We do not want to re-write the whole code to try parallelize - // each gate compatation separately. This will not harm circuits with smaller number of terms much. - std::vector> gate_parts(extended_domain_sizes.size()); - std::vector gate_parts_constaint_counts(extended_domain_sizes.size()); - - - // This parameter can be tuned based on the circuit and the number of cores of the server on which the proofs - // are generated. On the current zkEVM circuit this value is optimal based on experiments. - const std::size_t constraint_limit = 16; - - - auto selector = polynomial_dfs_variable_type( - gate.selector_index, 0, false, polynomial_dfs_variable_type::column_type::selector); - + std::vector> gate_results(extended_domain_sizes.size()); for (std::size_t constraint_idx = 0; constraint_idx < gate.constraints.size(); ++constraint_idx) { const auto& constraint = gate.constraints[constraint_idx]; - auto next_term = converter.convert(constraint) * value_type_to_polynomial_dfs(theta_acc); + auto next_term = constraint * theta_acc; theta_acc *= theta; // +1 stands for the selector multiplication. @@ -200,57 +180,44 @@ namespace nil { // Whatever the degree of term is, add it to the maximal degree expression. if (degree_limits[i] >= constraint_degree || i == 0) { gate_results[i] += next_term; - gate_parts[i] += next_term; - gate_parts_constaint_counts[i]++; - - // If we already have constraint_limit constaints in the gate_parts[i], add it to the 'subexpressions'. - if (gate_parts_constaint_counts[i] == constraint_limit) { - subexpressions[i].push_back(gate_parts[i] * selector); - gate_parts[i] = math::expression(); - gate_parts_constaint_counts[i] = 0; - } break; } - } } - + auto selector = variable_type( + gate.selector_index, 0, false, variable_type::column_type::selector); for (size_t i = 0; i < extended_domain_sizes.size(); ++i) { - // Only in parallel version we store the subexpressions of each expression and ignore the cache. expressions[i] += gate_results[i] * selector; - if (gate_parts_constaint_counts[i] != 0) - subexpressions[i].push_back(gate_parts[i] * selector); } } std::array F; - std::vector F_0_parts(extended_domain_sizes.size()); - parallel_for(0, extended_domain_sizes.size(), - [&subexpressions, &extended_domain_sizes, &F_0_parts, &original_domain, &column_polynomials, &expressions](std::size_t i) { - std::unordered_map variable_values; + F[0] = polynomial_dfs_type::zero(); + for (std::size_t i = 0; i < extended_domain_sizes.size(); ++i) { + std::unordered_map variable_values; build_variable_value_map(expressions[i], column_polynomials, original_domain, extended_domain_sizes[i], variable_values); - std::vector subvalues(subexpressions[i].size()); - parallel_for(0, subexpressions[i].size(), - [&subexpressions, &variable_values, &extended_domain_sizes, &subvalues, i](std::size_t subexpression_index) { - // Only in parallel version we store the subexpressions of each expression and ignore the cache, - // not using "cached_expression_evaluator". - math::expression_evaluator evaluator( - subexpressions[i][subexpression_index], - [&assignments=variable_values, domain_size=extended_domain_sizes[i]] - (const polynomial_dfs_variable_type &var) -> const polynomial_dfs_type& { - return assignments[var]; - }); - subvalues[subexpression_index] = evaluator.evaluate(); - }, ThreadPool::PoolLevel::HIGH); + polynomial_dfs_type result(extended_domain_sizes[i] - 1, extended_domain_sizes[i]); + wait_for_all(parallel_run_in_chunks( + extended_domain_sizes[i], + [&variable_values, &extended_domain_sizes, &result, &expressions, i] + (std::size_t begin, std::size_t end) { + for (std::size_t j = begin; j < end; ++j) { + math::expression_evaluator evaluator( + expressions[i], + [&assignments=variable_values, j] + (const variable_type &var) -> const typename FieldType::value_type& { + return assignments[var][j]; + }); + result[j] = evaluator.evaluate(); + } + }, ThreadPool::PoolLevel::HIGH)); - F_0_parts[i] = polynomial_sum(std::move(subvalues)); - }, ThreadPool::PoolLevel::LASTPOOL); - - F[0] += polynomial_sum(std::move(F_0_parts)); + F[0] += result; + }; F[0] *= mask_polynomial; return F; } From 45c07b8cd01847b8f80e616478e9cf9b9d21829c Mon Sep 17 00:00:00 2001 From: Martun Karapetyan Date: Tue, 30 Jul 2024 15:22:47 +0000 Subject: [PATCH 2/2] Optimize memory consumption a bit. --- .../plonk/placeholder/gates_argument.hpp | 2 + .../plonk/placeholder/lookup_argument.hpp | 78 ++++++++++--------- .../nil/actor/core/parallelization_utils.hpp | 19 ++++- 3 files changed, 60 insertions(+), 39 deletions(-) diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp index 68a8c9cc..a7555f86 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp @@ -206,6 +206,8 @@ namespace nil { [&variable_values, &extended_domain_sizes, &result, &expressions, i] (std::size_t begin, std::size_t end) { for (std::size_t j = begin; j < end; ++j) { + // Don't use cache here. In practice it's slower to maintain the cache + // than to re-compute the subexpression value when value type is field element. math::expression_evaluator evaluator( expressions[i], [&assignments=variable_values, j] diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp index dd97f891..b0544b92 100644 --- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp +++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp @@ -31,6 +31,7 @@ #define CRYPTO3_ZK_PLONK_PLACEHOLDER_LOOKUP_ARGUMENT_HPP #include +#include #include #include @@ -126,7 +127,7 @@ namespace nil { public: struct prover_lookup_result { - std::array, argument_size> F_dfs; + std::array F_dfs; typename commitment_scheme_type::commitment_type lookup_commitment; }; @@ -157,31 +158,31 @@ namespace nil { PROFILE_PLACEHOLDER_SCOPE("Lookup argument prove eval time"); // Construct lookup gates - math::polynomial_dfs one_polynomial( + polynomial_dfs_type one_polynomial( 0, basic_domain->m, FieldType::value_type::one()); - math::polynomial_dfs zero_polynomial( + polynomial_dfs_type zero_polynomial( 0, basic_domain->m, FieldType::value_type::zero()); - math::polynomial_dfs mask_assignment = + polynomial_dfs_type mask_assignment = one_polynomial - preprocessed_data.q_last - preprocessed_data.q_blind; - std::unique_ptr>> lookup_value_ptr = + std::unique_ptr> lookup_value_ptr = prepare_lookup_value(mask_assignment); auto& lookup_value = *lookup_value_ptr; - std::unique_ptr>> lookup_input_ptr = + std::unique_ptr> lookup_input_ptr = prepare_lookup_input(); auto& lookup_input = *lookup_input_ptr; // 3. Lookup_input and lookup_value are ready // Now sort them! // Reduce value and input: - auto reduced_value_ptr = std::make_unique>>(); + auto reduced_value_ptr = std::make_unique>(); auto& reduced_value = *reduced_value_ptr; for( std::size_t i = 0; i < lookup_value.size(); i++ ){ reduced_value.push_back(reduce_dfs_polynomial_domain(lookup_value[i], basic_domain->m)); } - auto reduced_input_ptr = std::make_unique>>(); + auto reduced_input_ptr = std::make_unique>(); auto& reduced_input = *reduced_input_ptr; for( std::size_t i = 0; i < lookup_input.size(); i++ ){ reduced_input.push_back(reduce_dfs_polynomial_domain(lookup_input[i], basic_domain->m)); @@ -207,7 +208,7 @@ namespace nil { lookup_alphas.push_back(transcript.template challenge()); } - math::polynomial_dfs V_L = compute_V_L( + polynomial_dfs_type V_L = compute_V_L( sorted, reduced_input, reduced_value, beta, gamma); // We don't use reduced_input and reduced_value after this line. @@ -220,18 +221,18 @@ namespace nil { BOOST_ASSERT(std::accumulate(part_sizes.begin(), part_sizes.end(), 0) == sorted.size()); // Compute gs and hs products for each part - std::vector> gs = compute_gs( + std::vector gs = compute_gs( std::move(lookup_input_ptr), std::move(lookup_value_ptr), beta, gamma, part_sizes ); - std::vector> hs = compute_hs( + std::vector hs = compute_hs( sorted, beta, gamma, part_sizes ); - math::polynomial_dfs V_L_shifted = + polynomial_dfs_type V_L_shifted = math::polynomial_shift(V_L, 1, basic_domain->m); - std::array, argument_size> F_dfs; + std::array F_dfs; F_dfs[0] = preprocessed_data.common_data.lagrange_0 * (one_polynomial - V_L); F_dfs[1] = preprocessed_data.q_last * ( V_L * V_L - V_L ); @@ -245,16 +246,16 @@ namespace nil { g *= V_L; h *= V_L_shifted; g -= h; - h = math::polynomial_dfs(); // just clean the memory of h. + h = polynomial_dfs_type(); // just clean the memory of h. g *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial; F_dfs[2] = std::move(g); } else { - std::vector> parts; + std::vector parts; BOOST_ASSERT(part_sizes.size() == gs.size()); BOOST_ASSERT(part_sizes.size() == hs.size()); BOOST_ASSERT(part_sizes.size() == lookup_alphas.size() + 1); - std::vector> reduced_gs(lookup_alphas.size()); - std::vector> reduced_hs(lookup_alphas.size()); + std::vector reduced_gs(lookup_alphas.size()); + std::vector reduced_hs(lookup_alphas.size()); parallel_for(0, lookup_alphas.size(), [this, &gs, &hs, &reduced_gs, &reduced_hs](std::size_t i) { reduced_gs[i] = reduce_dfs_polynomial_domain(gs[i], basic_domain->m); reduced_hs[i] = reduce_dfs_polynomial_domain(hs[i], basic_domain->m); @@ -270,11 +271,11 @@ namespace nil { }, ThreadPool::PoolLevel::HIGH); - math::polynomial_dfs current_poly = V_L; - math::polynomial_dfs previous_poly = V_L; + polynomial_dfs_type current_poly = V_L; + polynomial_dfs_type previous_poly = V_L; // We need to store all the values of current_poly. Suddenly this increases the RAM usage, but // there's no other way to parallelize this loop. - std::vector> all_polys(1, V_L); + std::vector all_polys(1, V_L); for (std::size_t i = 0; i < lookup_alphas.size(); ++i) { @@ -287,19 +288,24 @@ namespace nil { all_polys.push_back(current_poly); previous_poly = current_poly; } - std::vector> F_dfs_2_parts(lookup_alphas.size() + 1); - parallel_for(0, lookup_alphas.size(), - [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts](std::size_t i) { - auto &g = gs[i]; - auto &h = hs[i]; - F_dfs_2_parts[i] = lookup_alphas[i] * (all_polys[i] * g - all_polys[i + 1] * h); + std::vector F_dfs_2_parts( + std::thread::hardware_concurrency() + 1, + polynomial_dfs_type::zero()); + wait_for_all(parallel_run_in_chunks_with_thread_id( + lookup_alphas.size(), + [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts] + (std::size_t thread_id, std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + F_dfs_2_parts[thread_id] += (all_polys[i] * gs[i] - all_polys[i + 1] * hs[i]) * lookup_alphas[i]; + // Save a bit or ram by deleting gs[i] and hs[i], we don't need it any more. + gs[i] = polynomial_dfs_type(); + hs[i] = polynomial_dfs_type(); + } }, - ThreadPool::PoolLevel::HIGH); + ThreadPool::PoolLevel::HIGH)); std::size_t last = lookup_alphas.size(); - auto &g = gs[last]; - auto &h = hs[last]; - F_dfs_2_parts[lookup_alphas.size()] = previous_poly * g - V_L_shifted * h; + F_dfs_2_parts.back() = previous_poly * gs[last] - V_L_shifted * hs[last]; F_dfs[2] += polynomial_sum(std::move(F_dfs_2_parts)); F_dfs[2] *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial; } @@ -311,9 +317,11 @@ namespace nil { alpha_challenges[i] = transcript.template challenge(); } - std::vector> F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end()); + std::vector F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end()); parallel_for(0, F_dfs_3_parts.size(), [this, &F_dfs_3_parts, &alpha_challenges, &sorted](std::size_t i) { - math::polynomial_dfs sorted_shifted = math::polynomial_shift(sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount, this->basic_domain->m); + polynomial_dfs_type sorted_shifted = math::polynomial_shift( + sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount, + this->basic_domain->m); F_dfs_3_parts[i] -= sorted_shifted; F_dfs_3_parts[i] *= alpha_challenges[i] * preprocessed_data.common_data.lagrange_0; }, ThreadPool::PoolLevel::HIGH); @@ -324,9 +332,9 @@ namespace nil { }; } - std::vector> compute_gs( - std::unique_ptr>> lookup_input_ptr, - std::unique_ptr>> lookup_value_ptr, + std::vector compute_gs( + std::unique_ptr> lookup_input_ptr, + std::unique_ptr> lookup_value_ptr, const typename FieldType::value_type& beta, const typename FieldType::value_type& gamma, const std::vector& lookup_part_sizes diff --git a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp index 96fe37f5..32baf239 100644 --- a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp +++ b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp @@ -49,9 +49,9 @@ namespace nil { // Divides work into chunks and makes calls to 'func' in parallel. template - std::vector> parallel_run_in_chunks( + std::vector> parallel_run_in_chunks_with_thread_id( std::size_t elements_count, - std::function func, + std::function func, ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) { auto& thread_pool = ThreadPool::get_instance(pool_id); @@ -73,14 +73,25 @@ namespace nil { std::size_t begin = 0; for (std::size_t i = 0; i < workers_to_use; i++) { auto end = begin + (elements_count - begin) / (workers_to_use - i); - fut.emplace_back(thread_pool.post([begin, end, func]() { - return func(begin, end); + fut.emplace_back(thread_pool.post([i, begin, end, func]() { + return func(i, begin, end); })); begin = end; } return fut; } + template + std::vector> parallel_run_in_chunks( + std::size_t elements_count, + std::function func, + ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) { + return parallel_run_in_chunks_with_thread_id(elements_count, + [func](std::size_t thread_id, std::size_t begin, std::size_t end) -> ReturnType { + return func(begin, end); + }, pool_id); + } + // Similar to std::transform, but in parallel. We return void here for better usability for our use cases. template void parallel_transform(InputIt1 first1, InputIt1 last1, InputIt2 first2,