NilFoundation · martun · Jul 30, 2024 · Jul 29, 2024 · Jul 30, 2024
diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
@@ -77,18 +77,18 @@ namespace nil {
                     constexpr static const std::size_t argument_size = 1;
 
                     static inline void build_variable_value_map(
-                        const math::expression<polynomial_dfs_variable_type>& expr,
+                        const math::expression<variable_type>& expr,
                         const plonk_polynomial_dfs_table<FieldType> &assignments,
                         std::shared_ptr<math::evaluation_domain<FieldType>> domain,
                         std::size_t extended_domain_size,
-                        std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type>& variable_values_out) {
+                        std::unordered_map<variable_type, polynomial_dfs_type>& variable_values_out) {
 
-                        std::unordered_map<polynomial_dfs_variable_type, size_t> variable_counts;
+                        std::unordered_map<variable_type, size_t> variable_counts;
 
-                        std::vector<polynomial_dfs_variable_type> variables;
+                        std::vector<variable_type> variables;
 
-                        math::expression_for_each_variable_visitor<polynomial_dfs_variable_type> visitor(
-                            [&variable_counts, &variables, &variable_values_out](const polynomial_dfs_variable_type& var) {
+                        math::expression_for_each_variable_visitor<variable_type> visitor(
+                            [&variable_counts, &variables, &variable_values_out](const variable_type& var) {
                                 // Create the structure of the map so we can change the values later.
                                 if (variable_counts[var] == 0) {
                                     variables.push_back(var);
@@ -107,12 +107,14 @@ namespace nil {
 
                         parallel_for(0, variables.size(),
                             [&variables, &variable_values_out, &assignments, &domain, &extended_domain, extended_domain_size](std::size_t i) {
-                                const auto& var = variables[i];
-                                // We may have variable values in required sizes in some cases.
-                                if (variable_values_out[var].size() == extended_domain_size)
-                                    return;
+                                const variable_type& var = variables[i];
 
-                                polynomial_dfs_type assignment = assignments.get_variable_value(var, domain);
+                                // Convert the variable to polynomial_dfs variable type.
+                                polynomial_dfs_variable_type var_dfs(var.index, var.rotation, var.relative,
+                                    static_cast<typename polynomial_dfs_variable_type::column_type>(
+                                        static_cast<std::uint8_t>(var.type)));
+
+                                polynomial_dfs_type assignment = assignments.get_variable_value(var_dfs, domain);
 
                                 // In parallel version we always resize the assignment poly, it's better for parallelization.
                                 // if (count > 1) {
@@ -152,11 +154,7 @@ namespace nil {
                         degree_limits.push_back(max_degree / 2);
                         extended_domain_sizes.push_back(max_domain_size / 2);
 
-                        std::vector<math::expression<polynomial_dfs_variable_type>> expressions(extended_domain_sizes.size());
-
-                        // Only in parallel version we store the subexpressions of each expression and ignore the cache.
-                        std::vector<std::vector<math::expression<polynomial_dfs_variable_type>>> subexpressions(extended_domain_sizes.size());
-
+                        std::vector<math::expression<variable_type>> expressions(extended_domain_sizes.size());
                         auto theta_acc = FieldType::value_type::one();
 
                         // Every constraint has variable type 'variable_type', but we want it to use
@@ -170,28 +168,10 @@ namespace nil {
                         const auto& gates = constraint_system.gates();
 
                         for (const auto& gate: gates) {
-                            std::vector<math::expression<polynomial_dfs_variable_type>> gate_results(extended_domain_sizes.size());
-
-                            // We will split gates into parts especially for zkEVM circuit, since there is only 1 large gate with
-                            // 683 constraints. Will split it into 24 parts, ~32 constraints each.
-                            // This will mean our code will multiply by selector 16 times, instead of just once. But this is 
-                            // much better that losing parallelization. We do not want to re-write the whole code to try parallelize
-                            // each gate compatation separately. This will not harm circuits with smaller number of terms much.
-                            std::vector<math::expression<polynomial_dfs_variable_type>> gate_parts(extended_domain_sizes.size());
-                            std::vector<std::size_t> gate_parts_constaint_counts(extended_domain_sizes.size());
-
-
-                            // This parameter can be tuned based on the circuit and the number of cores of the server on which the proofs
-                            // are generated. On the current zkEVM circuit this value is optimal based on experiments.
-                            const std::size_t constraint_limit = 16;
-
-
-                            auto selector = polynomial_dfs_variable_type(
-                                gate.selector_index, 0, false, polynomial_dfs_variable_type::column_type::selector);
-
+                            std::vector<math::expression<variable_type>> gate_results(extended_domain_sizes.size());
                             for (std::size_t constraint_idx = 0; constraint_idx < gate.constraints.size(); ++constraint_idx) {
                                 const auto& constraint = gate.constraints[constraint_idx];
-                                auto next_term = converter.convert(constraint) * value_type_to_polynomial_dfs(theta_acc);
+                                auto next_term = constraint * theta_acc;
 
                                 theta_acc *= theta;
                                 // +1 stands for the selector multiplication.
@@ -200,57 +180,46 @@ namespace nil {
                                     // Whatever the degree of term is, add it to the maximal degree expression.
                                     if (degree_limits[i] >= constraint_degree || i == 0) {
                                         gate_results[i] += next_term;
-                                        gate_parts[i] += next_term;
-                                        gate_parts_constaint_counts[i]++;
-
-                                        // If we already have constraint_limit constaints in the gate_parts[i], add it to the 'subexpressions'.
-                                        if (gate_parts_constaint_counts[i] == constraint_limit) {
-                                            subexpressions[i].push_back(gate_parts[i] * selector);
-                                            gate_parts[i] = math::expression<polynomial_dfs_variable_type>();
-                                            gate_parts_constaint_counts[i] = 0;
-                                        }
                                         break;
                                     }
-
                                 }
                             }
-
+                            auto selector = variable_type(
+                                gate.selector_index, 0, false, variable_type::column_type::selector);
                             for (size_t i = 0; i < extended_domain_sizes.size(); ++i) {
-                                // Only in parallel version we store the subexpressions of each expression and ignore the cache.
                                 expressions[i] += gate_results[i] * selector;
-                                if (gate_parts_constaint_counts[i] != 0)
-                                    subexpressions[i].push_back(gate_parts[i] * selector);
                             }
                         }
 
                         std::array<polynomial_dfs_type, argument_size> F;
 
-                        std::vector<polynomial_dfs_type> F_0_parts(extended_domain_sizes.size());
-                        parallel_for(0, extended_domain_sizes.size(),
-                                [&subexpressions, &extended_domain_sizes, &F_0_parts, &original_domain, &column_polynomials, &expressions](std::size_t i) {
-                            std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type> variable_values;
+                        F[0] = polynomial_dfs_type::zero();
+                        for (std::size_t i = 0; i < extended_domain_sizes.size(); ++i) {
+                            std::unordered_map<variable_type, polynomial_dfs_type> variable_values;
 
                             build_variable_value_map(expressions[i], column_polynomials, original_domain,
                                 extended_domain_sizes[i], variable_values);
 
-                            std::vector<polynomial_dfs_type> subvalues(subexpressions[i].size());
-                            parallel_for(0, subexpressions[i].size(),
-                                [&subexpressions, &variable_values, &extended_domain_sizes, &subvalues, i](std::size_t subexpression_index) {
-                                // Only in parallel version we store the subexpressions of each expression and ignore the cache,
-                                // not using "cached_expression_evaluator".
-                                math::expression_evaluator<polynomial_dfs_variable_type> evaluator(
-                                    subexpressions[i][subexpression_index], 
-                                    [&assignments=variable_values, domain_size=extended_domain_sizes[i]]
-                                        (const polynomial_dfs_variable_type &var) -> const polynomial_dfs_type& {
-                                            return assignments[var];
-                                    });
-                                subvalues[subexpression_index] = evaluator.evaluate(); 
-                            }, ThreadPool::PoolLevel::HIGH);
+                            polynomial_dfs_type result(extended_domain_sizes[i] - 1, extended_domain_sizes[i]);
+                            wait_for_all(parallel_run_in_chunks<void>(
+                                extended_domain_sizes[i],
+                                [&variable_values, &extended_domain_sizes, &result, &expressions, i]
+                                (std::size_t begin, std::size_t end) {
+                                    for (std::size_t j = begin; j < end; ++j) {
+                                        // Don't use cache here. In practice it's slower to maintain the cache
+                                        // than to re-compute the subexpression value when value type is field element.
+                                        math::expression_evaluator<variable_type> evaluator(
+                                            expressions[i], 
+                                            [&assignments=variable_values, j]
+                                                (const variable_type &var) -> const typename FieldType::value_type& {
+                                                    return assignments[var][j];
+                                            });
+                                        result[j] = evaluator.evaluate();
+                                    }
+                            }, ThreadPool::PoolLevel::HIGH));
 
-                            F_0_parts[i] = polynomial_sum<FieldType>(std::move(subvalues));
-                        }, ThreadPool::PoolLevel::LASTPOOL);
-
-                        F[0] += polynomial_sum<FieldType>(std::move(F_0_parts));
+                            F[0] += result;
+                        };
                         F[0] *= mask_polynomial;
                         return F;
                     }