From d855e346c86e31e750cd38f3cd247f0ce0cb616b Mon Sep 17 00:00:00 2001
From: Martun Karapetyan <martun@nil.foundation>
Date: Mon, 29 Jul 2024 11:00:52 +0000
Subject: [PATCH 1/2] Parallelize gate argument outside the expression tree.

---
 .../plonk/placeholder/gates_argument.hpp      | 109 ++++++------------
 1 file changed, 38 insertions(+), 71 deletions(-)
diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
index 37ee0ec1..68a8c9cc 100644
--- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
+++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
@@ -77,18 +77,18 @@ namespace nil {
                     constexpr static const std::size_t argument_size = 1;
 
                     static inline void build_variable_value_map(
-                        const math::expression<polynomial_dfs_variable_type>& expr,
+                        const math::expression<variable_type>& expr,
                         const plonk_polynomial_dfs_table<FieldType> &assignments,
                         std::shared_ptr<math::evaluation_domain<FieldType>> domain,
                         std::size_t extended_domain_size,
-                        std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type>& variable_values_out) {
+                        std::unordered_map<variable_type, polynomial_dfs_type>& variable_values_out) {
 
-                        std::unordered_map<polynomial_dfs_variable_type, size_t> variable_counts;
+                        std::unordered_map<variable_type, size_t> variable_counts;
 
-                        std::vector<polynomial_dfs_variable_type> variables;
+                        std::vector<variable_type> variables;
 
-                        math::expression_for_each_variable_visitor<polynomial_dfs_variable_type> visitor(
-                            [&variable_counts, &variables, &variable_values_out](const polynomial_dfs_variable_type& var) {
+                        math::expression_for_each_variable_visitor<variable_type> visitor(
+                            [&variable_counts, &variables, &variable_values_out](const variable_type& var) {
                                 // Create the structure of the map so we can change the values later.
                                 if (variable_counts[var] == 0) {
                                     variables.push_back(var);
@@ -107,12 +107,14 @@ namespace nil {
 
                         parallel_for(0, variables.size(),
                             [&variables, &variable_values_out, &assignments, &domain, &extended_domain, extended_domain_size](std::size_t i) {
-                                const auto& var = variables[i];
-                                // We may have variable values in required sizes in some cases.
-                                if (variable_values_out[var].size() == extended_domain_size)
-                                    return;
+                                const variable_type& var = variables[i];
 
-                                polynomial_dfs_type assignment = assignments.get_variable_value(var, domain);
+                                // Convert the variable to polynomial_dfs variable type.
+                                polynomial_dfs_variable_type var_dfs(var.index, var.rotation, var.relative,
+                                    static_cast<typename polynomial_dfs_variable_type::column_type>(
+                                        static_cast<std::uint8_t>(var.type)));
+
+                                polynomial_dfs_type assignment = assignments.get_variable_value(var_dfs, domain);
 
                                 // In parallel version we always resize the assignment poly, it's better for parallelization.
                                 // if (count > 1) {
@@ -152,11 +154,7 @@ namespace nil {
                         degree_limits.push_back(max_degree / 2);
                         extended_domain_sizes.push_back(max_domain_size / 2);
 
-                        std::vector<math::expression<polynomial_dfs_variable_type>> expressions(extended_domain_sizes.size());
-
-                        // Only in parallel version we store the subexpressions of each expression and ignore the cache.
-                        std::vector<std::vector<math::expression<polynomial_dfs_variable_type>>> subexpressions(extended_domain_sizes.size());
-
+                        std::vector<math::expression<variable_type>> expressions(extended_domain_sizes.size());
                         auto theta_acc = FieldType::value_type::one();
 
                         // Every constraint has variable type 'variable_type', but we want it to use
@@ -170,28 +168,10 @@ namespace nil {
                         const auto& gates = constraint_system.gates();
 
                         for (const auto& gate: gates) {
-                            std::vector<math::expression<polynomial_dfs_variable_type>> gate_results(extended_domain_sizes.size());
-
-                            // We will split gates into parts especially for zkEVM circuit, since there is only 1 large gate with
-                            // 683 constraints. Will split it into 24 parts, ~32 constraints each.
-                            // This will mean our code will multiply by selector 16 times, instead of just once. But this is 
-                            // much better that losing parallelization. We do not want to re-write the whole code to try parallelize
-                            // each gate compatation separately. This will not harm circuits with smaller number of terms much.
-                            std::vector<math::expression<polynomial_dfs_variable_type>> gate_parts(extended_domain_sizes.size());
-                            std::vector<std::size_t> gate_parts_constaint_counts(extended_domain_sizes.size());
-    
-
-                            // This parameter can be tuned based on the circuit and the number of cores of the server on which the proofs
-                            // are generated. On the current zkEVM circuit this value is optimal based on experiments.
-                            const std::size_t constraint_limit = 16;
-
-
-                            auto selector = polynomial_dfs_variable_type(
-                                gate.selector_index, 0, false, polynomial_dfs_variable_type::column_type::selector);
-
+                            std::vector<math::expression<variable_type>> gate_results(extended_domain_sizes.size());
                             for (std::size_t constraint_idx = 0; constraint_idx < gate.constraints.size(); ++constraint_idx) {
                                 const auto& constraint = gate.constraints[constraint_idx];
-                                auto next_term = converter.convert(constraint) * value_type_to_polynomial_dfs(theta_acc);
+                                auto next_term = constraint * theta_acc;
 
                                 theta_acc *= theta;
                                 // +1 stands for the selector multiplication.
@@ -200,57 +180,44 @@ namespace nil {
                                     // Whatever the degree of term is, add it to the maximal degree expression.
                                     if (degree_limits[i] >= constraint_degree || i == 0) {
                                         gate_results[i] += next_term;
-                                        gate_parts[i] += next_term;
-                                        gate_parts_constaint_counts[i]++;
-
-                                        // If we already have constraint_limit constaints in the gate_parts[i], add it to the 'subexpressions'.
-                                        if (gate_parts_constaint_counts[i] == constraint_limit) {
-                                            subexpressions[i].push_back(gate_parts[i] * selector);
-                                            gate_parts[i] = math::expression<polynomial_dfs_variable_type>();
-                                            gate_parts_constaint_counts[i] = 0;
-                                        }
                                         break;
                                     }
-                                     
                                 }
                             }
-
+                            auto selector = variable_type(
+                                gate.selector_index, 0, false, variable_type::column_type::selector);
                             for (size_t i = 0; i < extended_domain_sizes.size(); ++i) {
-                                // Only in parallel version we store the subexpressions of each expression and ignore the cache.
                                 expressions[i] += gate_results[i] * selector;
-                                if (gate_parts_constaint_counts[i] != 0)
-                                    subexpressions[i].push_back(gate_parts[i] * selector);
                             }
                         }
 
                         std::array<polynomial_dfs_type, argument_size> F;
 
-                        std::vector<polynomial_dfs_type> F_0_parts(extended_domain_sizes.size());
-                        parallel_for(0, extended_domain_sizes.size(),
-                                [&subexpressions, &extended_domain_sizes, &F_0_parts, &original_domain, &column_polynomials, &expressions](std::size_t i) {
-                            std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type> variable_values;
+                        F[0] = polynomial_dfs_type::zero();
+                        for (std::size_t i = 0; i < extended_domain_sizes.size(); ++i) {
+                            std::unordered_map<variable_type, polynomial_dfs_type> variable_values;
                             
                             build_variable_value_map(expressions[i], column_polynomials, original_domain,
                                 extended_domain_sizes[i], variable_values);
 
-                            std::vector<polynomial_dfs_type> subvalues(subexpressions[i].size());
-                            parallel_for(0, subexpressions[i].size(),
-                                [&subexpressions, &variable_values, &extended_domain_sizes, &subvalues, i](std::size_t subexpression_index) {
-                                // Only in parallel version we store the subexpressions of each expression and ignore the cache,
-                                // not using "cached_expression_evaluator".
-                                math::expression_evaluator<polynomial_dfs_variable_type> evaluator(
-                                    subexpressions[i][subexpression_index], 
-                                    [&assignments=variable_values, domain_size=extended_domain_sizes[i]]
-                                        (const polynomial_dfs_variable_type &var) -> const polynomial_dfs_type& {
-                                            return assignments[var];
-                                    });
-                                subvalues[subexpression_index] = evaluator.evaluate(); 
-                            }, ThreadPool::PoolLevel::HIGH);
+                            polynomial_dfs_type result(extended_domain_sizes[i] - 1, extended_domain_sizes[i]);
+                            wait_for_all(parallel_run_in_chunks<void>(
+                                extended_domain_sizes[i],
+                                [&variable_values, &extended_domain_sizes, &result, &expressions, i]
+                                (std::size_t begin, std::size_t end) {
+                                    for (std::size_t j = begin; j < end; ++j) {
+                                        math::expression_evaluator<variable_type> evaluator(
+                                            expressions[i], 
+                                            [&assignments=variable_values, j]
+                                                (const variable_type &var) -> const typename FieldType::value_type& {
+                                                    return assignments[var][j];
+                                            });
+                                        result[j] = evaluator.evaluate();
+                                    }
+                            }, ThreadPool::PoolLevel::HIGH));
                             
-                            F_0_parts[i] = polynomial_sum<FieldType>(std::move(subvalues));
-                        }, ThreadPool::PoolLevel::LASTPOOL);
-
-                        F[0] += polynomial_sum<FieldType>(std::move(F_0_parts));
+                            F[0] += result;
+                        };
                         F[0] *= mask_polynomial;
                         return F;
                     }

From 45c07b8cd01847b8f80e616478e9cf9b9d21829c Mon Sep 17 00:00:00 2001
From: Martun Karapetyan <martun@nil.foundation>
Date: Tue, 30 Jul 2024 15:22:47 +0000
Subject: [PATCH 2/2] Optimize memory consumption a bit.

---
 .../plonk/placeholder/gates_argument.hpp      |  2 +
 .../plonk/placeholder/lookup_argument.hpp     | 78 ++++++++++---------
 .../nil/actor/core/parallelization_utils.hpp  | 19 ++++-
 3 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
index 68a8c9cc..a7555f86 100644
--- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
+++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
@@ -206,6 +206,8 @@ namespace nil {
                                 [&variable_values, &extended_domain_sizes, &result, &expressions, i]
                                 (std::size_t begin, std::size_t end) {
                                     for (std::size_t j = begin; j < end; ++j) {
+                                        // Don't use cache here. In practice it's slower to maintain the cache
+                                        // than to re-compute the subexpression value when value type is field element.
                                         math::expression_evaluator<variable_type> evaluator(
                                             expressions[i], 
                                             [&assignments=variable_values, j]
diff --git a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp
index dd97f891..b0544b92 100644
--- a/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp
+++ b/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/lookup_argument.hpp
@@ -31,6 +31,7 @@
 #define CRYPTO3_ZK_PLONK_PLACEHOLDER_LOOKUP_ARGUMENT_HPP
 
 #include <unordered_map>
+#include <thread>
 
 #include <nil/crypto3/math/polynomial/polynomial.hpp>
 #include <nil/crypto3/math/polynomial/shift.hpp>
@@ -126,7 +127,7 @@ namespace nil {
                 public:
 
                     struct prover_lookup_result {
-                        std::array<math::polynomial_dfs<typename FieldType::value_type>, argument_size> F_dfs;
+                        std::array<polynomial_dfs_type, argument_size> F_dfs;
                         typename commitment_scheme_type::commitment_type lookup_commitment;
                     };
 
@@ -157,31 +158,31 @@ namespace nil {
                         PROFILE_PLACEHOLDER_SCOPE("Lookup argument prove eval time");
 
                         // Construct lookup gates
-                        math::polynomial_dfs<typename FieldType::value_type> one_polynomial(
+                        polynomial_dfs_type one_polynomial(
                             0, basic_domain->m, FieldType::value_type::one());
-                        math::polynomial_dfs<typename FieldType::value_type> zero_polynomial(
+                        polynomial_dfs_type zero_polynomial(
                             0, basic_domain->m, FieldType::value_type::zero());
-                        math::polynomial_dfs<typename FieldType::value_type> mask_assignment =
+                        polynomial_dfs_type mask_assignment =
                             one_polynomial -  preprocessed_data.q_last - preprocessed_data.q_blind;
 
-                        std::unique_ptr<std::vector<math::polynomial_dfs<typename FieldType::value_type>>> lookup_value_ptr =
+                        std::unique_ptr<std::vector<polynomial_dfs_type>> lookup_value_ptr =
                             prepare_lookup_value(mask_assignment);
                         auto& lookup_value = *lookup_value_ptr;
 
-                        std::unique_ptr<std::vector<math::polynomial_dfs<typename FieldType::value_type>>> lookup_input_ptr =
+                        std::unique_ptr<std::vector<polynomial_dfs_type>> lookup_input_ptr =
                             prepare_lookup_input();
                         auto& lookup_input = *lookup_input_ptr;
 
                         // 3. Lookup_input and lookup_value are ready
                         //    Now sort them!
                         //    Reduce value and input:
-                        auto reduced_value_ptr = std::make_unique<std::vector<math::polynomial_dfs<typename FieldType::value_type>>>();
+                        auto reduced_value_ptr = std::make_unique<std::vector<polynomial_dfs_type>>();
                         auto& reduced_value = *reduced_value_ptr;
 
                         for( std::size_t i = 0; i < lookup_value.size(); i++ ){
                             reduced_value.push_back(reduce_dfs_polynomial_domain(lookup_value[i], basic_domain->m));
                         }
-                        auto reduced_input_ptr = std::make_unique<std::vector<math::polynomial_dfs<typename FieldType::value_type>>>();
+                        auto reduced_input_ptr = std::make_unique<std::vector<polynomial_dfs_type>>();
                         auto& reduced_input = *reduced_input_ptr;
                         for( std::size_t i = 0; i < lookup_input.size(); i++ ){
                             reduced_input.push_back(reduce_dfs_polynomial_domain(lookup_input[i], basic_domain->m));
@@ -207,7 +208,7 @@ namespace nil {
                             lookup_alphas.push_back(transcript.template challenge<FieldType>());
                         }
 
-                        math::polynomial_dfs<typename FieldType::value_type> V_L = compute_V_L(
+                        polynomial_dfs_type V_L = compute_V_L(
                             sorted, reduced_input, reduced_value, beta, gamma);
 
                         // We don't use reduced_input and reduced_value after this line.
@@ -220,18 +221,18 @@ namespace nil {
                         BOOST_ASSERT(std::accumulate(part_sizes.begin(), part_sizes.end(), 0) == sorted.size());
 
                         // Compute gs and hs products for each part
-                        std::vector<math::polynomial_dfs<typename FieldType::value_type>> gs = compute_gs(
+                        std::vector<polynomial_dfs_type> gs = compute_gs(
                              std::move(lookup_input_ptr), std::move(lookup_value_ptr), beta, gamma, part_sizes
                         );
 
-                        std::vector<math::polynomial_dfs<typename FieldType::value_type>> hs = compute_hs(
+                        std::vector<polynomial_dfs_type> hs = compute_hs(
                             sorted, beta, gamma, part_sizes
                         );
 
-                        math::polynomial_dfs<typename FieldType::value_type> V_L_shifted =
+                        polynomial_dfs_type V_L_shifted =
                             math::polynomial_shift(V_L, 1, basic_domain->m);
 
-                        std::array<math::polynomial_dfs<typename FieldType::value_type>, argument_size> F_dfs;
+                        std::array<polynomial_dfs_type, argument_size> F_dfs;
 
                         F_dfs[0] = preprocessed_data.common_data.lagrange_0 * (one_polynomial - V_L);
                         F_dfs[1] = preprocessed_data.q_last * ( V_L * V_L - V_L );
@@ -245,16 +246,16 @@ namespace nil {
                             g *= V_L;
                             h *= V_L_shifted;
                             g -= h;
-                            h = math::polynomial_dfs<typename FieldType::value_type>(); // just clean the memory of h.
+                            h = polynomial_dfs_type(); // just clean the memory of h.
                             g *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial;
                             F_dfs[2] = std::move(g);
                         } else {
-                           std::vector<math::polynomial_dfs<typename FieldType::value_type>> parts;
+                            std::vector<polynomial_dfs_type> parts;
                             BOOST_ASSERT(part_sizes.size() == gs.size());
                             BOOST_ASSERT(part_sizes.size() == hs.size());
                             BOOST_ASSERT(part_sizes.size() == lookup_alphas.size() + 1);
-                            std::vector<math::polynomial_dfs<typename FieldType::value_type>> reduced_gs(lookup_alphas.size());
-                            std::vector<math::polynomial_dfs<typename FieldType::value_type>> reduced_hs(lookup_alphas.size());
+                            std::vector<polynomial_dfs_type> reduced_gs(lookup_alphas.size());
+                            std::vector<polynomial_dfs_type> reduced_hs(lookup_alphas.size());
                             parallel_for(0, lookup_alphas.size(), [this, &gs, &hs, &reduced_gs, &reduced_hs](std::size_t i) {
                                 reduced_gs[i] = reduce_dfs_polynomial_domain(gs[i], basic_domain->m);
                                 reduced_hs[i] = reduce_dfs_polynomial_domain(hs[i], basic_domain->m);
@@ -270,11 +271,11 @@ namespace nil {
                                 },
                                 ThreadPool::PoolLevel::HIGH);
 
-                            math::polynomial_dfs<typename FieldType::value_type> current_poly = V_L;
-                            math::polynomial_dfs<typename FieldType::value_type> previous_poly = V_L;
+                            polynomial_dfs_type current_poly = V_L;
+                            polynomial_dfs_type previous_poly = V_L;
                             // We need to store all the values of current_poly. Suddenly this increases the RAM usage, but 
                             // there's no other way to parallelize this loop.
-                            std::vector<math::polynomial_dfs<typename FieldType::value_type>> all_polys(1, V_L);
+                            std::vector<polynomial_dfs_type> all_polys(1, V_L);
                             
                             for (std::size_t i = 0; i < lookup_alphas.size(); ++i) {
 
@@ -287,19 +288,24 @@ namespace nil {
                                 all_polys.push_back(current_poly);
                                 previous_poly = current_poly;
                             }
-                            std::vector<math::polynomial_dfs<typename FieldType::value_type>> F_dfs_2_parts(lookup_alphas.size() + 1);
-                            parallel_for(0, lookup_alphas.size(),
-                                [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts](std::size_t i) {
-                                    auto &g = gs[i];
-                                    auto &h = hs[i];
-                                    F_dfs_2_parts[i] = lookup_alphas[i] * (all_polys[i] * g - all_polys[i + 1] * h);
+                            std::vector<polynomial_dfs_type> F_dfs_2_parts(
+                                std::thread::hardware_concurrency() + 1,
+                                polynomial_dfs_type::zero());
+                            wait_for_all(parallel_run_in_chunks_with_thread_id<void>(
+                                lookup_alphas.size(),
+                                [&gs, &hs, &lookup_alphas, &all_polys, &F_dfs_2_parts]
+                                (std::size_t thread_id, std::size_t begin, std::size_t end) {
+                                    for (std::size_t i = begin; i < end; ++i) {
+                                        F_dfs_2_parts[thread_id] += (all_polys[i] * gs[i] - all_polys[i + 1] * hs[i]) * lookup_alphas[i];
+                                        // Save a bit or ram by deleting gs[i] and hs[i], we don't need it any more.
+                                        gs[i] = polynomial_dfs_type();
+                                        hs[i] = polynomial_dfs_type();
+                                    }
                                 },
-                                ThreadPool::PoolLevel::HIGH);
+                                ThreadPool::PoolLevel::HIGH));
  
                             std::size_t last = lookup_alphas.size();
-                            auto &g = gs[last];
-                            auto &h = hs[last];
-                            F_dfs_2_parts[lookup_alphas.size()] = previous_poly * g - V_L_shifted * h;
+                            F_dfs_2_parts.back() = previous_poly * gs[last] - V_L_shifted * hs[last];
                             F_dfs[2] += polynomial_sum<FieldType>(std::move(F_dfs_2_parts));
                             F_dfs[2] *= (preprocessed_data.q_last + preprocessed_data.q_blind) - one_polynomial;
                         }
@@ -311,9 +317,11 @@ namespace nil {
                             alpha_challenges[i] = transcript.template challenge<FieldType>();
                         }
 
-                        std::vector<math::polynomial_dfs<typename FieldType::value_type>> F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end());
+                        std::vector<polynomial_dfs_type> F_dfs_3_parts(std::next(sorted.begin(), 1), sorted.end());
                         parallel_for(0, F_dfs_3_parts.size(), [this, &F_dfs_3_parts, &alpha_challenges, &sorted](std::size_t i) {
-                            math::polynomial_dfs sorted_shifted = math::polynomial_shift(sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount, this->basic_domain->m);
+                            polynomial_dfs_type sorted_shifted = math::polynomial_shift(
+                                sorted[i], this->preprocessed_data.common_data.desc.usable_rows_amount,
+                                this->basic_domain->m);
                             F_dfs_3_parts[i] -= sorted_shifted;
                             F_dfs_3_parts[i] *= alpha_challenges[i] * preprocessed_data.common_data.lagrange_0;
                         }, ThreadPool::PoolLevel::HIGH);
@@ -324,9 +332,9 @@ namespace nil {
                         };
                     }
 
-                    std::vector<math::polynomial_dfs<typename FieldType::value_type>> compute_gs(
-                            std::unique_ptr<std::vector<math::polynomial_dfs<typename FieldType::value_type>>> lookup_input_ptr,
-                            std::unique_ptr<std::vector<math::polynomial_dfs<typename FieldType::value_type>>> lookup_value_ptr,
+                    std::vector<polynomial_dfs_type> compute_gs(
+                            std::unique_ptr<std::vector<polynomial_dfs_type>> lookup_input_ptr,
+                            std::unique_ptr<std::vector<polynomial_dfs_type>> lookup_value_ptr,
                             const typename FieldType::value_type& beta,
                             const typename FieldType::value_type& gamma,
                             const std::vector<std::size_t>& lookup_part_sizes
diff --git a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp
index 96fe37f5..32baf239 100644
--- a/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp
+++ b/libs/parallelization-utils/include/nil/actor/core/parallelization_utils.hpp
@@ -49,9 +49,9 @@ namespace nil {
 
         // Divides work into chunks and makes calls to 'func' in parallel.
         template<class ReturnType>
-        std::vector<std::future<ReturnType>> parallel_run_in_chunks(
+        std::vector<std::future<ReturnType>> parallel_run_in_chunks_with_thread_id(
                 std::size_t elements_count,
-                std::function<ReturnType(std::size_t begin, std::size_t end)> func, 
+                std::function<ReturnType(std::size_t thread_id, std::size_t begin, std::size_t end)> func, 
                 ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) {
 
             auto& thread_pool = ThreadPool::get_instance(pool_id);
@@ -73,14 +73,25 @@ namespace nil {
             std::size_t begin = 0;
             for (std::size_t i = 0; i < workers_to_use; i++) {
                 auto end = begin + (elements_count - begin) / (workers_to_use - i);
-                fut.emplace_back(thread_pool.post<ReturnType>([begin, end, func]() {
-                    return func(begin, end);
+                fut.emplace_back(thread_pool.post<ReturnType>([i, begin, end, func]() {
+                    return func(i, begin, end);
                 }));
                 begin = end;
             }
             return fut;
         }
 
+        template<class ReturnType>
+        std::vector<std::future<ReturnType>> parallel_run_in_chunks(
+                std::size_t elements_count,
+                std::function<ReturnType(std::size_t begin, std::size_t end)> func, 
+                ThreadPool::PoolLevel pool_id = ThreadPool::PoolLevel::LOW) {
+            return parallel_run_in_chunks_with_thread_id<ReturnType>(elements_count,
+                [func](std::size_t thread_id, std::size_t begin, std::size_t end) -> ReturnType {
+                    return func(begin, end);
+                }, pool_id);
+        }
+
         // Similar to std::transform, but in parallel. We return void here for better usability for our use cases.
         template<class InputIt1, class InputIt2, class OutputIt, class BinaryOperation>
         void parallel_transform(InputIt1 first1, InputIt1 last1, InputIt2 first2,