From 3a667ebcf0d889f10719e4a4f8267cd59869d6bf Mon Sep 17 00:00:00 2001 From: Iluvmagick Date: Mon, 30 Sep 2024 17:15:07 +0400 Subject: [PATCH] WIP SYCL parallelization --- parallel-crypto3/CMakeLists.txt | 4 +- .../detail/basic_radix2_domain_aux.hpp | 13 ++-- .../parallel-math/test/polynomial_dfs.cpp | 2 +- .../actor/core/sycl_parallelization_utils.hpp | 75 +++++++++++++++++++ parallel-crypto3/parallel-crypto3.nix | 6 +- 5 files changed, 89 insertions(+), 11 deletions(-) create mode 100644 parallel-crypto3/libs/parallelization-utils/include/nil/actor/core/sycl_parallelization_utils.hpp diff --git a/parallel-crypto3/CMakeLists.txt b/parallel-crypto3/CMakeLists.txt index 3921bd0223..97844f345d 100644 --- a/parallel-crypto3/CMakeLists.txt +++ b/parallel-crypto3/CMakeLists.txt @@ -18,12 +18,14 @@ if (${FORCE_COLORED_OUTPUT}) endif () endif () +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + # The file compile_commands.json is generated in build directory, so LSP could # pick it up and guess all include paths, defines and other stuff. # If Nix is used, LSP could not guess the locations of implicit include # directories, so we need to include them explicitly. if(CMAKE_EXPORT_COMPILE_COMMANDS) - set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES + set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES ${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}) endif() diff --git a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/detail/basic_radix2_domain_aux.hpp b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/detail/basic_radix2_domain_aux.hpp index 7a6f9d6162..9244a37571 100644 --- a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/detail/basic_radix2_domain_aux.hpp +++ b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/detail/basic_radix2_domain_aux.hpp @@ -24,8 +24,7 @@ // SOFTWARE. //---------------------------------------------------------------------------// -#ifndef CRYPTO3_MATH_BASIC_RADIX2_DOMAIN_AUX_HPP -#define CRYPTO3_MATH_BASIC_RADIX2_DOMAIN_AUX_HPP +#pragma once #include #include @@ -38,6 +37,7 @@ #include #include +#include namespace nil { namespace crypto3 { @@ -83,7 +83,7 @@ namespace nil { // swapping in place (from Storer's book) // We can parallelize this look, since k and rk are pairs, they will never intersect. - nil::crypto3::parallel_for(0, n, + sycl_parallel_for(0, n, [logn, &a](std::size_t k) { const std::size_t rk = crypto3::math::detail::bitreverse(k, logn); if (k < rk) @@ -100,7 +100,7 @@ namespace nil { // Here we can parallelize on the both loops with 'k' and 'm', because for each value of k and m // the ranges of array 'a' used do not intersect. Think of these 2 loops as 1. - wait_for_all(parallel_run_in_chunks( + sycl_run_in_chunks( m * count_k, [&a, m, count_k, inc, &omega_cache](std::size_t begin, std::size_t end) { size_t current_index = begin; @@ -124,8 +124,7 @@ namespace nil { return; } } - }, ThreadPool::PoolLevel::LOW - )); + })); } } @@ -209,5 +208,3 @@ namespace nil { } // namespace fft } // namespace crypto3 } // namespace nil - -#endif // ALGEBRA_FFT_BASIC_RADIX2_DOMAIN_AUX_HPP diff --git a/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp b/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp index b3a1ab59b8..efc6fdbdaf 100644 --- a/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp +++ b/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp @@ -1397,7 +1397,7 @@ BOOST_AUTO_TEST_CASE(polynomial_dfs_multiplication_perf_test, *boost::unit_test: std::cout << "Multiplication time: " << duration.count() << " microseconds." << std::endl; } -BOOST_AUTO_TEST_CASE(polynomial_dfs_resize_perf_test, *boost::unit_test::disabled()) { +BOOST_AUTO_TEST_CASE(polynomial_dfs_resize_perf_test) { std::vector values; std::size_t size = 131072 * 16; for (std::size_t i = 0; i < size; i++) { diff --git a/parallel-crypto3/libs/parallelization-utils/include/nil/actor/core/sycl_parallelization_utils.hpp b/parallel-crypto3/libs/parallelization-utils/include/nil/actor/core/sycl_parallelization_utils.hpp new file mode 100644 index 0000000000..f1cdebf61f --- /dev/null +++ b/parallel-crypto3/libs/parallelization-utils/include/nil/actor/core/sycl_parallelization_utils.hpp @@ -0,0 +1,75 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2024 Dmitrii Tabalin +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +//---------------------------------------------------------------------------// + +#pragma once + +#include +#include + +namespace nil { + namespace crypto3 { + template + void sycl_run_in_chunks( + std::size_t elements_count, + Function func + ) { + hipsycl::queue q; + std::size_t max_compute_units = q.get_device().get_info(); + std::size_t workers_to_use = + std::max(static_cast(1), std::min(elements_count, max_compute_units)); + { + q.submit([&](hipsycl::handler& cgh) { + cgh.parallel_for( + hipsycl::range<1>(workers_to_use), [=](hipsycl::id<1> idx) { + const std::size_t i = idx[0]; + const std::size_t chunk_size = elements_count / workers_to_use; + const std::size_t remainder = elements_count % workers_to_use; + const std::size_t begin = i * chunk_size + hipsycl::min(i, remainder); + const std::size_t end = begin + chunk_size + (i < remainder ? 1 : 0); + func(begin, end); + }); + }); + // The buffer destructor ensures synchronization + } + } + + template + void sycl_parallel_for( + std::size_t start, + std::size_t end, + Function func + ) { + hipsycl::queue q; + { + q.submit([&](hipsycl::handler& cgh) { + cgh.parallel_for( + hipsycl::range<1>(end - start), [=](hipsycl::id<1> idx) { + func(start + idx[0]); + }); + }); + // The buffer destructor ensures synchronization + } + } + } // namespace crypto3 +} // namespace nil \ No newline at end of file diff --git a/parallel-crypto3/parallel-crypto3.nix b/parallel-crypto3/parallel-crypto3.nix index 189efbc710..415a2f4bd6 100644 --- a/parallel-crypto3/parallel-crypto3.nix +++ b/parallel-crypto3/parallel-crypto3.nix @@ -5,8 +5,10 @@ cmake, boost, gdb, + lldb, cmake_modules, crypto3, + opensycl, enableDebugging, enableDebug ? false, runTests ? false, @@ -18,7 +20,9 @@ in stdenv.mkDerivation { src = lib.sourceByRegex ./. [ ".*" ]; - nativeBuildInputs = [ cmake ninja pkg-config ] ++ (lib.optional (!stdenv.isDarwin) gdb); + nativeBuildInputs = [ cmake ninja pkg-config opensycl ] ++ + (lib.optional (!stdenv.isDarwin) gdb) ++ + (lib.optional (stdenv.isDarwin) lldb); # enableDebugging will keep debug symbols in boost propagatedBuildInputs = [ (if enableDebug then (enableDebugging boost) else boost) ];