diff --git a/benchmarks/gbench/mhp/CMakeLists.txt b/benchmarks/gbench/mhp/CMakeLists.txt index 2c03a6c79f..07d7e901fa 100644 --- a/benchmarks/gbench/mhp/CMakeLists.txt +++ b/benchmarks/gbench/mhp/CMakeLists.txt @@ -36,8 +36,7 @@ endif() # mhp-quick-bench is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. -add_executable(mhp-quick-bench mhp-bench.cpp - ../common/inclusive_exclusive_scan.cpp) +add_executable(mhp-quick-bench mhp-bench.cpp ../common/sort.cpp) foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) target_compile_definitions(${mhp-bench-exec} PRIVATE BENCH_MHP) @@ -48,9 +47,7 @@ foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) endif() endforeach() -if(ENABLE_SYCL) - target_sources(mhp-quick-bench PRIVATE fft3d.cpp) -endif() +# if(ENABLE_SYCL) target_sources(mhp-quick-bench PRIVATE fft3d.cpp) endif() cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 4c9b671908..0d1e91c3c3 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -26,14 +26,104 @@ namespace dr::mhp { namespace __detail { -template -void local_sort(R &r, Compare &&comp) { +template class buffer { +public: + using value_type = T; + std::size_t size() { return size_; } + T *data() { return data_; } + T *cdata() const { return data_; } + T *begin() { return data_; } + T *end() { return data_ + size_; } + + T *resize(std::size_t cnt) { + assert(cnt >= size_); + if (cnt == size_) + return data_; + + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + T *newdata = sycl::malloc(cnt, sycl_queue(), sycl_mem_kind()); + assert(newdata != nullptr); + sycl_queue().copy(data_, newdata, size_); + assert(sycl_get(*data_) == + sycl_get(*newdata)); /* surprisingly is helpful in case of mem mgmt + issues */ + sycl::free(data_, sycl_queue()); + data_ = newdata; +#else + assert(false); +#endif + } else { + T *newdata = static_cast(malloc(cnt * sizeof(T))); + memcpy(newdata, data_, size_ * sizeof(T)); + free(data_); + data_ = newdata; + } + size_ = cnt; + return data_; + } + + void replace(buffer &other) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + if (data_ != nullptr) + sycl::free(data_, sycl_queue()); +#else + assert(false); +#endif + } else { + if (data_ != nullptr) + free(data_); + } + data_ = rng::data(other); + size_ = rng::size(other); + other.data_ = nullptr; + other.size_ = 0; + } + + ~buffer() { + if (data_ != nullptr) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::free(this->data_, sycl_queue()); +#else + assert(false); +#endif + } else { + free(data_); + } + } + data_ = nullptr; + size_ = 0; + } + + buffer(std::size_t cnt) : size_(cnt) { + if (cnt > 0) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + data_ = sycl::malloc(cnt, sycl_queue(), sycl_mem_kind()); +#else + assert(false); +#endif + } else { + data_ = static_cast(malloc(cnt * sizeof(T))); + } + assert(data_ != nullptr); + } + } + +private: + T *data_ = nullptr; + std::size_t size_ = 0; +}; // class buffer + +template void local_sort(R &r, Compare &&comp) { if (rng::size(r) >= 2) { if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION auto policy = dpl_policy(); auto &&local_segment = dr::ranges::__detail::local(r); - drlog.debug("GPU dpl::sort(), size {}\n", rng::size(r)); + DRLOG("GPU dpl::sort(), size {}\n", rng::size(r)); oneapi::dpl::sort( policy, dr::__detail::direct_iterator(rng::begin(local_segment)), dr::__detail::direct_iterator(rng::end(local_segment)), comp); @@ -41,7 +131,7 @@ void local_sort(R &r, Compare &&comp) { assert(false); #endif } else { - drlog.debug("cpu rng::sort, size {}\n", rng::size(r)); + DRLOG("cpu rng::sort, size {}\n", rng::size(r)); rng::sort(rng::begin(r), rng::end(r), comp); } } @@ -52,7 +142,6 @@ template void splitters(Seg &lsegment, Compare &&comp, std::vector &vec_split_i, std::vector &vec_split_s) { - const std::size_t _comm_size = default_comm().size(); // dr-style ignore assert(rng::size(vec_split_i) == _comm_size); @@ -64,87 +153,203 @@ void splitters(Seg &lsegment, Compare &&comp, const double _step_m = static_cast(rng::size(lsegment)) / static_cast(_comm_size); + fmt::print("{}:{} splitters start\n", default_comm().rank(), __LINE__); + /* calculate splitting values and indices - find n-1 dividers splitting * each segment into equal parts */ - - for (std::size_t _i = 0; _i < rng::size(vec_lmedians); _i++) { - vec_lmedians[_i] = lsegment[_i * _step_m]; + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { + assert(_i * _step_m < rng::size(lsegment)); + sycl_copy(&lsegment[_i * _step_m], &vec_lmedians[_i]); + } + sycl_copy(&lsegment[rng::size(lsegment) - 1], + &vec_lmedians[rng::size(vec_lmedians) - 1]); +#else + assert(false); +#endif + } else { + for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { + assert(_i * _step_m < rng::size(lsegment)); + vec_lmedians[_i] = lsegment[_i * _step_m]; + } + vec_lmedians.back() = lsegment.back(); } - vec_lmedians.back() = lsegment.back(); default_comm().all_gather(vec_lmedians, vec_gmedians); - rng::sort(rng::begin(vec_gmedians), rng::end(vec_gmedians), comp); - std::vector vec_split_v(_comm_size - 1); + // auto begin = std::chrono::high_resolution_clock::now(); + // auto end = std::chrono::high_resolution_clock::now(); - for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { - vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; - } + /* TODO: the loop below takes most of time of the whole sort + * procedure; is optimisation possible? */ + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + __detail::buffer dbuf_v(_comm_size - 1); + __detail::buffer dbuf_i(rng::size(vec_split_i)); + __detail::buffer dbuf_s(rng::size(vec_split_s)); + + for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); + sycl_copy(&vec_gmedians[(_i + 1) * (_comm_size + 1) - 1], + dbuf_v.data() + _i); + } - std::size_t segidx = 0, vidx = 1; + sycl_copy(vec_split_i.data(), dbuf_i.data(), + vec_split_i.size()); + sycl_copy(vec_split_s.data(), dbuf_s.data(), + vec_split_s.size()); + + sycl::buffer buf_i(dbuf_i.data(), sycl::range{dbuf_i.size()}); + sycl::buffer buf_s(dbuf_s.data(), sycl::range{dbuf_s.size()}); + sycl::buffer buf_v(dbuf_v.data(), sycl::range(rng::size(dbuf_v))); + sycl::buffer buf_lsegment(lsegment); + + sycl_queue() + .submit([&](sycl::handler &h) { + sycl::accessor acc_i{buf_i, h, sycl::read_write}; + sycl::accessor acc_s{buf_s, h, sycl::write_only}; + sycl::accessor acc_v{buf_v, h, sycl::read_write}; + sycl::accessor acc_ls{buf_lsegment, h, sycl::read_only}; + + auto ls_size = rng::size(lsegment); + h.single_task([=]() { + sycl::opencl::cl_ulong vidx0 = 0, vidx1 = 1, segidx = 0; + + while (vidx1 < _comm_size && segidx < ls_size) { + if (comp(acc_v[vidx1 - 1], acc_ls[segidx])) { + acc_i[vidx1] = segidx; + acc_s[vidx0] = acc_i[vidx1] - acc_i[vidx0]; + vidx1++; + vidx0++; + } else { + segidx++; + } + } + acc_s[vidx0] = ls_size - acc_i[vidx0]; + }); + }) + .wait(); + + // end = std::chrono::high_resolution_clock::now(); + // fmt::print("{}: splitters loop duration {} ms\n", default_comm().rank(), + // std::chrono::duration(end - begin).count() * 1000); + + sycl::host_accessor res_i{buf_i}, res_s{buf_s}; + + for (int _i = 0; _i < rng::size(vec_split_i); _i++) { + vec_split_i[_i] = sycl_get(res_i[_i]); + vec_split_s[_i] = sycl_get(res_s[_i]); + } +#else + assert(false); +#endif + } else { + fmt::print("{}:{} NO SYCL splitters\n", default_comm().rank(), __LINE__); - while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v[vidx - 1], *(lsegment.begin() + segidx))) { - vec_split_i[vidx] = segidx; - vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; - vidx++; - } else { - segidx++; + std::vector vec_split_v(_comm_size - 1); + for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); + vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; } + + /* + std::size_t segidx = 0, vidx = 1; + while (vidx < _comm_size && segidx < rng::size(lsegment)) { + if (comp(vec_split_v[vidx - 1], lsegment[segidx])) { + vec_split_i[vidx] = segidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + } else { + segidx++; + } + } + */ + std::size_t vidx = 1; + for (std::size_t sidx = 0; sidx < rng::size(lsegment); sidx++) { + while (comp(vec_split_v[vidx - 1], lsegment[sidx])) { + vec_split_i[vidx] = sidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + if (vidx == _comm_size) + break; + } + if (vidx == _comm_size) + break; + } + assert(rng::size(lsegment) > vec_split_i[vidx - 1]); + vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; } - assert(rng::size(lsegment) > vec_split_i[vidx - 1]); - vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; -} +} // splitters() -template +template void shift_data(const int shift_left, const int shift_right, - std::vector &vec_recvdata, - std::vector &vec_left, - std::vector &vec_right) { + buffer &vec_recvdata, buffer &vec_left, + buffer &vec_right) { const std::size_t _comm_rank = default_comm().rank(); MPI_Request req_l, req_r; MPI_Status stat_l, stat_r; + assert(static_cast(rng::size(vec_left)) == std::max(0, shift_left)); + assert(static_cast(rng::size(vec_right)) == std::max(0, shift_right)); + if (static_cast(rng::size(vec_recvdata)) < -shift_left) { - // Too little data in recv buffer to shift left - first get from right, then - // send left - drlog.debug("Get from right first, recvdata size {} shl {}\n", - rng::size(vec_recvdata), -shift_left); + // Too little data in recv buffer to shift left - first get from right, + // then send left + DRLOG("Get from right first, recvdata size {} shift left {} \n", + rng::size(vec_recvdata), shift_left); // ** This will never happen, because values eq to split go left ** assert(false); } else if (static_cast(rng::size(vec_recvdata)) < -shift_right) { - // Too little data in buffer to shift right - first get from left, then send - // right + // Too little data in buffer to shift right - first get from left, then + // send right assert(shift_left > 0); - default_comm().irecv(vec_left, _comm_rank - 1, &req_l); + + default_comm().irecv(rng::data(vec_left), rng::size(vec_left), + _comm_rank - 1, &req_l); MPI_Wait(&req_l, &stat_l); - vec_left.insert(vec_left.end(), vec_recvdata.begin(), vec_recvdata.end()); - std::swap(vec_left, vec_recvdata); - vec_left.clear(); + vec_left.resize(shift_left + rng::size(vec_recvdata)); + + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + sycl_queue().copy(rng::data(vec_recvdata), + rng::data(vec_left) + shift_left, + rng::size(vec_recvdata)); +#else + assert(false); +#endif + } else { + memcpy(rng::data(vec_left) + shift_left, rng::data(vec_recvdata), + rng::size(vec_recvdata)); + } + vec_recvdata.replace(vec_left); - default_comm().isend((valT *)(vec_recvdata.data()) + - rng::size(vec_recvdata) + shift_right, - -shift_right, _comm_rank + 1, &req_r); + default_comm().isend(vec_recvdata.end() + shift_right, -shift_right, + _comm_rank + 1, &req_r); MPI_Wait(&req_r, &stat_r); } else { // enough data in recv buffer if (shift_left < 0) { - default_comm().isend(vec_recvdata.data(), -shift_left, _comm_rank - 1, + default_comm().isend(rng::data(vec_recvdata), -shift_left, _comm_rank - 1, &req_l); } else if (shift_left > 0) { - default_comm().irecv(vec_left, _comm_rank - 1, &req_l); + assert(shift_left == static_cast(rng::size(vec_left))); + default_comm().irecv(rng::data(vec_left), rng::size(vec_left), + _comm_rank - 1, &req_l); } if (shift_right > 0) { - default_comm().irecv(vec_right, _comm_rank + 1, &req_r); + assert(shift_right == static_cast(rng::size(vec_right))); + default_comm().irecv(rng::data(vec_right), rng::size(vec_right), + _comm_rank + 1, &req_r); } else if (shift_right < 0) { - default_comm().isend((valT *)(vec_recvdata.data()) + - rng::size(vec_recvdata) + shift_right, + default_comm().isend(rng::data(vec_recvdata) + rng::size(vec_recvdata) + + shift_right, -shift_right, _comm_rank + 1, &req_r); } @@ -153,13 +358,12 @@ void shift_data(const int shift_left, const int shift_right, if (shift_right != 0) MPI_Wait(&req_r, &stat_r); } -} +} // shift_data() -template +template void copy_results(auto &lsegment, const int shift_left, const int shift_right, - std::vector &vec_recvdata, - std::vector &vec_left, - std::vector &vec_right) { + buffer &vec_recvdata, buffer &vec_left, + buffer &vec_right) { const std::size_t invalidate_left = std::max(-shift_left, 0); const std::size_t invalidate_right = std::max(-shift_right, 0); @@ -173,12 +377,12 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right, sycl::event e_l, e_d, e_r; if (size_l > 0) - e_l = sycl_queue().copy(vec_left.data(), lsegment.data(), size_l); + e_l = sycl_queue().copy(rng::data(vec_left), rng::data(lsegment), size_l); if (size_r > 0) - e_r = sycl_queue().copy(vec_right.data(), - lsegment.data() + size_l + size_d, size_r); - e_d = sycl_queue().copy(vec_recvdata.data() + invalidate_left, - lsegment.data() + size_l, size_d); + e_r = sycl_queue().copy(rng::data(vec_right), + rng::data(lsegment) + size_l + size_d, size_r); + e_d = sycl_queue().copy(rng::data(vec_recvdata) + invalidate_left, + rng::data(lsegment) + size_l, size_d); if (size_l > 0) e_l.wait(); if (size_r > 0) @@ -190,15 +394,17 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right, #endif } else { if (size_l > 0) - std::memcpy(lsegment.data(), vec_left.data(), size_l * sizeof(valT)); + std::memcpy(rng::data(lsegment), rng::data(vec_left), + size_l * sizeof(valT)); if (size_r > 0) - std::memcpy(lsegment.data() + size_l + size_d, vec_right.data(), + std::memcpy(rng::data(lsegment) + size_l + size_d, rng::data(vec_right), size_r * sizeof(valT)); - std::memcpy(lsegment.data() + size_l, vec_recvdata.data() + invalidate_left, + std::memcpy(rng::data(lsegment) + size_l, + rng::data(vec_recvdata) + invalidate_left, size_d * sizeof(valT)); } -} +} // copy_results template void dist_sort(R &r, Compare &&comp) { @@ -208,11 +414,6 @@ void dist_sort(R &r, Compare &&comp) { const std::size_t _comm_rank = default_comm().rank(); const std::size_t _comm_size = default_comm().size(); // dr-style ignore -#ifdef SYCL_LANGUAGE_VERSION - auto policy = dpl_policy(); - sycl::usm_allocator alloc(policy.queue()); -#endif - auto &&lsegment = local_segment(r); std::vector vec_split_i(_comm_size, 0); @@ -226,38 +427,34 @@ void dist_sort(R &r, Compare &&comp) { /* find splitting values - limits of areas to send to other processes */ __detail::splitters(lsegment, comp, vec_split_i, vec_split_s); - default_comm().alltoall(vec_split_s, vec_rsizes, 1); /* prepare data to send and receive */ std::exclusive_scan(vec_rsizes.begin(), vec_rsizes.end(), vec_rindices.begin(), 0); - const std::size_t _recv_elems = vec_rindices.back() + vec_rsizes.back(); /* send and receive data belonging to each node, then redistribute * data to achieve size of data equal to size of local segment */ - MPI_Request req_recvelems; - default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); + /* async version to consider and test (have issues with MPI_Wait() in the + * current CI)*/ + // MPI_Request req_recvelems; + // default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); + default_comm().all_gather(_recv_elems, vec_recv_elems); /* buffer for received data */ -#ifdef SYCL_LANGUAGE_VERSION - std::vector vec_recvdata(_recv_elems, alloc); -#else - std::vector vec_recvdata(_recv_elems); -#endif + buffer vec_recvdata(_recv_elems); /* send data not belonging and receive data belonging to local processes */ default_comm().alltoallv(lsegment, vec_split_s, vec_split_i, vec_recvdata, vec_rsizes, vec_rindices); - /* TODO: vec recvdata is partially sorted, implementation of merge on GPU is * desirable */ __detail::local_sort(vec_recvdata, comp); - MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); + // MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); _total_elems = std::reduce(vec_recv_elems.begin(), vec_recv_elems.end()); @@ -265,7 +462,6 @@ void dist_sort(R &r, Compare &&comp) { std::vector vec_shift(_comm_size - 1); const auto desired_elems_num = (_total_elems + _comm_size - 1) / _comm_size; - vec_shift[0] = desired_elems_num - vec_recv_elems[0]; for (std::size_t _i = 1; _i < _comm_size - 1; _i++) { vec_shift[_i] = vec_shift[_i - 1] + desired_elems_num - vec_recv_elems[_i]; @@ -275,13 +471,8 @@ void dist_sort(R &r, Compare &&comp) { const int shift_right = _comm_rank == _comm_size - 1 ? 0 : vec_shift[_comm_rank]; -#ifdef SYCL_LANGUAGE_VERSION - std::vector vec_left(std::max(shift_left, 0), alloc); - std::vector vec_right(std::max(shift_right, 0), alloc); -#else - std::vector vec_left(std::max(shift_left, 0)); - std::vector vec_right(std::max(shift_right, 0)); -#endif + buffer vec_left(std::max(shift_left, 0)); + buffer vec_right(std::max(shift_right, 0)); /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ @@ -304,8 +495,6 @@ void sort(R &r, Compare &&comp = Compare()) { std::size_t _comm_size = default_comm().size(); // dr-style ignore if (_comm_size == 1) { - drlog.debug("mhp::sort() - single node\n"); - auto &&lsegment = local_segment(r); __detail::local_sort(lsegment, comp); @@ -313,7 +502,7 @@ void sort(R &r, Compare &&comp = Compare()) { /* Distributed vector of size <= (comm_size-1) * (comm_size-1) may have * 0-size local segments. It is also small enough to prefer sequential sort */ - drlog.debug("mhp::sort() - local sort\n"); + DRLOG("mhp::sort() - local sort\n"); std::vector vec_recvdata(rng::size(r)); dr::mhp::copy(0, r, rng::begin(vec_recvdata)); @@ -325,7 +514,7 @@ void sort(R &r, Compare &&comp = Compare()) { dr::mhp::copy(0, vec_recvdata, rng::begin(r)); } else { - drlog.debug("mhp::sort() - dist sort\n"); + DRLOG("mhp::sort() - distributed sort\n"); __detail::dist_sort(r, comp); dr::mhp::barrier(); } diff --git a/include/dr/mhp/sycl_support.hpp b/include/dr/mhp/sycl_support.hpp index 528f64ccb8..0eef117a68 100644 --- a/include/dr/mhp/sycl_support.hpp +++ b/include/dr/mhp/sycl_support.hpp @@ -94,4 +94,9 @@ template void sycl_copy(T *src, T *dst, std::size_t size = 1) { sycl_copy(src, src + size, dst); } +template +void sycl_copy(const T *src, const T *dst, std::size_t size = 1) { + sycl_copy(src, src + size, dst); +} + } // namespace dr::mhp::__detail diff --git a/test/gtest/mhp/CMakeLists.txt b/test/gtest/mhp/CMakeLists.txt index 181dba0755..bd270a6973 100644 --- a/test/gtest/mhp/CMakeLists.txt +++ b/test/gtest/mhp/CMakeLists.txt @@ -56,7 +56,7 @@ add_executable( add_executable(mhp-quick-test mhp-tests.cpp - ../common/inclusive_scan.cpp + ../common/sort.cpp ) # cmake-format: on @@ -99,8 +99,7 @@ if(ENABLE_SYCL) add_mhp_ctest(NAME mhp-quick-test NPROC 1 SYCL) add_mhp_ctest(NAME mhp-quick-test NPROC 2 SYCL) - # DRA-17 for fixing sort, DRA-18 for fixing ExclScan - set(sycl-offload-exclusions ${sycl-exclusions}:Sort*) + set(sycl-offload-exclusions ${sycl-exclusions}) add_mhp_ctest( NAME mhp-tests NPROC 2 OFFLOAD SYCL TARGS --device-memory