From 5ab164b28f28dada63d3c3d805de49c7a3a4e5f3 Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Wed, 9 Jun 2021 10:43:29 +0200 Subject: [PATCH 1/6] optimized operator+= and added sub_view iterator for inner iterator --- dash/include/dash/halo/StencilOperator.h | 24 ++++++++ .../dash/halo/iterator/StencilIterator.h | 58 ++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/dash/include/dash/halo/StencilOperator.h b/dash/include/dash/halo/StencilOperator.h index 31d8c0818..0547c844e 100644 --- a/dash/include/dash/halo/StencilOperator.h +++ b/dash/include/dash/halo/StencilOperator.h @@ -33,6 +33,8 @@ class StencilOperatorInner { static constexpr auto NumStencilPoints = StencilOperatorT::num_stencil_points(); static constexpr auto NumDimensions = StencilOperatorT::ndim(); + using CoordsIdxManagerInner_t = typename StencilOperatorT::CoordsIdxManagerInner_t; + public: using Element_t = typename StencilOperatorT::Element_t; using ViewSpec_t = typename StencilOperatorT::ViewSpec_t; @@ -71,6 +73,26 @@ class StencilOperatorInner { */ const ViewSpec_t& view() const { return _stencil_op->_spec_views.inner(); } + std::pair sub_iterator(const ViewSpec_t* sub_view) { + auto& inner_view = _stencil_op->_spec_views.inner(); + auto& inner_offsets = inner_view.offsets(); + auto& inner_extents = inner_view.extents(); + auto& sub_offsets = sub_view->offsets(); + auto& sub_extents = sub_view->extents(); + for(dim_t d = 0; d < NumDimensions; ++d) { + auto inner_last_elem = inner_offsets[d] + inner_extents[d]; + auto sub_last_elem = sub_offsets[d] + sub_extents[d]; + if(sub_offsets[d] < inner_offsets[d] || sub_last_elem > inner_last_elem) { + DASH_LOG_ERROR("Sub view doesn't fit into inner view."); + + return std::make_pair(end(), end()); + } + } + + return std::make_pair(iterator(CoordsIdxManagerInner_t(*_stencil_op, 0, sub_view)), + iterator(CoordsIdxManagerInner_t(*_stencil_op, sub_view->size(), sub_view))); + } + /** * Modifies all stencil point elements and the center within the inner view. * The stencil points are multiplied with their coefficent (\ref StencilPoint) @@ -812,6 +834,8 @@ class StencilOperator { return _stencil_offsets[pos]; } + + /** * Returns the local memory offset for a given coordinate */ diff --git a/dash/include/dash/halo/iterator/StencilIterator.h b/dash/include/dash/halo/iterator/StencilIterator.h index 2687b2bc1..b1d08aa01 100644 --- a/dash/include/dash/halo/iterator/StencilIterator.h +++ b/dash/include/dash/halo/iterator/StencilIterator.h @@ -152,6 +152,58 @@ class CoordsIdxManagerInner { } } + void next_nelement(uindex_t n) { + /* + ++_idx; + ++_coords[FastestDimension]; + if(static_cast(_coords[FastestDimension]) < _ranges[FastestDimension].second) { + for(auto i = 0u; i < NumStencilPoints; ++i) + ++_stencil_mem_ptr[i]; + + ++_current_lmemory_addr; + ++_offset; + + return; + } + */ + _idx += n; + if(_idx >=_size) { + _idx = _size; + + return; + } + + /*_coords[FastestDimension] += n; + + if(static_cast(_coords[FastestDimension]) < _ranges[FastestDimension].second) { + for(auto i = 0u; i < NumStencilPoints; ++i) + _stencil_mem_ptr[i] += n; + + _current_lmemory_addr += n; + _offset += n; + + return; + }*/ + + init_coords(); + + if(MemoryArrange == ROW_MAJOR) { + _offset = _coords[0]; + for(dim_t d = 1; d < NumDimensions; ++d) + _offset = _offset * _local_layout.extent(d) + _coords[d]; + } else { + _offset = _coords[NumDimensions - 1]; + for(dim_t d = NumDimensions - 1; d > 0;) { + --d; + _offset = _offset * _local_layout.extent(d) + _coords[d]; + } + } + + _current_lmemory_addr = _stencil_op->local_memory() + _offset; + for(auto i = 0u; i < NumStencilPoints; ++i) { + _stencil_mem_ptr[i] = _current_lmemory_addr + _stencil_op->stencil_offsets()[i]; + } + } private: void init_ranges() { @@ -790,9 +842,10 @@ class StencilIteratorTest { } Self_t& operator+=(index_t n) { - auto index = _coords_mng.index() + n; + _coords_mng.next_nelement(n); + //auto index = _coords_mng.index() + n; //if(index < _coords_mng.size()) - _coords_mng.set(index); + // _coords_mng.set(index); return *this; } @@ -1389,4 +1442,3 @@ class StencilIterator { } // namespace dash #endif // DASH__HALO__ITERATOR__STENCILITERATOR_H - From ca092c6096d6087b56bf8805a408531f29b7656b Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Wed, 9 Jun 2021 12:02:35 +0200 Subject: [PATCH 2/6] replaced std::pair for iterators with structure, containing begin and end --- dash/include/dash/halo/StencilOperator.h | 24 +++++--- .../dash/halo/iterator/StencilIterator.h | 58 +------------------ dash/test/halo/HaloTest.cc | 4 +- 3 files changed, 18 insertions(+), 68 deletions(-) diff --git a/dash/include/dash/halo/StencilOperator.h b/dash/include/dash/halo/StencilOperator.h index 0547c844e..cba54a517 100644 --- a/dash/include/dash/halo/StencilOperator.h +++ b/dash/include/dash/halo/StencilOperator.h @@ -19,6 +19,12 @@ struct replace { using namespace internal; + template + struct Iterator_Range { + IteratorT begin; + IteratorT end; + }; + // Forward declaration template class StencilOperator; @@ -43,6 +49,7 @@ class StencilOperatorInner { using const_iterator = const iterator; using StencilOffsets_t = typename StencilOperatorT::StencilOffsets_t; + using Iterator_Range_t = Iterator_Range; public: StencilOperatorInner(StencilOperatorT* stencil_op) @@ -73,7 +80,7 @@ class StencilOperatorInner { */ const ViewSpec_t& view() const { return _stencil_op->_spec_views.inner(); } - std::pair sub_iterator(const ViewSpec_t* sub_view) { + Iterator_Range_t sub_iterator(const ViewSpec_t* sub_view) { auto& inner_view = _stencil_op->_spec_views.inner(); auto& inner_offsets = inner_view.offsets(); auto& inner_extents = inner_view.extents(); @@ -85,12 +92,12 @@ class StencilOperatorInner { if(sub_offsets[d] < inner_offsets[d] || sub_last_elem > inner_last_elem) { DASH_LOG_ERROR("Sub view doesn't fit into inner view."); - return std::make_pair(end(), end()); + return {end(), end()}; } } - return std::make_pair(iterator(CoordsIdxManagerInner_t(*_stencil_op, 0, sub_view)), - iterator(CoordsIdxManagerInner_t(*_stencil_op, sub_view->size(), sub_view))); + return { iterator(CoordsIdxManagerInner_t(*_stencil_op, 0, sub_view)), + iterator(CoordsIdxManagerInner_t(*_stencil_op, sub_view->size(), sub_view))}; } /** @@ -429,6 +436,7 @@ class StencilOperatorBoundary { using const_iterator = const iterator; using BoundaryViews_t = typename StencilSpecViews_t::BoundaryViews_t; using RegionCoords_t = RegionCoords; + using Iterator_Range_t = Iterator_Range; public: StencilOperatorBoundary(const StencilOperatorT* stencil_op) @@ -520,7 +528,7 @@ class StencilOperatorBoundary { * Using all iterators for all dimensions and \ref RegionPos has the same * effect as using bbegin and bend. */ - std::pair iterator_at(dim_t dim, RegionPos pos) { + Iterator_Range_t iterator_at(dim_t dim, RegionPos pos) { DASH_ASSERT_LT(dim, NumDimensions, "Given dimension to great"); const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); uindex_t offset = 0; @@ -535,10 +543,10 @@ class StencilOperatorBoundary { auto it_begin = _stencil_op->_bbegin + offset; - return std::make_pair(it_begin, it_begin + it_views->size()); + return {it_begin, it_begin + it_views->size()}; } - std::pair iterator_at(region_index_t index) { + Iterator_Range_t iterator_at(region_index_t index) { DASH_ASSERT_LT(index, NumRegionsMax, "Given index out of range"); const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); uindex_t offset = 0; @@ -548,7 +556,7 @@ class StencilOperatorBoundary { auto it_begin = _stencil_op->_bbegin + offset; - return std::make_pair(it_begin, it_begin + bnd_views[index].size()); + return {it_begin, it_begin + bnd_views[index].size()}; } diff --git a/dash/include/dash/halo/iterator/StencilIterator.h b/dash/include/dash/halo/iterator/StencilIterator.h index b1d08aa01..add76b1c1 100644 --- a/dash/include/dash/halo/iterator/StencilIterator.h +++ b/dash/include/dash/halo/iterator/StencilIterator.h @@ -152,59 +152,6 @@ class CoordsIdxManagerInner { } } - void next_nelement(uindex_t n) { - /* - ++_idx; - ++_coords[FastestDimension]; - if(static_cast(_coords[FastestDimension]) < _ranges[FastestDimension].second) { - for(auto i = 0u; i < NumStencilPoints; ++i) - ++_stencil_mem_ptr[i]; - - ++_current_lmemory_addr; - ++_offset; - - return; - } - */ - _idx += n; - if(_idx >=_size) { - _idx = _size; - - return; - } - - /*_coords[FastestDimension] += n; - - if(static_cast(_coords[FastestDimension]) < _ranges[FastestDimension].second) { - for(auto i = 0u; i < NumStencilPoints; ++i) - _stencil_mem_ptr[i] += n; - - _current_lmemory_addr += n; - _offset += n; - - return; - }*/ - - init_coords(); - - if(MemoryArrange == ROW_MAJOR) { - _offset = _coords[0]; - for(dim_t d = 1; d < NumDimensions; ++d) - _offset = _offset * _local_layout.extent(d) + _coords[d]; - } else { - _offset = _coords[NumDimensions - 1]; - for(dim_t d = NumDimensions - 1; d > 0;) { - --d; - _offset = _offset * _local_layout.extent(d) + _coords[d]; - } - } - - _current_lmemory_addr = _stencil_op->local_memory() + _offset; - for(auto i = 0u; i < NumStencilPoints; ++i) { - _stencil_mem_ptr[i] = _current_lmemory_addr + _stencil_op->stencil_offsets()[i]; - } - } - private: void init_ranges() { for(dim_t d = 0; d < NumDimensions; ++d) { @@ -842,10 +789,7 @@ class StencilIteratorTest { } Self_t& operator+=(index_t n) { - _coords_mng.next_nelement(n); - //auto index = _coords_mng.index() + n; - //if(index < _coords_mng.size()) - // _coords_mng.set(index); + _coords_mng.set(_coords_mng.index() + n); return *this; } diff --git a/dash/test/halo/HaloTest.cc b/dash/test/halo/HaloTest.cc index 3b922e635..059360014 100644 --- a/dash/test/halo/HaloTest.cc +++ b/dash/test/halo/HaloTest.cc @@ -450,9 +450,7 @@ unsigned long calc_sum_halo(HaloWrapperT& halo_wrapper, StencilOpT stencil_op, b if(region_wise) { for( auto r = 0; r < NumRegionsMax<3>; ++r) { auto it_bnd = stencil_op.boundary.iterator_at(r); - if(it_bnd.first == it_bnd.second) - continue; - for(auto it = it_bnd.first; it != it_bnd.second; ++it) { + for(auto it = it_bnd.begin; it != it_bnd.end; ++it) { for(auto i = 0; i < num_stencil_points; ++i) *sum_local += it.value_at(i); From 0a13a1c4146ba2f4a6fbe8a01c00286ba829f758 Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Thu, 24 Jun 2021 17:09:03 +0200 Subject: [PATCH 3/6] added const to inner sub iterator method --- dash/include/dash/halo/StencilOperator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dash/include/dash/halo/StencilOperator.h b/dash/include/dash/halo/StencilOperator.h index cba54a517..4dbd6048f 100644 --- a/dash/include/dash/halo/StencilOperator.h +++ b/dash/include/dash/halo/StencilOperator.h @@ -80,7 +80,7 @@ class StencilOperatorInner { */ const ViewSpec_t& view() const { return _stencil_op->_spec_views.inner(); } - Iterator_Range_t sub_iterator(const ViewSpec_t* sub_view) { + Iterator_Range_t sub_iterator(const ViewSpec_t* sub_view) const { auto& inner_view = _stencil_op->_spec_views.inner(); auto& inner_offsets = inner_view.offsets(); auto& inner_extents = inner_view.extents(); From c975b397b3a38732dd711da29dec416b95ca7ab2 Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Mon, 28 Jun 2021 12:44:26 +0200 Subject: [PATCH 4/6] fixed boundary iterator_at for dimensions --- dash/include/dash/halo/StencilOperator.h | 37 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/dash/include/dash/halo/StencilOperator.h b/dash/include/dash/halo/StencilOperator.h index 4dbd6048f..0c9262315 100644 --- a/dash/include/dash/halo/StencilOperator.h +++ b/dash/include/dash/halo/StencilOperator.h @@ -528,27 +528,40 @@ class StencilOperatorBoundary { * Using all iterators for all dimensions and \ref RegionPos has the same * effect as using bbegin and bend. */ - Iterator_Range_t iterator_at(dim_t dim, RegionPos pos) { - DASH_ASSERT_LT(dim, NumDimensions, "Given dimension to great"); - const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); - uindex_t offset = 0; - auto it_views = std::begin(bnd_views); - for(dim_t d = 0; d < dim; ++d, ++it_views) - offset += it_views->size() + (++it_views)->size(); + Iterator_Range_t iterator_at(dim_t dim, RegionPos pos) const { + DASH_ASSERT_LT(dim, NumDimensions, "Given dimension to high"); + + const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); + auto num_regions_dim = ce::pow(REGION_INDEX_BASE, static_cast::type>(NumDimensions-1)); + region_index_t start_region = 0; + for(dim_t d = 1; d <= dim; ++d) { + start_region += num_regions_dim; + num_regions_dim = ce::pow(REGION_INDEX_BASE, static_cast::type>(NumDimensions-d-1)); + } if(pos == RegionPos::POST) { - offset += it_views->size(); - ++it_views; + start_region = NumRegionsMax - start_region - num_regions_dim; } + uindex_t offset = 0; + for(region_index_t r = 0; r < start_region; ++r) { + offset += bnd_views[r].size(); + } + + uindex_t size = 0; + for(region_index_t r = start_region; r < start_region + num_regions_dim; ++r) { + size += bnd_views[r].size(); + } + + auto it_begin = _stencil_op->_bbegin + offset; - return {it_begin, it_begin + it_views->size()}; + return {it_begin, it_begin + size}; } - Iterator_Range_t iterator_at(region_index_t index) { + Iterator_Range_t iterator_at(region_index_t index) const { DASH_ASSERT_LT(index, NumRegionsMax, "Given index out of range"); - const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); + const auto& bnd_views = _stencil_op->_spec_views.boundary_views(); uindex_t offset = 0; for(region_index_t r = 0; r < index; ++r) { offset += bnd_views[r].size(); From 1e495bd0eadffc7fb15490ce7ca0c60cf7300518 Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Fri, 23 Jul 2021 11:32:35 +0200 Subject: [PATCH 5/6] introduced parallel packing in pack_env --- dash/include/dash/halo/HaloMatrixWrapper.h | 6 ++-- dash/include/dash/halo/HaloMemory.h | 37 +++++++++++++++------- dash/include/dash/halo/Types.h | 4 +++ 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/dash/include/dash/halo/HaloMatrixWrapper.h b/dash/include/dash/halo/HaloMatrixWrapper.h index 1b88b9822..ac00b4131 100644 --- a/dash/include/dash/halo/HaloMatrixWrapper.h +++ b/dash/include/dash/halo/HaloMatrixWrapper.h @@ -49,7 +49,9 @@ namespace halo { * halo region 3 '- halo region 7 */ -template + + +template class HaloMatrixWrapper { private: using Pattern_t = typename MatrixT::pattern_type; @@ -65,7 +67,7 @@ class HaloMatrixWrapper { using GlobBoundSpec_t = GlobalBoundarySpec; using HaloBlock_t = HaloBlock; using HaloMemory_t = HaloMemory; - using HaloUpdateEnv_t = HaloUpdateEnv; + using HaloUpdateEnv_t = HaloUpdateEnv; using ElementCoords_t = std::array; using region_index_t = internal::region_index_t; using stencil_dist_t = internal::spoint_value_t; diff --git a/dash/include/dash/halo/HaloMemory.h b/dash/include/dash/halo/HaloMemory.h index f718c78bf..3868168ec 100644 --- a/dash/include/dash/halo/HaloMemory.h +++ b/dash/include/dash/halo/HaloMemory.h @@ -6,6 +6,8 @@ #include #include +#include + namespace dash { namespace halo { @@ -355,10 +357,16 @@ class SignalEnv { SignalHandles_t _signal_ready_handles; }; +template +struct BufferOffset { + ElementT* block_pos; + ElementT* buffer_pos; +}; + template struct PackMetaData { bool needs_packing{false}; - std::vector block_pos{}; + std::vector> block_data{}; LengthSizeT block_len{0}; ElementT* buffer_pos{nullptr}; std::function pack_func = [](){}; @@ -373,7 +381,7 @@ std::ostream& operator<<(std::ostream& os, const PackMetaData +template class PackEnv { static constexpr auto NumDimensions = HaloBlockT::ndim(); static constexpr auto RegionsMax = NumRegionsMax; @@ -510,20 +518,27 @@ class PackEnv { pattern_size_t num_blocks = view_pack.size() / num_elems_block; pack_md.block_len = num_elems_block; - pack_md.block_pos.resize(num_blocks); + pack_md.block_data.resize(num_blocks); auto it_region = region->begin(); decltype(it_region) it_pack_data(&(it_region.globmem()), it_region.pattern(), view_pack); - for(auto& pos : pack_md.block_pos) { - pos = _local_memory + it_pack_data.lpos().index; + auto buffer_offset = pack_md.buffer_pos; + for(auto& pos : pack_md.block_data) { + pos.block_pos = _local_memory + it_pack_data.lpos().index; + pos.buffer_pos = buffer_offset; it_pack_data += num_elems_block; + buffer_offset += pack_md.block_len; } auto pack = &pack_md; pack_md.pack_func = [pack](){ - auto buffer_offset = pack->buffer_pos; - for(auto& pos : pack->block_pos) { - std::copy(pos, pos + pack->block_len, buffer_offset); - buffer_offset += pack->block_len; + if(SHARED_TYPE == SharedType::STL) { + std::for_each(std::execution::par, pack->block_data.begin(), pack->block_data.end(), [pack](const auto& block) { + std::copy(block.block_pos, block.block_pos + pack->block_len, block.buffer_pos); + }); + } else { + for(auto& block : pack->block_data) { + std::copy(block.block_pos, block.block_pos + pack->block_len, block.buffer_pos); + } } }; } @@ -536,7 +551,7 @@ class PackEnv { PackMDataAll_t _pack_md_all; }; -template +template class HaloUpdateEnv { struct UpdateData { std::function get_halos; @@ -552,7 +567,7 @@ class HaloUpdateEnv { using Pattern_t = typename HaloBlockT::Pattern_t; using BlockEnv_t = BlockEnvironment; using SignalEnv_t = SignalEnv; - using PackEnv_t = PackEnv; + using PackEnv_t = PackEnv; diff --git a/dash/include/dash/halo/Types.h b/dash/include/dash/halo/Types.h index 9b3ed84e2..48771f878 100644 --- a/dash/include/dash/halo/Types.h +++ b/dash/include/dash/halo/Types.h @@ -68,6 +68,10 @@ enum class BoundaryProp : uint8_t { CUSTOM }; +enum class SharedType : uint8_t { + STL, OMP, NONE +}; + inline std::ostream& operator<<(std::ostream& os, const BoundaryProp& prop) { if(prop == BoundaryProp::NONE) os << "NONE"; From dd9d45c457729bb5cc5727a0e4c729ef14ccaff4 Mon Sep 17 00:00:00 2001 From: Denis Huenich Date: Fri, 23 Jul 2021 14:36:13 +0200 Subject: [PATCH 6/6] added OMP for async packing --- dash/include/dash/halo/HaloMemory.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/dash/include/dash/halo/HaloMemory.h b/dash/include/dash/halo/HaloMemory.h index 3868168ec..8b64d7360 100644 --- a/dash/include/dash/halo/HaloMemory.h +++ b/dash/include/dash/halo/HaloMemory.h @@ -530,17 +530,27 @@ class PackEnv { buffer_offset += pack_md.block_len; } auto pack = &pack_md; - pack_md.pack_func = [pack](){ - if(SHARED_TYPE == SharedType::STL) { + if(SHARED_TYPE == SharedType::STL) { + pack_md.pack_func = [pack](){ std::for_each(std::execution::par, pack->block_data.begin(), pack->block_data.end(), [pack](const auto& block) { std::copy(block.block_pos, block.block_pos + pack->block_len, block.buffer_pos); }); - } else { + }; + } else if(SHARED_TYPE == SharedType::OMP) { + pack_md.pack_func = [pack](){ + #pragma omp parallel for + for(auto i = 0; i < pack->block_data.size(); ++i) { + const auto& block = pack->block_data[i]; + std::copy(block.block_pos, block.block_pos + pack->block_len, block.buffer_pos); + } + }; + } else { + pack_md.pack_func = [pack](){ for(auto& block : pack->block_data) { std::copy(block.block_pos, block.block_pos + pack->block_len, block.buffer_pos); } - } - }; + }; + } } }