diff --git a/.gitignore b/.gitignore index 135d76be3f15..ddb0f9dac625 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,4 @@ cmake_hdf5_test.o # Python artifacts *.pyc *.egg-info +*_venv \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7f66585a8e8c..050cc608bade 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -281,6 +281,8 @@ add_library(parthenon globals.cpp globals.hpp kokkos_abstraction.hpp + kokkos_types.hpp + loop_bound_translator.hpp parameter_input.cpp parameter_input.hpp parthenon_array_generic.hpp diff --git a/src/bvals/comms/build_boundary_buffers.cpp b/src/bvals/comms/build_boundary_buffers.cpp index aac532d037e6..01af7db4e744 100644 --- a/src/bvals/comms/build_boundary_buffers.cpp +++ b/src/bvals/comms/build_boundary_buffers.cpp @@ -29,7 +29,6 @@ #include "config.hpp" #include "globals.hpp" #include "interface/variable.hpp" -#include "kokkos_abstraction.hpp" #include "mesh/mesh.hpp" #include "mesh/mesh_refinement.hpp" #include "mesh/meshblock.hpp" diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index ca8c59ffe12e..f406e87872bb 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -29,144 +30,63 @@ #include "basic_types.hpp" #include "config.hpp" +#include "kokkos_types.hpp" +#include "loop_bound_translator.hpp" #include "parthenon_array_generic.hpp" #include "utils/error_checking.hpp" +#include "utils/indexer.hpp" #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" #include "utils/object_pool.hpp" +#include "utils/type_list.hpp" namespace parthenon { - -#ifdef KOKKOS_ENABLE_CUDA_UVM -using DevMemSpace = Kokkos::CudaUVMSpace; -using HostMemSpace = Kokkos::CudaUVMSpace; -using DevExecSpace = Kokkos::Cuda; -#else -using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space; -using HostMemSpace = Kokkos::HostSpace; -using DevExecSpace = Kokkos::DefaultExecutionSpace; -#endif -using ScratchMemSpace = DevExecSpace::scratch_memory_space; - -using HostExecSpace = Kokkos::DefaultHostExecutionSpace; -using LayoutWrapper = Kokkos::LayoutRight; -using MemUnmanaged = Kokkos::MemoryTraits; - -#if defined(PARTHENON_ENABLE_HOST_COMM_BUFFERS) -#if defined(KOKKOS_ENABLE_CUDA) -using BufMemSpace = Kokkos::CudaHostPinnedSpace::memory_space; -#elif defined(KOKKOS_ENABLE_HIP) -using BufMemSpace = Kokkos::Experimental::HipHostPinnedSpace::memory_space; -#else -#error "Unknow comm buffer space for chose execution space." -#endif -#else -using BufMemSpace = Kokkos::DefaultExecutionSpace::memory_space; -#endif - -// MPI communication buffers -template -using BufArray1D = Kokkos::View; - -// Structures for reusable memory pools and communication -template -using buf_pool_t = ObjectPool>; - -template -using ParArray0D = ParArrayGeneric, State>; -template -using ParArray1D = ParArrayGeneric, State>; -template -using ParArray2D = ParArrayGeneric, State>; -template -using ParArray3D = - ParArrayGeneric, State>; -template -using ParArray4D = - ParArrayGeneric, State>; -template -using ParArray5D = - ParArrayGeneric, State>; -template -using ParArray6D = - ParArrayGeneric, State>; -template -using ParArray7D = - ParArrayGeneric, State>; -template -using ParArray8D = - ParArrayGeneric, State>; - -// Host mirrors -template -using HostArray0D = typename ParArray0D::HostMirror; -template -using HostArray1D = typename ParArray1D::HostMirror; -template -using HostArray2D = typename ParArray2D::HostMirror; -template -using HostArray3D = typename ParArray3D::HostMirror; -template -using HostArray4D = typename ParArray4D::HostMirror; -template -using HostArray5D = typename ParArray5D::HostMirror; -template -using HostArray6D = typename ParArray6D::HostMirror; -template -using HostArray7D = typename ParArray7D::HostMirror; - -using team_policy = Kokkos::TeamPolicy<>; -using team_mbr_t = Kokkos::TeamPolicy<>::member_type; - -template -using ScratchPad1D = Kokkos::View; -template -using ScratchPad2D = Kokkos::View; -template -using ScratchPad3D = Kokkos::View; -template -using ScratchPad4D = Kokkos::View; -template -using ScratchPad5D = Kokkos::View; -template -using ScratchPad6D = Kokkos::View; - -// Used for ParArrayND -// TODO(JMM): Should all of parthenon_arrays.hpp -// be moved here? Or should all of the above stuff be moved to -// parthenon_arrays.hpp? -inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7; -template -using device_view_t = - Kokkos::View, Layout, DevMemSpace>; -template -using host_view_t = typename device_view_t::HostMirror; - // Defining tags to determine loop_patterns using a tag dispatch design pattern +template +struct PatternBase { + static constexpr int nmiddle = nm; + static constexpr int ninner = ni; +}; // Translates a non-Kokkos standard C++ nested `for` loop where the innermost // `for` is decorated with a #pragma omp simd IMPORTANT: This only works on CPUs -static struct LoopPatternSimdFor { +static struct LoopPatternSimdFor : public PatternBase<0, 1> { } loop_pattern_simdfor_tag; // Translates to a Kokkos 1D range (Kokkos::RangePolicy) where the wrapper takes // care of the (hidden) 1D index to `n`, `k`, `j`, `i indices conversion -static struct LoopPatternFlatRange { +static struct LoopPatternFlatRange : public PatternBase<0, 0> { } loop_pattern_flatrange_tag; // Translates to a Kokkos multi dimensional range (Kokkos::MDRangePolicy) with // a 1:1 indices matching -static struct LoopPatternMDRange { +static struct LoopPatternMDRange : public PatternBase<0, 0> { } loop_pattern_mdrange_tag; // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange -static struct LoopPatternTPTTR { +static struct LoopPatternTPTTR : public PatternBase<0, 1> { + KOKKOS_FORCEINLINE_FUNCTION + static auto InnerRange(team_mbr_t &team_member, std::size_t size) { + return Kokkos::TeamThreadRange<>(team_member, 0, size); + } } loop_pattern_tpttr_tag; // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::ThreadVectorRange -static struct LoopPatternTPTVR { +static struct LoopPatternTPTVR : public PatternBase<0, 1> { + KOKKOS_FORCEINLINE_FUNCTION + static auto InnerRange(team_mbr_t &team_member, std::size_t size) { + return Kokkos::TeamVectorRange<>(team_member, 0, size); + } } loop_pattern_tptvr_tag; // Translates to a Kokkos::TeamPolicy with a middle Kokkos::TeamThreadRange and // inner Kokkos::ThreadVectorRange -static struct LoopPatternTPTTRTVR { +static struct LoopPatternTPTTRTVR : public PatternBase<1, 1> { + KOKKOS_FORCEINLINE_FUNCTION + static auto MiddleRange(team_mbr_t &team_member, std::size_t size) { + return Kokkos::TeamThreadRange<>(team_member, 0, size); + } + KOKKOS_FORCEINLINE_FUNCTION + static auto InnerRange(team_mbr_t &team_member, std::size_t size) { + return Kokkos::ThreadVectorRange<>(team_member, 0, size); + } } loop_pattern_tpttrtvr_tag; // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { @@ -181,11 +101,29 @@ static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) -struct InnerLoopPatternTVR {}; +struct InnerLoopPatternTVR { + KOKKOS_FORCEINLINE_FUNCTION + static auto Range(team_mbr_t &team_member, std::size_t size) { + return Kokkos::TeamVectorRange<>(team_member, 0, size); + } +}; constexpr InnerLoopPatternTVR inner_loop_pattern_tvr_tag; // Translates to a Kokkos::TeamThreadRange as innermost loop -struct InnerLoopPatternTTR {}; +struct InnerLoopPatternTTR { + KOKKOS_FORCEINLINE_FUNCTION + static auto Range(team_mbr_t &team_member, std::size_t size) { + return Kokkos::TeamThreadRange<>(team_member, 0, size); + } +}; constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; +// Translates to a Kokkos::ThreadVectorRange as innermost loop +struct InnerLoopPatternThreadVR { + KOKKOS_FORCEINLINE_FUNCTION + static auto Range(team_mbr_t &team_member, std::size_t size) { + return Kokkos::ThreadVectorRange<>(team_member, 0, size); + } +}; +constexpr InnerLoopPatternThreadVR inner_loop_pattern_threadvr_tag; // Translate to a non-Kokkos plain C++ innermost loop (single index) // decorated with #pragma omp simd // IMPORTANT: currently only supported on CPUs @@ -212,472 +150,200 @@ template inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { Kokkos::parallel_scan(std::forward(args)...); } -} // namespace dispatch_impl - -// 1D loop using RangePolicy loops -template -inline typename std::enable_if::type -par_dispatch(Pattern, const std::string &name, DevExecSpace exec_space, const int &il, - const int &iu, const Function &function, Args &&...args) { - PARTHENON_INSTRUMENT_REGION(name) - if constexpr (std::is_same::value && - std::is_same::value) { -#pragma omp simd - for (auto i = il; i <= iu; i++) { - function(i); - } - } else { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::RangePolicy<>(exec_space, il, iu + 1), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); - } -} - -template -inline typename std::enable_if::type -par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, - const IndexRange &r, const Function &function, Args &&...args) { - par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); -} - -// 2D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int jl, const int ju, const int il, const int iu, - const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} - -template -class FlatFunctor; - -template -auto MakeFlatFunctor(F &function, Args... args) { - return FlatFunctor(function, std::forward(args)...); -} - -template -class FlatFunctor { - int NjNi, Ni, kl, jl, il; - Function function; - - public: - FlatFunctor(const Function _function, const int _NjNi, const int _Ni, const int _kl, - const int _jl, const int _il) - : function(_function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, FArgs &&...fargs) const { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - } -}; - -// 3D loop using Kokkos 1D Range -template -inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function, Args &&...args) { - Tag tag; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFlatFunctor(function, NjNi, Ni, kl, jl, il), - std::forward(args)...); -} -// 3D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int &kl, const int &ku, const int &jl, const int &ju, const int &il, - const int &iu, const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(exec_space, {kl, jl, il}, - {ku + 1, ju + 1, iu + 1}, - {1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} - -// 3D loop using TeamPolicy with single inner TeamThreadRange -template -inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; +template +void KokkosTwoLevelFor(DevExecSpace exec_space, const std::string &name, + const BT_t &bound_trans, const Func &function, + std::index_sequence) { + constexpr int nouter = sizeof...(Is) - Pattern::ninner; + const auto outer_idxer = bound_trans.template GetIndexer<0, nouter>(); + const auto inner_idxer = + bound_trans.template GetIndexer(); Kokkos::parallel_for( - name, team_policy(exec_space, NkNj, Kokkos::AUTO), + name, team_policy(exec_space, outer_idxer.size(), Kokkos::AUTO), KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); + // WARNING/QUESTION(LFR): Is this array defined per thread and is it safe + // update it at different levels of the parallel hierarchy? Could call + // all of the indexers at the innermost level, but that results in more + // index calculations (which are unlikely to matter though). + int indices[sizeof...(Is)]; + outer_idxer.GetIdxCArray(team_member.league_rank(), indices); + Kokkos::parallel_for(Pattern::InnerRange(team_member, inner_idxer.size()), + [&](const int idx) { + inner_idxer.GetIdxCArray(idx, &indices[nouter]); + function(indices[Is]...); + }); }); } -// 3D loop using TeamPolicy with single inner ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - // TODO(pgrete) if exec space is Cuda,throw error - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; +template +void KokkosThreeLevelFor(DevExecSpace exec_space, const std::string &name, + const BT_t &bound_trans, const Func &function, + std::index_sequence) { + constexpr int nouter = sizeof...(Is) - Pattern::ninner - Pattern::nmiddle; + const auto outer_idxer = bound_trans.template GetIndexer<0, nouter>(); + const auto middle_idxer = + bound_trans.template GetIndexer(); + const auto inner_idxer = + bound_trans.template GetIndexer(); Kokkos::parallel_for( - name, team_policy(exec_space, NkNj, Kokkos::AUTO), + name, team_policy(exec_space, outer_idxer.size(), Kokkos::AUTO), KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - Kokkos::parallel_for(Kokkos::TeamVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); - }); -} - -// 3D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - const int Nk = ku - kl + 1; - Kokkos::parallel_for( - name, team_policy(exec_space, Nk, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() + kl; + // WARNING/QUESTION(LFR): Is this array defined per thread and is it safe + // update it at different levels of the parallel hierarchy? Could call + // all of the indexers at the innermost level, but that results in more + // index calculations (which are unlikely to matter though). + int indices[sizeof...(Is)]; + outer_idxer.GetIdxCArray(team_member.league_rank(), indices); Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, jl, ju + 1), [&](const int j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); + Pattern::MiddleRange(team_member, inner_idxer.size()), [&](const int idx) { + middle_idxer.GetIdxCArray(idx, &indices[nouter]); + Kokkos::parallel_for( + Pattern::InnerRange(team_member, inner_idxer.size()), [&](const int i) { + inner_idxer.GetIdxCArray(i, &indices[nouter + Pattern::nmiddle]); + function(indices[Is]...); + }); }); }); } -// 3D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(k, j, i); -} - -template -class FlatFunctor { - int NkNjNi, NjNi, Ni, nl, kl, jl, il; - Function function; - - public: - FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Ni, - const int _nl, const int _kl, const int _jl, const int _il) - : function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Ni(_Ni), nl(_nl), kl(_kl), - jl(_jl), il(_il) {} - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, FArgs &&...fargs) const { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i, std::forward(fargs)...); +template +KOKKOS_INLINE_FUNCTION void RawFor(const LoopBoundTranslator_t &bound_trans, + const Function &function) { + auto idxer = bound_trans.template GetIndexer<0, LoopBoundTranslator_t::rank>(); + for (int idx = 0; idx < idxer.size(); ++idx) { + std::apply(function, idxer(idx)); } -}; - -// 4D loop using Kokkos 1D Range -template -inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function, - Args &&...args) { - Tag tag; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - MakeFlatFunctor(function, NkNjNi, NjNi, Ni, nl, kl, jl, il), - std::forward(args)...); -} - -// 4D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function, - Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {nl, kl, jl, il}, {nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); } -// 4D loop using TeamPolicy loop with inner TeamThreadRange -template -inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); -} - -// 4D loop using TeamPolicy loop with inner ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - // TODO(pgrete) if exec space is Cuda,throw error - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); -} - -// 4D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int NnNk = Nn * Nk; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNk, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / Nk + nl; - int k = team_member.league_rank() % Nk + kl; - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, jl, ju + 1), [&](const int j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); - }); -} - -// 4D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) +template +KOKKOS_INLINE_FUNCTION void SimdFor(const LoopBoundTranslator_t &bound_trans, + const Function &function, + std::index_sequence) { + if constexpr (LoopBoundTranslator_t::rank > 1) { + constexpr int inner_start_rank = LoopBoundTranslator_t::rank - 1; + auto idxer = bound_trans.template GetIndexer<0, inner_start_rank>(); + const int istart = bound_trans[inner_start_rank].s; + const int iend = bound_trans[inner_start_rank].e; + // Loop over all outer indices using a flat indexer + for (int idx = 0; idx < idxer.size(); ++idx) { + const auto indices = idxer.GetIdxArray(idx); #pragma omp simd - for (auto i = il; i <= iu; i++) - function(n, k, j, i); -} - -// 5D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int ml, const int mu, const int nl, const int nu, const int kl, - const int ku, const int jl, const int ju, const int il, const int iu, - const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch( - tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(exec_space, {ml, nl, kl, jl, il}, - {mu + 1, nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} - -// 5D loop using Kokkos 1D Range -template -inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, - const int nl, const int nu, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nb = bu - bl + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NbNnNkNjNi = Nb * Nn * Nk * Nj * Ni; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - Kokkos::parallel_for( - name, Kokkos::RangePolicy<>(exec_space, 0, NbNnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int b = idx / NnNkNjNi; - int n = (idx - b * NnNkNjNi) / NkNjNi; - int k = (idx - b * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - b * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - b * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - b += bl; - n += nl; - k += kl; - j += jl; - i += il; - function(b, n, k, j, i); - }); -} - -// 5D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, - const int nl, const int nu, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto b = bl; b <= bu; b++) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) + for (int i = istart; i <= iend; ++i) { + function(indices[Is]..., i); + } + } + } else { // Easier to just explicitly specialize for 1D Simd loop #pragma omp simd - for (auto i = il; i <= iu; i++) - function(b, n, k, j, i); + for (int i = bound_trans[0].s; i <= bound_trans[0].e; ++i) + function(i); + } } - -// 6D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int ll, const int lu, const int ml, const int mu, const int nl, - const int nu, const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {ll, ml, nl, kl, jl, il}, - {lu + 1, mu + 1, nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); +template +KOKKOS_FORCEINLINE_FUNCTION void SimdFor(const LoopBoundTranslator_t &bound_trans, + const Function &function) { + SimdFor(bound_trans, function, + std::make_index_sequence()); } -// 6D loop using Kokkos 1D Range -template -inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nl = lu - ll + 1; - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - const int NlNmNnNkNjNi = Nl * NmNnNkNjNi; - Kokkos::parallel_for( - name, Kokkos::RangePolicy<>(exec_space, 0, NlNmNnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int l = idx / NmNnNkNjNi; - int m = (idx - l * NmNnNkNjNi) / NnNkNjNi; - int n = (idx - l * NmNnNkNjNi - m * NnNkNjNi) / NkNjNi; - int k = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - l += ll; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(l, m, n, k, j, i); - }); -} +} // namespace dispatch_impl -// 6D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto l = ll; l <= lu; l++) - for (auto m = ml; m <= mu; m++) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(l, m, n, k, j, i); +template +struct par_dispatch_funct {}; + +template +struct par_dispatch_funct, Function, + TypeList, TypeList> { + template + void operator()(std::index_sequence, const std::string &name, + DevExecSpace exec_space, Bound_ts &&...bounds, const Function &function, + ExtraArgs_ts &&...args) { + using bt_t = LoopBoundTranslator; + auto bound_trans = bt_t(std::forward(bounds)...); + + constexpr int total_rank = bt_t::rank; + [[maybe_unused]] constexpr int kTotalRank = bt_t::rank; + [[maybe_unused]] constexpr int inner_start_rank = total_rank - Pattern::ninner; + [[maybe_unused]] constexpr int middle_start_rank = + total_rank - Pattern::ninner - Pattern::nmiddle; + + constexpr bool SimdRequested = std::is_same_v; + constexpr bool MDRangeRequested = std::is_same_v; + constexpr bool FlatRangeRequested = std::is_same_v; + constexpr bool TPTTRRequested = std::is_same_v; + constexpr bool TPTVRRequested = std::is_same_v; + constexpr bool TPTTRTVRRequested = std::is_same_v; + + constexpr bool isParFor = std::is_same_v; + constexpr bool doSimdFor = SimdRequested && isParFor; + constexpr bool doMDRange = MDRangeRequested && (total_rank > 1); + constexpr bool doTPTTRTV = TPTTRTVRRequested && + (total_rank > Pattern::ninner + Pattern::nmiddle) && + isParFor; + constexpr bool doTPTTR = TPTTRRequested && (total_rank > Pattern::ninner) && isParFor; + constexpr bool doTPTVR = TPTVRRequested && (total_rank > Pattern::ninner) && isParFor; + + [[maybe_unused]] constexpr bool doFlatRange = + FlatRangeRequested || (SimdRequested && !doSimdFor) || + (MDRangeRequested && !doMDRange) || (TPTTRTVRRequested && !doTPTTRTV) || + (TPTTRRequested && !doTPTTR) || (TPTVRRequested && !doTPTVR); + + if constexpr (doSimdFor) { + dispatch_impl::SimdFor(bound_trans, function); + } else if constexpr (doMDRange) { + static_assert(total_rank > 1, + "MDRange pattern only works for multi-dimensional loops."); + auto policy = + bound_trans.template GetKokkosMDRangePolicy<0, total_rank>(exec_space); + dispatch_impl::kokkos_dispatch(Tag(), name, policy, function, + std::forward(args)...); + } else if constexpr (doFlatRange) { + auto idxer = bound_trans.template GetIndexer<0, total_rank>(); + auto policy = + bound_trans.template GetKokkosFlatRangePolicy<0, total_rank>(exec_space); + auto lam = KOKKOS_LAMBDA(const int &idx, FuncExtArgs_ts... fargs) { + const auto indices = idxer.GetIdxArray(idx); + function(indices[Is]..., fargs...); + }; + dispatch_impl::kokkos_dispatch(Tag(), name, policy, lam, + std::forward(args)...); + } else if constexpr (doTPTTR || doTPTVR) { + static_assert(Pattern::nmiddle == 0, "Two-level paralellism chosen."); + dispatch_impl::KokkosTwoLevelFor(exec_space, name, bound_trans, function, + std::make_index_sequence()); + } else if constexpr (doTPTTRTV) { + dispatch_impl::KokkosThreeLevelFor(exec_space, name, bound_trans, function, + std::make_index_sequence()); + } else { + printf("Loop pattern unsupported."); + } + } +}; + +template +void par_dispatch(Pattern pattern, const std::string &name, DevExecSpace exec_space, + Args &&...args) { + using arg_tl = TypeList; + constexpr std::size_t func_idx = FirstFuncIdx(); + static_assert(func_idx < arg_tl::n_types, + "Apparently we didn't successfully find a function."); + using func_t = typename arg_tl::template type; + constexpr int loop_rank = LoopBoundTranslator< + typename arg_tl::template continuous_sublist<0, func_idx - 1>>::rank; + + using func_sig_tl = typename FuncSignature::arg_types_tl; + // The first loop_rank arguments of the function are just indices, the rest are extra + // arguments for reductions, etc. + using bounds_tl = typename arg_tl::template continuous_sublist<0, func_idx - 1>; + using extra_arg_tl = typename arg_tl::template continuous_sublist; + using func_sig_extra_tl = typename func_sig_tl::template continuous_sublist; + par_dispatch_funct + loop_funct; + loop_funct(std::make_index_sequence(), name, exec_space, + std::forward(args)...); } template @@ -701,73 +367,50 @@ inline void par_scan(Args &&...args) { par_dispatch(std::forward(args)...); } -// 1D outer parallel loop using Kokkos Teams -template -inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int kl, const int ku, - const Function &function) { - const int Nk = ku + 1 - kl; - - team_policy policy(exec_space, Nk, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() + kl; - function(team_member, k); - }); -} - -// 2D outer parallel loop using Kokkos Teams -template -inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int kl, const int ku, - const int jl, const int ju, const Function &function) { - const int Nk = ku + 1 - kl; - const int Nj = ju + 1 - jl; - const int NkNj = Nk * Nj; - - team_policy policy(exec_space, NkNj, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - function(team_member, k, j); - }); -} +template +struct par_for_outer_funct_t; + +template +struct par_for_outer_funct_t, TypeList> { + template + void operator()(std::index_sequence, Pattern, const std::string &name, + DevExecSpace exec_space, size_t scratch_size_in_bytes, + const int scratch_level, Bound_ts &&...bounds, + const Function &function) { + using bt_t = LoopBoundTranslator; + auto bound_trans = bt_t(std::forward(bounds)...); + auto idxer = bound_trans.template GetIndexer<0, bt_t::rank>(); + constexpr int kTotalRank = bt_t::rank; + + if constexpr (std::is_same_v) { + team_policy policy(exec_space, idxer.size(), Kokkos::AUTO); + Kokkos::parallel_for( + name, + policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), + KOKKOS_LAMBDA(team_mbr_t team_member) { + const auto indices = idxer.GetIdxArray(team_member.league_rank()); + function(team_member, indices[Is]...); + }); + } else { + PARTHENON_FAIL("Unsupported par_for_outer loop pattern."); + } + } +}; -// 3D outer parallel loop using Kokkos Teams -template +template inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - - team_policy policy(exec_space, NnNkNj, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - const int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - function(team_member, n, k, j); - }); + const int scratch_level, Args &&...args) { + using arg_tl = TypeList; + constexpr int nm2 = arg_tl::n_types - 2; + constexpr int nm1 = arg_tl::n_types - 1; + using bound_tl = typename arg_tl::template continuous_sublist<0, nm2>; + using function_tl = typename arg_tl::template continuous_sublist; + par_for_outer_funct_t par_for_outer_funct; + using bt_t = LoopBoundTranslator; + par_for_outer_funct(std::make_index_sequence(), OuterLoopPatternTeams(), + name, exec_space, scratch_size_in_bytes, scratch_level, + std::forward(args)...); } template @@ -776,199 +419,52 @@ inline void par_for_outer(const std::string &name, Args &&...args) { std::forward(args)...); } -// Inner parallel loop using TeamThreadRange -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, const int kl, - const int ku, const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nl = lu - ll + 1; - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - const int NlNmNnNkNjNi = Nl * NmNnNkNjNi; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team_member, NlNmNnNkNjNi), [&](const int &idx) { - int l = idx / NmNnNkNjNi; - int m = (idx - l * NmNnNkNjNi) / NnNkNjNi; - int n = (idx - l * NmNnNkNjNi - m * NnNkNjNi) / NkNjNi; - int k = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - l += nl; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(l, m, n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int ml, const int mu, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function) { - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NmNnNkNjNi), - [&](const int &idx) { - int m = idx / NnNkNjNi; - int n = (idx - m * NnNkNjNi) / NkNjNi; - int k = (idx - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(m, n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NnNkNjNi), - [&](const int &idx) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NkNjNi), [&](const int &idx) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NjNi), [&](const int &idx) { - int j = idx / Ni + jl; - int i = idx % Ni + il; - function(j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternTTR, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, il, iu + 1), function); -} -// Inner parallel loop using TeamVectorRange -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternTVR, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, il, iu + 1), function); -} - -// Inner parallel loop using FOR SIMD -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function) { - for (int n = nl; n <= nu; ++n) { - for (int k = kl; k <= ku; ++k) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(k, j, i); - } - } +template +struct par_for_inner_funct_t; + +template +struct par_for_inner_funct_t, TypeList> { + KOKKOS_DEFAULTED_FUNCTION + par_for_inner_funct_t() = default; + + template + KOKKOS_FORCEINLINE_FUNCTION void + operator()(std::index_sequence, Pattern, team_mbr_t team_member, + Bound_ts &&...bounds, const Function &function) { + using bt_t = LoopBoundTranslator; + auto bound_trans = bt_t(std::forward(bounds)...); + [[maybe_unused]] constexpr int kTotalRank = bt_t::rank; + constexpr bool doTTR = std::is_same_v; + constexpr bool doTVR = std::is_same_v; + constexpr bool doThreadVR = std::is_same_v; + constexpr bool doSimd = std::is_same_v; + if constexpr (doSimd) { + dispatch_impl::SimdFor(bound_trans, function); + } else if constexpr (doTTR || doTVR || doThreadVR) { + const auto idxer = bound_trans.template GetIndexer<0, bt_t::rank>(); + Kokkos::parallel_for(Pattern::Range(team_member, idxer.size()), + [&](const int &idx) { + const auto indices = idxer.GetIdxArray(idx); + function(indices[Is]...); + }); + } else { + PARTHENON_FAIL("Unsupported par_for_inner loop pattern."); } } -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - for (int k = kl; k <= ku; ++k) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(k, j, i); - } - } - } -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int jl, const int ju, - const int il, const int iu, const Function &function) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(j, i); - } - } -} -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternSimdFor, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(i); - } -} +}; -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(const Tag &t, team_mbr_t member, - const IndexRange r, - const Function &function) { - par_for_inner(t, member, r.s, r.e, function); +template +KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern pattern, Args &&...args) { + using arg_tl = TypeList; + constexpr int nm2 = arg_tl::n_types - 2; + constexpr int nm1 = arg_tl::n_types - 1; + // First Arg of Args is team_member + using bound_tl = typename arg_tl::template continuous_sublist<1, nm2>; + using function_tl = typename arg_tl::template continuous_sublist; + par_for_inner_funct_t par_for_inner_funct; + using bt_t = LoopBoundTranslator; + par_for_inner_funct(std::make_index_sequence(), pattern, + std::forward(args)...); } template diff --git a/src/kokkos_types.hpp b/src/kokkos_types.hpp new file mode 100644 index 000000000000..731fbc695723 --- /dev/null +++ b/src/kokkos_types.hpp @@ -0,0 +1,146 @@ +//======================================================================================== +// Parthenon performance portable AMR framework +// Copyright(C) 2020-2023 The Parthenon collaboration +// Licensed under the 3-clause BSD License, see LICENSE file for details +//======================================================================================== +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 +// for Los Alamos National Laboratory (LANL), which is operated by Triad +// National Security, LLC for the U.S. Department of Energy/National Nuclear +// Security Administration. All rights in the program are reserved by Triad +// National Security, LLC, and the U.S. Department of Energy/National Nuclear +// Security Administration. The Government is granted for itself and others +// acting on its behalf a nonexclusive, paid-up, irrevocable worldwide license +// in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do +// so. +//======================================================================================== + +#ifndef KOKKOS_TYPES_HPP_ +#define KOKKOS_TYPES_HPP_ + +#include +#include +#include +#include +#include + +#include + +#include "basic_types.hpp" +#include "config.hpp" +#include "parthenon_array_generic.hpp" +#include "utils/multi_pointer.hpp" +#include "utils/object_pool.hpp" + +namespace parthenon { + +#ifdef KOKKOS_ENABLE_CUDA_UVM +using DevMemSpace = Kokkos::CudaUVMSpace; +using HostMemSpace = Kokkos::CudaUVMSpace; +using DevExecSpace = Kokkos::Cuda; +#else +using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space; +using HostMemSpace = Kokkos::HostSpace; +using DevExecSpace = Kokkos::DefaultExecutionSpace; +#endif +using ScratchMemSpace = DevExecSpace::scratch_memory_space; + +using HostExecSpace = Kokkos::DefaultHostExecutionSpace; +using LayoutWrapper = Kokkos::LayoutRight; +using MemUnmanaged = Kokkos::MemoryTraits; + +#if defined(PARTHENON_ENABLE_HOST_COMM_BUFFERS) +#if defined(KOKKOS_ENABLE_CUDA) +using BufMemSpace = Kokkos::CudaHostPinnedSpace::memory_space; +#elif defined(KOKKOS_ENABLE_HIP) +using BufMemSpace = Kokkos::Experimental::HipHostPinnedSpace::memory_space; +#else +#error "Unknow comm buffer space for chose execution space." +#endif +#else +using BufMemSpace = Kokkos::DefaultExecutionSpace::memory_space; +#endif + +// MPI communication buffers +template +using BufArray1D = Kokkos::View; + +// Structures for reusable memory pools and communication +template +using buf_pool_t = ObjectPool>; + +template +using ParArray0D = ParArrayGeneric, State>; +template +using ParArray1D = ParArrayGeneric, State>; +template +using ParArray2D = ParArrayGeneric, State>; +template +using ParArray3D = + ParArrayGeneric, State>; +template +using ParArray4D = + ParArrayGeneric, State>; +template +using ParArray5D = + ParArrayGeneric, State>; +template +using ParArray6D = + ParArrayGeneric, State>; +template +using ParArray7D = + ParArrayGeneric, State>; +template +using ParArray8D = + ParArrayGeneric, State>; + +// Host mirrors +template +using HostArray0D = typename ParArray0D::HostMirror; +template +using HostArray1D = typename ParArray1D::HostMirror; +template +using HostArray2D = typename ParArray2D::HostMirror; +template +using HostArray3D = typename ParArray3D::HostMirror; +template +using HostArray4D = typename ParArray4D::HostMirror; +template +using HostArray5D = typename ParArray5D::HostMirror; +template +using HostArray6D = typename ParArray6D::HostMirror; +template +using HostArray7D = typename ParArray7D::HostMirror; + +using team_policy = Kokkos::TeamPolicy<>; +using team_mbr_t = Kokkos::TeamPolicy<>::member_type; + +template +using ScratchPad1D = Kokkos::View; +template +using ScratchPad2D = Kokkos::View; +template +using ScratchPad3D = Kokkos::View; +template +using ScratchPad4D = Kokkos::View; +template +using ScratchPad5D = Kokkos::View; +template +using ScratchPad6D = Kokkos::View; + +// Used for ParArrayND +// TODO(JMM): Should all of parthenon_arrays.hpp +// be moved here? Or should all of the above stuff be moved to +// parthenon_arrays.hpp? +inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7; +template +using device_view_t = + Kokkos::View, Layout, DevMemSpace>; +template +using host_view_t = typename device_view_t::HostMirror; + +} // namespace parthenon + +#endif // KOKKOS_TYPES_HPP_ diff --git a/src/loop_bound_translator.hpp b/src/loop_bound_translator.hpp new file mode 100644 index 000000000000..c31ed8a15ef6 --- /dev/null +++ b/src/loop_bound_translator.hpp @@ -0,0 +1,121 @@ +//======================================================================================== +// Parthenon performance portable AMR framework +// Copyright(C) 2020-2024 The Parthenon collaboration +// Licensed under the 3-clause BSD License, see LICENSE file for details +//======================================================================================== +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 +// for Los Alamos National Laboratory (LANL), which is operated by Triad +// National Security, LLC for the U.S. Department of Energy/National Nuclear +// Security Administration. All rights in the program are reserved by Triad +// National Security, LLC, and the U.S. Department of Energy/National Nuclear +// Security Administration. The Government is granted for itself and others +// acting on its behalf a nonexclusive, paid-up, irrevocable worldwide license +// in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do +// so. +//======================================================================================== + +#ifndef LOOP_BOUND_TRANSLATOR_HPP_ +#define LOOP_BOUND_TRANSLATOR_HPP_ + +#include +#include +#include +#include +#include + +#include + +#include "basic_types.hpp" +#include "kokkos_types.hpp" +#include "utils/indexer.hpp" +#include "utils/type_list.hpp" + +namespace parthenon { +// Struct for translating between loop bounds given in terms of IndexRanges and loop +// bounds given in terms of raw integers +template +struct LoopBoundTranslator { + using Bound_tl = TypeList; + static constexpr bool are_integers = std::is_integral_v< + typename std::remove_reference>::type>; + static constexpr uint rank = sizeof...(Bound_ts) / (1 + are_integers); + + std::array bounds; + + KOKKOS_INLINE_FUNCTION + IndexRange &operator[](int i) { return bounds[i]; } + + KOKKOS_INLINE_FUNCTION + const IndexRange &operator[](int i) const { return bounds[i]; } + + KOKKOS_INLINE_FUNCTION + explicit LoopBoundTranslator(Bound_ts... bounds_in) { + if constexpr (are_integers) { + std::array bounds_arr{static_cast(bounds_in)...}; + for (int r = 0; r < rank; ++r) { + bounds[r].s = static_cast(bounds_arr[2 * r]); + bounds[r].e = static_cast(bounds_arr[2 * r + 1]); + } + } else { + bounds = std::array{bounds_in...}; + } + } + + template + auto GetKokkosFlatRangePolicy(DevExecSpace exec_space) const { + constexpr int ndim = RankStop - RankStart; + static_assert(ndim > 0, "Need a valid range of ranks"); + static_assert(RankStart >= 0, "Need a valid range of ranks"); + static_assert(RankStop <= rank, "Need a valid range of ranks"); + int64_t npoints = 1; + for (int d = RankStart; d < RankStop; ++d) + npoints *= (bounds[d].e + 1 - bounds[d].s); + return Kokkos::Experimental::require( + Kokkos::RangePolicy<>(exec_space, 0, npoints), + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + } + + template + auto GetKokkosMDRangePolicy(DevExecSpace exec_space) const { + constexpr int ndim = RankStop - RankStart; + static_assert(ndim > 1, "Need a valid range of ranks"); + static_assert(RankStart >= 0, "Need a valid range of ranks"); + static_assert(RankStop <= rank, "Need a valid range of ranks"); + Kokkos::Array start, end, tile; + for (int d = 0; d < ndim; ++d) { + start[d] = bounds[d + RankStart].s; + end[d] = bounds[d + RankStart].e + 1; + tile[d] = 1; + } + tile[ndim - 1] = end[ndim - 1] - start[ndim - 1]; + return Kokkos::Experimental::require( + Kokkos::MDRangePolicy>(exec_space, start, end, tile), + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + } + + template + KOKKOS_INLINE_FUNCTION auto GetIndexer(std::index_sequence) const { + return MakeIndexer( + std::pair(bounds[Is + RankStart].s, bounds[Is + RankStart].e)...); + } + + template + KOKKOS_INLINE_FUNCTION auto GetIndexer() const { + constexpr int ndim = RankStop - RankStart; + static_assert(ndim > 0, "Need a valid range of ranks"); + static_assert(RankStart >= 0, "Need a valid range of ranks"); + static_assert(RankStop <= rank, "Need a valid range of ranks"); + return GetIndexer(std::make_index_sequence()); + } +}; + +template +struct LoopBoundTranslator> + : public LoopBoundTranslator {}; + +} // namespace parthenon + +#endif // LOOP_BOUND_TRANSLATOR_HPP_ diff --git a/src/mesh/meshblock.hpp b/src/mesh/meshblock.hpp index 41ae52ea5930..e300a44370d7 100644 --- a/src/mesh/meshblock.hpp +++ b/src/mesh/meshblock.hpp @@ -237,59 +237,34 @@ class MeshBlock : public std::enable_shared_from_this { template inline void par_for(Args &&...args) { - par_dispatch_(std::forward(args)...); + parthenon::par_for(std::forward(args)...); } template inline void par_reduce(Args &&...args) { - par_dispatch_(std::forward(args)...); + parthenon::par_reduce(std::forward(args)...); } template inline void par_scan(Args &&...args) { - par_dispatch_(std::forward(args)...); + parthenon::par_scan(std::forward(args)...); + } + + template + inline void par_for_outer(Args &&...args) { + parthenon::par_for_outer(std::forward(args)...); } template inline void par_for_bndry(const std::string &name, const IndexRange &nb, const IndexDomain &domain, TopologicalElement el, - const bool coarse, const bool fine, - const Function &function) { + const bool coarse, const bool fine, Function &&function) { auto &bounds = fine ? (coarse ? cellbounds : f_cellbounds) : (coarse ? c_cellbounds : cellbounds); auto ib = bounds.GetBoundsI(domain, el); auto jb = bounds.GetBoundsJ(domain, el); auto kb = bounds.GetBoundsK(domain, el); - par_for(name, nb, kb, jb, ib, function); - } - - // 1D Outer default loop pattern - template - inline void par_for_outer(const std::string &name, const size_t &scratch_size_in_bytes, - const int &scratch_level, const int &kl, const int &ku, - const Function &function) { - parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, name, exec_space, - scratch_size_in_bytes, scratch_level, kl, ku, function); - } - // 2D Outer default loop pattern - template - inline void par_for_outer(const std::string &name, const size_t &scratch_size_in_bytes, - const int &scratch_level, const int &kl, const int &ku, - const int &jl, const int &ju, const Function &function) { - parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, name, exec_space, - scratch_size_in_bytes, scratch_level, kl, ku, jl, ju, - function); - } - - // 3D Outer default loop pattern - template - inline void par_for(const std::string &name, size_t &scratch_size_in_bytes, - const int &scratch_level, const int &nl, const int &nu, - const int &kl, const int &ku, const int &jl, const int &ju, - const Function &function) { - parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, name, exec_space, - scratch_size_in_bytes, scratch_level, nl, nu, kl, ku, jl, ju, - function); + parthenon::par_for(name, nb, kb, jb, ib, std::forward(function)); } int GetNumberOfMeshBlockCells() const { @@ -300,124 +275,6 @@ class MeshBlock : public std::enable_shared_from_this { void SetAllowedDt(const Real dt) { new_block_dt_ = dt; } Real NewDt() const { return new_block_dt_; } - // It would be nice for these par_dispatch_ functions to be private, but they can't be - // 1D default loop pattern - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const int &il, const int &iu, - const Function &function, Args &&...args) { - // using loop_pattern_flatrange_tag instead of DEFAULT_LOOP_PATTERN for now - // as the other wrappers are not implemented yet for 1D loops - parthenon::par_dispatch(loop_pattern_flatrange_tag, name, exec_space, il, iu, - function, std::forward(args)...); - } - - // index domain version - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const IndexRange &ib, const Function &function, - Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, ib.s, ib.e, function, - std::forward(args)...); - } - - // 2D default loop pattern - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const int &jl, const int &ju, const int &il, - const int &iu, const Function &function, Args &&...args) { - // using loop_pattern_mdrange_tag instead of DEFAULT_LOOP_PATTERN for now - // as the other wrappers are not implemented yet for 1D loops - parthenon::par_dispatch(loop_pattern_mdrange_tag, name, exec_space, jl, ju, il, - iu, function, std::forward(args)...); - } - - // index domain version - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const IndexRange &jb, const IndexRange &ib, - const Function &function, Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, jb.s, jb.e, ib.s, ib.e, - function, std::forward(args)...); - } - - // 3D default loop pattern - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const int &kl, const int &ku, const int &jl, - const int &ju, const int &il, const int &iu, const Function &function, - Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, kl, ku, jl, ju, il, iu, - function, std::forward(args)...); - } - - // index domain version - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const IndexRange &kb, const IndexRange &jb, - const IndexRange &ib, const Function &function, Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, kb.s, kb.e, jb.s, jb.e, - ib.s, ib.e, function, std::forward(args)...); - } - - // 4D default loop pattern - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const int &nl, const int &nu, const int &kl, - const int &ku, const int &jl, const int &ju, const int &il, const int &iu, - const Function &function, Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, nl, nu, kl, ku, jl, ju, il, - iu, function, std::forward(args)...); - } - - // IndexDomain version - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const IndexRange &nb, const IndexRange &kb, - const IndexRange &jb, const IndexRange &ib, const Function &function, - Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, nb.s, nb.e, kb.s, kb.e, - jb.s, jb.e, ib.s, ib.e, function, - std::forward(args)...); - } - - // 5D default loop pattern - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const int &bl, const int &bu, const int &nl, - const int &nu, const int &kl, const int &ku, const int &jl, const int &ju, - const int &il, const int &iu, const Function &function, Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, bl, bu, nl, nu, kl, ku, jl, - ju, il, iu, function, std::forward(args)...); - } - - // IndexDomain version - template - inline typename std::enable_if::type - par_dispatch_(const std::string &name, const IndexRange &bb, const IndexRange &nb, - const IndexRange &kb, const IndexRange &jb, const IndexRange &ib, - const Function &function, Args &&...args) { - typename std::conditional::type loop_type; - parthenon::par_dispatch(loop_type, name, exec_space, bb.s, bb.e, nb.s, nb.e, - kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, function, - std::forward(args)...); - } - private: // data Real new_block_dt_, new_block_dt_hyperbolic_, new_block_dt_parabolic_, diff --git a/src/prolong_restrict/pr_ops.hpp b/src/prolong_restrict/pr_ops.hpp index f074f6a5d02b..585ce1e48816 100644 --- a/src/prolong_restrict/pr_ops.hpp +++ b/src/prolong_restrict/pr_ops.hpp @@ -427,9 +427,8 @@ struct ProlongateInternalTothAndRoe { return (*pfine)(eidx, l, m, n, fk + ok * g3, fj + oj * g2, fi + oi); } else if constexpr (fel == TE::F2) { return (*pfine)(eidx, l, m, n, fk + oj * g3, fj + oi * g2, fi + ok); - } else { - return (*pfine)(eidx, l, m, n, fk + oi * g3, fj + ok * g2, fi + oj); } + return (*pfine)(eidx, l, m, n, fk + oi * g3, fj + ok * g2, fi + oj); }; using iarr2 = std::array; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 43e4b613087c..b13b97115ada 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -13,12 +13,13 @@ #ifndef UTILS_INDEXER_HPP_ #define UTILS_INDEXER_HPP_ -#include #include #include #include #include +#include + #include "utils/concepts_lite.hpp" #include "utils/utils.hpp" @@ -97,8 +98,12 @@ struct Indexer { KOKKOS_FORCEINLINE_FUNCTION auto GetIdxArray(int idx) const { - return get_array_from_tuple( - GetIndicesImpl(idx, std::make_index_sequence())); + return GetIndicesArrayImpl(idx, std::make_index_sequence()); + } + + KOKKOS_FORCEINLINE_FUNCTION + void GetIdxCArray(int idx, int *indices) const { + GetIndicesCArrayImpl(idx, indices, std::make_index_sequence()); } template @@ -120,14 +125,40 @@ struct Indexer { std::tuple idxs; ( [&] { - std::get(idxs) = idx / std::get(N); - idx -= std::get(idxs) * std::get(N); - std::get(idxs) += std::get(start); + std::get(idxs) = idx / N[Is]; + idx -= std::get(idxs) * N[Is]; + std::get(idxs) += start[Is]; }(), ...); return idxs; } + template + KOKKOS_FORCEINLINE_FUNCTION void + GetIndicesCArrayImpl(int idx, int *indices, std::index_sequence) const { + ( + [&] { + indices[Is] = idx / N[Is]; + idx -= indices[Is] * N[Is]; + indices[Is] += start[Is]; + }(), + ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::array + GetIndicesArrayImpl(int idx, std::index_sequence) const { + std::array indices; + ( + [&] { + indices[Is] = idx / N[Is]; + idx -= indices[Is] * N[Is]; + indices[Is] += start[Is]; + }(), + ...); + return indices; + } + template KOKKOS_FORCEINLINE_FUNCTION static std::array GetFactors(std::tuple Nt, std::index_sequence) { @@ -136,7 +167,7 @@ struct Indexer { ( [&] { constexpr std::size_t idx = sizeof...(Ts) - (Is + 1); - std::get(N) = cur; + N[idx] = cur; cur *= std::get(Nt); }(), ...); @@ -189,5 +220,10 @@ using Indexer8D = Indexer; using SpatiallyMaskedIndexer6D = SpatiallyMaskedIndexer; +template +KOKKOS_INLINE_FUNCTION auto MakeIndexer(const std::pair &...ranges) { + return Indexer(ranges...); +} + } // namespace parthenon #endif // UTILS_INDEXER_HPP_ diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index a17b7b8f7063..097d6d97f14e 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -54,9 +54,8 @@ struct TypeList { (func(Args()), ...); } - private: template - static auto ContinuousSublist() { + static auto ContinuousSublistImpl() { return ContinuousSublistImpl(std::make_index_sequence()); } template @@ -64,9 +63,8 @@ struct TypeList { return sublist<(Start + Is)...>(); } - public: - template - using continuous_sublist = decltype(ContinuousSublist()); + template + using continuous_sublist = decltype(ContinuousSublistImpl()); }; namespace impl { @@ -92,6 +90,65 @@ auto GetNames() { TL::IterateTypes([&names](auto t) { names.push_back(decltype(t)::name()); }); return names; } + +namespace impl { +template +static constexpr int FirstNonIntegralImpl() { + if constexpr (cidx == TL::n_types) { + return TL::n_types; + } else { + if constexpr (std::is_integral_v>::type>) + return FirstNonIntegralImpl(); + return cidx; + } +} +} // namespace impl + +template +static constexpr int FirstNonIntegralIdx() { + return impl::FirstNonIntegralImpl(); +} + +template +struct is_functor : std::false_type {}; + +template +struct is_functor> : std::true_type {}; + +template +constexpr int FirstFuncIdx() { + if constexpr (idx == TL::n_types) { + return TL::n_types; + } else { + using cur_type = typename TL::template type; + if constexpr (is_functor::value) return idx; + if constexpr (std::is_function>::value) return idx; + return FirstFuncIdx(); + } + return -1; +} + +template +struct FuncSignature; + +template +struct FuncSignature : public FuncSignature {}; + +template +struct FuncSignature { + using type = R(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; + +template +struct FuncSignature { + using type = R (T::*)(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; + } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_