diff --git a/CHANGELOG.md b/CHANGELOG.md index ab321d4c2a8a..aaa484b3fc7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ - [[PR 885]](https://github.com/parthenon-hpc-lab/parthenon/pull/885) Expose PackDescriptor and use uids in SparsePacks ### Fixed (not changing behavior/API/variables/...) +- [[PR 952]](https://github.com/parthenon-hpc-lab/parthenon/pull/954) Fix format string in sparse advection example - [[PR 947]](https://github.com/parthenon-hpc-lab/parthenon/pull/947) Add missing ForceRemeshComm dependencies - [[PR 928]](https://github.com/parthenon-hpc-lab/parthenon/pull/928) Fix boundary comms during refinement next to refined blocks - [[PR 937]](https://github.com/parthenon-hpc-lab/parthenon/pull/937) Fix multiple line continuations @@ -34,6 +35,7 @@ - [[PR 890]](https://github.com/parthenon-hpc-lab/parthenon/pull/890) Fix bugs in sparse communication and prolongation ### Infrastructure (changes irrelevant to downstream codes) +- [[PR 938]](https://github.com/parthenon-hpc-lab/parthenon/pull/938) Restructure buffer packing/unpacking kernel hierarchical parallelism - [[PR 944]](https://github.com/parthenon-hpc-lab/parthenon/pull/944) Move sparse pack identifier creation to descriptor - [[PR 904]](https://github.com/parthenon-hpc-lab/parthenon/pull/904) Move to prolongation/restriction in one for AMR and communicate non-cell centered fields - [[PR 918]](https://github.com/parthenon-hpc-lab/parthenon/pull/918) Refactor RegionSize diff --git a/example/sparse_advection/parthenon_app_inputs.cpp b/example/sparse_advection/parthenon_app_inputs.cpp index 8203ade9b532..1cd806accad6 100644 --- a/example/sparse_advection/parthenon_app_inputs.cpp +++ b/example/sparse_advection/parthenon_app_inputs.cpp @@ -1,4 +1,4 @@ -// (C) (or copyright) 2021. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -176,7 +176,7 @@ void PostStepDiagnosticsInLoop(Mesh *mesh, ParameterInput *pin, const SimTime &t } std::printf("\n"); Real mem_avg = static_cast(mem_tot) / static_cast(blocks_tot); - std::printf("\tMem used/block in bytes [min, max, avg] = [%ld, %ld, %.14e]\n", + std::printf("\tMem used/block in bytes [min, max, avg] = [%lu, %lu, %.14e]\n", mem_min, mem_max, mem_avg); } } diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 9478de446d4a..9f3b47d0aba4 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -90,6 +90,7 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { PARTHENON_DEBUG_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); auto &sending_nonzero_flags = cache.sending_non_zero_flags; auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h; + Kokkos::parallel_for( "SendBoundBufs", Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), @@ -106,13 +107,26 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; + const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange<>(team_member, idxer.size()), + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), [&](const int idx, bool &lnon_zero) { - const auto [t, u, v, k, j, i] = idxer(idx); - const Real &val = bnd_info(b).var(iel, t, u, v, k, j, i); - bnd_info(b).buf(idx + idx_offset) = val; - lnon_zero = lnon_zero || (std::abs(val) >= threshold); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { buf[m] = var[m]; }); + + bool mnon_zero = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m, bool &llnon_zero) { + llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); + }, + Kokkos::LOr(mnon_zero)); + + lnon_zero = lnon_zero || mnon_zero; }, Kokkos::LOr(non_zero[iel])); idx_offset += idxer.size(); @@ -258,21 +272,41 @@ TaskStatus SetBounds(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; + const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size()), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); - if (idxer.IsActive(k, j, i)) - bnd_info(b).var(iel, t, u, v, k, j, i) = - bnd_info(b).buf(idx + idx_offset); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + // Have to do this because of some weird issue about structure bindings + // being captured + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = buf[m]; + }); + }); } else if (bnd_info(b).allocated) { const Real default_val = bnd_info(b).var.sparse_default_val; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size()), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); - bnd_info(b).var(iel, t, u, v, k, j, i) = default_val; - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = default_val; + }); + }); } idx_offset += idxer.size(); }