Skip to content

Commit

Permalink
add the ability to time things on host outside of kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
jdolence committed Nov 2, 2023
1 parent b076220 commit 908d666
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 14 deletions.
8 changes: 7 additions & 1 deletion src/bvals/boundary_conditions_generic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "mesh/domain.hpp"
#include "mesh/mesh.hpp"
#include "mesh/meshblock.hpp"
#include "utils/block_timer.hpp"

namespace parthenon {
namespace BoundaryFunction {
Expand All @@ -40,6 +41,10 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
// make sure DIR is X[123]DIR so we don't have to check again
static_assert(DIR == X1DIR || DIR == X2DIR || DIR == X3DIR, "DIR must be X[123]DIR");

std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
auto &block_cost_host = pmb->pmy_mesh->block_cost_host;
BlockTimerHost host_timer(block_cost_host, pmb->lid, pmb->lid);

// convenient shorthands
constexpr bool X1 = (DIR == X1DIR);
constexpr bool X2 = (DIR == X2DIR);
Expand All @@ -63,7 +68,6 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
if (lend < lstart) return;
auto nb = IndexRange{lstart, lend};

std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
const auto &bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;

const auto &range = X1 ? bounds.GetBoundsI(IndexDomain::interior, el)
Expand All @@ -87,6 +91,8 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
// used for derivatives
const int offsetin = INNER;
const int offsetout = !INNER;
// stop timing on the host
host_timer.Stop();
pmb->par_for_bndry(
PARTHENON_AUTO_LABEL, nb, domain, el, coarse,
KOKKOS_LAMBDA(const int &l, const int &k, const int &j, const int &i) {
Expand Down
4 changes: 2 additions & 2 deletions src/bvals/comms/boundary_communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
PARTHENON_INSTRUMENT

Mesh *pmesh = md->GetMeshPointer();
auto &block_cost = pmesh->block_cost;
auto &block_cost = pmesh->GetBlockCost();
auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);

if (cache.buf_vec.size() == 0)
Expand Down Expand Up @@ -226,7 +226,7 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
PARTHENON_INSTRUMENT

Mesh *pmesh = md->GetMeshPointer();
auto &block_cost = pmesh->block_cost;
auto &block_cost = pmesh->GetBlockCost();
auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);

auto [rebuild, nbound] = CheckReceiveBufferCacheForRebuild<bound_type, false>(md);
Expand Down
14 changes: 8 additions & 6 deletions src/mesh/amr_loadbalance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,12 +486,13 @@ void Mesh::ResetLoadBalanceVariables() {
auto bcost = block_cost;
parthenon::par_for(
loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0,
block_list.size() - 1,
KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
block_list.size() - 1, KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
for (int b = 0; b < block_list.size(); b++)
block_cost_host[b] = TINY_NUMBER;
#endif
} else if (lb_manual_) {
for (int b = 0; b < block_list.size(); b++) {
block_cost[b] = TINY_NUMBER;
block_cost_host[b] = TINY_NUMBER;
}
}
step_since_lb = 0;
Expand Down Expand Up @@ -642,6 +643,8 @@ void Mesh::GatherCostList() {
if (lb_automatic_) {
#ifdef ENABLE_LB_TIMERS
auto cost_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), block_cost);
for (int b = 0; b < block_cost_host.size(); b++)
cost_h(b) += block_cost_host[b];
#ifdef MPI_PARALLEL
PARTHENON_MPI_CHECK(MPI_Allgatherv(cost_h.data(), nblist[Globals::my_rank],
MPI_DOUBLE, costlist.data(), nblist.data(),
Expand All @@ -651,7 +654,7 @@ void Mesh::GatherCostList() {
}
if (lb_manual_) {
#ifdef MPI_PARALLEL
PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost.data(), nblist[Globals::my_rank],
PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost_host.data(), nblist[Globals::my_rank],
MPI_DOUBLE, costlist.data(), nblist.data(),
nslist.data(), MPI_DOUBLE, MPI_COMM_WORLD));
#endif
Expand Down Expand Up @@ -767,9 +770,8 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput

#ifdef ENABLE_LB_TIMERS
block_cost.Realloc(nbe - nbs + 1);
#else
block_cost.resize(nbe - nbs + 1);
#endif
block_cost_host.resize(nbe - nbs + 1);

// Restrict fine to coarse buffers
ProResCache_t restriction_cache;
Expand Down
7 changes: 3 additions & 4 deletions src/mesh/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
}
#ifdef ENABLE_LB_TIMERS
block_cost.Realloc(block_list.size());
#else
block_cost.resize(block_list.size());
#endif
block_cost_host.resize(block_list.size());

ResetLoadBalanceVariables();
}
Expand Down Expand Up @@ -721,9 +720,9 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,

#ifdef ENABLE_LB_TIMERS
block_cost.Realloc(block_list.size());
#else
block_cost.resize(block_list.size());
#endif
block_cost_host.resize(block_list.size());

ResetLoadBalanceVariables();
}

Expand Down
4 changes: 3 additions & 1 deletion src/mesh/mesh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,12 @@ class Mesh {
BlockList_t block_list;
Packages_t packages;
std::shared_ptr<StateDescriptor> resolved_packages;
std::vector<double> block_cost_host;
#ifdef ENABLE_LB_TIMERS
ParArray1D<double> block_cost;
auto &GetBlockCost() const { return block_cost; }
#else
std::vector<double> block_cost;
auto &GetBlockCost() const { return block_cost_host; }
#endif

DataCollection<MeshData<Real>> mesh_data;
Expand Down
30 changes: 30 additions & 0 deletions src/utils/block_timer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,36 @@ class BlockTimer {
#endif
};

class BlockTimerHost {
#ifdef ENABLE_LB_TIMERS
public:
BlockTimerHost(const std::vector<double> &cost, const int bs, const int be)
: cost_(cost), bs_(bs), be_(be), start_(Kokkos::Impl::clock_tic()) {}
void Stop() const {
auto stop = Kokkos::Impl::clock_tick();
// deal with overflow of clock
auto diff =
(stop < start_
? static_cast<double>(std::numeric_limits<uint64_t>::max() - start_) +
static_cast<double>(stop)
: static_cast<double>(stop - start_));
auto cost_per_block = diff / (be - bs + 1);
for (int b = bs; b <= be; b++)
cost_[b] += cost_per_block;
}

private:
const std::vector<double> &cost_;
const int bs_, be_;
const uint64_t start_;
#else // stub out
public:
template <typename... Args>
BlockTimerHost(Args &&...args) {}
void Stop() const {}
#endif
};

} // namespace parthenon

#endif

0 comments on commit 908d666

Please sign in to comment.