Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add boundary comm inner loop specializations for performance and allow setting vector length #1175

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


### Infrastructure (changes irrelevant to downstream codes)

- [[PR 1175]](https://github.com/parthenon-hpc-lab/parthenon/pull/1175) Add inner comm loop specializations to maximize comms performance

### Removed (removing behavior/API/varaibles/...)

Expand Down
200 changes: 137 additions & 63 deletions src/bvals/comms/boundary_communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {

Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();

Expand All @@ -110,28 +111,42 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
auto &idxer = bnd_info(b).idxer[it];
const int iel = static_cast<int>(bnd_info(b).topo_idx[it]) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
if (threshold > 0.0) {
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });
});
non_zero[iel] = true;
}
idx_offset += idxer.size();
}
Kokkos::single(Kokkos::PerTeam(team_member), [&]() {
Expand Down Expand Up @@ -272,62 +287,121 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
auto &bnd_info = cache.bnd_info;
Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();
if (bnd_info(b).same_to_same) return;
int idx_offset = 0;
for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) {
auto &idxer = bnd_info(b).idxer[it];
auto &lcoord_trans = bnd_info(b).lcoord_trans;
const bool isTrivial = lcoord_trans.IsTrivial();
auto &var = bnd_info(b).var;
const auto [tel, ftemp] =
lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]);
const bool isCell = (tel == TopologicalElement::CC);
Real fac = ftemp; // Can't capture structured bindings
const int iel = static_cast<int>(tel) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
if (bnd_info(b).buf_allocated && bnd_info(b).allocated) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure bindings
// being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = buf[m]; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, kk, jj, ii);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = buf[m];
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
}
} else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) {
const Real default_val = bnd_info(b).var.sparse_default_val;
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = default_val; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = default_val;
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
}
}
idx_offset += idxer.size();
}
Expand Down
7 changes: 7 additions & 0 deletions src/mesh/forest/logical_coordinate_transformation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ struct LogicalCoordinateTransformation {
std::int64_t origin) const;
CellCentOffsets Transform(CellCentOffsets in) const;

// Check if this transformation includes at most a translation
KOKKOS_INLINE_FUNCTION
bool IsTrivial() const {
return (dir_connection[0] == 0) && (dir_connection[1] == 1) &&
(dir_connection[2] == 2) && !dir_flip[0] && !dir_flip[1] && !dir_flip[2];
}

KOKKOS_INLINE_FUNCTION
std::tuple<TopologicalElement, Real> Transform(TopologicalElement el) const {
int iel = static_cast<int>(el);
Expand Down
2 changes: 2 additions & 0 deletions src/mesh/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
default_pack_size_(pin->GetOrAddInteger("parthenon/mesh", "pack_size", -1)),
// private members:
num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)),
comm_vector_length_(
pin->GetOrAddInteger("parthenon/mesh", "comm_vector_length", 1)),
Yurlungur marked this conversation as resolved.
Show resolved Hide resolved
use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(),
lb_manual_(), nslist(Globals::nranks), nblist(Globals::nranks),
nref(Globals::nranks), nderef(Globals::nranks), rdisp(Globals::nranks),
Expand Down
3 changes: 3 additions & 0 deletions src/mesh/mesh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class Mesh {
return nblist[my_rank];
}
int GetNumMeshThreads() const { return num_mesh_threads_; }
int GetCommVectorLength() const { return comm_vector_length_; }

std::int64_t GetTotalCells();
// TODO(JMM): Move block_size into mesh.
int GetNumberOfMeshBlockCells() const;
Expand Down Expand Up @@ -258,6 +260,7 @@ class Mesh {
// data
int root_level, max_level, current_level;
int num_mesh_threads_;
int comm_vector_length_;
/// Maps Global Block IDs to which rank the block is mapped to.
std::vector<int> ranklist;
/// Maps rank to start of local block IDs.
Expand Down
Loading