Skip to content

Commit

Permalink
Merge branch 'develop' into lroberts36/add-combined-buffer-communication
Browse files Browse the repository at this point in the history
  • Loading branch information
lroberts36 authored Nov 25, 2024
2 parents 867af0a + 2fc423a commit b573337
Show file tree
Hide file tree
Showing 23 changed files with 202 additions and 97 deletions.
24 changes: 18 additions & 6 deletions .github/workflows/check-compilers.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
name: Check compilers

on: [push, pull_request]
on:
# run every day at 06:00 UTC
schedule:
- cron: '0 6 * * *'
# when triggered manually
workflow_dispatch:
# when auto merge is enabled (hack to make sure it's run before merging)
pull_request:
types: [auto_merge_enabled]

# Cancel "duplicated" workflows triggered by pushes to internal
# branches with associated PRs.
Expand All @@ -14,7 +22,7 @@ jobs:
strategy:
matrix:
cxx: ['g++', 'clang++-15']
cmake_build_type: ['Release', 'Debug']
cmake_build_type: ['Release', 'DbgNoSym']
device: ['cuda', 'host']
parallel: ['serial', 'mpi']
exclude:
Expand All @@ -23,7 +31,7 @@ jobs:
# https://github.com/lanl/parthenon/issues/630
- cxx: clang++-15
device: cuda
cmake_build_type: Debug
cmake_build_type: DbgNoSym
runs-on: ubuntu-latest
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
Expand All @@ -48,25 +56,29 @@ jobs:
strategy:
matrix:
cxx: ['hipcc']
cmake_build_type: ['Release', 'Debug']
cmake_build_type: ['Release', 'DbgNoSym']
device: ['hip']
parallel: ['serial', 'mpi']
runs-on: ubuntu-latest
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
image: ghcr.io/parthenon-hpc-lab/rocm6.2-mpi-hdf5
env:
CMAKE_GENERATOR: Ninja
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: CMake
# Manually chaning the arch for this (debug) build as the
# -O0 option causes compiler issue for the navi 1030 GPU at
# compile time, see https://github.com/parthenon-hpc-lab/parthenon/pull/1191#issuecomment-2492035364
run: |
cmake -B builddir \
-DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DMACHINE_VARIANT=${{ matrix.device }}_${{ matrix.parallel }}
-DMACHINE_VARIANT=${{ matrix.device }}_${{ matrix.parallel }} \
-DKokkos_ARCH_AMD_GFX90A=ON -DKokkos_ARCH_NAVI1030=OFF
- name: Build
run: |
cmake --build builddir --parallel 2
4 changes: 3 additions & 1 deletion .github/workflows/ci-extended.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# CUDA IPC within docker repeated seem to cause issue on the CI machine
OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

Expand All @@ -34,7 +36,7 @@ jobs:
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/ci-short.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# CUDA IPC within docker repeated seem to cause issue on the CI machine
OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

Expand All @@ -22,7 +24,7 @@ jobs:
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
Expand All @@ -47,7 +49,7 @@ jobs:
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -79,7 +81,7 @@ jobs:
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
Expand Down
25 changes: 13 additions & 12 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,25 @@
### Added (new features/APIs/variables/...)
- [[PR 1192]](https://github.com/parthenon-hpc-lab/parthenon/pull/1103) Coalesced buffer communication
- [[PR 1103]](https://github.com/parthenon-hpc-lab/parthenon/pull/1103) Add sparsity to vector wave equation test
- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185/files) Bugfix to particle defragmentation
- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185) Bugfix to particle defragmentation
- [[PR 1184]](https://github.com/parthenon-hpc-lab/parthenon/pull/1184) Fix swarm block neighbor indexing in 1D, 2D
- [[PR 1183]](https://github.com/parthenon-hpc-lab/parthenon/pull/1183) Fix particle leapfrog example initialization data
- [[PR 1179]](https://github.com/parthenon-hpc-lab/parthenon/pull/1179) Make a global variable for whether simulation is a restart
- [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
- [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades

### Changed (changing behavior/API/variables/...)
- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1
- [[PR 1209]](https://github.com/parthenon-hpc-lab/parthenon/pull/1209) Ordered history output
- [[PR 1206]](https://github.com/parthenon-hpc-lab/parthenon/pull/1206) Leapfrog fix
- [[PR1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image
- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
- [[PR 1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image
- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
- [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
- [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
- [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls

### Fixed (not changing behavior/API/variables/...)
- [[PR 1211]](https://github.com/parthenon-hpc-lab/parthenon/pull/1211) remove inline from WriteTaskGraph
- [[PR 1188]](https://github.com/parthenon-hpc-lab/parthenon/pull/1188) Fix hdf5 output issue for metadata none variables, update test.
- [[PR 1170]](https://github.com/parthenon-hpc-lab/parthenon/pull/1170) Fixed incorrect initialization of array by a const not constexpr
- [[PR 1189]](https://github.com/parthenon-hpc-lab/parthenon/pull/1189) Address CUDA MPI/ICP issue with Kokkos <=4.4.1
Expand All @@ -36,7 +37,7 @@


### Incompatibilities (i.e. breaking changes)
- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag

## Release 24.08
Date: 2024-08-30
Expand Down Expand Up @@ -160,12 +161,12 @@ Date: 2024-03-21
- [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades

### Fixed (not changing behavior/API/variables/...)
- [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool
- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
- [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
- [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
- [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
- [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check
- [[PR 1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool
- [[PR 1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
- [[PR 992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
- [[PR 988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
- [[PR 986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
- [[PR 978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check

### Infrastructure (changes irrelevant to downstream codes)
- [[PR 1027]](https://github.com/parthenon-hpc-lab/parthenon/pull/1027) Refactor RestartReader as abstract class
Expand Down Expand Up @@ -232,7 +233,7 @@ Date: 2023-11-16
- [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model

### Removed (removing behavior/API/varaibles/...)
- [[PR 930](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage.
- [[PR 930]](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage.


## Release 0.8.0
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr

* CMake 3.16 or greater
* C++17 compatible compiler
* Kokkos 4.0.1 or greater
* Kokkos 4.4.1 or greater

## Optional (enabling features)

Expand Down
3 changes: 3 additions & 0 deletions cmake/machinecfg/GitHubActions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ")

# common options
set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks")
set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.")

set(CMAKE_CXX_FLAGS_DBGNOSYM "-O0" CACHE STRING "Debug build without symbols")

set(MACHINE_CXX_FLAGS "")
if (${MACHINE_VARIANT} MATCHES "cuda")
Expand Down
41 changes: 40 additions & 1 deletion doc/sphinx/src/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ Kokkos wrappers/abstractions
- ``par_for`` wrappers use inclusive bounds, i.e., the loop will
include the last index given
- ``ParArrayND`` arrays by default allocate on the *device* using
default precision configured
default precision configured and come with a `State` that can
be used to store additional metadata.
- ``ParArray#DRaw`` directly map to Kokkos ``Views`` that are allocated
on *device* using default precision.
- To create an array on the host with identical layout to the device
array either use

Expand Down Expand Up @@ -62,6 +65,42 @@ parallelism interface that is needed for managing memory cached in
tightly nested loops. The wrappers are documented
:ref:`here <nested par for>`.

View of Views
-------------

Special care needs to be taken when working with a ``View`` of ``Views``.

To repeat the Kokkos documenation: `Don't use them <https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html#can-i-make-a-view-of-views>`__

But if you have to (which is the case in some places inside Parthenon)
then follow this pattern:

.. code:: c++

ParArray1DRaw<ParArray1D<Real>> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10);

The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added,
which results in the (inner ``View`` ) deallocators being called on the host (rather than on
the device by default).
Also note the use of the "raw" ``ParArray1DRaw``, which directly maps to a Kokkos ``View``
(that is required to process the allocation property as this interface is not exposed
in the more generic ``ParArrayND``).

Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional
property for the same reason.

.. code:: c++

// explicit theoretical example -- don't use this
auto view_of_pararrays_h =
Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays);

// but instead use this interface provided by Parthenon:
auto view_of_pararrays_h = create_view_of_view_mirror(view_of_pararrays);


Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon).

The need for reductions within function handling ``MeshBlock`` data
-------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion external/Kokkos
Submodule Kokkos updated 1031 files
14 changes: 6 additions & 8 deletions scripts/docker/Dockerfile.hip-rocm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rocm/dev-ubuntu-20.04:5.4.3
FROM rocm/dev-ubuntu-24.04:6.2

RUN apt-get clean && apt-get update -y && \
DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib lcov curl cmake ninja-build openmpi-bin libopenmpi-dev && \
Expand All @@ -14,12 +14,10 @@ RUN cd /tmp && \
cd / && \
rm -rf /tmp/hdf5-1.10.8*

# "mpic++ --showme" forgets open-pal in Ubuntu 20.04 + OpenMPI 4.0.3
# https://bugs.launchpad.net/ubuntu/+source/openmpi/+bug/1941786
# https://github.com/open-mpi/ompi/issues/9317
ENV LDFLAGS="-lopen-pal"

RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10

# uid 1000 maps to the one running the container on the CI host
RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci
# Latest image has default user with uid 1000 (which maps to the one running the container on the CI host
# Need to add user to the group that can access the GPU
RUN usermod -a -G render ubuntu

WORKDIR /home/ubuntu
3 changes: 0 additions & 3 deletions src/bvals/bvals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,6 @@ class BoundarySwarm : public BoundaryCommunication {
explicit BoundarySwarm(std::weak_ptr<MeshBlock> pmb, const std::string &label);
~BoundarySwarm() = default;

std::vector<ParArrayND<int>> vars_int;
std::vector<ParArrayND<Real>> vars_real;

// (usuallly the std::size_t unsigned integer type)
std::vector<BoundaryCommunication *>::size_type bswarm_index;

Expand Down
4 changes: 2 additions & 2 deletions src/bvals/comms/bnd_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
namespace parthenon {

void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) {
prores_info = ParArray1D<ProResInfo>("prores_info", n_regions);
prores_info_h = Kokkos::create_mirror_view(prores_info);
prores_info = ProResInfoArr_t(ViewOfViewAlloc("prores_info"), n_regions);
prores_info_h = create_view_of_view_mirror(prores_info);
int nref_funcs = pkg->NumRefinementFuncs();
// Note that assignment of Kokkos views resets them, but
// buffer_subset_sizes is a std::vector. It must be cleared, then
Expand Down
7 changes: 4 additions & 3 deletions src/bvals/comms/bnd_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "bvals/neighbor_block.hpp"
#include "coordinates/coordinates.hpp"
#include "interface/variable_state.hpp"
#include "kokkos_abstraction.hpp"
#include "mesh/domain.hpp"
#include "mesh/forest/logical_coordinate_transformation.hpp"
#include "utils/communication_buffer.hpp"
Expand Down Expand Up @@ -137,11 +138,11 @@ struct ProResInfo {
int GetBufferSize(const MeshBlock *const pmb, const NeighborBlock &nb,
std::shared_ptr<Variable<Real>> v);

using BndInfoArr_t = ParArray1D<BndInfo>;
using BndInfoArr_t = ParArray1DRaw<BndInfo>;
using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror;

using ProResInfoArr_t = ParArray1D<ProResInfo>;
using ProResInfoArrHost_t = typename ParArray1D<ProResInfo>::HostMirror;
using ProResInfoArr_t = ParArray1DRaw<ProResInfo>;
using ProResInfoArrHost_t = typename ProResInfoArr_t::HostMirror;
class StateDescriptor;
struct ProResCache_t {
ProResInfoArr_t prores_info{};
Expand Down
5 changes: 3 additions & 2 deletions src/bvals/comms/bvals_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "bvals/comms/bnd_info.hpp"
#include "bvals/comms/bvals_in_one.hpp"
#include "interface/variable.hpp"
#include "kokkos_abstraction.hpp"
#include "mesh/domain.hpp"
#include "mesh/mesh.hpp"
#include "mesh/meshblock.hpp"
Expand Down Expand Up @@ -221,8 +222,8 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
using namespace loops;
using namespace loops::shorthands;
BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
cache.bnd_info = BndInfoArr_t(ViewOfViewAlloc("bnd_info"), nbound);
cache.bnd_info_h = create_view_of_view_mirror(cache.bnd_info);

// prolongation/restriction sub-sets
// TODO(JMM): Right now I exclude fluxcorrection boundaries but if
Expand Down
5 changes: 3 additions & 2 deletions src/interface/mesh_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "interface/sparse_pack_base.hpp"
#include "interface/swarm_pack_base.hpp"
#include "interface/variable_pack.hpp"
#include "kokkos_abstraction.hpp"
#include "mesh/domain.hpp"
#include "mesh/meshblock.hpp"
#include "mesh/meshblock_pack.hpp"
Expand Down Expand Up @@ -149,8 +150,8 @@ const MeshBlockPack<P> &PackOnMesh(M &map, BlockDataList_t<Real> &block_data_,
}

if (make_new_pack) {
ParArray1D<P> packs("MeshData::PackVariables::packs", nblocks);
auto packs_host = Kokkos::create_mirror_view(packs);
ParArray1DRaw<P> packs(ViewOfViewAlloc("MeshData::PackVariables::packs"), nblocks);
auto packs_host = create_view_of_view_mirror(packs);

for (size_t i = 0; i < nblocks; i++) {
const auto &pack = packing_function(block_data_[i], this_map, this_key);
Expand Down
Loading

0 comments on commit b573337

Please sign in to comment.