diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index f0062b4..1c9dbcd 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   codecov:
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
diff --git a/.github/workflows/linux-clang.yml b/.github/workflows/linux-clang.yml
index 6948fea..b510d06 100644
--- a/.github/workflows/linux-clang.yml
+++ b/.github/workflows/linux-clang.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   linux-clang:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         clang-version: [14, 15, 16, 17, 18]
@@ -75,7 +76,8 @@ jobs:
             -DCMAKE_C_COMPILER=$CC \
             -DCMAKE_CXX_COMPILER=$CXX \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=ON"
+            -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8"
 
           if [ "$MPI_TYPE" == "MPICH" ]; then
             CMAKE_ARGS="$CMAKE_ARGS -DMPI_C_COMPILER=$CC -DMPI_CXX_COMPILER=$CXX"
@@ -102,4 +104,5 @@ jobs:
           fi
           cd build
           echo "Testing with $MPI_TYPE"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          export DYNAMPI_MAX_MPI_RANK=8
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
diff --git a/.github/workflows/linux-gcc.yml b/.github/workflows/linux-gcc.yml
index d4baf48..b1a63c3 100644
--- a/.github/workflows/linux-gcc.yml
+++ b/.github/workflows/linux-gcc.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   linux-gcc:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         gcc-version: [11, 12, 13, 14]
@@ -77,7 +78,8 @@ jobs:
             -DCMAKE_C_COMPILER=$CC \
             -DCMAKE_CXX_COMPILER=$CXX \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=ON"
+            -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8"
           if [ "$MPI_TYPE" == "MPICH" ]; then
             CMAKE_ARGS="$CMAKE_ARGS -DMPI_C_COMPILER=$MPI_C_COMPILER -DMPI_CXX_COMPILER=$MPI_CXX_COMPILER"
           fi
@@ -97,4 +99,5 @@ jobs:
           fi
           cd build
           echo "Testing with $MPI_TYPE"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          export DYNAMPI_MAX_MPI_RANK=8
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
diff --git a/.github/workflows/linux-intel.yml b/.github/workflows/linux-intel.yml
index 967aae4..2737702 100644
--- a/.github/workflows/linux-intel.yml
+++ b/.github/workflows/linux-intel.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   linux-intel:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         build-type: [Debug, Release]
@@ -37,7 +38,8 @@ jobs:
             -DCMAKE_C_COMPILER=icx \
             -DCMAKE_CXX_COMPILER=icpx \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=ON
+            -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8
       - name: Build with Intel
         shell: bash
         run: |
@@ -49,4 +51,5 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           cd build
           echo "Testing with $MPI_TYPE"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          export DYNAMPI_MAX_MPI_RANK=8
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index c05ac0c..d83bff2 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   macos-clang:
     runs-on: macos-latest
+    timeout-minutes: 5
     strategy:
       matrix:
         build-type: [Debug, Release]
@@ -29,7 +30,8 @@ jobs:
           cmake -B build \
             -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=ON
+            -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8
       - name: Build on macOS
         shell: bash
         run: cmake --build build --config ${{ matrix.build-type }} --parallel
@@ -38,4 +40,5 @@ jobs:
         run: |
           cd build
           echo "Testing with $MPI_TYPE"
+          export DYNAMPI_MAX_MPI_RANK=8
           ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 02c6dd2..65f9595 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -10,6 +10,7 @@ on:
 jobs:
   pre-commit:
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml
index 5f1811f..19b60b6 100644
--- a/.github/workflows/sanitizers.yml
+++ b/.github/workflows/sanitizers.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   linux-debug-sanitizers:
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     strategy:
       matrix:
         sanitizer: [address, undefined]
@@ -33,6 +34,7 @@ jobs:
             -DCMAKE_CXX_COMPILER=g++-14 \
             -DDYNAMPI_BUILD_TESTS=ON \
             -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8 \
             -DCMAKE_CXX_FLAGS="-fsanitize=${{ matrix.sanitizer }} -fno-omit-frame-pointer" \
             -DCMAKE_C_FLAGS="-fsanitize=${{ matrix.sanitizer }} -fno-omit-frame-pointer" \
             -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=${{ matrix.sanitizer }}" \
@@ -46,4 +48,5 @@ jobs:
           cd build
           echo "Testing with $MPI_TYPE and ${{ matrix.sanitizer }} sanitizer"
           export LSAN_OPTIONS=suppressions=${{ github.workspace }}/test/lsan.supp
-          ctest --output-on-failure --parallel -C Debug
+          export DYNAMPI_MAX_MPI_RANK=8
+          ctest --output-on-failure --verbose -C Debug
diff --git a/.github/workflows/smpi.yml b/.github/workflows/smpi.yml
index d6a6b50..144b079 100644
--- a/.github/workflows/smpi.yml
+++ b/.github/workflows/smpi.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   smpi:
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    timeout-minutes: 5
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
@@ -46,7 +46,8 @@ jobs:
             -DMPIEXEC_EXECUTABLE=/usr/bin/smpirun \
             -DMPIEXEC_PREFLAGS=-platform\;platform.xml \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=OFF"
+            -DDYNAMPI_BUILD_BENCHMARKS=OFF \
+            -DDYNAMPI_MAX_MPI_RANK=8"
 
           echo "CMAKE_ARGS: $CMAKE_ARGS"
 
@@ -81,7 +82,8 @@ jobs:
         run: |
           echo "Testing with $MPI_TYPE"
           cp platform.xml ${{ steps.strings.outputs.build-output-dir }}/test
+          export DYNAMPI_MAX_MPI_RANK=8
           ctest --output-on-failure --parallel
 
-          echo "Additionally running MPI tests with 100 ranks..."
-          smpirun -np 100 -platform platform.xml ./test/mpi_test
+          echo "Additionally running MPI tests with 8 ranks..."
+          smpirun -np 8 -platform platform.xml ./test/mpi_test
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8556ea9..90d1fc5 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -19,6 +19,7 @@ jobs:
   windows-msmpi:
     name: Windows MS-MPI (MPI 3.1)
     runs-on: windows-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         build-type: [Debug, Release]
@@ -34,7 +35,8 @@ jobs:
           cmake -B build `
             -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} `
             -DDYNAMPI_BUILD_TESTS=ON `
-            -DDYNAMPI_BUILD_BENCHMARKS=ON
+            -DDYNAMPI_BUILD_BENCHMARKS=ON `
+            -DDYNAMPI_MAX_MPI_RANK=8
       - name: Build (MS-MPI)
         shell: pwsh
         run: cmake --build build --config ${{ matrix.build-type }} --parallel
@@ -43,10 +45,12 @@ jobs:
         run: |
           cd build
           echo "Testing with Microsoft MPI (MPI 3.1 features)"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          $env:DYNAMPI_MAX_MPI_RANK = "8"
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
   windows-mingw:
     name: Windows MinGW
     runs-on: windows-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         build-type: [Debug, Release]
@@ -97,7 +101,8 @@ jobs:
             -DMPI_C_LIBRARIES=/mingw64/lib/libmsmpi.a \
             -DMPI_CXX_LIBRARIES=/mingw64/lib/libmsmpi.a \
             -DDYNAMPI_BUILD_TESTS=ON \
-            -DDYNAMPI_BUILD_BENCHMARKS=ON
+            -DDYNAMPI_BUILD_BENCHMARKS=ON \
+            -DDYNAMPI_MAX_MPI_RANK=8
       - name: Build (MinGW)
         shell: msys2 {0}
         run: cmake --build build --config ${{ matrix.build-type }} --parallel
@@ -106,10 +111,12 @@ jobs:
         run: |
           cd build
           echo "Testing with $MPI_TYPE"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          export DYNAMPI_MAX_MPI_RANK=8
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
   windows-intel-mpi:
     name: Windows Intel MPI (MPI 4.0)
     runs-on: windows-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         build-type: [Debug, Release]
@@ -126,7 +133,8 @@ jobs:
           cmake -B build `
             -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} `
             -DDYNAMPI_BUILD_TESTS=ON `
-            -DDYNAMPI_BUILD_BENCHMARKS=ON
+            -DDYNAMPI_BUILD_BENCHMARKS=ON `
+            -DDYNAMPI_MAX_MPI_RANK=8
       - name: Build (Intel MPI)
         shell: pwsh
         run: cmake --build build --config ${{ matrix.build-type }} --parallel
@@ -135,4 +143,5 @@ jobs:
         run: |
           cd build
           echo "Testing with Intel MPI (MPI 4.0 support)"
-          ctest --output-on-failure --parallel -C ${{ matrix.build-type }}
+          $env:DYNAMPI_MAX_MPI_RANK = "8"
+          ctest --output-on-failure -j 1 --timeout 180 -C ${{ matrix.build-type }}
diff --git a/.gitignore b/.gitignore
index 649aad9..ef46468 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ bin
 Testing
 *.btr
 commands.txt
+core.*
+__pycache__/
+*.png
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 83c7b15..6b7858f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,6 +18,14 @@ repos:
     rev: v20.1.8
     hooks:
       - id: clang-format
+  - repo: local
+    hooks:
+      - id: cppcheck
+        name: cppcheck
+        entry: cppcheck --enable=warning --suppress=missingIncludeSystem -I include
+          --std=c++20 --inline-suppr
+        language: system
+        types_or: [c, c++]
   - repo: https://github.com/google/yamlfmt.git
     rev: v0.17.2
     hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d102c1c..51c19c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,9 @@ if(MSVC)
 else()
   add_compile_options(-Wall -Wextra -Wpedantic -Werror -fno-ms-extensions)
 endif()
+if(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+  add_compile_options(-diag-disable=10430 -Wno-unknown-warning-option)
+endif()
 
 set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g -coverage -fprofile-arcs -ftest-coverage \
     -fno-elide-constructors ${CMAKE_CXX_FLAGS_COVERAGE}")
diff --git a/REUSE.toml b/REUSE.toml
new file mode 100644
index 0000000..9e98fec
--- /dev/null
+++ b/REUSE.toml
@@ -0,0 +1,13 @@
+version = 1
+
+[[annotations]]
+path = "benchmark/results/**.csv"
+SPDX-FileCopyrightText = "2025 QDX Technologies"
+SPDX-License-Identifier = "Apache-2.0"
+SPDX-Comment = "Generated benchmark results."
+
+[[annotations]]
+path = "**.png"
+SPDX-FileCopyrightText = "2025 QDX Technologies"
+SPDX-License-Identifier = "Apache-2.0"
+SPDX-Comment = "Generated result plots."
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 6391c99..5b32eaa 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -9,11 +9,21 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(cxxopts)
 
-add_executable(asymptotic_distribution_throughput
-    asymptotic_distribution_throughput.cpp
-)
-target_link_libraries(asymptotic_distribution_throughput
-    PRIVATE
-    dynampi
-    cxxopts::cxxopts
+set(benchmarks
+    asymptotic_distribution_throughput
+    strong_scaling_distribution_rate
+    pingpong
+    timer_resolution
+    naive_shutdown_time
 )
+
+foreach(benchmark IN LISTS benchmarks)
+    add_executable(${benchmark}
+        ${benchmark}.cpp
+    )
+    target_link_libraries(${benchmark}
+        PRIVATE
+        dynampi
+        cxxopts::cxxopts
+    )
+endforeach()
diff --git a/benchmark/asymptotic_distribution_throughput.cpp b/benchmark/asymptotic_distribution_throughput.cpp
index db92dfb..8467103 100644
--- a/benchmark/asymptotic_distribution_throughput.cpp
+++ b/benchmark/asymptotic_distribution_throughput.cpp
@@ -74,11 +74,12 @@ static double run_single_benchmark(const BenchmarkOptions& opts) {
   MPI_Barrier(MPI_COMM_WORLD);
 
   using Task = size_t;
-  using Result = std::vector<std::byte>;
+  // using Result = std::vector<std::byte>;
+  using Result = size_t;
 
-  auto worker_task = [&opts](Task task) -> Result {
-    return std::vector<std::byte>(opts.message_size, std::byte(task));
-  };
+  // auto worker_task = [&opts](Task task) -> Result {
+  // return std::vector<std::byte>(opts.message_size, std::byte(task));
+  auto worker_task = [](Task task) -> Result { return task; };
 
   dynampi::Timer dynamic_timer;
   auto dynamic_communicator = make_dynamic_communicator(opts.remove_root_from_distribution);
@@ -103,8 +104,9 @@ static double run_single_benchmark(const BenchmarkOptions& opts) {
     if (work_distributer.is_root_manager()) {
       std::cout << "Dynamic task distribution completed successfully." << std::endl;
       const auto& stats = work_distributer.get_statistics();
-      for (size_t i = 0; i < stats.worker_task_counts.size(); i++) {
-        std::cout << "Rank " << i << ": " << "Tasks: " << stats.worker_task_counts[i] << std::endl;
+      for (size_t i = 0; i < stats.worker_task_counts->size(); i++) {
+        std::cout << "Rank " << i << ": " << "Tasks: " << stats.worker_task_counts->at(i)
+                  << std::endl;
       }
       std::cout << "Total messages sent: " << stats.comm_statistics.send_count << std::endl;
       std::cout << "Total messages received: " << stats.comm_statistics.recv_count << std::endl;
diff --git a/benchmark/aurora/aurora_compile.sh b/benchmark/aurora/aurora_compile.sh
new file mode 100755
index 0000000..7506333
--- /dev/null
+++ b/benchmark/aurora/aurora_compile.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+module load cmake
+
+# Default values
+BUILD_DIR="build"
+BUILD_TYPE="Release"
+
+usage() {
+    echo "Usage: $0 [-d build_dir] [-t build_type] [--clean]"
+    echo
+    echo "Options:"
+    echo "  -d DIR    Build directory (default: build)"
+    echo "  -t TYPE   Build type: Release, Debug, RelWithDebInfo, MinSizeRel (default: Release)"
+    echo "  --clean   Remove build directory before configuring"
+    exit 1
+}
+
+# Parse args
+CLEAN=0
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -d) BUILD_DIR="$2"; shift 2 ;;
+        -t) BUILD_TYPE="$2"; shift 2 ;;
+        --clean) CLEAN=1; shift ;;
+        -h|--help) usage ;;
+        *) echo "Unknown option: $1"; usage ;;
+    esac
+done
+
+if [[ $CLEAN -eq 1 && -d "$BUILD_DIR" ]]; then
+    echo "Cleaning $BUILD_DIR"
+    rm -rf "$BUILD_DIR"
+fi
+
+echo "Configuring with CMake..."
+cmake -DCMAKE_BUILD_TYPE="$BUILD_TYPE" -B "$BUILD_DIR"
+
+echo "Building..."
+cmake --build "$BUILD_DIR" -- -j"$(nproc)"
+
+echo "✅ Build finished in $BUILD_DIR ($BUILD_TYPE)"
diff --git a/benchmark/naive_shutdown_time.cpp b/benchmark/naive_shutdown_time.cpp
new file mode 100644
index 0000000..ac3d27a
--- /dev/null
+++ b/benchmark/naive_shutdown_time.cpp
@@ -0,0 +1,174 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <mpi.h>
+
+#include <cstdint>
+#include <cxxopts.hpp>
+#include <dynampi/impl/naive_distributor.hpp>
+#include <dynampi/mpi/mpi_communicator.hpp>
+#include <dynampi/utilities/timer.hpp>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+using Task = uint32_t;
+using Result = uint32_t;
+
+struct BenchmarkOptions {
+  uint64_t nodes = 0;
+  std::string system;
+  std::string output_path;
+};
+
+struct BenchmarkResult {
+  uint64_t workers = 0;
+  uint64_t world_size = 0;
+  double time_per_shutdown_us = 0.0;
+  uint64_t iterations = 0;
+};
+
+static void write_csv_header(std::ostream& os) {
+  os << "system,nodes,world_size,workers,time_per_shutdown_us,iterations\n";
+}
+
+static void write_csv_row(std::ostream& os, const BenchmarkOptions& opts,
+                          const BenchmarkResult& result) {
+  os << opts.system << "," << opts.nodes << "," << result.world_size << "," << result.workers << ","
+     << result.time_per_shutdown_us << "," << result.iterations << "\n";
+}
+
+static BenchmarkResult run_benchmark([[maybe_unused]] const BenchmarkOptions& opts, MPI_Comm comm) {
+  dynampi::MPICommunicator<> comm_wrapper(comm, dynampi::MPICommunicator<>::Ownership::Reference);
+  int rank = 0;
+  int size = 0;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  const uint64_t num_workers = (size == 1) ? 1 : static_cast<uint64_t>(size - 1);
+
+  // Simple worker function that does nothing
+  auto worker_function = [](Task task) -> Result { return static_cast<Result>(task); };
+
+  MPI_Barrier(comm_wrapper);
+
+  // Overall timer for 10-second duration
+  dynampi::Timer overall_timer(dynampi::Timer::AutoStart::Yes);
+  const double target_duration_s = 10.0;
+
+  // Per-iteration timer
+  dynampi::Timer iteration_timer(dynampi::Timer::AutoStart::No);
+
+  double total_shutdown_time = 0.0;
+  uint64_t iterations = 0;
+
+  while (true) {
+    bool should_continue = overall_timer.elapsed().count() < target_duration_s;
+    comm_wrapper.broadcast(should_continue);
+    if (!should_continue) {
+      break;
+    }
+    // Ensure all workers are ready
+    MPI_Barrier(comm_wrapper);
+
+    {
+      dynampi::NaiveMPIWorkDistributor<Task, Result> distributor(
+          worker_function, {.comm = comm, .manager_rank = 0, .auto_run_workers = true});
+
+      if (distributor.is_root_manager()) {
+        iteration_timer.reset(dynampi::Timer::AutoStart::Yes);
+        auto _ = distributor.finish_remaining_tasks();
+        (void)_;
+        iteration_timer.stop();
+        total_shutdown_time += iteration_timer.elapsed().count();
+        iterations++;
+      }
+    }
+
+    // Barrier to ensure all processes complete shutdown before next iteration
+    MPI_Barrier(comm_wrapper);
+  }
+
+  // Calculate average shutdown time in microseconds
+  const double avg_shutdown_time_us =
+      (iterations > 0) ? (total_shutdown_time / static_cast<double>(iterations)) * 1'000'000.0
+                       : 0.0;
+
+  return BenchmarkResult{num_workers, static_cast<uint64_t>(size), avg_shutdown_time_us,
+                         iterations};
+}
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+  int world_rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  cxxopts::Options options("naive_shutdown_time",
+                           "Benchmark naive distributor shutdown time with no tasks");
+  options.add_options()("n,nodes", "Number of nodes for labeling output (defaults to world size)",
+                        cxxopts::value<uint64_t>()->default_value("0"))(
+      "S,system", "System label for plotting (frontier, aurora, ...)",
+      cxxopts::value<std::string>()->default_value(""))(
+      "o,output", "Append results to CSV file", cxxopts::value<std::string>()->default_value(""))(
+      "h,help", "Print usage");
+
+  cxxopts::ParseResult args;
+  try {
+    args = options.parse(argc, argv);
+  } catch (const std::exception& e) {
+    if (world_rank == 0) {
+      std::cerr << "Error parsing options: " << e.what() << "\n" << options.help() << std::endl;
+    }
+    MPI_Finalize();
+    return 1;
+  }
+
+  if (args.count("help")) {
+    if (world_rank == 0) {
+      std::cout << options.help() << std::endl;
+    }
+    MPI_Finalize();
+    return 0;
+  }
+
+  BenchmarkOptions opts;
+  opts.nodes = args["nodes"].as<uint64_t>();
+  opts.system = args["system"].as<std::string>();
+  opts.output_path = args["output"].as<std::string>();
+
+  {
+    MPI_Comm comm = MPI_COMM_WORLD;
+    int rank = 0;
+    int size = 0;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+    if (opts.nodes == 0) {
+      opts.nodes = static_cast<uint64_t>(size);
+    }
+
+    BenchmarkResult result = run_benchmark(opts, comm);
+
+    if (rank == 0) {
+      std::cout << "RESULT"
+                << " nodes=" << opts.nodes << " world_size=" << result.world_size
+                << " workers=" << result.workers
+                << " time_per_shutdown_us=" << result.time_per_shutdown_us
+                << " iterations=" << result.iterations << std::endl;
+      if (!opts.output_path.empty()) {
+        std::ifstream check(opts.output_path);
+        const bool needs_header =
+            !check.good() || check.peek() == std::ifstream::traits_type::eof();
+        check.close();
+        std::ofstream out(opts.output_path, std::ios::app);
+        if (needs_header) {
+          write_csv_header(out);
+        }
+        write_csv_row(out, opts, result);
+      }
+    }
+  }
+  MPI_Finalize();
+  return 0;
+}
diff --git a/benchmark/pingpong.cpp b/benchmark/pingpong.cpp
new file mode 100644
index 0000000..daf72c3
--- /dev/null
+++ b/benchmark/pingpong.cpp
@@ -0,0 +1,474 @@
+// SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+// SPDX-License-Identifier: Apache-2.0
+
+// mpi_pair_bench.cpp
+#ifdef _WIN32
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#include <mpi.h>
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+enum class Method { SEND, ISEND, BSEND, SSEND };
+
+struct Options {
+  std::size_t min_bytes = 1;
+  std::size_t max_bytes = 1u << 25;  // 32 MiB
+  int factor = 2;                    // geometric progression; use 1 for linear
+  int warmup = 10;
+  int iters = 100;
+  int only_rank = -1;           // if >=0, test only pairs involving this rank
+  std::vector<Method> methods;  // default: all
+  std::string outfile = "mpi_pair_bench.csv";
+};
+
+struct PingResult {
+  double avg_rtt_s;          // average round-trip time (per message)
+  double send_call_total_s;  // total time spent inside send() calls across timed iterations
+};
+
+static void die(int rank, const std::string &msg) {
+  if (rank == 0) std::cerr << "Error: " << msg << std::endl;
+  MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+static std::string method_name(Method m) {
+  switch (m) {
+    case Method::SEND:
+      return "send";
+    case Method::ISEND:
+      return "isend";
+    case Method::BSEND:
+      return "bsend";
+    case Method::SSEND:
+      return "ssend";
+  }
+  return "?";
+}
+
+static std::optional<Method> parse_method(const std::string &s) {
+  if (s == "send") {
+    return Method::SEND;
+  }
+  if (s == "isend") {
+    return Method::ISEND;
+  }
+  if (s == "bsend") {
+    return Method::BSEND;
+  }
+  if (s == "ssend") {
+    return Method::SSEND;
+  }
+  return std::nullopt;
+}
+
+static void parse_unsigned_arg(int rank, int &i, int argc, char **argv, const char *name,
+                               std::size_t &out) {
+  if (i + 1 >= argc) die(rank, std::string("missing value for ") + name);
+  try {
+    out = std::stoull(argv[++i]);
+  } catch (const std::invalid_argument &e) {
+    die(rank, std::string("invalid value for ") + name + ": " + argv[i] + ": " + e.what());
+  } catch (const std::out_of_range &e) {
+    die(rank, std::string("invalid value for ") + name + ": " + argv[i] + ": " + e.what());
+  }
+}
+
+static void parse_int_arg(int rank, int &i, int argc, char **argv, const char *name, int &out) {
+  if (i + 1 >= argc) die(rank, std::string("missing value for ") + name);
+  try {
+    out = std::stoi(argv[++i]);
+  } catch (const std::invalid_argument &e) {
+    die(rank, std::string("invalid value for ") + name + ": " + argv[i] + ": " + e.what());
+  } catch (const std::out_of_range &e) {
+    die(rank, std::string("invalid value for ") + name + ": " + argv[i] + ": " + e.what());
+  }
+}
+
+static Options parse_args(int argc, char **argv, int rank) {
+  Options opt;
+  bool methods_specified = false;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    if (a == "--min-bytes") {
+      parse_unsigned_arg(rank, i, argc, argv, "--min-bytes", opt.min_bytes);
+    } else if (a == "--max-bytes") {
+      parse_unsigned_arg(rank, i, argc, argv, "--max-bytes", opt.max_bytes);
+    } else if (a == "--factor") {
+      parse_int_arg(rank, i, argc, argv, "--factor", opt.factor);
+    } else if (a == "--warmup") {
+      parse_int_arg(rank, i, argc, argv, "--warmup", opt.warmup);
+    } else if (a == "--iters") {
+      parse_int_arg(rank, i, argc, argv, "--iters", opt.iters);
+    } else if (a == "--outfile") {
+      if (i + 1 >= argc) die(rank, "missing value for --outfile");
+      opt.outfile = argv[++i];
+    } else if (a == "--only-rank") {
+      parse_int_arg(rank, i, argc, argv, "--only-rank", opt.only_rank);
+    } else if (a == "--methods") {
+      if (i + 1 >= argc) die(rank, "missing value for --methods");
+      ++i;
+      methods_specified = true;
+      opt.methods.clear();
+      std::string list = argv[i];
+      size_t start = 0;
+      while (start <= list.size()) {
+        size_t comma = list.find(',', start);
+        std::string tok =
+            (comma == std::string::npos) ? list.substr(start) : list.substr(start, comma - start);
+        auto m = parse_method(tok);
+        if (!m) die(rank, "unknown method in --methods: " + tok);
+        opt.methods.push_back(*m);
+        if (comma == std::string::npos) break;
+        start = comma + 1;
+      }
+    } else if (a == "-h" || a == "--help") {
+      if (rank == 0) {
+        std::cout
+            << "MPI pairwise bandwidth/latency benchmark\n\n"
+               "Usage: mpirun -n <P> ./mpi_pair_bench [options]\n\n"
+               "Options:\n"
+               "  --min-bytes N        starting message size (default 1)\n"
+               "  --max-bytes N        maximum message size (default 33554432 = 32 MiB)\n"
+               "  --factor K           size multiplier per step (default 2; use 1 for linear)\n"
+               "  --warmup W           warmup iterations per size (default 10)\n"
+               "  --iters I            timed iterations per size (default 100)\n"
+               "  --methods LIST       subset of: send,isend,bsend,ssend (default: all)\n"
+               "  --only-rank R        only test pairs involving rank R (default: all pairs)\n"
+               "  --outfile PATH       CSV output file (default mpi_pair_bench.csv)\n";
+      }
+      MPI_Finalize();
+      std::exit(0);
+    } else {
+      die(rank, "unknown argument: " + a);
+    }
+  }
+
+  if (!methods_specified) {
+    opt.methods = {Method::SEND, Method::ISEND, Method::BSEND, Method::SSEND};
+  }
+  if (opt.min_bytes == 0) die(rank, "--min-bytes must be >= 1");
+  if (opt.max_bytes < opt.min_bytes) die(rank, "--max-bytes must be >= --min-bytes");
+  if (opt.max_bytes > INT_MAX) die(rank, "--max-bytes must be <= INT_MAX");
+  if (opt.factor < 1) die(rank, "--factor must be >= 1");
+  if (opt.iters <= 0 || opt.warmup < 0) die(rank, "iterations must be positive");
+  return opt;
+}
+
+// Helper to perform MPI send based on method, optionally tracking time
+struct SendResult {
+  MPI_Request request;
+  double elapsed_time;
+};
+
+static SendResult do_send(Method method, const void *buf, int count, MPI_Datatype datatype,
+                          int dest, int tag, MPI_Comm comm, bool track_time) {
+  SendResult res{MPI_REQUEST_NULL, 0.0};
+  double t0 = track_time ? MPI_Wtime() : 0.0;
+
+  if (method == Method::ISEND) {
+    MPI_Isend(buf, count, datatype, dest, tag, comm, &res.request);
+  } else if (method == Method::SEND) {
+    MPI_Send(buf, count, datatype, dest, tag, comm);
+  } else if (method == Method::BSEND) {
+    MPI_Bsend(buf, count, datatype, dest, tag, comm);
+  } else /* SSEND */ {
+    MPI_Ssend(buf, count, datatype, dest, tag, comm);
+  }
+
+  if (track_time) {
+    res.elapsed_time = MPI_Wtime() - t0;
+  }
+  return res;
+}
+
+// Measure one direction using the unified pattern:
+// sender:   for i: send(); recv();   then if isend -> Waitall
+// receiver: for i: recv(); send();   then if isend -> Waitall
+static PingResult ping_once(int sender, int receiver, int me, std::size_t bytes, int warmup,
+                            int iters, std::vector<char> &buf, Method method) {
+  const int tag = 42424;
+
+  // ---- Warmup (no timing) ----
+  if (warmup > 0) {
+    if (me == sender) {
+      std::vector<MPI_Request> sreq;
+      sreq.reserve(method == Method::ISEND ? warmup : 0);
+      for (int w = 0; w < warmup; ++w) {
+        auto send_res =
+            do_send(method, buf.data(), (int)bytes, MPI_CHAR, receiver, tag, MPI_COMM_WORLD, false);
+        if (method == Method::ISEND) {
+          sreq.push_back(send_res.request);
+        }
+        MPI_Recv(buf.data(), (int)bytes, MPI_CHAR, receiver, tag, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+      }
+      if (method == Method::ISEND && !sreq.empty())
+        MPI_Waitall((int)sreq.size(), sreq.data(), MPI_STATUSES_IGNORE);
+    } else if (me == receiver) {
+      std::vector<MPI_Request> sreq;
+      sreq.reserve(method == Method::ISEND ? warmup : 0);
+      for (int w = 0; w < warmup; ++w) {
+        MPI_Recv(buf.data(), (int)bytes, MPI_CHAR, sender, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        auto send_res =
+            do_send(method, buf.data(), (int)bytes, MPI_CHAR, sender, tag, MPI_COMM_WORLD, false);
+        if (method == Method::ISEND) {
+          sreq.push_back(send_res.request);
+        }
+      }
+      if (method == Method::ISEND && !sreq.empty())
+        MPI_Waitall((int)sreq.size(), sreq.data(), MPI_STATUSES_IGNORE);
+    }
+  }
+
+  // ---- Timed phase ----
+  PingResult res{-1.0, -1.0};
+  if (me == sender) {
+    std::vector<MPI_Request> sreq;
+    sreq.reserve(method == Method::ISEND ? iters : 0);
+    double send_call_total = 0.0;
+
+    double t0 = MPI_Wtime();
+    for (int i = 0; i < iters; ++i) {
+      auto send_res =
+          do_send(method, buf.data(), (int)bytes, MPI_CHAR, receiver, tag, MPI_COMM_WORLD, true);
+      send_call_total += send_res.elapsed_time;
+      if (method == Method::ISEND) {
+        sreq.push_back(send_res.request);
+      }
+
+      MPI_Recv(buf.data(), (int)bytes, MPI_CHAR, receiver, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+    if (method == Method::ISEND && !sreq.empty())
+      MPI_Waitall((int)sreq.size(), sreq.data(), MPI_STATUSES_IGNORE);
+    double t1 = MPI_Wtime();
+
+    res.avg_rtt_s = (t1 - t0) / (double)iters;
+    res.send_call_total_s = send_call_total;
+  } else if (me == receiver) {
+    std::vector<MPI_Request> sreq;
+    sreq.reserve(method == Method::ISEND ? iters : 0);
+    for (int i = 0; i < iters; ++i) {
+      MPI_Recv(buf.data(), (int)bytes, MPI_CHAR, sender, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+      auto send_res =
+          do_send(method, buf.data(), (int)bytes, MPI_CHAR, sender, tag, MPI_COMM_WORLD, false);
+      if (method == Method::ISEND) {
+        sreq.push_back(send_res.request);
+      }
+    }
+    if (method == Method::ISEND && !sreq.empty())
+      MPI_Waitall((int)sreq.size(), sreq.data(), MPI_STATUSES_IGNORE);
+  }
+  return res;
+}
+
+int main(int argc, char **argv) {
+  MPI_Init(&argc, &argv);
+  int world, me;
+  MPI_Comm_size(MPI_COMM_WORLD, &world);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
+  if (world < 2) {
+    if (me == 0) std::cerr << "Run with at least 2 ranks.\n";
+    MPI_Finalize();
+    return 1;
+  }
+
+  Options opt = parse_args(argc, argv, me);
+
+  if (opt.only_rank >= world) {
+    die(me, "--only-rank " + std::to_string(opt.only_rank) + " is out of range for world size " +
+                std::to_string(world));
+  }
+
+  // Gather processor names for locality classification
+  char myname[MPI_MAX_PROCESSOR_NAME] = {};
+  int mylen = 0;
+  MPI_Get_processor_name(myname, &mylen);
+  std::vector<char> allnames(world * MPI_MAX_PROCESSOR_NAME, 0);
+  MPI_Allgather(myname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, allnames.data(), MPI_MAX_PROCESSOR_NAME,
+                MPI_CHAR, MPI_COMM_WORLD);
+  auto rank_name = [&](int r) -> std::string {
+    const char *p = &allnames[r * MPI_MAX_PROCESSOR_NAME];
+    return std::string(p);  // buffer is zero-padded
+  };
+
+  // Prepare message sizes
+  std::vector<std::size_t> sizes;
+  {
+    std::size_t s = opt.min_bytes;
+    if (opt.factor == 1) {
+      for (; s <= opt.max_bytes; ++s) sizes.push_back(s);
+    } else {
+      while (s <= opt.max_bytes) {
+        sizes.push_back(s);
+        if (s > opt.max_bytes / (std::size_t)opt.factor) break;
+        s *= (std::size_t)opt.factor;
+      }
+    }
+  }
+
+  // Reusable buffer
+  std::vector<char> buffer(opt.max_bytes, 0);
+
+  // Attach a Bsend buffer if BSEND is in use (supports one outstanding bsend at a time)
+  std::vector<char> bsend_storage;
+  bool have_bsend =
+      std::find(opt.methods.begin(), opt.methods.end(), Method::BSEND) != opt.methods.end();
+  if (have_bsend) {
+    int pack_max = 0;
+    MPI_Pack_size((int)opt.max_bytes, MPI_CHAR, MPI_COMM_WORLD, &pack_max);
+    int bsz = pack_max + MPI_BSEND_OVERHEAD;
+    bsend_storage.resize((size_t)bsz);
+    if (MPI_Buffer_attach(bsend_storage.data(), bsz) != MPI_SUCCESS) {
+      die(me, "MPI_Buffer_attach failed");
+    }
+  }
+
+  // CSV accumulation (only lower rank logs)
+  std::ostringstream local_csv;
+  auto add_line = [&](int src, int dst, const char *direction, const char *locality,
+                      std::size_t bytes, int iters, const PingResult &res, Method method) {
+    double latency_s = res.avg_rtt_s / 2.0;
+    double bw_MBps = (2.0 * (double)bytes / res.avg_rtt_s) / 1.0e6;  // MB/s (1e6)
+    local_csv << src << ',' << dst << ',' << method_name(method) << ',' << direction << ','
+              << locality << ',' << bytes << ',' << iters << ',' << std::setprecision(12)
+              << res.avg_rtt_s << ',' << std::setprecision(12) << latency_s << ','
+              << std::setprecision(12) << bw_MBps << ',' << std::setprecision(12)
+              << res.send_call_total_s << '\n';
+  };
+
+  auto pair_is_enabled = [&](int a, int b) -> bool {
+    if (opt.only_rank < 0) return true;
+    return (a == opt.only_rank) || (b == opt.only_rank);
+  };
+
+  const int TAG_B_TO_A_RESULT = 88001;
+
+  // Main sweep: pairs × sizes × methods
+  for (int a = 0; a < world; ++a) {
+    for (int b = a + 1; b < world; ++b) {
+      if (!pair_is_enabled(a, b)) continue;
+
+      const bool same_node = (rank_name(a) == rank_name(b));
+      const char *locality = same_node ? "intranode" : "internode";
+
+      for (std::size_t bytes : sizes) {
+        for (Method m : opt.methods) {
+          // a->b
+          MPI_Barrier(MPI_COMM_WORLD);
+          PingResult rtt_ab = ping_once(a, b, me, bytes, opt.warmup, opt.iters, buffer, m);
+
+          // b->a
+          MPI_Barrier(MPI_COMM_WORLD);
+          PingResult rtt_b_to_a = ping_once(b, a, me, bytes, opt.warmup, opt.iters, buffer, m);
+
+          // Ship b->a sender's measurement to logger (rank a)
+          if (me == b) {
+            double payload[2] = {rtt_b_to_a.avg_rtt_s, rtt_b_to_a.send_call_total_s};
+            MPI_Send(payload, 2, MPI_DOUBLE, a, TAG_B_TO_A_RESULT, MPI_COMM_WORLD);
+          }
+
+          if (me == a) {
+            double payload[2];
+            MPI_Recv(payload, 2, MPI_DOUBLE, b, TAG_B_TO_A_RESULT, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            PingResult rtt_b_to_a_from_b{payload[0], payload[1]};
+            add_line(a, b, "a->b", locality, bytes, opt.iters, rtt_ab, m);  // measured by a
+            add_line(a, b, "b->a", locality, bytes, opt.iters, rtt_b_to_a_from_b,
+                     m);  // measured by b
+          }
+
+          MPI_Barrier(MPI_COMM_WORLD);
+        }
+      }
+    }
+  }
+
+  if (have_bsend) {
+    void *bufptr = nullptr;
+    int size = 0;
+    MPI_Buffer_detach(&bufptr, &size);
+  }
+
+  // Gather CSV chunks to rank 0
+  std::string chunk = local_csv.str();
+  long long local_len = (long long)chunk.size();
+  std::vector<long long> all_lens(world, 0);
+  MPI_Gather(&local_len, 1, MPI_LONG_LONG, all_lens.data(), 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
+
+  std::vector<int> recvcounts, displs;
+  std::vector<char> recvbuf;
+  if (me == 0) {
+    recvcounts.resize(world);
+    displs.resize(world);
+    int64_t offset = 0;
+    for (int r = 0; r < world; ++r) {
+      if (all_lens[r] > INT_MAX) {
+        die(0, "CSV output too large for MPI_Gatherv");
+      }
+      recvcounts[r] = static_cast<int>(all_lens[r]);
+      displs[r] = static_cast<int>(offset);
+      offset += recvcounts[r];
+    }
+    if (offset > INT_MAX) {
+      die(0, "CSV output too large for MPI_Gatherv");
+    }
+    recvbuf.resize(static_cast<size_t>(offset));
+  }
+
+  MPI_Gatherv(chunk.data(), (int)local_len, MPI_CHAR, recvbuf.data(), recvcounts.data(),
+              displs.data(), MPI_CHAR, 0, MPI_COMM_WORLD);
+
+  if (me == 0) {
+    std::string_view header =
+        "src_rank,dst_rank,method,direction,locality,msg_bytes,iters,avg_rtt_seconds,latency_"
+        "seconds,bandwidth_MBps,send_call_total_seconds\n";
+    FILE *fp = std::fopen(opt.outfile.c_str(), "wb");
+    if (!fp) {
+      std::cerr << "Failed to open output file: " << opt.outfile << std::endl;
+      MPI_Abort(MPI_COMM_WORLD, 2);
+    }
+    bool write_error = false;
+    if (std::fwrite(header.data(), 1, header.size(), fp) != header.size()) {
+      std::cerr << "Failed to write header to " << opt.outfile << std::endl;
+      write_error = true;
+    }
+    if (!write_error && !recvbuf.empty()) {
+      if (std::fwrite(recvbuf.data(), 1, recvbuf.size(), fp) != recvbuf.size()) {
+        std::cerr << "Failed to write data to " << opt.outfile << std::endl;
+        write_error = true;
+      }
+    }
+    if (!write_error && std::fflush(fp) != 0) {
+      std::cerr << "Failed to flush " << opt.outfile << std::endl;
+      write_error = true;
+    }
+    if (std::fclose(fp) != 0) {
+      std::cerr << "Failed to close " << opt.outfile << std::endl;
+      write_error = true;
+    }
+    if (write_error) {
+      MPI_Abort(MPI_COMM_WORLD, 2);
+    }
+    std::cout << "Wrote results to " << opt.outfile << std::endl;
+  }
+
+  MPI_Finalize();
+  return 0;
+}
diff --git a/benchmark/results/aurora/1-dynampi_shutdown_aurora_1-8273278.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/1-dynampi_shutdown_aurora_1-8273278.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..dba4d7f
--- /dev/null
+++ b/benchmark/results/aurora/1-dynampi_shutdown_aurora_1-8273278.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,1,102,101,56.2148,47544
diff --git a/benchmark/results/aurora/1-dynampi_ss_aurora_1-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/1-dynampi_ss_aurora_1-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..7b4a711
--- /dev/null
+++ b/benchmark/results/aurora/1-dynampi_ss_aurora_1-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,1,102,101,3292035,10,329203
+aurora,naive,fixed,10,10,1,102,101,3038956,10.0132,303495
+aurora,naive,fixed,100,10,1,102,101,3046127,10.0791,302222
+aurora,naive,fixed,1000,10,1,102,101,697881,10.0214,69638.8
+aurora,naive,fixed,10000,10,1,102,101,72117,10.0723,7159.91
+aurora,naive,fixed,100000,10,1,102,101,8130,10.2931,789.851
+aurora,naive,fixed,1000000,10,1,102,101,998,10.0016,99.7837
+aurora,naive,random,1,10,1,102,101,3372090,10,337207
+aurora,naive,random,10,10,1,102,101,3538674,10,353866
+aurora,naive,random,100,10,1,102,101,3737838,10.0001,373780
+aurora,naive,random,1000,10,1,102,101,688195,10.0871,68225.1
+aurora,naive,random,10000,10,1,102,101,74008,10.0392,7371.88
+aurora,naive,random,100000,10,1,102,101,8253,10.263,804.148
+aurora,naive,random,1000000,10,1,102,101,1037,11.9386,86.8612
+aurora,hierarchical,fixed,1,10,1,102,101,25600214,10.0285,2.55275e+06
+aurora,hierarchical,fixed,10,10,1,102,101,19734672,10.1209,1.94989e+06
+aurora,hierarchical,fixed,100,10,1,102,101,5325657,10.0002,532556
+aurora,hierarchical,fixed,1000,10,1,102,101,631484,10.1692,62097.9
+aurora,hierarchical,fixed,10000,10,1,102,101,65076,10.0286,6489.06
+aurora,hierarchical,fixed,100000,10,1,102,101,7275,10.2691,708.435
+aurora,hierarchical,fixed,1000000,10,1,102,101,864,10.0021,86.3822
+aurora,hierarchical,random,1,10,1,102,101,26725337,10,2.67252e+06
+aurora,hierarchical,random,10,10,1,102,101,18693436,10,1.86933e+06
+aurora,hierarchical,random,100,10,1,102,101,4084427,10.0003,408431
+aurora,hierarchical,random,1000,10,1,102,101,455397,10.0021,45530.1
+aurora,hierarchical,random,10000,10,1,102,101,46179,10.0178,4609.71
+aurora,hierarchical,random,100000,10,1,102,101,5248,10.51,499.332
+aurora,hierarchical,random,1000000,10,1,102,101,768,13.7015,56.0524
diff --git a/benchmark/results/aurora/128-dynampi_shutdown_aurora_128-8273285.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/128-dynampi_shutdown_aurora_128-8273285.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..800c81a
--- /dev/null
+++ b/benchmark/results/aurora/128-dynampi_shutdown_aurora_128-8273285.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,128,13056,13055,1.3148e+06,7
diff --git a/benchmark/results/aurora/128-dynampi_ss_aurora_128-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/128-dynampi_ss_aurora_128-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..cbd49a9
--- /dev/null
+++ b/benchmark/results/aurora/128-dynampi_ss_aurora_128-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,128,13056,13055,3759459,10.1804,369286
+aurora,naive,fixed,10,10,128,13056,13055,4135601,10.1817,406181
+aurora,naive,fixed,100,10,128,13056,13055,3857471,10.1774,379025
+aurora,naive,fixed,1000,10,128,13056,13055,4391850,10.2552,428256
+aurora,naive,fixed,10000,10,128,13056,13055,862438,11.5385,74744.2
+aurora,naive,fixed,100000,10,128,13056,13055,1247116,10.3682,120283
+aurora,naive,fixed,1000000,10,128,13056,13055,137318,12.6656,10841.8
+aurora,naive,random,1,10,128,13056,13055,3185549,10.2509,310758
+aurora,naive,random,10,10,128,13056,13055,4089673,10.0479,407019
+aurora,naive,random,100,10,128,13056,13055,4510086,10.2189,441349
+aurora,naive,random,1000,10,128,13056,13055,4382619,10.2858,426084
+aurora,naive,random,10000,10,128,13056,13055,813690,10.1783,79943.7
+aurora,naive,random,100000,10,128,13056,13055,1162931,10.4364,111430
+aurora,naive,random,1000000,10,128,13056,13055,111302,12.1867,9133.04
+aurora,hierarchical,fixed,1,10,128,13056,13055,143076556,10.2942,1.38988e+07
+aurora,hierarchical,fixed,10,10,128,13056,13055,138750651,10.2194,1.35772e+07
+aurora,hierarchical,fixed,100,10,128,13056,13055,139061615,10.2833,1.3523e+07
+aurora,hierarchical,fixed,1000,10,128,13056,13055,64695180,10.2085,6.33736e+06
+aurora,hierarchical,fixed,10000,10,128,13056,13055,8711435,10.2067,853504
+aurora,hierarchical,fixed,100000,10,128,13056,13055,1059270,10.4242,101616
+aurora,hierarchical,fixed,1000000,10,128,13056,13055,146950,12.3262,11921.8
+aurora,hierarchical,random,1,10,128,13056,13055,136173192,10.2541,1.32799e+07
+aurora,hierarchical,random,10,10,128,13056,13055,138530865,10.273,1.3485e+07
+aurora,hierarchical,random,100,10,128,13056,13055,138680080,10.3062,1.34559e+07
+aurora,hierarchical,random,1000,10,128,13056,13055,47825162,10.2542,4.66395e+06
+aurora,hierarchical,random,10000,10,128,13056,13055,5700593,10.2524,556028
+aurora,hierarchical,random,100000,10,128,13056,13055,759388,10.4392,72743.8
+aurora,hierarchical,random,1000000,10,128,13056,13055,101932,13.4684,7568.24
diff --git a/benchmark/results/aurora/16-dynampi_shutdown_aurora_16-8273282.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/16-dynampi_shutdown_aurora_16-8273282.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..477a342
--- /dev/null
+++ b/benchmark/results/aurora/16-dynampi_shutdown_aurora_16-8273282.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,16,1632,1631,65732.5,52
diff --git a/benchmark/results/aurora/16-dynampi_ss_aurora_16-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/16-dynampi_ss_aurora_16-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..4ab3dcc
--- /dev/null
+++ b/benchmark/results/aurora/16-dynampi_ss_aurora_16-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,16,1632,1631,926576,10.1276,91490.6
+aurora,naive,fixed,10,10,16,1632,1631,1014478,10.1828,99626.4
+aurora,naive,fixed,100,10,16,1632,1631,970322,10.2285,94864.7
+aurora,naive,fixed,1000,10,16,1632,1631,899363,10.1916,88245.9
+aurora,naive,fixed,10000,10,16,1632,1631,710556,10.0606,70627.4
+aurora,naive,fixed,100000,10,16,1632,1631,155556,10.3381,15046.9
+aurora,naive,fixed,1000000,10,16,1632,1631,17505,11.1251,1573.46
+aurora,naive,random,1,10,16,1632,1631,958883,10.0515,95396.8
+aurora,naive,random,10,10,16,1632,1631,1057683,10.1523,104182
+aurora,naive,random,100,10,16,1632,1631,1100268,10.1276,108640
+aurora,naive,random,1000,10,16,1632,1631,921259,10.194,90372.8
+aurora,naive,random,10000,10,16,1632,1631,787837,10.2133,77138.1
+aurora,naive,random,100000,10,16,1632,1631,157827,10.4086,15163.2
+aurora,naive,random,1000000,10,16,1632,1631,17253,11.9111,1448.48
+aurora,hierarchical,fixed,1,10,16,1632,1631,95909487,10.0591,9.53461e+06
+aurora,hierarchical,fixed,10,10,16,1632,1631,96384034,10.4675,9.20791e+06
+aurora,hierarchical,fixed,100,10,16,1632,1631,62607757,10.2033,6.136e+06
+aurora,hierarchical,fixed,1000,10,16,1632,1631,10707158,10.2557,1.04402e+06
+aurora,hierarchical,fixed,10000,10,16,1632,1631,1173716,10.276,114219
+aurora,hierarchical,fixed,100000,10,16,1632,1631,132710,10.2848,12903.6
+aurora,hierarchical,fixed,1000000,10,16,1632,1631,17872,12.4028,1440.96
+aurora,hierarchical,random,1,10,16,1632,1631,97037464,10.1521,9.55841e+06
+aurora,hierarchical,random,10,10,16,1632,1631,94803173,10.2254,9.27134e+06
+aurora,hierarchical,random,100,10,16,1632,1631,54551098,10.1787,5.35933e+06
+aurora,hierarchical,random,1000,10,16,1632,1631,7264168,10.2942,705658
+aurora,hierarchical,random,10000,10,16,1632,1631,767200,10.2683,74715.7
+aurora,hierarchical,random,100000,10,16,1632,1631,94483,10.6022,8911.62
+aurora,hierarchical,random,1000000,10,16,1632,1631,12728,13.4645,945.3
diff --git a/benchmark/results/aurora/2-dynampi_shutdown_aurora_2-8273279.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/2-dynampi_shutdown_aurora_2-8273279.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..5a96d51
--- /dev/null
+++ b/benchmark/results/aurora/2-dynampi_shutdown_aurora_2-8273279.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,2,204,203,232.767,16092
diff --git a/benchmark/results/aurora/2-dynampi_ss_aurora_2-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/2-dynampi_ss_aurora_2-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..0ed3c02
--- /dev/null
+++ b/benchmark/results/aurora/2-dynampi_ss_aurora_2-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,2,204,203,2320234,10,232023
+aurora,naive,fixed,10,10,2,204,203,2323683,10.0797,230532
+aurora,naive,fixed,100,10,2,204,203,2432285,10.1231,240271
+aurora,naive,fixed,1000,10,2,204,203,963304,10.052,95832.5
+aurora,naive,fixed,10000,10,2,204,203,122452,10.0034,12241
+aurora,naive,fixed,100000,10,2,204,203,16835,10.2284,1645.9
+aurora,naive,fixed,1000000,10,2,204,203,2200,11.0373,199.324
+aurora,naive,random,1,10,2,204,203,2558997,10.0916,253576
+aurora,naive,random,10,10,2,204,203,2852009,10,285201
+aurora,naive,random,100,10,2,204,203,2704860,10.0422,269349
+aurora,naive,random,1000,10,2,204,203,1015491,10.2629,98947.6
+aurora,naive,random,10000,10,2,204,203,122936,10.0804,12195.5
+aurora,naive,random,100000,10,2,204,203,17047,10.3119,1653.14
+aurora,naive,random,1000000,10,2,204,203,2137,11.8315,180.62
+aurora,hierarchical,fixed,1,10,2,204,203,35501354,10.1075,3.51236e+06
+aurora,hierarchical,fixed,10,10,2,204,203,33540165,10.203,3.28728e+06
+aurora,hierarchical,fixed,100,10,2,204,203,10089910,10.0002,1.00897e+06
+aurora,hierarchical,fixed,1000,10,2,204,203,1213536,10.1795,119214
+aurora,hierarchical,fixed,10000,10,2,204,203,129458,10.1806,12716.2
+aurora,hierarchical,fixed,100000,10,2,204,203,14734,10.2973,1430.86
+aurora,hierarchical,fixed,1000000,10,2,204,203,1929,12.671,152.237
+aurora,hierarchical,random,1,10,2,204,203,37185358,10.2636,3.62302e+06
+aurora,hierarchical,random,10,10,2,204,203,31345762,10.1917,3.07562e+06
+aurora,hierarchical,random,100,10,2,204,203,7656038,10.019,764152
+aurora,hierarchical,random,1000,10,2,204,203,857171,10.0783,85050.8
+aurora,hierarchical,random,10000,10,2,204,203,89629,10.1354,8843.2
+aurora,hierarchical,random,100000,10,2,204,203,10820,10.5683,1023.82
+aurora,hierarchical,random,1000000,10,2,204,203,1568,13.527,115.916
diff --git a/benchmark/results/aurora/256-dynampi_shutdown_aurora_256-8273286.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/256-dynampi_shutdown_aurora_256-8273286.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..9697998
--- /dev/null
+++ b/benchmark/results/aurora/256-dynampi_shutdown_aurora_256-8273286.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,256,26112,26111,3.22161e+06,4
diff --git a/benchmark/results/aurora/256-dynampi_ss_aurora_256-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/256-dynampi_ss_aurora_256-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..72326d3
--- /dev/null
+++ b/benchmark/results/aurora/256-dynampi_ss_aurora_256-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,256,26112,26111,4619072,11.966,386017
+aurora,naive,fixed,10,10,256,26112,26111,4641837,12.4846,371806
+aurora,naive,fixed,100,10,256,26112,26111,3237586,10.3013,314288
+aurora,naive,fixed,1000,10,256,26112,26111,4237900,10.2809,412212
+aurora,naive,fixed,10000,10,256,26112,26111,795236,10.2324,77717.2
+aurora,naive,fixed,100000,10,256,26112,26111,1280802,11.8403,108173
+aurora,naive,fixed,1000000,10,256,26112,26111,266610,11.6867,22813.1
+aurora,naive,random,1,10,256,26112,26111,4781788,13.4572,355334
+aurora,naive,random,10,10,256,26112,26111,5233230,12.882,406243
+aurora,naive,random,100,10,256,26112,26111,3550527,10.2433,346619
+aurora,naive,random,1000,10,256,26112,26111,4100379,10.2172,401320
+aurora,naive,random,10000,10,256,26112,26111,685419,11.7954,58108.9
+aurora,naive,random,100000,10,256,26112,26111,2266274,11.2013,202322
+aurora,naive,random,1000000,10,256,26112,26111,171232,11.9497,14329.4
+aurora,hierarchical,fixed,1,10,256,26112,26111,145731858,10.23,1.42456e+07
+aurora,hierarchical,fixed,10,10,256,26112,26111,144252187,10.248,1.40761e+07
+aurora,hierarchical,fixed,100,10,256,26112,26111,150907610,10.2015,1.47927e+07
+aurora,hierarchical,fixed,1000,10,256,26112,26111,100576835,10.2351,9.82667e+06
+aurora,hierarchical,fixed,10000,10,256,26112,26111,15747075,10.2446,1.53711e+06
+aurora,hierarchical,fixed,100000,10,256,26112,26111,2070254,10.5322,196564
+aurora,hierarchical,fixed,1000000,10,256,26112,26111,292292,12.2982,23767
+aurora,hierarchical,random,1,10,256,26112,26111,147387588,10.2316,1.44052e+07
+aurora,hierarchical,random,10,10,256,26112,26111,151057427,10.1957,1.48158e+07
+aurora,hierarchical,random,100,10,256,26112,26111,149040594,10.2656,1.45185e+07
+aurora,hierarchical,random,1000,10,256,26112,26111,78431231,10.2522,7.65015e+06
+aurora,hierarchical,random,10000,10,256,26112,26111,10525990,10.2869,1.02324e+06
+aurora,hierarchical,random,100000,10,256,26112,26111,1501539,10.4575,143585
+aurora,hierarchical,random,1000000,10,256,26112,26111,207806,15.6579,13271.6
diff --git a/benchmark/results/aurora/32-dynampi_shutdown_aurora_32-8273283.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/32-dynampi_shutdown_aurora_32-8273283.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..a7ade02
--- /dev/null
+++ b/benchmark/results/aurora/32-dynampi_shutdown_aurora_32-8273283.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,32,3264,3263,173000,23
diff --git a/benchmark/results/aurora/32-dynampi_ss_aurora_32-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/32-dynampi_ss_aurora_32-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..3819060
--- /dev/null
+++ b/benchmark/results/aurora/32-dynampi_ss_aurora_32-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,32,3264,3263,1971708,10.203,193248
+aurora,naive,fixed,10,10,32,3264,3263,1874286,10.2216,183366
+aurora,naive,fixed,100,10,32,3264,3263,1998433,10.2637,194709
+aurora,naive,fixed,1000,10,32,3264,3263,1780555,10.2429,173832
+aurora,naive,fixed,10000,10,32,3264,3263,1574554,10.3053,152791
+aurora,naive,fixed,100000,10,32,3264,3263,303456,10.4089,29153.6
+aurora,naive,fixed,1000000,10,32,3264,3263,34672,11.1478,3110.22
+aurora,naive,random,1,10,32,3264,3263,2053313,10.1821,201659
+aurora,naive,random,10,10,32,3264,3263,2051489,10.1645,201828
+aurora,naive,random,100,10,32,3264,3263,1986143,10.204,194643
+aurora,naive,random,1000,10,32,3264,3263,1648332,10.2367,161023
+aurora,naive,random,10000,10,32,3264,3263,1562902,10.2551,152402
+aurora,naive,random,100000,10,32,3264,3263,301658,10.4567,28848.4
+aurora,naive,random,1000000,10,32,3264,3263,34149,12.0491,2834.16
+aurora,hierarchical,fixed,1,10,32,3264,3263,109144885,10.268,1.06296e+07
+aurora,hierarchical,fixed,10,10,32,3264,3263,108544974,10.203,1.06385e+07
+aurora,hierarchical,fixed,100,10,32,3264,3263,89223525,10.189,8.75682e+06
+aurora,hierarchical,fixed,1000,10,32,3264,3263,19786955,10.189,1.942e+06
+aurora,hierarchical,fixed,10000,10,32,3264,3263,2232458,10.3681,215319
+aurora,hierarchical,fixed,100000,10,32,3264,3263,264786,10.6206,24931.3
+aurora,hierarchical,fixed,1000000,10,32,3264,3263,35784,12.2509,2920.93
+aurora,hierarchical,random,1,10,32,3264,3263,110723722,10.2266,1.08271e+07
+aurora,hierarchical,random,10,10,32,3264,3263,107073128,10.0836,1.06186e+07
+aurora,hierarchical,random,100,10,32,3264,3263,84979742,10.2703,8.27432e+06
+aurora,hierarchical,random,1000,10,32,3264,3263,13545663,10.1751,1.33126e+06
+aurora,hierarchical,random,10000,10,32,3264,3263,1483126,10.3192,143725
+aurora,hierarchical,random,100000,10,32,3264,3263,189678,10.4472,18155.9
+aurora,hierarchical,random,1000000,10,32,3264,3263,25876,15.121,1711.27
diff --git a/benchmark/results/aurora/4-dynampi_shutdown_aurora_4-8273280.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/4-dynampi_shutdown_aurora_4-8273280.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..cb008d4
--- /dev/null
+++ b/benchmark/results/aurora/4-dynampi_shutdown_aurora_4-8273280.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,4,408,407,415.061,11331
diff --git a/benchmark/results/aurora/4-dynampi_ss_aurora_4-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/4-dynampi_ss_aurora_4-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..176597d
--- /dev/null
+++ b/benchmark/results/aurora/4-dynampi_ss_aurora_4-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,4,408,407,2770958,10.1144,273961
+aurora,naive,fixed,10,10,4,408,407,2728827,10.0002,272878
+aurora,naive,fixed,100,10,4,408,407,2656957,10.0001,265693
+aurora,naive,fixed,1000,10,4,408,407,2319196,10.0363,231081
+aurora,naive,fixed,10000,10,4,408,407,295337,10.0086,29508.4
+aurora,naive,fixed,100000,10,4,408,407,39834,10.159,3921.07
+aurora,naive,fixed,1000000,10,4,408,407,4409,11.0021,400.742
+aurora,naive,random,1,10,4,408,407,2775929,10,277593
+aurora,naive,random,10,10,4,408,407,3080768,10,308076
+aurora,naive,random,100,10,4,408,407,3260474,10.0029,325952
+aurora,naive,random,1000,10,4,408,407,2255570,10.0497,224442
+aurora,naive,random,10000,10,4,408,407,311848,10.0166,31133.3
+aurora,naive,random,100000,10,4,408,407,40522,10.1908,3976.34
+aurora,naive,random,1000000,10,4,408,407,4325,11.8484,365.027
+aurora,hierarchical,fixed,1,10,4,408,407,63171445,10.0368,6.29397e+06
+aurora,hierarchical,fixed,10,10,4,408,407,59782951,10.0001,5.97826e+06
+aurora,hierarchical,fixed,100,10,4,408,407,24326641,10.0249,2.42663e+06
+aurora,hierarchical,fixed,1000,10,4,408,407,3287894,10.0021,328721
+aurora,hierarchical,fixed,10000,10,4,408,407,340081,10.0203,33939.1
+aurora,hierarchical,fixed,100000,10,4,408,407,37755,10.2044,3699.88
+aurora,hierarchical,fixed,1000000,10,4,408,407,4017,12.0263,334.019
+aurora,hierarchical,random,1,10,4,408,407,62514471,10,6.25142e+06
+aurora,hierarchical,random,10,10,4,408,407,60507379,10.0001,6.0507e+06
+aurora,hierarchical,random,100,10,4,408,407,20013832,10.2073,1.96073e+06
+aurora,hierarchical,random,1000,10,4,408,407,2236408,10.0036,223560
+aurora,hierarchical,random,10000,10,4,408,407,233038,10.079,23121.2
+aurora,hierarchical,random,100000,10,4,408,407,25626,10.2339,2504.03
+aurora,hierarchical,random,1000000,10,4,408,407,3216,13.9369,230.755
diff --git a/benchmark/results/aurora/512-dynampi_shutdown_aurora_512-8273287.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/512-dynampi_shutdown_aurora_512-8273287.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..c86ec4e
--- /dev/null
+++ b/benchmark/results/aurora/512-dynampi_shutdown_aurora_512-8273287.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,512,52224,52223,1.32215e+07,1
diff --git a/benchmark/results/aurora/512-dynampi_ss_aurora_512-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/512-dynampi_ss_aurora_512-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..30a709d
--- /dev/null
+++ b/benchmark/results/aurora/512-dynampi_ss_aurora_512-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,512,52224,52223,4097144,31.191,131357
+aurora,naive,fixed,10,10,512,52224,52223,3723083,25.4862,146082
+aurora,naive,fixed,100,10,512,52224,52223,2141347,17.8665,119853
+aurora,naive,fixed,1000,10,512,52224,52223,209409,13.8726,15095.2
+aurora,naive,fixed,10000,10,512,52224,52223,209242,14.9043,14039.1
+aurora,naive,fixed,100000,10,512,52224,52223,209197,13.2934,15736.9
+aurora,naive,fixed,1000000,10,512,52224,52223,209095,29.7586,7026.37
+aurora,naive,random,1,10,512,52224,52223,4088630,29.857,136940
+aurora,naive,random,10,10,512,52224,52223,4582756,26.5137,172845
+aurora,naive,random,100,10,512,52224,52223,2340349,16.9888,137759
+aurora,naive,random,1000,10,512,52224,52223,209282,13.3642,15659.9
+aurora,naive,random,10000,10,512,52224,52223,209134,13.8665,15082
+aurora,naive,random,100000,10,512,52224,52223,209302,12.8027,16348.3
+aurora,naive,random,1000000,10,512,52224,52223,209204,51.0611,4097.13
+aurora,hierarchical,fixed,1,10,512,52224,52223,142287911,10.2518,1.38792e+07
+aurora,hierarchical,fixed,10,10,512,52224,52223,139946758,10.387,1.34733e+07
+aurora,hierarchical,fixed,100,10,512,52224,52223,139594506,10.2802,1.3579e+07
+aurora,hierarchical,fixed,1000,10,512,52224,52223,107690133,10.2897,1.04658e+07
+aurora,hierarchical,fixed,10000,10,512,52224,52223,24945068,10.3131,2.41878e+06
+aurora,hierarchical,fixed,100000,10,512,52224,52223,3685997,10.4885,351433
+aurora,hierarchical,fixed,1000000,10,512,52224,52223,573943,12.435,46155.5
+aurora,hierarchical,random,1,10,512,52224,52223,138737200,10.2986,1.34715e+07
+aurora,hierarchical,random,10,10,512,52224,52223,139848889,10.2705,1.36165e+07
+aurora,hierarchical,random,100,10,512,52224,52223,141593060,10.3642,1.36617e+07
+aurora,hierarchical,random,1000,10,512,52224,52223,94863386,10.3386,9.17563e+06
+aurora,hierarchical,random,10000,10,512,52224,52223,17146858,10.2664,1.67019e+06
+aurora,hierarchical,random,100000,10,512,52224,52223,2780472,10.6128,261993
+aurora,hierarchical,random,1000000,10,512,52224,52223,416372,16.0025,26019.2
diff --git a/benchmark/results/aurora/64-dynampi_shutdown_aurora_64-8273284.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/64-dynampi_shutdown_aurora_64-8273284.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..58b6b20
--- /dev/null
+++ b/benchmark/results/aurora/64-dynampi_shutdown_aurora_64-8273284.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,64,6528,6527,429295,12
diff --git a/benchmark/results/aurora/64-dynampi_ss_aurora_64-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/64-dynampi_ss_aurora_64-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..82b0839
--- /dev/null
+++ b/benchmark/results/aurora/64-dynampi_ss_aurora_64-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,64,6528,6527,3153458,10.2011,309128
+aurora,naive,fixed,10,10,64,6528,6527,3070733,10.1625,302162
+aurora,naive,fixed,100,10,64,6528,6527,3012121,10.2078,295081
+aurora,naive,fixed,1000,10,64,6528,6527,3005823,10.2481,293306
+aurora,naive,fixed,10000,10,64,6528,6527,1140339,10.3641,110028
+aurora,naive,fixed,100000,10,64,6528,6527,373045,10.9557,34050.2
+aurora,naive,fixed,1000000,10,64,6528,6527,69040,11.3344,6091.17
+aurora,naive,random,1,10,64,6528,6527,2972987,10.1104,294054
+aurora,naive,random,10,10,64,6528,6527,3188762,10.3839,307087
+aurora,naive,random,100,10,64,6528,6527,3508625,10.2335,342856
+aurora,naive,random,1000,10,64,6528,6527,3127984,10.2974,303764
+aurora,naive,random,10000,10,64,6528,6527,1206366,10.6337,113447
+aurora,naive,random,100000,10,64,6528,6527,289137,10.5432,27424.1
+aurora,naive,random,1000000,10,64,6528,6527,68086,12.1331,5611.58
+aurora,hierarchical,fixed,1,10,64,6528,6527,117185788,10.4487,1.12154e+07
+aurora,hierarchical,fixed,10,10,64,6528,6527,113120908,10.2455,1.1041e+07
+aurora,hierarchical,fixed,100,10,64,6528,6527,106274811,10.2038,1.04152e+07
+aurora,hierarchical,fixed,1000,10,64,6528,6527,33179042,10.257,3.23478e+06
+aurora,hierarchical,fixed,10000,10,64,6528,6527,3832028,10.269,373163
+aurora,hierarchical,fixed,100000,10,64,6528,6527,483621,10.5397,45885.6
+aurora,hierarchical,fixed,1000000,10,64,6528,6527,72473,12.2945,5894.73
+aurora,hierarchical,random,1,10,64,6528,6527,114058699,10.4063,1.09605e+07
+aurora,hierarchical,random,10,10,64,6528,6527,113958357,10.2227,1.11475e+07
+aurora,hierarchical,random,100,10,64,6528,6527,100452348,10.475,9.58975e+06
+aurora,hierarchical,random,1000,10,64,6528,6527,22216175,10.2638,2.16452e+06
+aurora,hierarchical,random,10000,10,64,6528,6527,2512161,10.2109,246028
+aurora,hierarchical,random,100000,10,64,6528,6527,358523,10.4799,34210.6
+aurora,hierarchical,random,1000000,10,64,6528,6527,48783,13.4007,3640.33
diff --git a/benchmark/results/aurora/8-dynampi_shutdown_aurora_8-8273281.aurora/naive_shutdown_aurora.csv b/benchmark/results/aurora/8-dynampi_shutdown_aurora_8-8273281.aurora/naive_shutdown_aurora.csv
new file mode 100644
index 0000000..f08ac80
--- /dev/null
+++ b/benchmark/results/aurora/8-dynampi_shutdown_aurora_8-8273281.aurora/naive_shutdown_aurora.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+aurora,8,816,815,1716.62,1756
diff --git a/benchmark/results/aurora/8-dynampi_ss_aurora_8-manual/strong_scaling_aurora.csv b/benchmark/results/aurora/8-dynampi_ss_aurora_8-manual/strong_scaling_aurora.csv
new file mode 100644
index 0000000..aed3c39
--- /dev/null
+++ b/benchmark/results/aurora/8-dynampi_ss_aurora_8-manual/strong_scaling_aurora.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+aurora,naive,fixed,1,10,8,816,815,1228409,10.075,121926
+aurora,naive,fixed,10,10,8,816,815,1206907,10.0815,119715
+aurora,naive,fixed,100,10,8,816,815,1227390,10.0271,122407
+aurora,naive,fixed,1000,10,8,816,815,1092887,10.032,108940
+aurora,naive,fixed,10000,10,8,816,815,414988,10.0574,41262.2
+aurora,naive,fixed,100000,10,8,816,815,79431,10.0807,7879.55
+aurora,naive,fixed,1000000,10,8,816,815,8059,10.0985,798.039
+aurora,naive,random,1,10,8,816,815,1234459,10.0221,123174
+aurora,naive,random,10,10,8,816,815,1504660,10.1793,147815
+aurora,naive,random,100,10,8,816,815,1429906,10.1001,141574
+aurora,naive,random,1000,10,8,816,815,1164652,10.1162,115127
+aurora,naive,random,10000,10,8,816,815,434861,10.1444,42867.1
+aurora,naive,random,100000,10,8,816,815,79430,10.2573,7743.73
+aurora,naive,random,1000000,10,8,816,815,8670,12.031,720.641
+aurora,hierarchical,fixed,1,10,8,816,815,79011039,10.2284,7.72464e+06
+aurora,hierarchical,fixed,10,10,8,816,815,77370138,10.1479,7.62428e+06
+aurora,hierarchical,fixed,100,10,8,816,815,40474774,10.1982,3.9688e+06
+aurora,hierarchical,fixed,1000,10,8,816,815,6013964,10.2077,589158
+aurora,hierarchical,fixed,10000,10,8,816,815,634971,10.241,62002.7
+aurora,hierarchical,fixed,100000,10,8,816,815,70335,10.3061,6824.63
+aurora,hierarchical,fixed,1000000,10,8,816,815,8884,12.129,732.461
+aurora,hierarchical,random,1,10,8,816,815,77582110,10.151,7.64282e+06
+aurora,hierarchical,random,10,10,8,816,815,75719678,10.1162,7.485e+06
+aurora,hierarchical,random,100,10,8,816,815,33532886,10.0429,3.33897e+06
+aurora,hierarchical,random,1000,10,8,816,815,4024376,10.0437,400685
+aurora,hierarchical,random,10000,10,8,816,815,422833,10.0998,41865.5
+aurora,hierarchical,random,100000,10,8,816,815,49633,10.3575,4791.96
+aurora,hierarchical,random,1000000,10,8,816,815,6340,15.0156,422.228
diff --git a/benchmark/results/frontier/1-dynampi_shutdown_frontier_1-4058166/naive_shutdown_frontier.csv b/benchmark/results/frontier/1-dynampi_shutdown_frontier_1-4058166/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..22c0f23
--- /dev/null
+++ b/benchmark/results/frontier/1-dynampi_shutdown_frontier_1-4058166/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,1,56,55,24.2909,165568
diff --git a/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054467/strong_scaling_frontier.csv b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054467/strong_scaling_frontier.csv
new file mode 100644
index 0000000..df6e1ad
--- /dev/null
+++ b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054467/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,1,56,55,23186674,23186674,18.7768,1.23486e+06
+frontier,naive,fixed,10000,200,10,1,56,55,13768572,13768572,10.8496,1.26904e+06
+frontier,naive,fixed,100000,200,10,1,56,55,5477490,5477490,10.1831,537900
+frontier,naive,fixed,1000000,200,10,1,56,55,547414,547414,10.1623,53867.1
+frontier,naive,fixed,10000000,200,10,1,56,55,46916,46916,10.1724,4612.08
+frontier,naive,fixed,100000000,200,10,1,56,55,2854,2854,10.1013,282.539
+frontier,naive,fixed,1000000000,200,10,1,56,55,550,550,10.0013,54.993
+frontier,naive,poisson,1000,200,10,1,56,55,23350494,23350494,18.7675,1.2442e+06
+frontier,naive,poisson,10000,200,10,1,56,55,13699510,13699510,10.8446,1.26325e+06
+frontier,naive,poisson,100000,200,10,1,56,55,5440040,5440040,10.1841,534169
+frontier,naive,poisson,1000000,200,10,1,56,55,547428,547428,10.1661,53848.4
+frontier,naive,poisson,10000000,200,10,1,56,55,46950,46950,10.1725,4615.37
+frontier,naive,poisson,100000000,200,10,1,56,55,2910,2910,10.3,282.524
+frontier,naive,poisson,1000000000,200,10,1,56,55,550,550,10.0013,54.993
+frontier,hierarchical,fixed,1000,200,10,1,56,55,103808000,103808000,11.1631,9.29918e+06
+frontier,hierarchical,fixed,10000,200,10,1,56,55,35839000,35839000,10.2613,3.49263e+06
+frontier,hierarchical,fixed,100000,200,10,1,56,55,2787000,2787000,10.3762,268594
+frontier,hierarchical,fixed,1000000,200,10,1,56,55,110000,110000,10.0183,10979.9
+frontier,hierarchical,fixed,10000000,200,10,1,56,55,7700,7700,11.0621,696.071
+frontier,hierarchical,fixed,100000000,200,10,1,56,55,770,770,11.2013,68.7421
+frontier,hierarchical,fixed,1000000000,200,10,1,56,55,110,110,16.0005,6.87479
+frontier,hierarchical,poisson,1000,200,10,1,56,55,103554000,103554000,11.1279,9.30583e+06
+frontier,hierarchical,poisson,10000,200,10,1,56,55,34642000,34642000,10.2254,3.38783e+06
+frontier,hierarchical,poisson,100000,200,10,1,56,55,2744000,2744000,10.2129,268680
+frontier,hierarchical,poisson,1000000,200,10,1,56,55,110000,110000,10.0234,10974.3
+frontier,hierarchical,poisson,10000000,200,10,1,56,55,7700,7700,11.0625,696.047
+frontier,hierarchical,poisson,100000000,200,10,1,56,55,770,770,11.2009,68.7445
+frontier,hierarchical,poisson,1000000000,200,10,1,56,55,110,110,16.0004,6.87481
diff --git a/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054787/strong_scaling_frontier.csv b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054787/strong_scaling_frontier.csv
new file mode 100644
index 0000000..7761c24
--- /dev/null
+++ b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4054787/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,1,56,55,14009820,10.0002,1.40096e+06
+frontier,naive,fixed,10,10,1,56,55,14745060,10.0001,1.47449e+06
+frontier,naive,fixed,100,10,1,56,55,4609660,10.0003,460952
+frontier,naive,fixed,1000,10,1,56,55,539110,10.0021,53899.9
+frontier,naive,fixed,10000,10,1,56,55,55000,10.0248,5486.39
+frontier,naive,fixed,100000,10,1,56,55,2964,10.3015,287.725
+frontier,naive,fixed,1000000,10,1,56,55,444,13.0005,34.1525
+frontier,naive,random,1,10,1,56,55,13885960,10.0001,1.38858e+06
+frontier,naive,random,10,10,1,56,55,13543750,10.0002,1.35435e+06
+frontier,naive,random,100,10,1,56,55,3353020,10.0006,335281
+frontier,naive,random,1000,10,1,56,55,347930,10.0048,34776.2
+frontier,naive,random,10000,10,1,56,55,35090,10.0356,3496.57
+frontier,naive,random,100000,10,1,56,55,3523,10.5376,334.325
+frontier,naive,random,1000000,10,1,56,55,511,16.1204,31.699
+frontier,hierarchical,fixed,1,10,1,56,55,18667330,10.0001,1.86671e+06
+frontier,hierarchical,fixed,10,10,1,56,55,8643580,10.0002,864340
+frontier,hierarchical,fixed,100,10,1,56,55,1863620,10.0011,186342
+frontier,hierarchical,fixed,1000,10,1,56,55,183260,10.0093,18308.9
+frontier,hierarchical,fixed,10000,10,1,56,55,15730,10.1382,1551.56
+frontier,hierarchical,fixed,100000,10,1,56,55,2834,10.4018,272.453
+frontier,hierarchical,fixed,1000000,10,1,56,55,440,16.0006,27.499
+frontier,hierarchical,random,1,10,1,56,55,18438310,10.0001,1.84381e+06
+frontier,hierarchical,random,10,10,1,56,55,7449860,10.0003,744967
+frontier,hierarchical,random,100,10,1,56,55,951610,10.0017,95145.2
+frontier,hierarchical,random,1000,10,1,56,55,99550,10.0156,9939.47
+frontier,hierarchical,random,10000,10,1,56,55,11068,10.1014,1095.69
+frontier,hierarchical,random,100000,10,1,56,55,2410,10.7022,225.188
+frontier,hierarchical,random,1000000,10,1,56,55,440,19.8574,22.158
diff --git a/benchmark/results/frontier/1-dynampi_ss_frontier_1-4058681/strong_scaling_frontier.csv b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4058681/strong_scaling_frontier.csv
new file mode 100644
index 0000000..0be314e
--- /dev/null
+++ b/benchmark/results/frontier/1-dynampi_ss_frontier_1-4058681/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,1,56,55,12040519,10,1.20405e+06
+frontier,naive,fixed,10,10,1,56,55,12620115,10,1.26201e+06
+frontier,naive,fixed,100,10,1,56,55,5395057,10.0001,539500
+frontier,naive,fixed,1000,10,1,56,55,548975,10.0009,54892.7
+frontier,naive,fixed,10000,10,1,56,55,55034,10.0132,5496.15
+frontier,naive,fixed,100000,10,1,56,55,5499,10.0021,549.786
+frontier,naive,fixed,1000000,10,1,56,55,543,10.0008,54.2955
+frontier,naive,random,1,10,1,56,55,12448955,10,1.24489e+06
+frontier,naive,random,10,10,1,56,55,13253391,10,1.32534e+06
+frontier,naive,random,100,10,1,56,55,5389311,10.0002,538922
+frontier,naive,random,1000,10,1,56,55,549678,10.0018,54957.7
+frontier,naive,random,10000,10,1,56,55,54805,10.0184,5470.42
+frontier,naive,random,100000,10,1,56,55,5575,10.1503,549.245
+frontier,naive,random,1000000,10,1,56,55,577,11.639,49.5747
+frontier,hierarchical,fixed,1,10,1,56,55,16132218,10,1.61322e+06
+frontier,hierarchical,fixed,10,10,1,56,55,16973914,10,1.69739e+06
+frontier,hierarchical,fixed,100,10,1,56,55,4543143,10.0001,454311
+frontier,hierarchical,fixed,1000,10,1,56,55,529400,10.0003,52938.4
+frontier,hierarchical,fixed,10000,10,1,56,55,53877,10.0028,5386.2
+frontier,hierarchical,fixed,100000,10,1,56,55,5397,10.0047,539.445
+frontier,hierarchical,fixed,1000000,10,1,56,55,539,10.0013,53.8932
+frontier,hierarchical,random,1,10,1,56,55,15936845,10,1.59368e+06
+frontier,hierarchical,random,10,10,1,56,55,15838898,10,1.58389e+06
+frontier,hierarchical,random,100,10,1,56,55,3247533,10.0001,324751
+frontier,hierarchical,random,1000,10,1,56,55,341366,10.0002,34135.8
+frontier,hierarchical,random,10000,10,1,56,55,34134,10.0153,3408.17
+frontier,hierarchical,random,100000,10,1,56,55,3464,10.1463,341.404
+frontier,hierarchical,random,1000000,10,1,56,55,335,11.3834,29.4289
diff --git a/benchmark/results/frontier/1024-dynampi_shutdown_frontier_1024-4058261/naive_shutdown_frontier.csv b/benchmark/results/frontier/1024-dynampi_shutdown_frontier_1024-4058261/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..e59bb8c
--- /dev/null
+++ b/benchmark/results/frontier/1024-dynampi_shutdown_frontier_1024-4058261/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,1024,57344,57343,498323,20
diff --git a/benchmark/results/frontier/1024-dynampi_ss_frontier_1024-4058697/strong_scaling_frontier.csv b/benchmark/results/frontier/1024-dynampi_ss_frontier_1024-4058697/strong_scaling_frontier.csv
new file mode 100644
index 0000000..a0b041b
--- /dev/null
+++ b/benchmark/results/frontier/1024-dynampi_ss_frontier_1024-4058697/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,1024,57344,57343,12646757,10.2384,1.23523e+06
+frontier,naive,fixed,10,10,1024,57344,57343,12728450,10.2096,1.24672e+06
+frontier,naive,fixed,100,10,1024,57344,57343,3121891,10.0535,310529
+frontier,naive,fixed,1000,10,1024,57344,57343,1508945,10.0329,150399
+frontier,naive,fixed,10000,10,1024,57344,57343,1372316,10.0463,136599
+frontier,naive,fixed,100000,10,1024,57344,57343,1322242,10.3211,128111
+frontier,naive,fixed,1000000,10,1024,57344,57343,523136,11.0993,47132.5
+frontier,naive,random,1,10,1024,57344,57343,12534290,10.2092,1.22774e+06
+frontier,naive,random,10,10,1024,57344,57343,12858648,10.2077,1.2597e+06
+frontier,naive,random,100,10,1024,57344,57343,3184408,10.0367,317275
+frontier,naive,random,1000,10,1024,57344,57343,1510168,10.0331,150519
+frontier,naive,random,10000,10,1024,57344,57343,1367983,10.0574,136018
+frontier,naive,random,100000,10,1024,57344,57343,1057967,10.1691,104037
+frontier,naive,random,1000000,10,1024,57344,57343,587912,11.9787,49079.8
+frontier,hierarchical,fixed,1,10,1024,57344,57343,103594651,10.0043,1.0355e+07
+frontier,hierarchical,fixed,10,10,1024,57344,57343,104492874,10.0042,1.04449e+07
+frontier,hierarchical,fixed,100,10,1024,57344,57343,102410818,10.0042,1.02368e+07
+frontier,hierarchical,fixed,1000,10,1024,57344,57343,100673795,10.0042,1.00632e+07
+frontier,hierarchical,fixed,10000,10,1024,57344,57343,56022791,10.0194,5.59145e+06
+frontier,hierarchical,fixed,100000,10,1024,57344,57343,5662051,10.2188,554083
+frontier,hierarchical,fixed,1000000,10,1024,57344,57343,584370,12.0055,48675
+frontier,hierarchical,random,1,10,1024,57344,57343,103763301,10.0043,1.03719e+07
+frontier,hierarchical,random,10,10,1024,57344,57343,104267975,10.0041,1.04225e+07
+frontier,hierarchical,random,100,10,1024,57344,57343,99933459,10.0043,9.98907e+06
+frontier,hierarchical,random,1000,10,1024,57344,57343,97689946,10.0043,9.7648e+06
+frontier,hierarchical,random,10000,10,1024,57344,57343,765278,18.8932,40505.5
+frontier,hierarchical,random,100000,10,1024,57344,57343,3634573,10.3729,350393
+frontier,hierarchical,random,1000000,10,1024,57344,57343,454428,15.3959,29516.1
diff --git a/benchmark/results/frontier/128-dynampi_shutdown_frontier_128-4058173/naive_shutdown_frontier.csv b/benchmark/results/frontier/128-dynampi_shutdown_frontier_128-4058173/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..1d77434
--- /dev/null
+++ b/benchmark/results/frontier/128-dynampi_shutdown_frontier_128-4058173/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,128,7168,7167,27280.3,329
diff --git a/benchmark/results/frontier/128-dynampi_ss_frontier_128-4054794/strong_scaling_frontier.csv b/benchmark/results/frontier/128-dynampi_ss_frontier_128-4054794/strong_scaling_frontier.csv
new file mode 100644
index 0000000..b72bf5f
--- /dev/null
+++ b/benchmark/results/frontier/128-dynampi_ss_frontier_128-4054794/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,128,7168,7167,3683838,10.067,365933
+frontier,naive,fixed,10,10,128,7168,7167,3769842,10.0639,374591
+frontier,naive,fixed,100,10,128,7168,7167,3311154,10.058,329205
+frontier,naive,fixed,1000,10,128,7168,7167,2465448,10.0633,244994
+frontier,naive,fixed,10000,10,128,7168,7167,2092764,10.1019,207164
+frontier,naive,fixed,100000,10,128,7168,7167,379900,10.3373,36750.5
+frontier,naive,fixed,1000000,10,128,7168,7167,57340,13.0328,4399.66
+frontier,naive,random,1,10,128,7168,7167,3755508,10.0738,372800
+frontier,naive,random,10,10,128,7168,7167,3726840,10.0406,371177
+frontier,naive,random,100,10,128,7168,7167,3325488,10.0798,329916
+frontier,naive,random,1000,10,128,7168,7167,2465448,10.0884,244384
+frontier,naive,random,10000,10,128,7168,7167,2322108,10.0742,230500
+frontier,naive,random,100000,10,128,7168,7167,414730,10.4981,39505.2
+frontier,naive,random,1000000,10,128,7168,7167,58757,14.5895,4027.36
+frontier,hierarchical,fixed,1,10,128,7168,7167,71784672,10.0038,7.17575e+06
+frontier,hierarchical,fixed,10,10,128,7168,7167,94575732,10.003,9.45478e+06
+frontier,hierarchical,fixed,100,10,128,7168,7167,97657542,10.0015,9.76425e+06
+frontier,hierarchical,fixed,1000,10,128,7168,7167,29943726,10.0067,2.99237e+06
+frontier,hierarchical,fixed,10000,10,128,7168,7167,3540498,10.0705,351570
+frontier,hierarchical,fixed,100000,10,128,7168,7167,371388,10.4116,35670.5
+frontier,hierarchical,fixed,1000000,10,128,7168,7167,57228,16.0056,3575.5
+frontier,hierarchical,random,1,10,128,7168,7167,70222266,10.0028,7.02028e+06
+frontier,hierarchical,random,10,10,128,7168,7167,65119362,10.0036,6.50959e+06
+frontier,hierarchical,random,100,10,128,7168,7167,28424322,10.0089,2.8399e+06
+frontier,hierarchical,random,1000,10,128,7168,7167,7425012,10.0551,738436
+frontier,hierarchical,random,10000,10,128,7168,7167,2221770,10.1115,219727
+frontier,hierarchical,random,100000,10,128,7168,7167,257148,10.6039,24250.2
+frontier,hierarchical,random,1000000,10,128,7168,7167,42948,17.984,2388.12
diff --git a/benchmark/results/frontier/128-dynampi_ss_frontier_128-4058688/strong_scaling_frontier.csv b/benchmark/results/frontier/128-dynampi_ss_frontier_128-4058688/strong_scaling_frontier.csv
new file mode 100644
index 0000000..1c55eae
--- /dev/null
+++ b/benchmark/results/frontier/128-dynampi_ss_frontier_128-4058688/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,128,7168,7167,5679185,10.0082,567456
+frontier,naive,fixed,10,10,128,7168,7167,5710363,10.0157,570141
+frontier,naive,fixed,100,10,128,7168,7167,6428132,10.0282,641002
+frontier,naive,fixed,1000,10,128,7168,7167,2740884,10.0283,273315
+frontier,naive,fixed,10000,10,128,7168,7167,2397737,10.0278,239109
+frontier,naive,fixed,100000,10,128,7168,7167,716464,10.0321,71417
+frontier,naive,fixed,1000000,10,128,7168,7167,70868,10.0287,7066.49
+frontier,naive,random,1,10,128,7168,7167,5715286,10.0175,570529
+frontier,naive,random,10,10,128,7168,7167,5753891,10.0078,574939
+frontier,naive,random,100,10,128,7168,7167,6498880,10.0284,648049
+frontier,naive,random,1000,10,128,7168,7167,2730826,10.0286,272303
+frontier,naive,random,10000,10,128,7168,7167,2385026,10.0283,237828
+frontier,naive,random,100000,10,128,7168,7167,720111,10.1958,70627.9
+frontier,naive,random,1000000,10,128,7168,7167,76502,11.9879,6381.58
+frontier,hierarchical,fixed,1,10,128,7168,7167,103336119,10.0006,1.0333e+07
+frontier,hierarchical,fixed,10,10,128,7168,7167,104483546,10.0005,1.04478e+07
+frontier,hierarchical,fixed,100,10,128,7168,7167,101868200,10.0006,1.01863e+07
+frontier,hierarchical,fixed,1000,10,128,7168,7167,67608419,10.0019,6.75959e+06
+frontier,hierarchical,fixed,10000,10,128,7168,7167,7014044,10.0215,699902
+frontier,hierarchical,fixed,100000,10,128,7168,7167,703107,10.0087,70249.7
+frontier,hierarchical,fixed,1000000,10,128,7168,7167,73127,12.0027,6092.54
+frontier,hierarchical,random,1,10,128,7168,7167,103058588,10.0006,1.03052e+07
+frontier,hierarchical,random,10,10,128,7168,7167,104183225,10.0005,1.04178e+07
+frontier,hierarchical,random,100,10,128,7168,7167,99624543,10.0006,9.96186e+06
+frontier,hierarchical,random,1000,10,128,7168,7167,44051991,10.0032,4.40379e+06
+frontier,hierarchical,random,10000,10,128,7168,7167,4469070,10.0327,445451
+frontier,hierarchical,random,100000,10,128,7168,7167,453946,10.3398,43902.8
+frontier,hierarchical,random,1000000,10,128,7168,7167,56714,15.0959,3756.91
diff --git a/benchmark/results/frontier/16-dynampi_shutdown_frontier_16-4058170/naive_shutdown_frontier.csv b/benchmark/results/frontier/16-dynampi_shutdown_frontier_16-4058170/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..ee52ddc
--- /dev/null
+++ b/benchmark/results/frontier/16-dynampi_shutdown_frontier_16-4058170/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,16,896,895,3387.91,2616
diff --git a/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054471/strong_scaling_frontier.csv b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054471/strong_scaling_frontier.csv
new file mode 100644
index 0000000..64dddc5
--- /dev/null
+++ b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054471/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,16,896,895,190523267,190523267,144.346,1.31991e+06
+frontier,naive,fixed,10000,200,10,16,896,895,30305359,30305359,23.2854,1.30148e+06
+frontier,naive,fixed,100000,200,10,16,896,895,8057025,8057025,12.6688,635973
+frontier,naive,fixed,1000000,200,10,16,896,895,2935707,2935707,10.6162,276530
+frontier,naive,fixed,10000000,200,10,16,896,895,760916,760916,10.1801,74745.7
+frontier,naive,fixed,100000000,200,10,16,896,895,45694,45694,10.1061,4521.41
+frontier,naive,fixed,1000000000,200,10,16,896,895,8950,8950,10.0356,891.829
+frontier,naive,poisson,1000,200,10,16,896,895,190398492,190398492,146.189,1.30242e+06
+frontier,naive,poisson,10000,200,10,16,896,895,30366935,30366935,23.207,1.30852e+06
+frontier,naive,poisson,100000,200,10,16,896,895,8046895,8046895,12.6731,634960
+frontier,naive,poisson,1000000,200,10,16,896,895,2941516,2941516,10.6153,277101
+frontier,naive,poisson,10000000,200,10,16,896,895,760918,760918,10.1805,74743
+frontier,naive,poisson,100000000,200,10,16,896,895,45694,45694,10.1053,4521.79
+frontier,naive,poisson,1000000000,200,10,16,896,895,8950,8950,10.0353,891.849
+frontier,hierarchical,fixed,1000,200,10,16,896,895,214670000,214670000,38.1494,5.62708e+06
+frontier,hierarchical,fixed,10000,200,10,16,896,895,55310000,55310000,12.7119,4.35103e+06
+frontier,hierarchical,fixed,100000,200,10,16,896,895,38330000,38330000,10.3212,3.71373e+06
+frontier,hierarchical,fixed,1000000,200,10,16,896,895,1790000,1790000,10.1593,176194
+frontier,hierarchical,fixed,10000000,200,10,16,896,895,35800,35800,12.3791,2891.96
+frontier,hierarchical,fixed,100000000,200,10,16,896,895,3580,3580,12.4027,288.647
+frontier,hierarchical,fixed,1000000000,200,10,16,896,895,895,895,31.0012,28.8698
+frontier,hierarchical,poisson,1000,200,10,16,896,895,214670000,214670000,38.0652,5.63954e+06
+frontier,hierarchical,poisson,10000,200,10,16,896,895,56180000,56180000,12.8342,4.37738e+06
+frontier,hierarchical,poisson,100000,200,10,16,896,895,38330000,38330000,10.3506,3.70315e+06
+frontier,hierarchical,poisson,1000000,200,10,16,896,895,1790000,1790000,10.166,176077
+frontier,hierarchical,poisson,10000000,200,10,16,896,895,35800,35800,12.38,2891.77
+frontier,hierarchical,poisson,100000000,200,10,16,896,895,3580,3580,12.403,288.64
+frontier,hierarchical,poisson,1000000000,200,10,16,896,895,895,895,36.0012,24.8603
diff --git a/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054791/strong_scaling_frontier.csv b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054791/strong_scaling_frontier.csv
new file mode 100644
index 0000000..5d1686b
--- /dev/null
+++ b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4054791/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,16,896,895,3979170,10.0046,397735
+frontier,naive,fixed,10,10,16,896,895,3995280,10.0089,399173
+frontier,naive,fixed,100,10,16,896,895,3476180,10.0093,347295
+frontier,naive,fixed,1000,10,16,896,895,2373540,10.0151,236997
+frontier,naive,fixed,10000,10,16,896,895,762540,10.044,75920.2
+frontier,naive,fixed,100000,10,16,896,895,47484,10.3065,4607.19
+frontier,naive,fixed,1000000,10,16,896,895,7164,13.006,550.822
+frontier,naive,random,1,10,16,896,895,3984540,10.0067,398185
+frontier,naive,random,10,10,16,896,895,3989910,10.005,398790
+frontier,naive,random,100,10,16,896,895,3495870,10.0084,349293
+frontier,naive,random,1000,10,16,896,895,2597290,10.0091,259492
+frontier,naive,random,10000,10,16,896,895,537000,10.0455,53456.7
+frontier,naive,random,100000,10,16,896,895,53780,10.6237,5062.28
+frontier,naive,random,1000000,10,16,896,895,7348,14.3291,512.804
+frontier,hierarchical,fixed,1,10,16,896,895,32431220,10.0013,3.2427e+06
+frontier,hierarchical,fixed,10,10,16,896,895,16992470,10.0016,1.69898e+06
+frontier,hierarchical,fixed,100,10,16,896,895,25949630,10.0008,2.59475e+06
+frontier,hierarchical,fixed,1000,10,16,896,895,4176070,10.0052,417389
+frontier,hierarchical,fixed,10000,10,16,896,895,447500,10.0652,44460.2
+frontier,hierarchical,fixed,100000,10,16,896,895,45340,10.4097,4355.54
+frontier,hierarchical,fixed,1000000,10,16,896,895,7060,16.0025,441.18
+frontier,hierarchical,random,1,10,16,896,895,22303400,10.0011,2.23008e+06
+frontier,hierarchical,random,10,10,16,896,895,16752610,10.002,1.67493e+06
+frontier,hierarchical,random,100,10,16,896,895,3887880,10.0069,388521
+frontier,hierarchical,random,1000,10,16,896,895,619340,10.0629,61546.9
+frontier,hierarchical,random,10000,10,16,896,895,201736,10.1296,19915.4
+frontier,hierarchical,random,100000,10,16,896,895,33160,10.71,3096.17
+frontier,hierarchical,random,1000000,10,16,896,895,5320,17.524,303.583
diff --git a/benchmark/results/frontier/16-dynampi_ss_frontier_16-4058685/strong_scaling_frontier.csv b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4058685/strong_scaling_frontier.csv
new file mode 100644
index 0000000..b1686c5
--- /dev/null
+++ b/benchmark/results/frontier/16-dynampi_ss_frontier_16-4058685/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,16,896,895,5924765,10,592474
+frontier,naive,fixed,10,10,16,896,895,5969337,10.0018,596824
+frontier,naive,fixed,100,10,16,896,895,6565573,10.0034,656337
+frontier,naive,fixed,1000,10,16,896,895,2849830,10.0033,284888
+frontier,naive,fixed,10000,10,16,896,895,894147,10.0099,89326.7
+frontier,naive,fixed,100000,10,16,896,895,89478,10.0063,8942.16
+frontier,naive,fixed,1000000,10,16,896,895,8850,10.0049,884.57
+frontier,naive,random,1,10,16,896,895,5946061,10.0002,594593
+frontier,naive,random,10,10,16,896,895,6013833,10.0004,601357
+frontier,naive,random,100,10,16,896,895,6631642,10.0034,662939
+frontier,naive,random,1000,10,16,896,895,2851279,10.0033,285033
+frontier,naive,random,10000,10,16,896,895,893936,10.0187,89226.5
+frontier,naive,random,100000,10,16,896,895,90058,10.1919,8836.25
+frontier,naive,random,1000000,10,16,896,895,9543,11.9244,800.295
+frontier,hierarchical,fixed,1,10,16,896,895,101244806,10.0002,1.01243e+07
+frontier,hierarchical,fixed,10,10,16,896,895,103394285,10.0002,1.03393e+07
+frontier,hierarchical,fixed,100,10,16,896,895,64932987,10.0003,6.49313e+06
+frontier,hierarchical,fixed,1000,10,16,896,895,8486748,10.0026,848454
+frontier,hierarchical,fixed,10000,10,16,896,895,875752,10.0193,87406.6
+frontier,hierarchical,fixed,100000,10,16,896,895,88622,10.2067,8682.72
+frontier,hierarchical,fixed,1000000,10,16,896,895,9158,12.0026,763
+frontier,hierarchical,random,1,10,16,896,895,101751609,10.0002,1.0175e+07
+frontier,hierarchical,random,10,10,16,896,895,103652771,10.0001,1.03652e+07
+frontier,hierarchical,random,100,10,16,896,895,49042617,10.0004,4.90409e+06
+frontier,hierarchical,random,1000,10,16,896,895,5510857,10.0027,550936
+frontier,hierarchical,random,10000,10,16,896,895,558116,10.0303,55642.8
+frontier,hierarchical,random,100000,10,16,896,895,56944,10.3491,5502.29
+frontier,hierarchical,random,1000000,10,16,896,895,6990,12.9946,537.914
diff --git a/benchmark/results/frontier/2-dynampi_shutdown_frontier_2-4058167/naive_shutdown_frontier.csv b/benchmark/results/frontier/2-dynampi_shutdown_frontier_2-4058167/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..d4ff365
--- /dev/null
+++ b/benchmark/results/frontier/2-dynampi_shutdown_frontier_2-4058167/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,2,112,111,241.62,32133
diff --git a/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054468/strong_scaling_frontier.csv b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054468/strong_scaling_frontier.csv
new file mode 100644
index 0000000..7a55094
--- /dev/null
+++ b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054468/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,2,112,111,35298152,35298152,26.4428,1.33488e+06
+frontier,naive,fixed,10000,200,10,2,112,111,15634096,15634096,11.6174,1.34574e+06
+frontier,naive,fixed,100000,200,10,2,112,111,6789104,6789104,10.2955,659425
+frontier,naive,fixed,1000000,200,10,2,112,111,1101815,1101815,10.1538,108512
+frontier,naive,fixed,10000000,200,10,2,112,111,94516,94516,10.1749,9289.14
+frontier,naive,fixed,100000000,200,10,2,112,111,5710,5710,10.1012,565.277
+frontier,naive,fixed,1000000000,200,10,2,112,111,1110,1110,10.0039,110.957
+frontier,naive,poisson,1000,200,10,2,112,111,35176771,35176771,26.607,1.32209e+06
+frontier,naive,poisson,10000,200,10,2,112,111,15495802,15495802,11.6326,1.3321e+06
+frontier,naive,poisson,100000,200,10,2,112,111,6764652,6764652,10.2972,656939
+frontier,naive,poisson,1000000,200,10,2,112,111,1101344,1101344,10.154,108464
+frontier,naive,poisson,10000000,200,10,2,112,111,94517,94517,10.1752,9288.99
+frontier,naive,poisson,100000000,200,10,2,112,111,5710,5710,10.1015,565.261
+frontier,naive,poisson,1000000000,200,10,2,112,111,1110,1110,10.0039,110.957
+frontier,hierarchical,fixed,1000,200,10,2,112,111,67410000,67410000,13.2779,5.07687e+06
+frontier,hierarchical,fixed,10000,200,10,2,112,111,34410000,34410000,10.5455,3.26301e+06
+frontier,hierarchical,fixed,100000,200,10,2,112,111,5502000,5502000,10.3741,530359
+frontier,hierarchical,fixed,1000000,200,10,2,112,111,222000,222000,10.0937,21994
+frontier,hierarchical,fixed,10000000,200,10,2,112,111,2220,2220,10.0154,221.659
+frontier,hierarchical,fixed,100000000,200,10,2,112,111,1110,1110,11.502,96.5049
+frontier,hierarchical,fixed,1000000000,200,10,2,112,111,111,111,12.0007,9.24947
+frontier,hierarchical,poisson,1000,200,10,2,112,111,66470000,66470000,13.3679,4.97237e+06
+frontier,hierarchical,poisson,10000,200,10,2,112,111,34410000,34410000,10.4962,3.27832e+06
+frontier,hierarchical,poisson,100000,200,10,2,112,111,5392000,5392000,10.2123,527993
+frontier,hierarchical,poisson,1000000,200,10,2,112,111,222000,222000,10.0446,22101.4
+frontier,hierarchical,poisson,10000000,200,10,2,112,111,11100,11100,11.1065,999.419
+frontier,hierarchical,poisson,100000000,200,10,2,112,111,222,222,22.204,9.99821
+frontier,hierarchical,poisson,1000000000,200,10,2,112,111,111,111,105.002,1.05712
diff --git a/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054788/strong_scaling_frontier.csv b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054788/strong_scaling_frontier.csv
new file mode 100644
index 0000000..d24e646
--- /dev/null
+++ b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4054788/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,2,112,111,5964918,10.0005,596460
+frontier,naive,fixed,10,10,2,112,111,6012204,10.0005,601190
+frontier,naive,fixed,100,10,2,112,111,4395156,10.0007,439485
+frontier,naive,fixed,1000,10,2,112,111,971028,10.0042,97061.6
+frontier,naive,fixed,10000,10,2,112,111,109668,10.0262,10938.1
+frontier,naive,fixed,100000,10,2,112,111,5932,10.3019,575.816
+frontier,naive,fixed,1000000,10,2,112,111,892,13.0023,68.6034
+frontier,naive,random,1,10,2,112,111,5974242,10.0005,597394
+frontier,naive,random,10,10,2,112,111,5954706,10.0005,595443
+frontier,naive,random,100,10,2,112,111,4828500,10.0006,482819
+frontier,naive,random,1000,10,2,112,111,682650,10.0067,68219.2
+frontier,naive,random,10000,10,2,112,111,69264,10.0634,6882.74
+frontier,naive,random,100000,10,2,112,111,7004,10.4999,667.051
+frontier,naive,random,1000000,10,2,112,111,1035,16.194,63.9127
+frontier,hierarchical,fixed,1,10,2,112,111,11324664,10.0003,1.13243e+06
+frontier,hierarchical,fixed,10,10,2,112,111,7390602,10.0005,739024
+frontier,hierarchical,fixed,100,10,2,112,111,2929956,10.0012,292960
+frontier,hierarchical,fixed,1000,10,2,112,111,279276,10.0127,27892.1
+frontier,hierarchical,fixed,10000,10,2,112,111,55722,10.0573,5540.44
+frontier,hierarchical,fixed,100000,10,2,112,111,5724,10.4027,550.239
+frontier,hierarchical,fixed,1000000,10,2,112,111,884,16.0009,55.2468
+frontier,hierarchical,random,1,10,2,112,111,11346864,10.0004,1.13465e+06
+frontier,hierarchical,random,10,10,2,112,111,6774552,10.0004,677426
+frontier,hierarchical,random,100,10,2,112,111,1307136,10.0028,130676
+frontier,hierarchical,random,1000,10,2,112,111,140304,10.0254,13994.9
+frontier,hierarchical,random,10000,10,2,112,111,19826,10.2549,1933.31
+frontier,hierarchical,random,100000,10,2,112,111,4624,10.5091,440
+frontier,hierarchical,random,1000000,10,2,112,111,664,15.1035,43.9634
diff --git a/benchmark/results/frontier/2-dynampi_ss_frontier_2-4058682/strong_scaling_frontier.csv b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4058682/strong_scaling_frontier.csv
new file mode 100644
index 0000000..fd10791
--- /dev/null
+++ b/benchmark/results/frontier/2-dynampi_ss_frontier_2-4058682/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,2,112,111,7893820,10,789379
+frontier,naive,fixed,10,10,2,112,111,8001886,10.0002,800175
+frontier,naive,fixed,100,10,2,112,111,6614880,10.0003,661471
+frontier,naive,fixed,1000,10,2,112,111,1101559,10.0011,110144
+frontier,naive,fixed,10000,10,2,112,111,110970,10.012,11083.7
+frontier,naive,fixed,100000,10,2,112,111,11097,10.0028,1109.39
+frontier,naive,fixed,1000000,10,2,112,111,1097,10.0025,109.673
+frontier,naive,random,1,10,2,112,111,7953784,10,795377
+frontier,naive,random,10,10,2,112,111,8114285,10.0001,811417
+frontier,naive,random,100,10,2,112,111,6675135,10.0003,667496
+frontier,naive,random,1000,10,2,112,111,1102007,10.0019,110180
+frontier,naive,random,10000,10,2,112,111,111022,10.0195,11080.6
+frontier,naive,random,100000,10,2,112,111,11116,10.1779,1092.17
+frontier,naive,random,1000000,10,2,112,111,1182,11.9545,98.875
+frontier,hierarchical,fixed,1,10,2,112,111,21664284,10.0001,2.16642e+06
+frontier,hierarchical,fixed,10,10,2,112,111,21791375,10.0001,2.17911e+06
+frontier,hierarchical,fixed,100,10,2,112,111,8286109,10.0002,828596
+frontier,hierarchical,fixed,1000,10,2,112,111,1055918,10.0009,105582
+frontier,hierarchical,fixed,10000,10,2,112,111,108646,10.0139,10849.6
+frontier,hierarchical,fixed,100000,10,2,112,111,10891,10.0064,1088.4
+frontier,hierarchical,fixed,1000000,10,2,112,111,1062,10.0024,106.174
+frontier,hierarchical,random,1,10,2,112,111,21626884,10.0001,2.16267e+06
+frontier,hierarchical,random,10,10,2,112,111,21805222,10.0002,2.18049e+06
+frontier,hierarchical,random,100,10,2,112,111,6253514,10.0003,625335
+frontier,hierarchical,random,1000,10,2,112,111,686021,10.0007,68597.4
+frontier,hierarchical,random,10000,10,2,112,111,69032,10.0185,6890.42
+frontier,hierarchical,random,100000,10,2,112,111,6945,10.1851,681.875
+frontier,hierarchical,random,1000000,10,2,112,111,776,12.9328,60.0026
diff --git a/benchmark/results/frontier/2048-dynampi_shutdown_frontier_2048-4058262/naive_shutdown_frontier.csv b/benchmark/results/frontier/2048-dynampi_shutdown_frontier_2048-4058262/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..d14582a
--- /dev/null
+++ b/benchmark/results/frontier/2048-dynampi_shutdown_frontier_2048-4058262/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,2048,114688,114687,1.80677e+06,6
diff --git a/benchmark/results/frontier/256-dynampi_shutdown_frontier_256-4058174/naive_shutdown_frontier.csv b/benchmark/results/frontier/256-dynampi_shutdown_frontier_256-4058174/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..7faf780
--- /dev/null
+++ b/benchmark/results/frontier/256-dynampi_shutdown_frontier_256-4058174/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,256,14336,14335,57377,158
diff --git a/benchmark/results/frontier/256-dynampi_ss_frontier_256-4054795/strong_scaling_frontier.csv b/benchmark/results/frontier/256-dynampi_ss_frontier_256-4054795/strong_scaling_frontier.csv
new file mode 100644
index 0000000..3280998
--- /dev/null
+++ b/benchmark/results/frontier/256-dynampi_ss_frontier_256-4054795/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,256,14336,14335,3641090,10.1474,358820
+frontier,naive,fixed,10,10,256,14336,14335,3641090,10.1216,359734
+frontier,naive,fixed,100,10,256,14336,14335,3211040,10.1722,315668
+frontier,naive,fixed,1000,10,256,14336,14335,2348163,10.1686,230924
+frontier,naive,fixed,10000,10,256,14336,14335,2144825,10.1629,211044
+frontier,naive,fixed,100000,10,256,14336,14335,759804,10.3724,73252.5
+frontier,naive,fixed,1000000,10,256,14336,14335,114684,13.0679,8776
+frontier,naive,random,1,10,256,14336,14335,3612420,10.083,358270
+frontier,naive,random,10,10,256,14336,14335,3641090,10.1019,360435
+frontier,naive,random,100,10,256,14336,14335,3182370,10.1686,312962
+frontier,naive,random,1000,10,256,14336,14335,2348381,10.2191,229803
+frontier,naive,random,10000,10,256,14336,14335,2201986,10.2039,215798
+frontier,naive,random,100000,10,256,14336,14335,820751,10.5145,78058.6
+frontier,naive,random,1000000,10,256,14336,14335,117328,14.6075,8032.06
+frontier,hierarchical,fixed,1,10,256,14336,14335,38303120,10.0181,3.8234e+06
+frontier,hierarchical,fixed,10,10,256,14336,14335,36869620,10.0111,3.68287e+06
+frontier,hierarchical,fixed,100,10,256,14336,14335,27437190,10.0386,2.73318e+06
+frontier,hierarchical,fixed,1000,10,256,14336,14335,48423630,10.0116,4.83676e+06
+frontier,hierarchical,fixed,10000,10,256,14336,14335,7081490,10.0751,702871
+frontier,hierarchical,fixed,100000,10,256,14336,14335,742780,10.4188,71292.1
+frontier,hierarchical,fixed,1000000,10,256,14336,14335,114460,16.007,7150.62
+frontier,hierarchical,random,1,10,256,14336,14335,37529030,10.0192,3.7457e+06
+frontier,hierarchical,random,10,10,256,14336,14335,36525580,10.0133,3.6477e+06
+frontier,hierarchical,random,100,10,256,14336,14335,25602310,10.0402,2.54998e+06
+frontier,hierarchical,random,1000,10,256,14336,14335,16169880,10.0606,1.60725e+06
+frontier,hierarchical,random,10000,10,256,14336,14335,4501190,10.1248,444573
+frontier,hierarchical,random,100000,10,256,14336,14335,514300,10.9927,46785.5
+frontier,hierarchical,random,1000000,10,256,14336,14335,85900,18.5246,4637.09
diff --git a/benchmark/results/frontier/256-dynampi_ss_frontier_256-4058689/strong_scaling_frontier.csv b/benchmark/results/frontier/256-dynampi_ss_frontier_256-4058689/strong_scaling_frontier.csv
new file mode 100644
index 0000000..837d89a
--- /dev/null
+++ b/benchmark/results/frontier/256-dynampi_ss_frontier_256-4058689/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,256,14336,14335,7591021,10.0474,755523
+frontier,naive,fixed,10,10,256,14336,14335,8784967,10.0542,873762
+frontier,naive,fixed,100,10,256,14336,14335,6293517,10.0575,625755
+frontier,naive,fixed,1000,10,256,14336,14335,2566865,10.0564,255248
+frontier,naive,fixed,10000,10,256,14336,14335,2244467,10.0564,223188
+frontier,naive,fixed,100000,10,256,14336,14335,1432879,10.061,142419
+frontier,naive,fixed,1000000,10,256,14336,14335,141744,10.056,14095.4
+frontier,naive,random,1,10,256,14336,14335,8762174,10.0575,871208
+frontier,naive,random,10,10,256,14336,14335,8914463,10.0481,887176
+frontier,naive,random,100,10,256,14336,14335,6361136,10.0574,632485
+frontier,naive,random,1000,10,256,14336,14335,2543940,10.0565,252966
+frontier,naive,random,10000,10,256,14336,14335,2225075,10.0565,221258
+frontier,naive,random,100000,10,256,14336,14335,1437946,10.1976,141009
+frontier,naive,random,1000000,10,256,14336,14335,152165,11.9725,12709.6
+frontier,hierarchical,fixed,1,10,256,14336,14335,102776227,10.0011,1.02765e+07
+frontier,hierarchical,fixed,10,10,256,14336,14335,104438745,10.0011,1.04427e+07
+frontier,hierarchical,fixed,100,10,256,14336,14335,102038288,10.0013,1.02025e+07
+frontier,hierarchical,fixed,1000,10,256,14336,14335,99991272,10.0022,9.99695e+06
+frontier,hierarchical,fixed,10000,10,256,14336,14335,14029522,10.0223,1.39983e+06
+frontier,hierarchical,fixed,100000,10,256,14336,14335,1406124,10.0113,140454
+frontier,hierarchical,fixed,1000000,10,256,14336,14335,146216,12.0033,12181.4
+frontier,hierarchical,random,1,10,256,14336,14335,102651296,10.0011,1.0264e+07
+frontier,hierarchical,random,10,10,256,14336,14335,103828615,10.001,1.03818e+07
+frontier,hierarchical,random,100,10,256,14336,14335,99609765,10.0011,9.95988e+06
+frontier,hierarchical,random,1000,10,256,14336,14335,87244278,10.0034,8.7215e+06
+frontier,hierarchical,random,10000,10,256,14336,14335,8937662,10.0328,890845
+frontier,hierarchical,random,100000,10,256,14336,14335,909211,10.3464,87877
+frontier,hierarchical,random,1000000,10,256,14336,14335,113106,15.0599,7510.41
diff --git a/benchmark/results/frontier/32-dynampi_shutdown_frontier_32-4058171/naive_shutdown_frontier.csv b/benchmark/results/frontier/32-dynampi_shutdown_frontier_32-4058171/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..017eef8
--- /dev/null
+++ b/benchmark/results/frontier/32-dynampi_shutdown_frontier_32-4058171/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,32,1792,1791,6760.42,1318
diff --git a/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054472/strong_scaling_frontier.csv b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054472/strong_scaling_frontier.csv
new file mode 100644
index 0000000..0d9bc15
--- /dev/null
+++ b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054472/strong_scaling_frontier.csv
@@ -0,0 +1,20 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,32,1792,1791,367958397,367958397,289.072,1.2729e+06
+frontier,naive,fixed,10000,200,10,32,1792,1791,47254631,47254631,36.9228,1.27982e+06
+frontier,naive,fixed,100000,200,10,32,1792,1791,9557970,9557970,15.4676,617935
+frontier,naive,fixed,1000000,200,10,32,1792,1791,3103406,3103406,11.2552,275730
+frontier,naive,fixed,10000000,200,10,32,1792,1791,1522516,1522516,10.1892,149424
+frontier,naive,fixed,100000000,200,10,32,1792,1791,91390,91390,10.1104,9039.25
+frontier,naive,fixed,1000000000,200,10,32,1792,1791,17910,17910,10.0724,1778.13
+frontier,naive,poisson,1000,200,10,32,1792,1791,368041162,368041162,286.459,1.2848e+06
+frontier,naive,poisson,10000,200,10,32,1792,1791,47195308,47195308,36.9883,1.27595e+06
+frontier,naive,poisson,100000,200,10,32,1792,1791,9544842,9544842,15.4791,616628
+frontier,naive,poisson,1000000,200,10,32,1792,1791,3099899,3099899,11.2648,275185
+frontier,naive,poisson,10000000,200,10,32,1792,1791,1522516,1522516,10.1869,149458
+frontier,naive,poisson,100000000,200,10,32,1792,1791,91390,91390,10.1095,9040
+frontier,naive,poisson,1000000000,200,10,32,1792,1791,17910,17910,10.072,1778.19
+frontier,hierarchical,fixed,1000,200,10,32,1792,1791,437019000,437019000,44.8812,9.73723e+06
+frontier,hierarchical,fixed,10000,200,10,32,1792,1791,123579000,123579000,13.5003,9.15383e+06
+frontier,hierarchical,fixed,100000,200,10,32,1792,1791,74622000,74622000,10.4339,7.15189e+06
+frontier,hierarchical,fixed,1000000,200,10,32,1792,1791,3582000,3582000,10.1708,352183
+frontier,hierarchical,fixed,10000000,200,10,32,1792,1791,71640,71640,17.0866,4192.76
diff --git a/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054792/strong_scaling_frontier.csv b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054792/strong_scaling_frontier.csv
new file mode 100644
index 0000000..c6158d8
--- /dev/null
+++ b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4054792/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,32,1792,1791,3864978,10.0167,385854
+frontier,naive,fixed,10,10,32,1792,1791,3900798,10.0098,389696
+frontier,naive,fixed,100,10,32,1792,1791,3406482,10.0207,339944
+frontier,naive,fixed,1000,10,32,1792,1791,2493072,10.0179,248861
+frontier,naive,fixed,10000,10,32,1792,1791,1318176,10.0496,131166
+frontier,naive,fixed,100000,10,32,1792,1791,94972,10.311,9210.77
+frontier,naive,fixed,1000000,10,32,1792,1791,14332,13.0091,1101.69
+frontier,naive,random,1,10,32,1792,1791,3882888,10.0106,387878
+frontier,naive,random,10,10,32,1792,1791,3875724,10.0126,387085
+frontier,naive,random,100,10,32,1792,1791,3420810,10.0194,341419
+frontier,naive,random,1000,10,32,1792,1791,2532474,10.0271,252563
+frontier,naive,random,10000,10,32,1792,1791,1063854,10.0585,105767
+frontier,naive,random,100000,10,32,1792,1791,104501,10.4119,10036.7
+frontier,naive,random,1000000,10,32,1792,1791,14678,14.4154,1018.22
+frontier,hierarchical,fixed,1,10,32,1792,1791,87418710,10.0008,8.74118e+06
+frontier,hierarchical,fixed,10,10,32,1792,1791,86935140,10.0004,8.69315e+06
+frontier,hierarchical,fixed,100,10,32,1792,1791,46444212,10.001,4.64398e+06
+frontier,hierarchical,fixed,1000,10,32,1792,1791,8245764,10.0071,823995
+frontier,hierarchical,fixed,10000,10,32,1792,1791,891918,10.0464,88779.8
+frontier,hierarchical,fixed,100000,10,32,1792,1791,92562,10.4094,8892.19
+frontier,hierarchical,fixed,1000000,10,32,1792,1791,14328,16.0028,895.344
+frontier,hierarchical,random,1,10,32,1792,1791,87411546,10.0005,8.74074e+06
+frontier,hierarchical,random,10,10,32,1792,1791,87741090,10.0005,8.77367e+06
+frontier,hierarchical,random,100,10,32,1792,1791,13092210,10.0032,1.3088e+06
+frontier,hierarchical,random,1000,10,32,1792,1791,1855476,10.0476,184669
+frontier,hierarchical,random,10000,10,32,1792,1791,489948,10.1839,48110.3
+frontier,hierarchical,random,100000,10,32,1792,1791,67998,11.1207,6114.54
+frontier,hierarchical,random,1000000,10,32,1792,1791,10746,17.0003,632.108
diff --git a/benchmark/results/frontier/32-dynampi_ss_frontier_32-4058686/strong_scaling_frontier.csv b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4058686/strong_scaling_frontier.csv
new file mode 100644
index 0000000..3881a00
--- /dev/null
+++ b/benchmark/results/frontier/32-dynampi_ss_frontier_32-4058686/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,32,1792,1791,5889064,10.0008,588862
+frontier,naive,fixed,10,10,32,1792,1791,5936899,10.0017,593586
+frontier,naive,fixed,100,10,32,1792,1791,6554789,10.0069,655026
+frontier,naive,fixed,1000,10,32,1792,1791,2836200,10.0069,283425
+frontier,naive,fixed,10000,10,32,1792,1791,1788319,10.0095,178662
+frontier,naive,fixed,100000,10,32,1792,1791,179056,10.01,17887.7
+frontier,naive,fixed,1000000,10,32,1792,1791,17709,10.008,1769.48
+frontier,naive,random,1,10,32,1792,1791,5906985,10.0029,590527
+frontier,naive,random,10,10,32,1792,1791,5985074,10.0036,598293
+frontier,naive,random,100,10,32,1792,1791,6623444,10.0069,661888
+frontier,naive,random,1000,10,32,1792,1791,2837172,10.0068,283524
+frontier,naive,random,10000,10,32,1792,1791,1788483,10.0194,178502
+frontier,naive,random,100000,10,32,1792,1791,180255,10.1969,17677.4
+frontier,naive,random,1000000,10,32,1792,1791,19076,11.9674,1594
+frontier,hierarchical,fixed,1,10,32,1792,1791,102595667,10.0001,1.02594e+07
+frontier,hierarchical,fixed,10,10,32,1792,1791,104528982,10.0002,1.04527e+07
+frontier,hierarchical,fixed,100,10,32,1792,1791,101830529,10.0003,1.01828e+07
+frontier,hierarchical,fixed,1000,10,32,1792,1791,16973417,10.0026,1.6969e+06
+frontier,hierarchical,fixed,10000,10,32,1792,1791,1753021,10.0208,174938
+frontier,hierarchical,fixed,100000,10,32,1792,1791,177339,10.2067,17374.8
+frontier,hierarchical,fixed,1000000,10,32,1792,1791,18298,12.0026,1524.5
+frontier,hierarchical,random,1,10,32,1792,1791,102172549,10.0002,1.02171e+07
+frontier,hierarchical,random,10,10,32,1792,1791,104224311,10.0002,1.04223e+07
+frontier,hierarchical,random,100,10,32,1792,1791,94917455,10.0003,9.49141e+06
+frontier,hierarchical,random,1000,10,32,1792,1791,11023487,10.0034,1.10197e+06
+frontier,hierarchical,random,10000,10,32,1792,1791,1115757,10.0299,111243
+frontier,hierarchical,random,100000,10,32,1792,1791,113342,10.3054,10998.3
+frontier,hierarchical,random,1000000,10,32,1792,1791,14090,13.6719,1030.58
diff --git a/benchmark/results/frontier/4-dynampi_shutdown_frontier_4-4058168/naive_shutdown_frontier.csv b/benchmark/results/frontier/4-dynampi_shutdown_frontier_4-4058168/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..2000758
--- /dev/null
+++ b/benchmark/results/frontier/4-dynampi_shutdown_frontier_4-4058168/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,4,224,223,672.43,12561
diff --git a/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054469/strong_scaling_frontier.csv b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054469/strong_scaling_frontier.csv
new file mode 100644
index 0000000..c36f4c3
--- /dev/null
+++ b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054469/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,4,224,223,57163885,57163885,43.7275,1.30728e+06
+frontier,naive,fixed,10000,200,10,4,224,223,17713679,17713679,13.2581,1.33606e+06
+frontier,naive,fixed,100000,200,10,4,224,223,7005558,7005558,10.6738,656331
+frontier,naive,fixed,1000000,200,10,4,224,223,2194351,2194351,10.1571,216041
+frontier,naive,fixed,10000000,200,10,4,224,223,189716,189716,10.1758,18643.9
+frontier,naive,fixed,100000000,200,10,4,224,223,11422,11422,10.1027,1130.59
+frontier,naive,fixed,1000000000,200,10,4,224,223,2230,2230,10.0086,222.809
+frontier,naive,poisson,1000,200,10,4,224,223,57156290,57156290,43.9642,1.30006e+06
+frontier,naive,poisson,10000,200,10,4,224,223,17576611,17576611,13.2932,1.32222e+06
+frontier,naive,poisson,100000,200,10,4,224,223,6989526,6989526,10.6756,654719
+frontier,naive,poisson,1000000,200,10,4,224,223,2193603,2193603,10.157,215970
+frontier,naive,poisson,10000000,200,10,4,224,223,189718,189718,10.1766,18642.5
+frontier,naive,poisson,100000000,200,10,4,224,223,11422,11422,10.1021,1130.66
+frontier,naive,poisson,1000000000,200,10,4,224,223,2230,2230,10.0083,222.814
+frontier,hierarchical,fixed,1000,200,10,4,224,223,96134000,96134000,17.9539,5.3545e+06
+frontier,hierarchical,fixed,10000,200,10,4,224,223,29702000,29702000,11.0214,2.69494e+06
+frontier,hierarchical,fixed,100000,200,10,4,224,223,10526000,10526000,10.4088,1.01126e+06
+frontier,hierarchical,fixed,1000000,200,10,4,224,223,446000,446000,10.1488,43946.3
+frontier,hierarchical,fixed,10000000,200,10,4,224,223,17840,17840,12.7677,1397.28
+frontier,hierarchical,fixed,100000000,200,10,4,224,223,1784,1784,12.8029,139.344
+frontier,hierarchical,fixed,1000000000,200,10,4,224,223,223,223,16.0009,13.9367
+frontier,hierarchical,poisson,1000,200,10,4,224,223,96036000,96036000,17.9532,5.34925e+06
+frontier,hierarchical,poisson,10000,200,10,4,224,223,45606000,45606000,11.0327,4.13373e+06
+frontier,hierarchical,poisson,100000,200,10,4,224,223,10316000,10316000,10.2543,1.00602e+06
+frontier,hierarchical,poisson,1000000,200,10,4,224,223,446000,446000,10.159,43902
+frontier,hierarchical,poisson,10000000,200,10,4,224,223,13380,13380,10.307,1298.15
+frontier,hierarchical,poisson,100000000,200,10,4,224,223,1784,1784,12.8031,139.341
+frontier,hierarchical,poisson,1000000000,200,10,4,224,223,223,223,17.0007,13.1171
diff --git a/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054789/strong_scaling_frontier.csv b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054789/strong_scaling_frontier.csv
new file mode 100644
index 0000000..8be8e38
--- /dev/null
+++ b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4054789/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,4,224,223,4547862,10.0013,454728
+frontier,naive,fixed,10,10,4,224,223,4590232,10.0013,458964
+frontier,naive,fixed,100,10,4,224,223,3794122,10.0023,379327
+frontier,naive,fixed,1000,10,4,224,223,1621656,10.0036,162108
+frontier,naive,fixed,10000,10,4,224,223,215418,10.0275,21482.8
+frontier,naive,fixed,100000,10,4,224,223,11868,10.3026,1151.94
+frontier,naive,fixed,1000000,10,4,224,223,1788,13.0022,137.515
+frontier,naive,random,1,10,4,224,223,4574176,10.0019,457331
+frontier,naive,random,10,10,4,224,223,4588448,10.0013,458784
+frontier,naive,random,100,10,4,224,223,3928814,10.0012,392836
+frontier,naive,random,1000,10,4,224,223,1337108,10.0046,133649
+frontier,naive,random,10000,10,4,224,223,136922,10.0662,13602.1
+frontier,naive,random,100000,10,4,224,223,13407,10.3749,1292.25
+frontier,naive,random,1000000,10,4,224,223,1835,13.9344,131.689
+frontier,hierarchical,fixed,1,10,4,224,223,14149350,10.0004,1.41487e+06
+frontier,hierarchical,fixed,10,10,4,224,223,9828948,10.0008,982815
+frontier,hierarchical,fixed,100,10,4,224,223,8409330,10.001,840852
+frontier,hierarchical,fixed,1000,10,4,224,223,1080212,10.0062,107955
+frontier,hierarchical,fixed,10000,10,4,224,223,111946,10.0774,11108.7
+frontier,hierarchical,fixed,100000,10,4,224,223,10972,10.4031,1054.68
+frontier,hierarchical,fixed,1000000,10,4,224,223,1732,16.0011,108.243
+frontier,hierarchical,random,1,10,4,224,223,13635112,10.0005,1.36344e+06
+frontier,hierarchical,random,10,10,4,224,223,9365108,10.0007,936447
+frontier,hierarchical,random,100,10,4,224,223,1833952,10.0044,183315
+frontier,hierarchical,random,1000,10,4,224,223,221662,10.0397,22078.5
+frontier,hierarchical,random,10000,10,4,224,223,44568,10.2315,4355.97
+frontier,hierarchical,random,100000,10,4,224,223,8872,10.8119,820.581
+frontier,hierarchical,random,1000000,10,4,224,223,1312,16.1668,81.1539
diff --git a/benchmark/results/frontier/4-dynampi_ss_frontier_4-4058683/strong_scaling_frontier.csv b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4058683/strong_scaling_frontier.csv
new file mode 100644
index 0000000..6d56b94
--- /dev/null
+++ b/benchmark/results/frontier/4-dynampi_ss_frontier_4-4058683/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,4,224,223,6717086,10.0004,671682
+frontier,naive,fixed,10,10,4,224,223,6774977,10.0005,677466
+frontier,naive,fixed,100,10,4,224,223,6577294,10.0007,657682
+frontier,naive,fixed,1000,10,4,224,223,2201640,10.001,220141
+frontier,naive,fixed,10000,10,4,224,223,222888,10.015,22255.4
+frontier,naive,fixed,100000,10,4,224,223,22295,10.0034,2228.74
+frontier,naive,fixed,1000000,10,4,224,223,2205,10.0026,220.442
+frontier,naive,random,1,10,4,224,223,6764440,10.0003,676421
+frontier,naive,random,10,10,4,224,223,6795236,10,679521
+frontier,naive,random,100,10,4,224,223,6659825,10.0007,665936
+frontier,naive,random,1000,10,4,224,223,2196061,10.002,219562
+frontier,naive,random,10000,10,4,224,223,223015,10.0188,22259.7
+frontier,naive,random,100000,10,4,224,223,22357,10.185,2195.09
+frontier,naive,random,1000000,10,4,224,223,2384,11.8718,200.812
+frontier,hierarchical,fixed,1,10,4,224,223,41529685,10.0001,4.15292e+06
+frontier,hierarchical,fixed,10,10,4,224,223,41717861,10.0001,4.17175e+06
+frontier,hierarchical,fixed,100,10,4,224,223,16354178,10.0001,1.63539e+06
+frontier,hierarchical,fixed,1000,10,4,224,223,2116933,10.0016,211659
+frontier,hierarchical,fixed,10000,10,4,224,223,218263,10.0136,21796.6
+frontier,hierarchical,fixed,100000,10,4,224,223,21879,10.0066,2186.46
+frontier,hierarchical,fixed,1000000,10,4,224,223,2304,12.0027,191.957
+frontier,hierarchical,random,1,10,4,224,223,41343892,10.0001,4.13435e+06
+frontier,hierarchical,random,10,10,4,224,223,41410367,10.0001,4.14098e+06
+frontier,hierarchical,random,100,10,4,224,223,12398434,10.0002,1.23981e+06
+frontier,hierarchical,random,1000,10,4,224,223,1375607,10.002,137533
+frontier,hierarchical,random,10000,10,4,224,223,139093,10.0195,13882.2
+frontier,hierarchical,random,100000,10,4,224,223,14076,10.1846,1382.08
+frontier,hierarchical,random,1000000,10,4,224,223,1499,12.7281,117.771
diff --git a/benchmark/results/frontier/4096-dynampi_shutdown_frontier_4096-4058263/naive_shutdown_frontier.csv b/benchmark/results/frontier/4096-dynampi_shutdown_frontier_4096-4058263/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..cc88034
--- /dev/null
+++ b/benchmark/results/frontier/4096-dynampi_shutdown_frontier_4096-4058263/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,4096,229376,229375,8.7203e+06,2
diff --git a/benchmark/results/frontier/512-dynampi_shutdown_frontier_512-4058175/naive_shutdown_frontier.csv b/benchmark/results/frontier/512-dynampi_shutdown_frontier_512-4058175/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..35fe6f1
--- /dev/null
+++ b/benchmark/results/frontier/512-dynampi_shutdown_frontier_512-4058175/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,512,28672,28671,147610,63
diff --git a/benchmark/results/frontier/512-dynampi_ss_frontier_512-4054796/strong_scaling_frontier.csv b/benchmark/results/frontier/512-dynampi_ss_frontier_512-4054796/strong_scaling_frontier.csv
new file mode 100644
index 0000000..b74af59
--- /dev/null
+++ b/benchmark/results/frontier/512-dynampi_ss_frontier_512-4054796/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,512,28672,28671,1593917,10.2563,155408
+frontier,naive,fixed,10,10,512,28672,28671,1595820,10.2644,155471
+frontier,naive,fixed,100,10,512,28672,28671,1607061,10.4485,153808
+frontier,naive,fixed,1000,10,512,28672,28671,1589578,10.6948,148631
+frontier,naive,fixed,10000,10,512,28672,28671,1691253,10.6247,159181
+frontier,naive,fixed,100000,10,512,28672,28671,1165716,10.5506,110488
+frontier,naive,fixed,1000000,10,512,28672,28671,230474,14.3831,16023.9
+frontier,naive,random,1,10,512,28672,28671,1595884,10.2749,155319
+frontier,naive,random,10,10,512,28672,28671,1614193,10.3073,156606
+frontier,naive,random,100,10,512,28672,28671,1608566,10.4461,153987
+frontier,naive,random,1000,10,512,28672,28671,1555596,10.715,145179
+frontier,naive,random,10000,10,512,28672,28671,1649861,10.6681,154654
+frontier,naive,random,100000,10,512,28672,28671,1591839,10.5636,150691
+frontier,naive,random,1000000,10,512,28672,28671,222850,14.6966,15163.4
+frontier,hierarchical,fixed,1,10,512,28672,28671,58144788,10.0262,5.79928e+06
+frontier,hierarchical,fixed,10,10,512,28672,28671,54532242,10.0341,5.43471e+06
+frontier,hierarchical,fixed,100,10,512,28672,28671,53614770,10.0315,5.34463e+06
+frontier,hierarchical,fixed,1000,10,512,28672,28671,100864578,10.0109,1.00755e+07
+frontier,hierarchical,fixed,10000,10,512,28672,28671,13876764,10.0646,1.37877e+06
+frontier,hierarchical,fixed,100000,10,512,28672,28671,1488060,10.4317,142648
+frontier,hierarchical,fixed,1000000,10,512,28672,28671,229368,16.0094,14327.1
+frontier,hierarchical,random,1,10,512,28672,28671,60209100,10.0223,6.00754e+06
+frontier,hierarchical,random,10,10,512,28672,28671,54417558,10.0345,5.42302e+06
+frontier,hierarchical,random,100,10,512,28672,28671,53156034,10.0623,5.28268e+06
+frontier,hierarchical,random,1000,10,512,28672,28671,41171556,10.0471,4.09785e+06
+frontier,hierarchical,random,10000,10,512,28672,28671,9174720,10.0892,909362
+frontier,hierarchical,random,100000,10,512,28672,28671,1032038,11.2,92145.9
+frontier,hierarchical,random,1000000,10,512,28672,28671,172026,18.6474,9225.22
diff --git a/benchmark/results/frontier/512-dynampi_ss_frontier_512-4058690/strong_scaling_frontier.csv b/benchmark/results/frontier/512-dynampi_ss_frontier_512-4058690/strong_scaling_frontier.csv
new file mode 100644
index 0000000..791aa64
--- /dev/null
+++ b/benchmark/results/frontier/512-dynampi_ss_frontier_512-4058690/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,512,28672,28671,8789361,10.0652,873239
+frontier,naive,fixed,10,10,512,28672,28671,9261687,10.0858,918294
+frontier,naive,fixed,100,10,512,28672,28671,3197232,10.0818,317128
+frontier,naive,fixed,1000,10,512,28672,28671,1553875,10.0182,155105
+frontier,naive,fixed,10000,10,512,28672,28671,1398919,10.0312,139457
+frontier,naive,fixed,100000,10,512,28672,28671,2319601,10.1141,229344
+frontier,naive,fixed,1000000,10,512,28672,28671,301158,11.1981,26893.7
+frontier,naive,random,1,10,512,28672,28671,8129334,10.1092,804154
+frontier,naive,random,10,10,512,28672,28671,9481135,10.084,940216
+frontier,naive,random,100,10,512,28672,28671,3279858,10.0788,325421
+frontier,naive,random,1000,10,512,28672,28671,1564771,10.0017,156450
+frontier,naive,random,10000,10,512,28672,28671,1424268,10.0149,142214
+frontier,naive,random,100000,10,512,28672,28671,2328254,10.1959,228352
+frontier,naive,random,1000000,10,512,28672,28671,302850,11.986,25267
+frontier,hierarchical,fixed,1,10,512,28672,28671,103676556,10.0021,1.03654e+07
+frontier,hierarchical,fixed,10,10,512,28672,28671,104640022,10.0022,1.04617e+07
+frontier,hierarchical,fixed,100,10,512,28672,28671,102250175,10.0022,1.02228e+07
+frontier,hierarchical,fixed,1000,10,512,28672,28671,100203438,10.0021,1.00183e+07
+frontier,hierarchical,fixed,10000,10,512,28672,28671,28054792,10.0233,2.79897e+06
+frontier,hierarchical,fixed,100000,10,512,28672,28671,2811605,10.0166,280696
+frontier,hierarchical,fixed,1000000,10,512,28672,28671,267492,10.0065,26731.7
+frontier,hierarchical,random,1,10,512,28672,28671,103940686,10.0022,1.03918e+07
+frontier,hierarchical,random,10,10,512,28672,28671,104776703,10.0021,1.04755e+07
+frontier,hierarchical,random,100,10,512,28672,28671,100195281,10.0021,1.00174e+07
+frontier,hierarchical,random,1000,10,512,28672,28671,97940942,10.0032,9.79096e+06
+frontier,hierarchical,random,10000,10,512,28672,28671,17868532,10.0335,1.78088e+06
+frontier,hierarchical,random,100000,10,512,28672,28671,1816371,10.3556,175400
+frontier,hierarchical,random,1000000,10,512,28672,28671,226880,15.1788,14947.2
diff --git a/benchmark/results/frontier/64-dynampi_shutdown_frontier_64-4058172/naive_shutdown_frontier.csv b/benchmark/results/frontier/64-dynampi_shutdown_frontier_64-4058172/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..ffc5a16
--- /dev/null
+++ b/benchmark/results/frontier/64-dynampi_shutdown_frontier_64-4058172/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,64,3584,3583,14580.9,618
diff --git a/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054473/strong_scaling_frontier.csv b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054473/strong_scaling_frontier.csv
new file mode 100644
index 0000000..7246e50
--- /dev/null
+++ b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054473/strong_scaling_frontier.csv
@@ -0,0 +1,8 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,64,3584,3583,724123159,724123159,555.985,1.30242e+06
+frontier,naive,fixed,10000,200,10,64,3584,3583,81807696,81807696,63.4709,1.2889e+06
+frontier,naive,fixed,100000,200,10,64,3584,3583,12619811,12619811,20.968,601861
+frontier,naive,fixed,1000000,200,10,64,3584,3583,3369470,3369470,12.5931,267565
+frontier,naive,fixed,10000000,200,10,64,3584,3583,2129961,2129961,10.2558,207684
+frontier,naive,fixed,100000000,200,10,64,3584,3583,182782,182782,10.119,18063.2
+frontier,naive,fixed,1000000000,200,10,64,3584,3583,35830,35830,10.1455,3531.63
diff --git a/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054793/strong_scaling_frontier.csv b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054793/strong_scaling_frontier.csv
new file mode 100644
index 0000000..6641545
--- /dev/null
+++ b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4054793/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,64,3584,3583,3769316,10.0241,376024
+frontier,naive,fixed,10,10,64,3584,3583,3819478,10.0346,380629
+frontier,naive,fixed,100,10,64,3584,3583,3360854,10.0344,334934
+frontier,naive,fixed,1000,10,64,3584,3583,2508100,10.0548,249443
+frontier,naive,fixed,10000,10,64,3584,3583,1891824,10.0659,187943
+frontier,naive,fixed,100000,10,64,3584,3583,189948,10.3198,18406.1
+frontier,naive,fixed,1000000,10,64,3584,3583,28668,13.0166,2202.42
+frontier,naive,random,1,10,64,3584,3583,3783648,10.0193,377636
+frontier,naive,random,10,10,64,3584,3583,3790814,10.0365,377703
+frontier,naive,random,100,10,64,3584,3583,3368020,10.0229,336032
+frontier,naive,random,1000,10,64,3584,3583,2500934,10.0419,249049
+frontier,naive,random,10000,10,64,3584,3583,1992148,10.0493,198238
+frontier,naive,random,100000,10,64,3584,3583,208382,10.4557,19929.9
+frontier,naive,random,1000000,10,64,3584,3583,29371,14.5497,2018.67
+frontier,hierarchical,fixed,1,10,64,3584,3583,25768936,10.0039,2.57588e+06
+frontier,hierarchical,fixed,10,10,64,3584,3583,25797600,10.0054,2.57836e+06
+frontier,hierarchical,fixed,100,10,64,3584,3583,37600002,10.0085,3.7568e+06
+frontier,hierarchical,fixed,1000,10,64,3584,3583,10870822,10.0084,1.08617e+06
+frontier,hierarchical,fixed,10000,10,64,3584,3583,1777168,10.0764,176369
+frontier,hierarchical,fixed,100000,10,64,3584,3583,184252,10.4152,17690.7
+frontier,hierarchical,fixed,1000000,10,64,3584,3583,28492,16.0038,1780.33
+frontier,hierarchical,random,1,10,64,3584,3583,25991082,10.0051,2.59779e+06
+frontier,hierarchical,random,10,10,64,3584,3583,26550030,10.0043,2.65388e+06
+frontier,hierarchical,random,100,10,64,3584,3583,9322966,10.0112,931254
+frontier,hierarchical,random,1000,10,64,3584,3583,2780408,10.1031,275204
+frontier,hierarchical,random,10000,10,64,3584,3583,1053402,10.1678,103601
+frontier,hierarchical,random,100000,10,64,3584,3583,127612,10.7208,11903.2
+frontier,hierarchical,random,1000000,10,64,3584,3583,21412,17.7611,1205.56
diff --git a/benchmark/results/frontier/64-dynampi_ss_frontier_64-4058687/strong_scaling_frontier.csv b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4058687/strong_scaling_frontier.csv
new file mode 100644
index 0000000..9a1b85f
--- /dev/null
+++ b/benchmark/results/frontier/64-dynampi_ss_frontier_64-4058687/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,64,3584,3583,5808317,10.0098,580261
+frontier,naive,fixed,10,10,64,3584,3583,5803394,10.0008,580294
+frontier,naive,fixed,100,10,64,3584,3583,6500955,10.0141,649181
+frontier,naive,fixed,1000,10,64,3584,3583,2811407,10.014,280748
+frontier,naive,fixed,10000,10,64,3584,3583,2446281,10.0134,244300
+frontier,naive,fixed,100000,10,64,3584,3583,358202,10.0176,35757.2
+frontier,naive,fixed,1000000,10,64,3584,3583,35429,10.0151,3537.54
+frontier,naive,random,1,10,64,3584,3583,5818141,10.0004,581793
+frontier,naive,random,10,10,64,3584,3583,5843557,10.0034,584155
+frontier,naive,random,100,10,64,3584,3583,6593945,10.014,658474
+frontier,naive,random,1000,10,64,3584,3583,2817589,10.014,281365
+frontier,naive,random,10000,10,64,3584,3583,2450596,10.0199,244573
+frontier,naive,random,100000,10,64,3584,3583,360218,10.1953,35331.8
+frontier,naive,random,1000000,10,64,3584,3583,38099,11.9544,3187.03
+frontier,hierarchical,fixed,1,10,64,3584,3583,102416518,10.0003,1.02413e+07
+frontier,hierarchical,fixed,10,10,64,3584,3583,103691289,10.0003,1.03688e+07
+frontier,hierarchical,fixed,100,10,64,3584,3583,102024761,10.0003,1.02022e+07
+frontier,hierarchical,fixed,1000,10,64,3584,3583,33945141,10.002,3.39382e+06
+frontier,hierarchical,fixed,10000,10,64,3584,3583,3505327,10.0201,349828
+frontier,hierarchical,fixed,100000,10,64,3584,3583,351531,10.0073,35127.4
+frontier,hierarchical,fixed,1000000,10,64,3584,3583,36575,12.0023,3047.34
+frontier,hierarchical,random,1,10,64,3584,3583,102870506,10.0003,1.02867e+07
+frontier,hierarchical,random,10,10,64,3584,3583,104275771,10.0003,1.04273e+07
+frontier,hierarchical,random,100,10,64,3584,3583,99362749,10.0004,9.93592e+06
+frontier,hierarchical,random,1000,10,64,3584,3583,22053877,10.0035,2.20461e+06
+frontier,hierarchical,random,10000,10,64,3584,3583,2232854,10.0331,222548
+frontier,hierarchical,random,100000,10,64,3584,3583,227356,10.3569,21952.1
+frontier,hierarchical,random,1000000,10,64,3584,3583,28298,14.8682,1903.26
diff --git a/benchmark/results/frontier/8-dynampi_shutdown_frontier_8-4058169/naive_shutdown_frontier.csv b/benchmark/results/frontier/8-dynampi_shutdown_frontier_8-4058169/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..f985193
--- /dev/null
+++ b/benchmark/results/frontier/8-dynampi_shutdown_frontier_8-4058169/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,8,448,447,1550.36,5607
diff --git a/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054470/strong_scaling_frontier.csv b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054470/strong_scaling_frontier.csv
new file mode 100644
index 0000000..f658708
--- /dev/null
+++ b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054470/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_ns,round_target_ms,duration_s,nodes,world_size,workers,total_tasks,total_subtasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1000,200,10,8,448,447,101776305,101776305,77.0221,1.32139e+06
+frontier,naive,fixed,10000,200,10,8,448,447,21899094,21899094,16.565,1.32201e+06
+frontier,naive,fixed,100000,200,10,8,448,447,7336212,7336212,11.3312,647436
+frontier,naive,fixed,1000000,200,10,8,448,447,2867904,2867904,10.2727,279177
+frontier,naive,fixed,10000000,200,10,8,448,447,380116,380116,10.1772,37349.6
+frontier,naive,fixed,100000000,200,10,8,448,447,22846,22846,10.104,2261.09
+frontier,naive,fixed,1000000000,200,10,8,448,447,4470,4470,10.0177,446.208
+frontier,naive,poisson,1000,200,10,8,448,447,101587556,101587556,78.056,1.30147e+06
+frontier,naive,poisson,10000,200,10,8,448,447,21799432,21799432,16.592,1.31385e+06
+frontier,naive,poisson,100000,200,10,8,448,447,7326251,7326251,11.3322,646498
+frontier,naive,poisson,1000000,200,10,8,448,447,2864316,2864316,10.2738,278799
+frontier,naive,poisson,10000000,200,10,8,448,447,380116,380116,10.1773,37349.5
+frontier,naive,poisson,100000000,200,10,8,448,447,22846,22846,10.1032,2261.26
+frontier,naive,poisson,1000000000,200,10,8,448,447,4470,4470,10.0175,446.218
+frontier,hierarchical,fixed,1000,200,10,8,448,447,145209000,145209000,22.6738,6.40425e+06
+frontier,hierarchical,fixed,10000,200,10,8,448,447,62094000,62094000,11.3898,5.45172e+06
+frontier,hierarchical,fixed,100000,200,10,8,448,447,21903000,21903000,10.2302,2.14101e+06
+frontier,hierarchical,fixed,1000000,200,10,8,448,447,894000,894000,10.1507,88073.1
+frontier,hierarchical,fixed,10000000,200,10,8,448,447,26820,26820,12.8001,2095.3
+frontier,hierarchical,fixed,100000000,200,10,8,448,447,2682,2682,12.9028,207.862
+frontier,hierarchical,fixed,1000000000,200,10,8,448,447,447,447,22.0012,20.3171
+frontier,hierarchical,poisson,1000,200,10,8,448,447,145209000,145209000,22.6182,6.42001e+06
+frontier,hierarchical,poisson,10000,200,10,8,448,447,62604000,62604000,11.4339,5.47531e+06
+frontier,hierarchical,poisson,100000,200,10,8,448,447,21903000,21903000,10.2694,2.13285e+06
+frontier,hierarchical,poisson,1000000,200,10,8,448,447,894000,894000,10.1579,88010.1
+frontier,hierarchical,poisson,10000000,200,10,8,448,447,26820,26820,12.8009,2095.16
+frontier,hierarchical,poisson,100000000,200,10,8,448,447,2682,2682,12.9029,207.86
+frontier,hierarchical,poisson,1000000000,200,10,8,448,447,447,447,22.0009,20.3174
diff --git a/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054790/strong_scaling_frontier.csv b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054790/strong_scaling_frontier.csv
new file mode 100644
index 0000000..2956ca4
--- /dev/null
+++ b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4054790/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,8,448,447,4159782,10.0037,415825
+frontier,naive,fixed,10,10,8,448,447,4188390,10.0026,418730
+frontier,naive,fixed,100,10,8,448,447,3606396,10.0036,360509
+frontier,naive,fixed,1000,10,8,448,447,2225166,10.0048,222410
+frontier,naive,fixed,10000,10,8,448,447,413922,10.0388,41232.1
+frontier,naive,fixed,100000,10,8,448,447,23740,10.304,2303.97
+frontier,naive,fixed,1000000,10,8,448,447,3580,13.0029,275.323
+frontier,naive,random,1,10,8,448,447,4169616,10.003,416838
+frontier,naive,random,10,10,8,448,447,4181238,10.0025,418017
+frontier,naive,random,100,10,8,448,447,3650202,10.0041,364871
+frontier,naive,random,1000,10,8,448,447,2414694,10.0064,241316
+frontier,naive,random,10000,10,8,448,447,270882,10.0408,26978.2
+frontier,naive,random,100000,10,8,448,447,27023,10.5424,2563.28
+frontier,naive,random,1000000,10,8,448,447,4143,16.294,254.266
+frontier,hierarchical,fixed,1,10,8,448,447,44793870,10.0003,4.47926e+06
+frontier,hierarchical,fixed,10,10,8,448,447,26263932,10.0007,2.62622e+06
+frontier,hierarchical,fixed,100,10,8,448,447,16459434,10.0009,1.64579e+06
+frontier,hierarchical,fixed,1000,10,8,448,447,2182254,10.0042,218133
+frontier,hierarchical,fixed,10000,10,8,448,447,224394,10.0671,22289.7
+frontier,hierarchical,fixed,100000,10,8,448,447,22584,10.4036,2170.78
+frontier,hierarchical,fixed,1000000,10,8,448,447,3546,16.0012,221.608
+frontier,hierarchical,random,1,10,8,448,447,44670498,10.0002,4.46696e+06
+frontier,hierarchical,random,10,10,8,448,447,26506206,10.0004,2.65051e+06
+frontier,hierarchical,random,100,10,8,448,447,6727350,10.0024,672572
+frontier,hierarchical,random,1000,10,8,448,447,714306,10.0235,71262.8
+frontier,hierarchical,random,10000,10,8,448,447,113538,10.1494,11186.7
+frontier,hierarchical,random,100000,10,8,448,447,17790,10.9489,1624.82
+frontier,hierarchical,random,1000000,10,8,448,447,2682,16.0222,167.393
diff --git a/benchmark/results/frontier/8-dynampi_ss_frontier_8-4058684/strong_scaling_frontier.csv b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4058684/strong_scaling_frontier.csv
new file mode 100644
index 0000000..d71cd79
--- /dev/null
+++ b/benchmark/results/frontier/8-dynampi_ss_frontier_8-4058684/strong_scaling_frontier.csv
@@ -0,0 +1,29 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+frontier,naive,fixed,1,10,8,448,447,6177757,10.0006,617737
+frontier,naive,fixed,10,10,8,448,447,6248681,10.0003,624852
+frontier,naive,fixed,100,10,8,448,447,6571127,10.0015,657011
+frontier,naive,fixed,1000,10,8,448,447,2866258,10.0016,286581
+frontier,naive,fixed,10000,10,8,448,447,446655,10.015,44598.5
+frontier,naive,fixed,100000,10,8,448,447,44690,10.0044,4467.02
+frontier,naive,fixed,1000000,10,8,448,447,4420,10.0033,441.854
+frontier,naive,random,1,10,8,448,447,6195082,10.0002,619495
+frontier,naive,random,10,10,8,448,447,6278178,10.001,627753
+frontier,naive,random,100,10,8,448,447,6646979,10.0016,664593
+frontier,naive,random,1000,10,8,448,447,2868032,10.0018,286752
+frontier,naive,random,10000,10,8,448,447,446647,10.0193,44578.5
+frontier,naive,random,100000,10,8,448,447,44964,10.1863,4414.17
+frontier,naive,random,1000000,10,8,448,447,4781,11.9527,399.992
+frontier,hierarchical,fixed,1,10,8,448,447,80386914,10.0001,8.03859e+06
+frontier,hierarchical,fixed,10,10,8,448,447,80806349,10.0001,8.08052e+06
+frontier,hierarchical,fixed,100,10,8,448,447,32512772,10.0002,3.2512e+06
+frontier,hierarchical,fixed,1000,10,8,448,447,4237689,10.0029,423648
+frontier,hierarchical,fixed,10000,10,8,448,447,437522,10.0198,43665.9
+frontier,hierarchical,fixed,100000,10,8,448,447,43858,10.0067,4382.86
+frontier,hierarchical,fixed,1000000,10,8,448,447,4589,12.0025,382.337
+frontier,hierarchical,random,1,10,8,448,447,80384681,10.0002,8.03834e+06
+frontier,hierarchical,random,10,10,8,448,447,80660773,10.0001,8.06596e+06
+frontier,hierarchical,random,100,10,8,448,447,24679835,10.0003,2.4679e+06
+frontier,hierarchical,random,1000,10,8,448,447,2752429,10.003,275161
+frontier,hierarchical,random,10000,10,8,448,447,278803,10.0319,27791.6
+frontier,hierarchical,random,100000,10,8,448,447,28160,10.2183,2755.84
+frontier,hierarchical,random,1000000,10,8,448,447,3437,12.9098,266.231
diff --git a/benchmark/results/frontier/8192-dynampi_shutdown_frontier_8192-4058264/naive_shutdown_frontier.csv b/benchmark/results/frontier/8192-dynampi_shutdown_frontier_8192-4058264/naive_shutdown_frontier.csv
new file mode 100644
index 0000000..befcc98
--- /dev/null
+++ b/benchmark/results/frontier/8192-dynampi_shutdown_frontier_8192-4058264/naive_shutdown_frontier.csv
@@ -0,0 +1,2 @@
+system,nodes,world_size,workers,time_per_shutdown_us,iterations
+frontier,8192,458752,458751,3.55936e+07,1
diff --git a/benchmark/results/naive_shutdown_local.csv b/benchmark/results/naive_shutdown_local.csv
new file mode 100644
index 0000000..518d327
--- /dev/null
+++ b/benchmark/results/naive_shutdown_local.csv
@@ -0,0 +1,45 @@
+system,nodes,world_size,workers,shutdown_time_s
+local,1,1,1,5.48e-07
+local,1,1,1,4e-07
+local,1,1,1,3.14e-07
+local,1,1,1,1.577e-06
+local,1,1,1,1.127e-06
+local,1,1,1,1.332e-06
+local,1,2,1,3.322e-05
+local,1,4,3,1.4165e-05
+local,1,8,7,7.699e-05
+local,1,12,11,9.5137e-05
+local,1,1,1,1.316e-06
+local,1,2,1,4.623e-06
+local,1,4,3,1.4413e-05
+local,1,8,7,0.0120517
+local,1,12,11,0.000663585
+local,1,1,1,3.029e-06
+local,1,2,1,4.582e-06
+local,1,4,3,0.0113531
+local,1,8,7,0.0159826
+local,1,12,11,9.6134e-05
+local,1,1,1,1.075e-06,0
+local,1,2,1,6.343e-06,0
+local,1,4,3,1.8276e-05,0
+local,1,8,7,6.1534e-05,0
+local,1,12,11,9.8477e-05,0
+local,1,1,1,0.214144,1422282
+local,1,2,1,0.514655,587015
+local,1,4,3,0.565046,490827
+local,1,1,1,190888,1573402
+local,1,2,1,464309,653690
+local,1,4,3,472562,374580
+local,1,1,1,0.235832,1551905
+local,1,2,1,1.64211,605344
+local,1,4,3,3.47449,414542
+local,1,1,1,0.361011,457441
+local,1,2,1,1.32542,434603
+local,1,4,3,2.58194,452985
+local,1,8,7,4.15406,310733
+local,1,12,11,10.4031,138796
+local,1,1,1,0.245685,1545135
+local,1,2,1,1.20228,776076
+local,1,4,3,3.01309,412864
+local,1,8,7,3.61101,381144
+local,1,12,11,5.33916,288527
diff --git a/benchmark/results/strong_scaling_local.csv b/benchmark/results/strong_scaling_local.csv
new file mode 100644
index 0000000..48d7ffa
--- /dev/null
+++ b/benchmark/results/strong_scaling_local.csv
@@ -0,0 +1,148 @@
+system,distributor,mode,expected_us,duration_s,nodes,world_size,workers,total_tasks,elapsed_s,throughput_tasks_per_s
+local,naive,fixed,1,10,1,1,1,4936879,10,493688
+local,naive,fixed,10,10,1,1,1,820802,10,82080.2
+local,naive,fixed,100,10,1,1,1,88892,10,8889.2
+local,naive,fixed,1000,10,1,1,1,8975,10,897.5
+local,naive,fixed,10000,10,1,1,1,900,10,90
+local,naive,fixed,100000,10,1,1,1,90,10,9
+local,naive,fixed,1000000,10,1,1,1,9,10,0.9
+local,naive,random,1,10,1,1,1,4842714,10,484271
+local,naive,random,10,10,1,1,1,820360,10,82036
+local,naive,random,100,10,1,1,1,89034,10,8903.4
+local,naive,random,1000,10,1,1,1,8928,10,892.8
+local,naive,random,10000,10,1,1,1,895,10,89.5
+local,naive,random,100000,10,1,1,1,76,10,7.6
+local,naive,random,1000000,10,1,1,1,11,13.2314,0.831355
+local,hierarchical,fixed,1,10,1,1,1,4700619,10,470062
+local,hierarchical,fixed,10,10,1,1,1,814999,10,81499.9
+local,hierarchical,fixed,100,10,1,1,1,88762,10,8876.2
+local,hierarchical,fixed,1000,10,1,1,1,8973,10,897.3
+local,hierarchical,fixed,10000,10,1,1,1,900,10,90
+local,hierarchical,fixed,100000,10,1,1,1,90,10,9
+local,hierarchical,fixed,1000000,10,1,1,1,9,10,0.9
+local,hierarchical,random,1,10,1,1,1,4638143,10,463814
+local,hierarchical,random,10,10,1,1,1,814497,10,81449.7
+local,hierarchical,random,100,10,1,1,1,88578,10,8857.8
+local,hierarchical,random,1000,10,1,1,1,9035,10,903.499
+local,hierarchical,random,10000,10,1,1,1,889,10,88.9
+local,hierarchical,random,100000,10,1,1,1,93,10,9.3
+local,hierarchical,random,1000000,10,1,1,1,18,15.0561,1.19553
+local,naive,fixed,1,10,2,2,1,3492706,10,349271
+local,naive,fixed,10,10,2,2,1,758488,10,75848.8
+local,naive,fixed,100,10,2,2,1,87606,10,8760.6
+local,naive,fixed,1000,10,2,2,1,8945,10,894.5
+local,naive,fixed,10000,10,2,2,1,900,10,90
+local,naive,fixed,100000,10,2,2,1,90,10,9
+local,naive,fixed,1000000,10,2,2,1,9,10,0.9
+local,naive,random,1,10,2,2,1,3357918,10,335792
+local,naive,random,10,10,2,2,1,758674,10,75867.4
+local,naive,random,100,10,2,2,1,87810,10,8781
+local,naive,random,1000,10,2,2,1,8875,10,887.5
+local,naive,random,10000,10,2,2,1,865,10,86.5
+local,naive,random,100000,10,2,2,1,97,10,9.7
+local,naive,random,1000000,10,2,2,1,10,10.0505,0.994972
+local,hierarchical,fixed,1,10,2,2,1,3354387,10,335439
+local,hierarchical,fixed,10,10,2,2,1,748999,10,74899.9
+local,hierarchical,fixed,100,10,2,2,1,87505,10,8750.5
+local,hierarchical,fixed,1000,10,2,2,1,8943,10,894.3
+local,hierarchical,fixed,10000,10,2,2,1,900,10,90
+local,hierarchical,fixed,100000,10,2,2,1,90,10,9
+local,hierarchical,fixed,1000000,10,2,2,1,9,10,0.9
+local,hierarchical,random,1,10,2,2,1,3190185,10,319018
+local,hierarchical,random,10,10,2,2,1,753267,10,75326.7
+local,hierarchical,random,100,10,2,2,1,86942,10,8694.2
+local,hierarchical,random,1000,10,2,2,1,9063,10,906.3
+local,hierarchical,random,10000,10,2,2,1,857,10,85.7
+local,hierarchical,random,100000,10,2,2,1,85,10,8.5
+local,hierarchical,random,1000000,10,2,2,1,14,13.7515,1.01807
+local,naive,fixed,1,10,4,4,3,8343450,10,834343
+local,naive,fixed,10,10,4,4,3,2315964,10,231595
+local,naive,fixed,100,10,4,4,3,289128,10.0004,28911.8
+local,naive,fixed,1000,10,4,4,3,29790,10.0034,2978
+local,naive,fixed,10000,10,4,4,3,3006,10.0315,299.656
+local,naive,fixed,100000,10,4,4,3,208,10.3024,20.1894
+local,naive,fixed,1000000,10,4,4,3,28,13.0005,2.15377
+local,naive,random,1,10,4,4,3,8392890,10,839288
+local,naive,random,10,10,4,4,3,2020056,10,202005
+local,naive,random,100,10,4,4,3,234588,10.0005,23457.7
+local,naive,random,1000,10,4,4,3,23946,10.0031,2393.86
+local,naive,random,10000,10,4,4,3,2430,10.0364,242.119
+local,naive,random,100000,10,4,4,3,237,10.2858,23.0415
+local,naive,random,1000000,10,4,4,3,33,13.5085,2.4429
+local,hierarchical,fixed,1,10,4,4,3,3501084,10,350108
+local,hierarchical,fixed,10,10,4,4,3,823800,10.0001,82379.1
+local,hierarchical,fixed,100,10,4,4,3,96918,10.0007,9691.11
+local,hierarchical,fixed,1000,10,4,4,3,11922,10.0052,1191.58
+local,hierarchical,fixed,10000,10,4,4,3,1206,10.0622,119.855
+local,hierarchical,fixed,100000,10,4,4,3,126,10.5028,11.9968
+local,hierarchical,fixed,1000000,10,4,4,3,18,18.0008,0.999957
+local,hierarchical,random,1,10,4,4,3,3967542,10,396753
+local,hierarchical,random,10,10,4,4,3,976926,10.0001,97691.7
+local,hierarchical,random,100,10,4,4,3,116142,10.0007,11613.4
+local,hierarchical,random,1000,10,4,4,3,11832,10.007,1182.37
+local,hierarchical,random,10000,10,4,4,3,1194,10.0701,118.569
+local,hierarchical,random,100000,10,4,4,3,120,11.0632,10.8468
+local,hierarchical,random,1000000,10,4,4,3,18,17.9428,1.00319
+local,naive,fixed,1,10,8,8,7,11848858,10,1.18488e+06
+local,naive,fixed,10,10,8,8,7,4682244,10.0001,468222
+local,naive,fixed,100,10,8,8,7,655046,10.0003,65502.6
+local,naive,fixed,1000,10,8,8,7,69244,10.0031,6922.22
+local,naive,fixed,10000,10,8,8,7,7014,10.037,698.817
+local,naive,fixed,100000,10,8,8,7,420,10.3031,40.7645
+local,naive,fixed,1000000,10,8,8,7,60,13.0006,4.61519
+local,naive,random,1,10,8,8,7,11858854,10,1.18588e+06
+local,naive,random,10,10,8,8,7,4099536,10,409952
+local,naive,random,100,10,8,8,7,493136,10.0005,49311
+local,naive,random,1000,10,8,8,7,50358,10.0041,5033.73
+local,naive,random,10000,10,8,8,7,5124,10.0377,510.477
+local,naive,random,100000,10,8,8,7,533,10.4457,51.0258
+local,naive,random,1000000,10,8,8,7,72,15.6575,4.59843
+local,hierarchical,fixed,1,10,8,8,7,3546732,10.0001,354671
+local,hierarchical,fixed,10,10,8,8,7,811972,10.0002,81195.6
+local,hierarchical,fixed,100,10,8,8,7,192766,10.0008,19275
+local,hierarchical,fixed,1000,10,8,8,7,19866,10.0097,1984.67
+local,hierarchical,fixed,10000,10,8,8,7,1092,10.1567,107.516
+local,hierarchical,fixed,100000,10,8,8,7,126,11.7033,10.7662
+local,hierarchical,fixed,1000000,10,8,8,7,28,26.0097,1.07652
+local,hierarchical,random,1,10,8,8,7,3300248,10,330023
+local,hierarchical,random,10,10,8,8,7,809284,10.0003,80926
+local,hierarchical,random,100,10,8,8,7,96670,10.0017,9665.38
+local,hierarchical,random,1000,10,8,8,7,9870,10.0117,985.849
+local,hierarchical,random,10000,10,8,8,7,1876,10.1227,185.326
+local,hierarchical,random,100000,10,8,8,7,224,11.0743,20.2269
+local,hierarchical,random,1000000,10,8,8,7,42,34.674,1.21128
+local,naive,fixed,1,10,12,12,11,12340922,10.0001,1.23409e+06
+local,naive,fixed,10,10,12,12,11,6491958,10.0001,649193
+local,naive,fixed,100,10,12,12,11,1009448,10.0003,100942
+local,naive,fixed,1000,10,12,12,11,108724,10.0023,10869.9
+local,naive,fixed,10000,10,12,12,11,11022,10.04,1097.81
+local,naive,fixed,100000,10,12,12,11,632,10.3033,61.3398
+local,naive,fixed,1000000,10,12,12,11,92,13.0007,7.07652
+local,naive,random,1,10,12,12,11,12441572,10,1.24415e+06
+local,naive,random,10,10,12,12,11,5863704,10.0001,586367
+local,naive,random,100,10,12,12,11,738232,10.0005,73819.8
+local,naive,random,1000,10,12,12,11,76098,10.0058,7605.43
+local,naive,random,10000,10,12,12,11,7678,10.0415,764.628
+local,naive,random,100000,10,12,12,11,756,10.3338,73.1582
+local,naive,random,1000000,10,12,12,11,107,15.4668,6.91806
+local,hierarchical,fixed,1,10,12,12,11,8097496,10.0001,809744
+local,hierarchical,fixed,10,10,12,12,11,1604834,10.0002,160479
+local,hierarchical,fixed,100,10,12,12,11,265936,10.0016,26589.3
+local,hierarchical,fixed,1000,10,12,12,11,27346,10.0155,2730.37
+local,hierarchical,fixed,10000,10,12,12,11,2772,10.0941,274.615
+local,hierarchical,fixed,100000,10,12,12,11,308,11.2037,27.4909
+local,hierarchical,fixed,1000000,10,12,12,11,66,24.001,2.74988
+local,hierarchical,random,1,10,12,12,11,7607182,10,760715
+local,hierarchical,random,10,10,12,12,11,2208470,10.0002,220844
+local,hierarchical,random,100,10,12,12,11,266860,10.0013,26682.4
+local,hierarchical,random,1000,10,12,12,11,27896,10.0129,2786.02
+local,hierarchical,random,10000,10,12,12,11,2838,10.1515,279.565
+local,hierarchical,random,100000,10,12,12,11,308,10.7896,28.546
+local,hierarchical,random,1000000,10,12,12,11,66,19.8541,3.32425
+local,naive,fixed,1,10,1,1,1,4508789,10,450879
+local,naive,fixed,10,10,1,1,1,825894,10,82589
+local,naive,fixed,1,10,1,1,1,4512569,10,451257
+local,naive,fixed,10000,10,1,8,7,6998,10.0104,699.073
+local,naive,random,10000,10,1,8,7,6509,10.0115,650.153
+local,hierarchical,fixed,10000,10,1,8,7,3995,10.0167,398.833
+local,hierarchical,random,10000,10,1,8,7,3439,10.0041,343.761
diff --git a/benchmark/scripts/aurora_queue_utils.sh b/benchmark/scripts/aurora_queue_utils.sh
new file mode 100755
index 0000000..51f969d
--- /dev/null
+++ b/benchmark/scripts/aurora_queue_utils.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aurora PBS queue helpers: enforce "only 1 job <256 nodes in queue" and "at most 2 running".
+# Source this from submit_aurora_*.sh. Set SKIP_QUEUE_POLL=1 to disable waiting.
+
+# Poll interval in seconds. Override with AURORA_QUEUE_POLL_INTERVAL.
+AURORA_QUEUE_POLL_INTERVAL="${AURORA_QUEUE_POLL_INTERVAL:-60}"
+
+# Count my jobs: running (state R). Assumes qstat -u output has state as second-to-last column.
+_aurora_running_count() {
+  qstat -u "${USER}" 2>/dev/null | awk '
+    NR > 5 && NF >= 2 && $(NF-1) == "R" { n++ }
+    END { print 0 + n }
+  '
+}
+
+# Count my jobs in debug-scaling (queued + running). Queue name is last column.
+_aurora_debug_scaling_count() {
+  qstat -u "${USER}" 2>/dev/null | awk '
+    NR > 5 && NF >= 2 && $NF == "debug-scaling" { n++ }
+    END { print 0 + n }
+  '
+}
+
+# Block until we are allowed to submit a job with this many nodes.
+# Rules: only 1 job <256 nodes (debug-scaling) at a time; at most 2 jobs running.
+wait_for_aurora_queue_space() {
+  local nodes="${1:?}"
+  if [[ -n "${SKIP_QUEUE_POLL:-}" ]]; then
+    return 0
+  fi
+  while true; do
+    local running
+    running="$(_aurora_running_count)"
+    if [[ "${running}" -ge 2 ]]; then
+      echo "Aurora: ${running} jobs running (max 2); waiting ${AURORA_QUEUE_POLL_INTERVAL}s ..."
+      sleep "${AURORA_QUEUE_POLL_INTERVAL}"
+      continue
+    fi
+    if [[ "${nodes}" -lt 256 ]]; then
+      local in_debug
+      in_debug="$(_aurora_debug_scaling_count)"
+      if [[ "${in_debug}" -ge 1 ]]; then
+        echo "Aurora: ${in_debug} job(s) already in debug-scaling (max 1); waiting ${AURORA_QUEUE_POLL_INTERVAL}s ..."
+        sleep "${AURORA_QUEUE_POLL_INTERVAL}"
+        continue
+      fi
+    fi
+    return 0
+  done
+}
diff --git a/benchmark/scripts/check_timer_resolution.sh b/benchmark/scripts/check_timer_resolution.sh
new file mode 100755
index 0000000..9d747da
--- /dev/null
+++ b/benchmark/scripts/check_timer_resolution.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage:
+#   ./benchmark/scripts/check_timer_resolution.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/timer_resolution}"
+
+if [[ ! -f "${APP}" ]]; then
+    echo "Error: ${APP} not found. Please build the benchmark first." >&2
+    echo "Run: cmake --build build --target timer_resolution" >&2
+    exit 1
+fi
+
+"${APP}"
diff --git a/benchmark/scripts/launch_aurora_naive_shutdown.sh b/benchmark/scripts/launch_aurora_naive_shutdown.sh
new file mode 100755
index 0000000..5669844
--- /dev/null
+++ b/benchmark/scripts/launch_aurora_naive_shutdown.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage (PBS, qsub):
+#   qsub -l select=512:ncpus=102:mpiprocs=102 -l walltime=00:15:00 launch_aurora_naive_shutdown.sh
+# Or use the submit script: ./benchmark/scripts/submit_aurora_naive_shutdown.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/naive_shutdown_time}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="aurora"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512 1024 2048}"
+IFS=' ' read -r -a RANKS_PER_NODE_LIST <<< "${RANKS_PER_NODE_LIST:-core}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v srun >/dev/null 2>&1; then
+    LAUNCHER="srun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  elif command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  else
+    echo "No launcher found. Install srun, mpiexec, or mpirun." >&2
+    exit 1
+  fi
+fi
+
+get_allocated_cores_per_node() {
+  if [[ -n "${PBS_NCPUS:-}" ]]; then
+    echo "${PBS_NCPUS}"
+    return
+  fi
+  if [[ -n "${CORES_PER_NODE:-}" ]]; then
+    echo "${CORES_PER_NODE}"
+    return
+  fi
+  if [[ -n "${NCPUS_PER_NODE:-}" ]]; then
+    echo "${NCPUS_PER_NODE}"
+    return
+  fi
+  echo 102
+}
+
+ALLOC_CORES_PER_NODE="$(get_allocated_cores_per_node)"
+echo "Allocated cores per node: ${ALLOC_CORES_PER_NODE}"
+
+export FI_CXI_RX_MATCH_MODE=software
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/naive_shutdown_${SYSTEM}.csv"
+
+for nodes in "${NODE_LIST[@]}"; do
+  for rpn in "${RANKS_PER_NODE_LIST[@]}"; do
+    if [[ "${rpn}" == "core" || "${rpn}" == "cores" ]]; then
+      ranks_per_node="${CORES_PER_NODE:-102}"
+    else
+      ranks_per_node="${rpn}"
+    fi
+    if [[ "${ranks_per_node}" -gt "${ALLOC_CORES_PER_NODE}" ]]; then
+      echo "Requested ranks_per_node=${ranks_per_node} exceeds allocation ${ALLOC_CORES_PER_NODE}" >&2
+      exit 1
+    fi
+    total_ranks=$((nodes * ranks_per_node))
+    echo "Running ${SYSTEM} nodes=${nodes} ranks_per_node=${ranks_per_node}"
+    launcher_base="$(basename "${LAUNCHER}")"
+    if [[ "${launcher_base}" == mpiexec || "${launcher_base}" == mpirun ]]; then
+      "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -n "${total_ranks}" --ppn "${ranks_per_node}" \
+        "${APP}" \
+        --nodes "${nodes}" \
+        --system "${SYSTEM}" \
+        --output "${CSV}"
+    else
+      "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -N "${nodes}" -n "${total_ranks}" \
+        --ntasks-per-node="${ranks_per_node}" \
+        "${APP}" \
+        --nodes "${nodes}" \
+        --system "${SYSTEM}" \
+        --output "${CSV}"
+    fi
+  done
+done
+
+echo "Results written to ${CSV}"
diff --git a/benchmark/scripts/launch_aurora_strong_scaling.sh b/benchmark/scripts/launch_aurora_strong_scaling.sh
new file mode 100755
index 0000000..af93865
--- /dev/null
+++ b/benchmark/scripts/launch_aurora_strong_scaling.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage (PBS, qsub):
+#   qsub -l select=1:ncpus=102:mpiprocs=102 -l walltime=02:00:00 launch_aurora_strong_scaling.sh
+# Or use the submit script: ./benchmark/scripts/submit_aurora_strong_scaling.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/strong_scaling_distribution_rate}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="aurora"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512}"
+IFS=' ' read -r -a TASK_US_LIST <<< "${TASK_US_LIST:-1 10 100 1000 10000 100000 1000000}"
+IFS=' ' read -r -a DISTRIBUTIONS <<< "${DISTRIBUTIONS:-naive hierarchical}"
+IFS=' ' read -r -a MODES <<< "${MODES:-fixed random}"
+DURATION_S="${DURATION_S:-10}"
+IFS=' ' read -r -a RANKS_PER_NODE_LIST <<< "${RANKS_PER_NODE_LIST:-core}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v srun >/dev/null 2>&1; then
+    LAUNCHER="srun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  elif command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  else
+    echo "No launcher found. Install srun, mpiexec, or mpirun." >&2
+    exit 1
+  fi
+fi
+
+get_allocated_cores_per_node() {
+  if [[ -n "${PBS_NCPUS:-}" ]]; then
+    echo "${PBS_NCPUS}"
+    return
+  fi
+  if [[ -n "${CORES_PER_NODE:-}" ]]; then
+    echo "${CORES_PER_NODE}"
+    return
+  fi
+  if [[ -n "${NCPUS_PER_NODE:-}" ]]; then
+    echo "${NCPUS_PER_NODE}"
+    return
+  fi
+  echo 102
+}
+
+ALLOC_CORES_PER_NODE="$(get_allocated_cores_per_node)"
+echo "Allocated cores per node: ${ALLOC_CORES_PER_NODE}"
+
+export FI_CXI_RX_MATCH_MODE=software
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/strong_scaling_${SYSTEM}.csv"
+
+for nodes in "${NODE_LIST[@]}"; do
+  for rpn in "${RANKS_PER_NODE_LIST[@]}"; do
+    if [[ "${rpn}" == "core" || "${rpn}" == "cores" ]]; then
+      ranks_per_node="${CORES_PER_NODE:-102}"
+    else
+      ranks_per_node="${rpn}"
+    fi
+    if [[ "${ranks_per_node}" -gt "${ALLOC_CORES_PER_NODE}" ]]; then
+      echo "Requested ranks_per_node=${ranks_per_node} exceeds allocation ${ALLOC_CORES_PER_NODE}" >&2
+      exit 1
+    fi
+    total_ranks=$((nodes * ranks_per_node))
+    for dist in "${DISTRIBUTIONS[@]}"; do
+      # For Aurora, restrict to hierarchical distributor on 2048 nodes and above
+      if [[ "${SYSTEM}" == "aurora" && "${nodes}" -ge 2048 && "${dist}" != "hierarchical" ]]; then
+        continue
+      fi
+      for mode in "${MODES[@]}"; do
+      for expected_us in "${TASK_US_LIST[@]}"; do
+          echo "Running ${SYSTEM} nodes=${nodes} ranks_per_node=${ranks_per_node} dist=${dist} mode=${mode} expected_us=${expected_us}"
+        launcher_base="$(basename "${LAUNCHER}")"
+        if [[ "${launcher_base}" == mpiexec || "${launcher_base}" == mpirun ]]; then
+          "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -n "${total_ranks}" --ppn "${ranks_per_node}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes "${nodes}" \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        else
+          "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -N "${nodes}" -n "${total_ranks}" \
+            --ntasks-per-node="${ranks_per_node}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes "${nodes}" \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        fi
+        done
+      done
+    done
+  done
+done
diff --git a/benchmark/scripts/launch_frontier_naive_shutdown.sh b/benchmark/scripts/launch_frontier_naive_shutdown.sh
new file mode 100755
index 0000000..50145b8
--- /dev/null
+++ b/benchmark/scripts/launch_frontier_naive_shutdown.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage:
+#   sbatch --nodes=512 --time=00:15:00 launch_frontier_naive_shutdown.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/naive_shutdown_time}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="frontier"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192}"
+IFS=' ' read -r -a RANKS_PER_NODE_LIST <<< "${RANKS_PER_NODE_LIST:-core}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v srun >/dev/null 2>&1; then
+    LAUNCHER="srun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  elif command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  else
+    echo "No launcher found. Install srun, mpiexec, or mpirun." >&2
+    exit 1
+  fi
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/naive_shutdown_${SYSTEM}.csv"
+
+for nodes in "${NODE_LIST[@]}"; do
+  for rpn in "${RANKS_PER_NODE_LIST[@]}"; do
+    if [[ "${rpn}" == "core" || "${rpn}" == "cores" ]]; then
+      if [[ -n "${SLURM_JOB_CPUS_PER_NODE:-}" ]]; then
+        ranks_per_node="${SLURM_JOB_CPUS_PER_NODE%%(*}"
+        ranks_per_node="${ranks_per_node%%,*}"
+      else
+        ranks_per_node="${CORES_PER_NODE:-56}"
+      fi
+    else
+      ranks_per_node="${rpn}"
+    fi
+    total_ranks=$((nodes * ranks_per_node))
+    echo "Running ${SYSTEM} nodes=${nodes} ranks_per_node=${ranks_per_node}"
+    launcher_base="$(basename "${LAUNCHER}")"
+    if [[ "${launcher_base}" == mpiexec || "${launcher_base}" == mpirun ]]; then
+      "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -n "${total_ranks}" --ppn "${ranks_per_node}" \
+        "${APP}" \
+        --nodes "${nodes}" \
+        --system "${SYSTEM}" \
+        --output "${CSV}"
+    else
+      "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -N "${nodes}" -n "${total_ranks}" \
+        --ntasks-per-node="${ranks_per_node}" \
+        "${APP}" \
+        --nodes "${nodes}" \
+        --system "${SYSTEM}" \
+        --output "${CSV}"
+    fi
+  done
+done
+
+echo "Results written to ${CSV}"
diff --git a/benchmark/scripts/launch_frontier_strong_scaling.sh b/benchmark/scripts/launch_frontier_strong_scaling.sh
new file mode 100755
index 0000000..7179eb9
--- /dev/null
+++ b/benchmark/scripts/launch_frontier_strong_scaling.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage:
+#   sbatch --nodes=8096 --time=02:00:00 launch_frontier_strong_scaling.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/strong_scaling_distribution_rate}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="frontier"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512}"
+IFS=' ' read -r -a TASK_US_LIST <<< "${TASK_US_LIST:-1 10 100 1000 10000 100000 1000000}"
+IFS=' ' read -r -a DISTRIBUTIONS <<< "${DISTRIBUTIONS:-naive hierarchical}"
+IFS=' ' read -r -a MODES <<< "${MODES:-fixed random}"
+DURATION_S="${DURATION_S:-10}"
+IFS=' ' read -r -a RANKS_PER_NODE_LIST <<< "${RANKS_PER_NODE_LIST:-core}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v srun >/dev/null 2>&1; then
+    LAUNCHER="srun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  elif command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  else
+    echo "No launcher found. Install srun, mpiexec, or mpirun." >&2
+    exit 1
+  fi
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/strong_scaling_${SYSTEM}.csv"
+
+for nodes in "${NODE_LIST[@]}"; do
+  for rpn in "${RANKS_PER_NODE_LIST[@]}"; do
+    if [[ "${rpn}" == "core" || "${rpn}" == "cores" ]]; then
+      if [[ -n "${SLURM_JOB_CPUS_PER_NODE:-}" ]]; then
+        ranks_per_node="${SLURM_JOB_CPUS_PER_NODE%%(*}"
+        ranks_per_node="${ranks_per_node%%,*}"
+      else
+        ranks_per_node="${CORES_PER_NODE:-56}"
+      fi
+    else
+      ranks_per_node="${rpn}"
+    fi
+    total_ranks=$((nodes * ranks_per_node))
+    for dist in "${DISTRIBUTIONS[@]}"; do
+      for mode in "${MODES[@]}"; do
+      for expected_us in "${TASK_US_LIST[@]}"; do
+        echo "Running ${SYSTEM} nodes=${nodes} ranks_per_node=${ranks_per_node} dist=${dist} mode=${mode} expected_us=${expected_us}"
+        launcher_base="$(basename "${LAUNCHER}")"
+        if [[ "${launcher_base}" == mpiexec || "${launcher_base}" == mpirun ]]; then
+          "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -n "${total_ranks}" --ppn "${ranks_per_node}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes "${nodes}" \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        else
+          "${LAUNCHER}" "${LAUNCHER_ARGS[@]}" -N "${nodes}" -n "${total_ranks}" \
+            --ntasks-per-node="${ranks_per_node}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes "${nodes}" \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        fi
+        done
+      done
+    done
+  done
+done
diff --git a/benchmark/scripts/launch_local_naive_shutdown.sh b/benchmark/scripts/launch_local_naive_shutdown.sh
new file mode 100755
index 0000000..e731a82
--- /dev/null
+++ b/benchmark/scripts/launch_local_naive_shutdown.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage:
+#   ./benchmark/scripts/launch_local_naive_shutdown.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/naive_shutdown_time}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="local"
+
+IFS=' ' read -r -a RANK_LIST <<< "${RANK_LIST:-1 2 4 8 12}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  else
+    echo "No launcher found. Install mpirun or mpiexec." >&2
+    exit 1
+  fi
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/naive_shutdown_${SYSTEM}.csv"
+
+for ranks in "${RANK_LIST[@]}"; do
+  echo "Running ${SYSTEM} ranks=${ranks}"
+  launcher_base="$(basename "${LAUNCHER}")"
+  if [[ "${launcher_base}" == mpiexec ]]; then
+    "${LAUNCHER}" ${LAUNCHER_ARGS[@]+"${LAUNCHER_ARGS[@]}"} -n "${ranks}" \
+      "${APP}" \
+      --nodes 1 \
+      --system "${SYSTEM}" \
+      --output "${CSV}"
+  else
+    "${LAUNCHER}" ${LAUNCHER_ARGS[@]+"${LAUNCHER_ARGS[@]}"} -np "${ranks}" \
+      "${APP}" \
+      --nodes 1 \
+      --system "${SYSTEM}" \
+      --output "${CSV}"
+  fi
+done
+
+echo "Results written to ${CSV}"
diff --git a/benchmark/scripts/launch_local_strong_scaling.sh b/benchmark/scripts/launch_local_strong_scaling.sh
new file mode 100755
index 0000000..dae075c
--- /dev/null
+++ b/benchmark/scripts/launch_local_strong_scaling.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Example usage:
+#   ./benchmark/scripts/launch_local_strong_scaling.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+APP="${APP:-${ROOT_DIR}/build/benchmark/strong_scaling_distribution_rate}"
+OUTPUT_DIR="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+SYSTEM="local"
+
+IFS=' ' read -r -a RANK_LIST <<< "${RANK_LIST:-1 2 4 8 12}"
+IFS=' ' read -r -a TASK_US_LIST <<< "${TASK_US_LIST:-1 10 100 1000 10000 100000 1000000}"
+IFS=' ' read -r -a DISTRIBUTIONS <<< "${DISTRIBUTIONS:-naive hierarchical}"
+IFS=' ' read -r -a MODES <<< "${MODES:-fixed random}"
+DURATION_S="${DURATION_S:-10}"
+LAUNCHER="${LAUNCHER:-}"
+IFS=' ' read -r -a LAUNCHER_ARGS <<< "${LAUNCHER_ARGS:-}"
+
+if [[ -z "${LAUNCHER}" ]]; then
+  if command -v mpirun >/dev/null 2>&1; then
+    LAUNCHER="mpirun"
+  elif command -v mpiexec >/dev/null 2>&1; then
+    LAUNCHER="mpiexec"
+  else
+    echo "No launcher found. Install mpirun or mpiexec." >&2
+    exit 1
+  fi
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+CSV="${OUTPUT_DIR}/strong_scaling_${SYSTEM}.csv"
+
+for ranks in "${RANK_LIST[@]}"; do
+  for dist in "${DISTRIBUTIONS[@]}"; do
+    for mode in "${MODES[@]}"; do
+      for expected_us in "${TASK_US_LIST[@]}"; do
+        echo "Running ${SYSTEM} ranks=${ranks} dist=${dist} mode=${mode} expected_us=${expected_us}"
+        launcher_base="$(basename "${LAUNCHER}")"
+        if [[ "${launcher_base}" == mpiexec ]]; then
+          "${LAUNCHER}" ${LAUNCHER_ARGS[@]+"${LAUNCHER_ARGS[@]}"} -n "${ranks}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes 1 \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        else
+          "${LAUNCHER}" ${LAUNCHER_ARGS[@]+"${LAUNCHER_ARGS[@]}"} -np "${ranks}" \
+            "${APP}" \
+            --distribution "${dist}" \
+            --mode "${mode}" \
+            --expected_us "${expected_us}" \
+            --duration_s "${DURATION_S}" \
+            --nodes 1 \
+            --system "${SYSTEM}" \
+            --output "${CSV}"
+        fi
+      done
+    done
+  done
+done
+
+echo "Results written to ${CSV}"
diff --git a/benchmark/scripts/plot_shutdown_time.py b/benchmark/scripts/plot_shutdown_time.py
new file mode 100755
index 0000000..a8e4a71
--- /dev/null
+++ b/benchmark/scripts/plot_shutdown_time.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import csv
+import os
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+from matplotlib.ticker import FixedLocator, FuncFormatter
+import scienceplots  # noqa: F401  # registers matplotlib styles
+
+# IEEE styling parameters
+IEEE_FIG_WIDTH = 3.5  # Single column width in inches
+IEEE_FIG_HEIGHT = 3.5  # Height in inches
+
+# Hollow marker shapes for different series
+MARKER_SHAPES = ['o', 's', '^', 'v', 'D', 'p', '*', 'h', 'X', '<', '>', 'd']
+
+
+def collect_csv_paths(inputs):
+    paths = []
+    for raw in inputs:
+        for entry in raw.split(","):
+            entry = entry.strip()
+            if not entry:
+                continue
+            if os.path.isdir(entry):
+                for root, _, files in os.walk(entry):
+                    for name in files:
+                        # Only collect shutdown-related CSV files
+                        if name.endswith(".csv") and "shutdown" in name.lower():
+                            paths.append(os.path.join(root, name))
+            else:
+                # Only add if it's a shutdown CSV file
+                if "shutdown" in os.path.basename(entry).lower():
+                    paths.append(entry)
+    return paths
+
+
+def parse_rows(paths):
+    rows = []
+    for path in paths:
+        file_mtime = os.path.getmtime(path)
+        with open(path, "r", encoding="utf-8") as handle:
+            reader = csv.DictReader(handle)
+            for row in reader:
+                # Skip rows that don't have the time_per_shutdown_us column
+                if "time_per_shutdown_us" not in row:
+                    continue
+                nodes = int(float(row.get("nodes", 0)))
+                world_size = int(float(row.get("world_size", 0)))
+                workers = int(float(row.get("workers", 0)))
+                time_per_shutdown_us = float(row.get("time_per_shutdown_us", 0.0))
+                # Skip rows with zero or invalid shutdown times
+                if time_per_shutdown_us <= 0.0:
+                    continue
+                rows.append(
+                    {
+                        "system": row.get("system", "").strip() or "unknown",
+                        "nodes": nodes,
+                        "world_size": world_size,
+                        "workers": workers,
+                        "time_per_shutdown_us": time_per_shutdown_us,
+                        "file_mtime": file_mtime,
+                    }
+                )
+    return rows
+
+
+def group_rows(rows):
+    # First, filter to keep only newest results for each unique configuration
+    # Key: (system, nodes)
+    # Value: (time_per_shutdown_us, file_mtime)
+    newest_by_config = {}
+    for row in rows:
+        config_key = (
+            row["system"],
+            row["nodes"],
+        )
+        if config_key not in newest_by_config:
+            newest_by_config[config_key] = (row["time_per_shutdown_us"], row["file_mtime"])
+        else:
+            # Keep the one from the newest file
+            _, existing_mtime = newest_by_config[config_key]
+            if row["file_mtime"] > existing_mtime:
+                newest_by_config[config_key] = (row["time_per_shutdown_us"], row["file_mtime"])
+
+    # Now group by system for plotting
+    grouped = defaultdict(list)
+    for (system, nodes), (time_per_shutdown_us, _) in newest_by_config.items():
+        grouped[system].append((nodes, time_per_shutdown_us))
+    return grouped
+
+
+def plot_all_systems(grouped, output_dir, image_format):
+    # Use scienceplots IEEE style
+    with plt.style.context(['science', 'ieee']):
+        fig, ax = plt.subplots(figsize=(IEEE_FIG_WIDTH, IEEE_FIG_HEIGHT))
+
+        # Filter out "local" system
+        systems = sorted([s for s in grouped.keys() if s != "local"])
+        all_nodes = set()
+        handles = []
+        labels = []
+
+        # Plot each system with different markers/colors
+        for idx, system in enumerate(systems):
+            points = grouped[system]
+            points_sorted = sorted(points, key=lambda x: x[0])
+            nodes = [p[0] for p in points_sorted]
+            time_per_shutdown_us = [p[1] for p in points_sorted]
+            # Convert microseconds to seconds
+            time_per_shutdown_s = [t / 1_000_000.0 for t in time_per_shutdown_us]
+
+            all_nodes.update(nodes)
+
+            marker = MARKER_SHAPES[idx % len(MARKER_SHAPES)]
+            color = plt.cm.tab10(idx % 10)
+
+            # Plot data
+            line, = ax.plot(nodes, time_per_shutdown_s, marker=marker, fillstyle='none',
+                           markeredgewidth=1.0, linewidth=1.0, color=color, label=system.capitalize())
+            handles.append(line)
+            labels.append(system.capitalize())
+
+        ax.set_xlabel("Nodes")
+        ax.set_ylabel("Shutdown time (s)")
+        ax.set_xscale("log", base=2)
+        ax.set_yscale("log")
+
+        # Show actual node counts (1, 2, 4, 8, 16, ...) rather than 2^n formatting.
+        # Keep the log2 spacing but format ticks as plain integers.
+        if all_nodes:
+            node_ticks = sorted(all_nodes)
+            ax.xaxis.set_major_locator(FixedLocator(node_ticks))
+            ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}"))
+
+        # Add very light grey underlying grid
+        ax.grid(True, which="both", linestyle="-", linewidth=0.5, color='lightgrey', alpha=0.5, zorder=0)
+
+        # Add legend
+        ax.legend(handles, labels, frameon=False, loc='best')
+
+        filename = f"shutdown_time_combined.{image_format}"
+        fig.tight_layout()
+        fig.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
+        plt.close(fig)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot shutdown time vs number of nodes.")
+    parser.add_argument(
+        "--input",
+        required=True,
+        action="append",
+        help="CSV file or directory (can be passed multiple times)",
+    )
+    parser.add_argument(
+        "--output-dir", required=True, help="Directory to write output plots"
+    )
+    parser.add_argument(
+        "--format", default="png", choices=["png", "pdf", "svg"], help="Output image format"
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    input_paths = collect_csv_paths(args.input)
+    rows = parse_rows(input_paths)
+    grouped = group_rows(rows)
+
+    # Plot all systems on the same figure
+    plot_all_systems(grouped, args.output_dir, args.format)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/scripts/plot_strong_scaling.py b/benchmark/scripts/plot_strong_scaling.py
new file mode 100755
index 0000000..26ace5c
--- /dev/null
+++ b/benchmark/scripts/plot_strong_scaling.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import csv
+import os
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+from matplotlib.ticker import FixedLocator, FuncFormatter
+import scienceplots  # noqa: F401  # registers matplotlib styles
+
+# IEEE styling parameters
+IEEE_FIG_WIDTH = 3.5  # Single column width in inches
+IEEE_FIG_HEIGHT = 3.5  # Height in inches (increased for bottom legend)
+
+# Hollow marker shapes for different series
+MARKER_SHAPES = ['o', 's', '^', 'v', 'D', 'p', '*', 'h', 'X', '<', '>', 'd']
+
+
+def format_duration(expected_ns):
+    if expected_ns <= 0:
+        return "0 ns"
+    if expected_ns >= 1_000_000_000:
+        return f"{expected_ns / 1_000_000_000:g} s"
+    if expected_ns >= 1_000_000:
+        return f"{expected_ns / 1_000_000:g} ms"
+    if expected_ns >= 1_000:
+        return f"{expected_ns / 1_000:g} us"
+    return f"{expected_ns:g} ns"
+
+
+def collect_csv_paths(inputs):
+    paths = []
+    for raw in inputs:
+        for entry in raw.split(","):
+            entry = entry.strip()
+            if not entry:
+                continue
+            if os.path.isdir(entry):
+                for root, _, files in os.walk(entry):
+                    for name in files:
+                        # Only collect strong scaling CSV files
+                        if name.endswith(".csv") and "strong_scaling" in name.lower():
+                            paths.append(os.path.join(root, name))
+            else:
+                # Only add if it's a strong scaling CSV file
+                if "strong_scaling" in os.path.basename(entry).lower():
+                    paths.append(entry)
+    return paths
+
+
+def parse_rows(paths):
+    rows = []
+    for path in paths:
+        file_mtime = os.path.getmtime(path)
+        with open(path, "r", encoding="utf-8") as handle:
+            reader = csv.DictReader(handle)
+            for row in reader:
+                expected_ns_raw = row.get("expected_ns", "").strip()
+                expected_us_raw = row.get("expected_us", "").strip()
+                if expected_ns_raw:
+                    expected_ns = int(float(expected_ns_raw))
+                elif expected_us_raw:
+                    expected_ns = int(float(expected_us_raw) * 1000)
+                else:
+                    expected_ns = 0
+                nodes = int(float(row.get("nodes", 0)))
+                world_size = int(float(row.get("world_size", 0)))
+                ranks_per_node = int(round(world_size / nodes)) if nodes else 0
+                rows.append(
+                    {
+                        "system": row.get("system", "").strip() or "unknown",
+                        "distributor": row.get("distributor", "").strip(),
+                        "mode": row.get("mode", "").strip(),
+                        "expected_ns": expected_ns,
+                        "nodes": nodes,
+                        "ranks_per_node": ranks_per_node,
+                        "throughput": float(row.get("throughput_tasks_per_s", 0.0)),
+                        "file_mtime": file_mtime,
+                    }
+                )
+    return rows
+
+
+def group_rows(rows):
+    # First, filter to keep only newest results for each unique configuration
+    # Key: (system, distributor, mode, expected_ns, ranks_per_node, nodes)
+    # Value: (throughput, file_mtime)
+    # Note: Normalize mode for backward compatibility (poisson -> random)
+    newest_by_config = {}
+    for row in rows:
+        # Handle backward compatibility: treat "poisson" as "random"
+        normalized_mode = "random" if row["mode"] == "poisson" else row["mode"]
+        config_key = (
+            row["system"],
+            row["distributor"],
+            normalized_mode,
+            row["expected_ns"],
+            row["ranks_per_node"],
+            row["nodes"],
+        )
+        if config_key not in newest_by_config:
+            newest_by_config[config_key] = (row["throughput"], row["file_mtime"])
+        else:
+            # Keep the one from the newest file
+            _, existing_mtime = newest_by_config[config_key]
+            if row["file_mtime"] > existing_mtime:
+                newest_by_config[config_key] = (row["throughput"], row["file_mtime"])
+
+    # Now group by (system, distributor, mode, expected_ns, ranks_per_node) for plotting
+    grouped = defaultdict(list)
+    for (system, distributor, mode, expected_ns, ranks_per_node, nodes), (throughput, _) in newest_by_config.items():
+        key = (system, distributor, mode, expected_ns, ranks_per_node)
+        grouped[key].append((nodes, throughput))
+    return grouped
+
+
+def plot_distributor(system, distributor, grouped, output_dir, image_format):
+    modes = ["fixed", "random"]
+
+    # Create separate plots for each mode
+    for mode in modes:
+        # Use scienceplots IEEE style
+        with plt.style.context(['science', 'ieee']):
+            fig, ax = plt.subplots(figsize=(IEEE_FIG_WIDTH, IEEE_FIG_HEIGHT))
+
+            series = []
+            all_nodes = set()
+            ranks_per_node_value = None
+            for (
+                sys_name,
+                dist,
+                mode_name,
+                expected_ns,
+                ranks_per_node,
+            ), points in grouped.items():
+                # Handle backward compatibility: treat "poisson" as "random"
+                normalized_mode = "random" if mode_name == "poisson" else mode_name
+                if sys_name != system or dist != distributor or normalized_mode != mode:
+                    continue
+                points_sorted = sorted(points, key=lambda x: x[0])
+                nodes = [p[0] for p in points_sorted]
+                throughput = [p[1] for p in points_sorted]
+                all_nodes.update(nodes)
+                if ranks_per_node_value is None:
+                    ranks_per_node_value = ranks_per_node
+                series.append((expected_ns, ranks_per_node, nodes, throughput))
+
+            # Skip creating plot if there's no data
+            if not series:
+                plt.close(fig)
+                continue
+
+            # Sort series by expected_ns (duration) to ensure proper ordering
+            series_sorted = sorted(series, key=lambda x: x[0])  # Sort by expected_ns only
+            handles = []
+            labels = []
+
+            # Plot actual data first to establish axis limits
+            for idx, (expected_ns, ranks_per_node, nodes, throughput) in enumerate(series_sorted):
+                # Remove rpn from legend label, only show duration
+                label = format_duration(expected_ns)
+                marker = MARKER_SHAPES[idx % len(MARKER_SHAPES)]
+                color = plt.cm.tab10(idx % 10)
+                # Use matplotlib's default color cycle for different colors
+                line, = ax.plot(nodes, throughput, marker=marker, label=label,
+                               fillstyle='none', markeredgewidth=1.0,
+                               color=color)
+                handles.append(line)
+                labels.append(label)
+
+            ax.set_xlabel("Nodes")
+            ax.set_ylabel("Tasks per second")
+            ax.set_xscale("log", base=2)
+            ax.set_yscale("log")
+            # Show actual node counts (2, 4, 8, 16, ...) rather than 2^n formatting.
+            # Keep the log2 spacing but format ticks as plain integers.
+            if all_nodes:
+                node_ticks = sorted(all_nodes)
+                ax.xaxis.set_major_locator(FixedLocator(node_ticks))
+                ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}"))
+
+            # Add very light grey underlying grid
+            ax.grid(True, which="both", linestyle="-", linewidth=0.5, color='lightgrey', alpha=0.5, zorder=0)
+
+            # Store axis limits before plotting ideal lines
+            xlim = ax.get_xlim()
+            ylim = ax.get_ylim()
+
+            # Plot ideal scaling lines without affecting axis limits
+            for idx, (expected_ns, ranks_per_node, nodes, throughput) in enumerate(series_sorted):
+                color = plt.cm.tab10(idx % 10)
+                # Add ideal scaling line: throughput = nodes * ranks_per_node * 1e9 / expected_ns
+                if all_nodes:
+                    ideal_nodes = sorted(all_nodes)
+                    ideal_throughput = [n * ranks_per_node * 1e9 / expected_ns for n in ideal_nodes]
+                    ax.plot(ideal_nodes, ideal_throughput, linestyle='--', color=color,
+                           linewidth=1.0, alpha=0.5, zorder=0)
+
+            # Restore axis limits to those determined by actual data
+            ax.set_xlim(xlim)
+            ax.set_ylim(ylim)
+
+            # Reorder handles and labels to go across columns first (row-major)
+            # Matplotlib's legend with ncol fills column-major (down columns first),
+            # so we need to transpose the order to get row-major display
+            ncol = 4
+            n_items = len(handles)
+            n_rows = (n_items + ncol - 1) // ncol  # Ceiling division
+
+            # Create reordered lists: transpose so matplotlib's column-major fill gives row-major display
+            reordered_handles = []
+            reordered_labels = []
+            for col in range(ncol):
+                for row in range(n_rows):
+                    idx = row * ncol + col
+                    if idx < n_items:
+                        reordered_handles.append(handles[idx])
+                        reordered_labels.append(labels[idx])
+
+            # Compact legend with increased column spacing - 4 columns at bottom, no border
+            # Items ordered by duration (1us, 10us, 100us, ...) going across columns first
+            ax.legend(reordered_handles, reordered_labels,
+                      frameon=False,
+                      ncol=ncol, columnspacing=0.8,
+                      loc='upper center', bbox_to_anchor=(0.5, -0.15))
+
+            # Add rpn to filename
+            rpn_str = f"_{ranks_per_node_value}rpn" if ranks_per_node_value else ""
+            filename = f"strong_scaling_{system}_{distributor}_{mode}{rpn_str}.{image_format}"
+            fig.tight_layout(rect=[0, 0.12, 1, 1])  # Leave space at bottom for legend
+            fig.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
+            plt.close(fig)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot strong scaling distribution throughput.")
+    parser.add_argument(
+        "--input",
+        required=True,
+        action="append",
+        help="CSV file or directory (can be passed multiple times)",
+    )
+    parser.add_argument(
+        "--output-dir", required=True, help="Directory to write output plots"
+    )
+    parser.add_argument(
+        "--format", default="png", choices=["png", "pdf", "svg"], help="Output image format"
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    input_paths = collect_csv_paths(args.input)
+    rows = parse_rows(input_paths)
+    grouped = group_rows(rows)
+
+    systems = sorted({row["system"] for row in rows})
+    # Filter out empty distributors and only include those with actual data
+    distributors = sorted({row["distributor"] for row in rows if row["distributor"].strip()})
+    for system in systems:
+        for distributor in distributors:
+            plot_distributor(system, distributor, grouped, args.output_dir, args.format)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/scripts/submit_aurora_naive_shutdown.sh b/benchmark/scripts/submit_aurora_naive_shutdown.sh
new file mode 100755
index 0000000..50dfe62
--- /dev/null
+++ b/benchmark/scripts/submit_aurora_naive_shutdown.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Submit one PBS job per node count to avoid long serial waits.
+# Example:
+#   ./benchmark/scripts/submit_aurora_naive_shutdown.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=benchmark/scripts/aurora_queue_utils.sh
+source "${SCRIPT_DIR}/aurora_queue_utils.sh"
+SYSTEM="aurora"
+SCRIPT="${ROOT_DIR}/benchmark/scripts/launch_aurora_naive_shutdown.sh"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512}"
+IFS=' ' read -r -a QSUB_ARGS <<< "${QSUB_ARGS:-}"
+ACCOUNT="${ACCOUNT:-DynaMPI}"
+FILESYSTEMS="${FILESYSTEMS:-flare}"
+NCPUS_PER_NODE="${NCPUS_PER_NODE:-102}"
+
+WALLTIME="${WALLTIME:-00:15:00}"
+LAUNCHER="${LAUNCHER:-}"
+LAUNCHER_ARGS="${LAUNCHER_ARGS:-}"
+OUTPUT_BASE="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+
+for nodes in "${NODE_LIST[@]}"; do
+  job_name="dynampi_shutdown_${SYSTEM}_${nodes}"
+  submit_args=("${QSUB_ARGS[@]}")
+  if [[ -n "${ACCOUNT}" ]]; then
+    submit_args+=(-A "${ACCOUNT}")
+  fi
+  if [[ "${nodes}" -lt 256 ]]; then
+    submit_args+=(-q "debug-scaling")
+  else
+    submit_args+=(-q "prod")
+  fi
+  wait_for_aurora_queue_space "${nodes}"
+  job_script="#!/usr/bin/env bash
+#PBS -j oe
+set -euo pipefail
+cd \"${ROOT_DIR}\"
+export NODE_LIST=\"${nodes}\"
+export LAUNCHER=\"${LAUNCHER}\"
+export LAUNCHER_ARGS=\"${LAUNCHER_ARGS}\"
+export CORES_PER_NODE=\"${NCPUS_PER_NODE}\"
+export OUTPUT_DIR=\"${OUTPUT_BASE}/${SYSTEM}/${nodes}-${job_name}-\${PBS_JOBID_SHORT:-manual}\"
+${SCRIPT}
+"
+  echo "qsub ${submit_args[*]} -N \"${job_name}\" -l \"select=${nodes}:ncpus=${NCPUS_PER_NODE}:mpiprocs=${NCPUS_PER_NODE}\" -l \"walltime=${WALLTIME}\" -l \"filesystems=${FILESYSTEMS}\" <<'QSUBEOF'"
+  echo "${job_script}"
+  echo "QSUBEOF"
+  qsub "${submit_args[@]}" -N "${job_name}" -l "select=${nodes}:ncpus=${NCPUS_PER_NODE}:mpiprocs=${NCPUS_PER_NODE}" -l "walltime=${WALLTIME}" \
+    -l "filesystems=${FILESYSTEMS}" <<< "${job_script}"
+done
diff --git a/benchmark/scripts/submit_aurora_strong_scaling.sh b/benchmark/scripts/submit_aurora_strong_scaling.sh
new file mode 100755
index 0000000..ad5c7de
--- /dev/null
+++ b/benchmark/scripts/submit_aurora_strong_scaling.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Submit one PBS job per node count to avoid long serial waits.
+# Example:
+#   ./benchmark/scripts/submit_aurora_strong_scaling.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=benchmark/scripts/aurora_queue_utils.sh
+source "${SCRIPT_DIR}/aurora_queue_utils.sh"
+SYSTEM="aurora"
+SCRIPT="${ROOT_DIR}/benchmark/scripts/launch_aurora_strong_scaling.sh"
+
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST:-1 2 4 8 16 32 64 128 256 512}"
+IFS=' ' read -r -a QSUB_ARGS <<< "${QSUB_ARGS:-}"
+ACCOUNT="${ACCOUNT:-DynaMPI}"
+FILESYSTEMS="${FILESYSTEMS:-flare}"
+NCPUS_PER_NODE="${NCPUS_PER_NODE:-102}"
+
+WALLTIME="${WALLTIME:-00:15:00}"
+LAUNCHER="${LAUNCHER:-}"
+LAUNCHER_ARGS="${LAUNCHER_ARGS:-}"
+OUTPUT_BASE="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+
+for nodes in "${NODE_LIST[@]}"; do
+  job_name="dynampi_ss_${SYSTEM}_${nodes}"
+  submit_args=("${QSUB_ARGS[@]}")
+  if [[ -n "${ACCOUNT}" ]]; then
+    submit_args+=(-A "${ACCOUNT}")
+  fi
+  if [[ "${nodes}" -lt 256 ]]; then
+    submit_args+=(-q "debug-scaling")
+  else
+    submit_args+=(-q "prod")
+  fi
+  wait_for_aurora_queue_space "${nodes}"
+  job_script="#!/usr/bin/env bash
+#PBS -j oe
+set -euo pipefail
+cd \"${ROOT_DIR}\"
+export NODE_LIST=\"${nodes}\"
+export LAUNCHER=\"${LAUNCHER}\"
+export LAUNCHER_ARGS=\"${LAUNCHER_ARGS}\"
+export CORES_PER_NODE=\"${NCPUS_PER_NODE}\"
+export OUTPUT_DIR=\"${OUTPUT_BASE}/${SYSTEM}/${nodes}-${job_name}-\${PBS_JOBID_SHORT:-manual}\"
+${SCRIPT}
+"
+  echo "qsub ${submit_args[*]} -N \"${job_name}\" -l \"select=${nodes}:ncpus=${NCPUS_PER_NODE}:mpiprocs=${NCPUS_PER_NODE}\" -l \"walltime=${WALLTIME}\" -l \"filesystems=${FILESYSTEMS}\" <<'QSUBEOF'"
+  echo "${job_script}"
+  echo "QSUBEOF"
+  qsub "${submit_args[@]}" -N "${job_name}" -l "select=${nodes}:ncpus=${NCPUS_PER_NODE}:mpiprocs=${NCPUS_PER_NODE}" -l "walltime=${WALLTIME}" \
+    -l "filesystems=${FILESYSTEMS}" <<< "${job_script}"
+done
diff --git a/benchmark/scripts/submit_frontier_naive_shutdown.sh b/benchmark/scripts/submit_frontier_naive_shutdown.sh
new file mode 100755
index 0000000..cc588aa
--- /dev/null
+++ b/benchmark/scripts/submit_frontier_naive_shutdown.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Submit one Slurm job per node count to avoid long serial waits.
+# Example:
+#   ./benchmark/scripts/submit_frontier_naive_shutdown.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SYSTEM="frontier"
+SCRIPT="${ROOT_DIR}/benchmark/scripts/launch_frontier_naive_shutdown.sh"
+
+SBATCH_ARGS=()
+if [[ -z "${NODE_LIST:-}" ]]; then
+  NODE_LIST="1 2 4 8 16 32 64 128 256 512"
+fi
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST}"
+IFS=' ' read -r -a SBATCH_ARGS <<< "${SBATCH_ARGS:-}"
+ACCOUNT="${ACCOUNT:-chm213}"
+
+WALLTIME="${WALLTIME:-00:15:00}"
+LAUNCHER="${LAUNCHER:-}"
+LAUNCHER_ARGS="${LAUNCHER_ARGS:-}"
+OUTPUT_BASE="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+
+for nodes in "${NODE_LIST[@]}"; do
+  job_name="dynampi_shutdown_${SYSTEM}_${nodes}"
+  submit_args=(${SBATCH_ARGS[@]+"${SBATCH_ARGS[@]}"})
+  if [[ -n "${ACCOUNT}" ]]; then
+    submit_args+=(--account="${ACCOUNT}")
+  fi
+  wrap="cd ${ROOT_DIR} && OUTPUT_DIR=\"${OUTPUT_BASE}/${SYSTEM}/${nodes}-${job_name}-\${SLURM_JOB_ID:-manual}\" ${SCRIPT}"
+  echo "sbatch ${submit_args[*]} --job-name=\"${job_name}\" --nodes=${nodes} --time=${WALLTIME} --export=ALL,NODE_LIST=${nodes},LAUNCHER=${LAUNCHER},LAUNCHER_ARGS=${LAUNCHER_ARGS} --wrap=\"${wrap}\""
+  sbatch "${submit_args[@]}" \
+    --job-name="${job_name}" \
+    --nodes="${nodes}" \
+    --time="${WALLTIME}" \
+    --export=ALL,NODE_LIST="${nodes}",LAUNCHER="${LAUNCHER}",LAUNCHER_ARGS="${LAUNCHER_ARGS}" \
+    --wrap="${wrap}"
+done
diff --git a/benchmark/scripts/submit_frontier_strong_scaling.sh b/benchmark/scripts/submit_frontier_strong_scaling.sh
new file mode 100755
index 0000000..9b69bea
--- /dev/null
+++ b/benchmark/scripts/submit_frontier_strong_scaling.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+# SPDX-License-Identifier: Apache-2.0
+set -euo pipefail
+
+# Submit one Slurm job per node count to avoid long serial waits.
+# Example:
+#   ./benchmark/scripts/submit_frontier_strong_scaling.sh
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SYSTEM="frontier"
+SCRIPT="${ROOT_DIR}/benchmark/scripts/launch_frontier_strong_scaling.sh"
+
+SBATCH_ARGS=()
+if [[ -z "${NODE_LIST:-}" ]]; then
+  NODE_LIST="1 2 4 8 16 32 64 128 256 512"
+fi
+IFS=' ' read -r -a NODE_LIST <<< "${NODE_LIST}"
+IFS=' ' read -r -a SBATCH_ARGS <<< "${SBATCH_ARGS:-}"
+ACCOUNT="${ACCOUNT:-chm213}"
+
+WALLTIME="${WALLTIME:-00:15:00}"
+LAUNCHER="${LAUNCHER:-}"
+LAUNCHER_ARGS="${LAUNCHER_ARGS:-}"
+OUTPUT_BASE="${OUTPUT_DIR:-${ROOT_DIR}/benchmark/results}"
+
+for nodes in "${NODE_LIST[@]}"; do
+  job_name="dynampi_ss_${SYSTEM}_${nodes}"
+  submit_args=(${SBATCH_ARGS[@]+"${SBATCH_ARGS[@]}"})
+  if [[ -n "${ACCOUNT}" ]]; then
+    submit_args+=(--account="${ACCOUNT}")
+  fi
+  wrap="cd ${ROOT_DIR} && OUTPUT_DIR=\"${OUTPUT_BASE}/${SYSTEM}/${nodes}-${job_name}-\${SLURM_JOB_ID:-manual}\" ${SCRIPT}"
+  echo "sbatch ${submit_args[*]} --job-name=\"${job_name}\" --nodes=${nodes} --time=${WALLTIME} --export=ALL,NODE_LIST=${nodes},LAUNCHER=${LAUNCHER},LAUNCHER_ARGS=${LAUNCHER_ARGS} --wrap=\"${wrap}\""
+  sbatch "${submit_args[@]}" \
+    --job-name="${job_name}" \
+    --nodes="${nodes}" \
+    --time="${WALLTIME}" \
+    --export=ALL,NODE_LIST="${nodes}",LAUNCHER="${LAUNCHER}",LAUNCHER_ARGS="${LAUNCHER_ARGS}" \
+    --wrap="${wrap}"
+done
diff --git a/benchmark/strong_scaling_distribution_rate.cpp b/benchmark/strong_scaling_distribution_rate.cpp
new file mode 100644
index 0000000..14d6e0a
--- /dev/null
+++ b/benchmark/strong_scaling_distribution_rate.cpp
@@ -0,0 +1,283 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cxxopts.hpp>
+#include <dynampi/impl/hierarchical_distributor.hpp>
+#include <dynampi/impl/naive_distributor.hpp>
+#include <dynampi/mpi/mpi_communicator.hpp>
+#include <dynampi/utilities/timer.hpp>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <random>
+#include <string>
+
+using Task = uint32_t;
+
+enum class DistributorKind { Naive, Hierarchical };
+enum class DurationMode { Fixed, Poisson };
+
+struct BenchmarkOptions {
+  uint64_t expected_us = 1;
+  double duration_s = 10.0;
+  DistributorKind distributor = DistributorKind::Hierarchical;
+  DurationMode duration_mode = DurationMode::Fixed;
+  uint64_t nodes = 0;
+  std::string system;
+  std::string output_path;
+};
+
+struct BenchmarkResult {
+  uint64_t total_tasks = 0;
+  uint64_t workers = 0;
+  uint64_t world_size = 0;
+  double elapsed_s = 0.0;
+};
+
+static DistributorKind parse_distributor(const std::string& value) {
+  if (value == "naive") return DistributorKind::Naive;
+  if (value == "hierarchical") return DistributorKind::Hierarchical;
+  throw std::runtime_error("Unknown distributor: " + value);
+}
+
+static DurationMode parse_duration_mode(const std::string& value) {
+  if (value == "fixed") return DurationMode::Fixed;
+  if (value == "poisson" || value == "random") return DurationMode::Poisson;
+  throw std::runtime_error("Unknown duration mode: " + value);
+}
+
+static std::string to_string(DistributorKind kind) {
+  return kind == DistributorKind::Naive ? "naive" : "hierarchical";
+}
+
+static std::string to_string(DurationMode mode) {
+  return mode == DurationMode::Fixed ? "fixed" : "random";
+}
+
+static void spin_wait(std::chrono::microseconds duration) {
+  auto start = std::chrono::high_resolution_clock::now();
+  while (std::chrono::high_resolution_clock::now() - start < duration) {
+  }
+}
+
+static void write_csv_header(std::ostream& os) {
+  os << "system,distributor,mode,expected_us,"
+        "duration_s,nodes,world_size,workers,total_tasks,elapsed_s,"
+        "throughput_tasks_per_s\n";
+}
+
+static void write_csv_row(std::ostream& os, const BenchmarkOptions& opts,
+                          const BenchmarkResult& result) {
+  const double throughput =
+      result.elapsed_s > 0.0 ? static_cast<double>(result.total_tasks) / result.elapsed_s : 0.0;
+  os << opts.system << "," << to_string(opts.distributor) << "," << to_string(opts.duration_mode)
+     << "," << opts.expected_us << "," << opts.duration_s << "," << opts.nodes << ","
+     << result.world_size << "," << result.workers << "," << result.total_tasks << ","
+     << result.elapsed_s << "," << throughput << "\n";
+}
+
+template <typename Distributor>
+static BenchmarkResult run_benchmark(const BenchmarkOptions& opts, MPI_Comm comm) {
+  dynampi::MPICommunicator<> comm_wrapper(comm, dynampi::MPICommunicator<>::Ownership::Reference);
+  int rank = 0;
+  int size = 0;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  const uint64_t num_workers = (size == 1) ? 1 : static_cast<uint64_t>(size - 1);
+
+  struct WorkerFunctor {
+    std::mt19937_64 rng;
+    std::uniform_int_distribution<uint64_t> uniform;
+    uint64_t expected_us;
+    DurationMode duration_mode;
+
+    WorkerFunctor(int rank, uint64_t expected_us, DurationMode mode)
+        : rng([rank]() {
+            std::random_device rd;
+            std::mt19937_64 seed_gen(rd());
+            return seed_gen() + static_cast<uint64_t>(rank);
+          }()),
+          uniform(0, 2 * expected_us),
+          expected_us(expected_us),
+          duration_mode(mode) {}
+
+    uint32_t operator()(Task task) {
+      uint32_t value = task;
+      uint64_t duration_us = expected_us;
+      if (duration_mode == DurationMode::Poisson) {
+        duration_us = uniform(rng);
+      }
+      spin_wait(std::chrono::microseconds(duration_us));
+      const uint64_t squared = static_cast<uint64_t>(value) * static_cast<uint64_t>(value);
+      return static_cast<uint32_t>(squared);
+    }
+  };
+
+  WorkerFunctor worker_function(rank, opts.expected_us, opts.duration_mode);
+
+  MPI_Barrier(comm_wrapper);
+  dynampi::Timer timer(dynampi::Timer::AutoStart::No);
+  uint64_t total_tasks = 0;
+
+  Distributor distributor(worker_function, {.comm = comm, .manager_rank = 0});
+
+  if (distributor.is_root_manager()) {
+    timer.start();
+
+    const uint64_t target_queue_size = num_workers * 4;
+    while (timer.elapsed().count() < opts.duration_s) {
+      const uint64_t remaining = distributor.remaining_tasks_count();
+      uint64_t to_insert = 0;
+      if (remaining < target_queue_size) {
+        to_insert = target_queue_size - remaining;
+      }
+      if (timer.elapsed().count() > opts.duration_s / 2.0 && total_tasks > 0) {
+        double current_rate = static_cast<double>(total_tasks) / timer.elapsed().count();
+        double estimated_total_tasks = current_rate * opts.duration_s;
+        if (estimated_total_tasks > static_cast<double>(total_tasks) && current_rate > 0.0) {
+          double remaining_time = opts.duration_s - timer.elapsed().count();
+          uint64_t can_complete_tasks_remaining =
+              static_cast<uint64_t>(current_rate * remaining_time);
+          if (can_complete_tasks_remaining > remaining) {
+            uint64_t max_to_insert = can_complete_tasks_remaining - remaining;
+            to_insert = std::min(to_insert, max_to_insert);
+          } else {
+            // Already have more tasks queued than can be completed, don't insert more
+            to_insert = 0;
+          }
+        }
+      }
+      // Clamp to_insert to be non-negative and <= target_queue_size
+      to_insert = std::min(to_insert, target_queue_size);
+
+      if (to_insert > 0) {
+        std::vector<Task> tasks;
+        tasks.reserve(to_insert);
+        for (uint64_t i = 0; i < to_insert; ++i) {
+          tasks.push_back(static_cast<Task>(total_tasks + i));
+        }
+        distributor.insert_tasks(tasks);
+      }
+      auto results =
+          distributor.run_tasks({.target_num_tasks = num_workers * 2, .max_seconds = 0.1});
+      total_tasks += results.size();
+    }
+    {
+      auto results = distributor.finish_remaining_tasks();
+      total_tasks += results.size();
+    }
+    timer.stop();
+    distributor.finalize();
+  }
+
+  return BenchmarkResult{total_tasks, num_workers, static_cast<uint64_t>(size),
+                         timer.elapsed().count()};
+}
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+  int world_rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  cxxopts::Options options("strong_scaling_distribution_rate",
+                           "Benchmark strong scaling task distribution throughput");
+  options.add_options()("t,expected_us", "Expected task duration in microseconds",
+                        cxxopts::value<uint64_t>()->default_value("1"))(
+      "d,duration_s", "Target duration in seconds", cxxopts::value<double>()->default_value("10"))(
+      "D,distribution", "Distribution strategy: naive or hierarchical",
+      cxxopts::value<std::string>()->default_value("hierarchical"))(
+      "m,mode", "Duration mode: fixed or random (uniform 0-2x expected)",
+      cxxopts::value<std::string>()->default_value("fixed"))(
+      "n,nodes", "Number of nodes for labeling output (defaults to world size)",
+      cxxopts::value<uint64_t>()->default_value("0"))(
+      "S,system", "System label for plotting (frontier, aurora, ...)",
+      cxxopts::value<std::string>()->default_value(""))(
+      "o,output", "Append results to CSV file", cxxopts::value<std::string>()->default_value(""))(
+      "h,help", "Print usage");
+
+  cxxopts::ParseResult args;
+  try {
+    args = options.parse(argc, argv);
+  } catch (const std::exception& e) {
+    if (world_rank == 0) {
+      std::cerr << "Error parsing options: " << e.what() << "\n" << options.help() << std::endl;
+    }
+    MPI_Finalize();
+    return 1;
+  }
+
+  if (args.count("help")) {
+    if (world_rank == 0) {
+      std::cout << options.help() << std::endl;
+    }
+    MPI_Finalize();
+    return 0;
+  }
+
+  BenchmarkOptions opts;
+  opts.expected_us = args["expected_us"].as<uint64_t>();
+  opts.duration_s = args["duration_s"].as<double>();
+  opts.distributor = parse_distributor(args["distribution"].as<std::string>());
+  opts.duration_mode = parse_duration_mode(args["mode"].as<std::string>());
+  opts.nodes = args["nodes"].as<uint64_t>();
+  opts.system = args["system"].as<std::string>();
+  opts.output_path = args["output"].as<std::string>();
+
+  if (opts.expected_us == 0) {
+    if (world_rank == 0) {
+      std::cerr << "Expected task duration must be >= 1 microsecond." << std::endl;
+    }
+    MPI_Finalize();
+    return 1;
+  }
+
+  {
+    MPI_Comm comm = MPI_COMM_WORLD;
+    int rank = 0;
+    int size = 0;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+    if (opts.nodes == 0) {
+      opts.nodes = static_cast<uint64_t>(size);
+    }
+
+    BenchmarkResult result;
+    if (opts.distributor == DistributorKind::Naive) {
+      result = run_benchmark<dynampi::NaiveMPIWorkDistributor<Task, uint32_t>>(opts, comm);
+    } else {
+      result = run_benchmark<dynampi::HierarchicalMPIWorkDistributor<Task, uint32_t>>(opts, comm);
+    }
+
+    if (rank == 0) {
+      const double throughput =
+          result.elapsed_s > 0.0 ? static_cast<double>(result.total_tasks) / result.elapsed_s : 0.0;
+      std::cout << "RESULT"
+                << " distributor=" << to_string(opts.distributor)
+                << " mode=" << to_string(opts.duration_mode) << " expected_us=" << opts.expected_us
+                << " nodes=" << opts.nodes << " world_size=" << result.world_size
+                << " total_tasks=" << result.total_tasks << " elapsed_s=" << result.elapsed_s
+                << " throughput_tasks_per_s=" << throughput << std::endl;
+      if (!opts.output_path.empty()) {
+        std::ifstream check(opts.output_path);
+        const bool needs_header =
+            !check.good() || check.peek() == std::ifstream::traits_type::eof();
+        check.close();
+        std::ofstream out(opts.output_path, std::ios::app);
+        if (needs_header) {
+          write_csv_header(out);
+        }
+        write_csv_row(out, opts, result);
+      }
+    }
+  }
+  MPI_Finalize();
+  return 0;
+}
diff --git a/benchmark/timer_resolution.cpp b/benchmark/timer_resolution.cpp
new file mode 100644
index 0000000..1b7b586
--- /dev/null
+++ b/benchmark/timer_resolution.cpp
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <dynampi/utilities/timer.hpp>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+void print_resolution_stats(std::vector<double>& deltas, int iterations) {
+  if (deltas.empty()) {
+    std::cout << "  Measured resolution: < 1 ns (no measurable difference in " << iterations
+              << " iterations)\n";
+  } else {
+    std::sort(deltas.begin(), deltas.end());
+    double min_delta = deltas[0];
+    double median_delta = deltas.size() % 2 == 0
+                              ? (deltas[deltas.size() / 2 - 1] + deltas[deltas.size() / 2]) / 2.0
+                              : deltas[deltas.size() / 2];
+    double mean_delta = std::accumulate(deltas.begin(), deltas.end(), 0.0) / deltas.size();
+
+    std::cout << "  Measured resolution (min): " << min_delta << " ns\n";
+    std::cout << "  Measured resolution (median): " << median_delta << " ns\n";
+    std::cout << "  Measured resolution (mean): " << mean_delta << " ns\n";
+    std::cout << "  Non-zero measurements: " << deltas.size() << "/" << iterations << "\n";
+  }
+}
+
+template <typename GetTimePoint>
+std::vector<double> measure_resolution(GetTimePoint&& get_time_point, int iterations) {
+  std::vector<double> deltas;
+
+  for (int i = 0; i < iterations; ++i) {
+    auto t1 = get_time_point();
+    auto t2 = get_time_point();
+    // Wait for time to advance
+    while (t2 <= t1) {
+      t2 = get_time_point();
+    }
+    auto delta = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
+    if (delta > 0) {
+      deltas.push_back(static_cast<double>(delta));
+    }
+  }
+
+  return deltas;
+}
+
+template <typename Clock>
+void test_clock_resolution(const char* name) {
+  using Duration = typename Clock::duration;
+  using Period = typename Duration::period;
+
+  std::cout << "\n" << name << ":\n";
+  std::cout << "  Period: " << Period::num;
+  if constexpr (Period::den != 1) {
+    std::cout << "/" << Period::den;
+  }
+  std::cout << " seconds\n";
+
+  const int iterations = 10000;
+  auto deltas = measure_resolution([]() { return Clock::now(); }, iterations);
+  print_resolution_stats(deltas, iterations);
+
+  // Test if clock is steady
+  bool is_steady = Clock::is_steady;
+  std::cout << "  Is steady: " << (is_steady ? "yes" : "no") << "\n";
+}
+
+void test_timer_resolution() {
+  std::cout << "\nDynaMPI Timer:\n";
+
+  const int iterations = 10000;
+  dynampi::Timer timer(dynampi::Timer::AutoStart::No);
+  timer.start();
+  auto deltas = measure_resolution([&timer]() { return timer.elapsed(); }, iterations);
+  print_resolution_stats(deltas, iterations);
+}
+
+int main() {
+  std::cout << "Timer Resolution Test\n";
+  std::cout << "====================\n";
+
+  test_clock_resolution<std::chrono::high_resolution_clock>("high_resolution_clock");
+  test_clock_resolution<std::chrono::steady_clock>("steady_clock");
+  test_clock_resolution<std::chrono::system_clock>("system_clock");
+  test_timer_resolution();
+
+  std::cout << "\n";
+  return 0;
+}
diff --git a/include/dynampi/impl/hierarchical_distributor.hpp b/include/dynampi/impl/hierarchical_distributor.hpp
index 6725f9c..5a20f1d 100644
--- a/include/dynampi/impl/hierarchical_distributor.hpp
+++ b/include/dynampi/impl/hierarchical_distributor.hpp
@@ -7,18 +7,23 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstdint>
+#include <cmath>
 #include <functional>
+#include <iostream>
 #include <iterator>
+#include <limits>
 #include <ranges>
 #include <span>
 #include <stack>
+#include <thread>
 #include <type_traits>
 #include <vector>
 
 #include "../mpi/mpi_communicator.hpp"
 #include "../mpi/mpi_types.hpp"
 #include "dynampi/impl/base_distributor.hpp"
+#include "dynampi/utilities/assert.hpp"
+#include "dynampi/utilities/timer.hpp"
 
 namespace dynampi {
 
@@ -32,34 +37,202 @@ class HierarchicalMPIWorkDistributor : public BaseMPIWorkDistributor<TaskT, Resu
     int manager_rank = 0;
     bool auto_run_workers = true;
     std::optional<size_t> message_batch_size = std::nullopt;
-    size_t max_workers_per_coordinator = 2;
+    std::optional<int> max_workers_per_coordinator = std::nullopt;
+    int batch_size_multiplier = 2;
+
+    // If true, topology is strictly mapped to physical nodes:
+    // Manager <-> Node Coordinators <-> Local Workers
+    // Note: Manager is excluded from its node's Local Comm to separate duties.
+    bool coordinator_per_node = true;
+  };
+
+  struct RunConfig {
+    // Stop once we have at least this many results ready to return.
+    size_t target_num_tasks = std::numeric_limits<size_t>::max();
+
+    // If false, strictly clips the return vector to `target_num_tasks`.
+    // Excess results are buffered for the next call.
+    bool allow_more_than_target_tasks = true;
+
+    // Stop if this much time has passed.
+    std::optional<double> max_seconds = std::nullopt;
   };
 
   static constexpr bool prioritize_tasks = Base::prioritize_tasks;
+  static const bool ordered = false;
 
  private:
-  typename Base::QueueT _unallocated_task_queue;
-  std::vector<int64_t> _worker_current_task_indices;
-  std::vector<ResultT> _results;
-  std::stack<int, std::vector<int>> _free_worker_indices;
+  typename Base::QueueT m_unallocated_task_queue;
+  std::vector<ResultT> m_results;
+
+  enum class CommLayer { Global, Local, Leader };
+
+  struct TaskRequest {
+    int worker_rank;
+    CommLayer source_layer = CommLayer::Global;  // Which comm did this come from?
+    std::optional<int> num_tasks_requested = std::nullopt;
+  };
+  static constexpr int kMaxTasksRequested = 1'000'000;  // guard against pathological reserve()
+  std::stack<TaskRequest, std::vector<TaskRequest>> m_free_worker_indices;
+
+  size_t m_tasks_sent_to_child = 0;
+  size_t m_results_received_from_child = 0;
+  size_t m_results_sent_to_parent = 0;
+  size_t m_tasks_received_from_parent = 0;
+  size_t m_tasks_executed = 0;
+  size_t m_results_returned = 0;
 
-  size_t _tasks_sent = 0;
-  size_t _results_received = 0;
-  bool _finalized = false;
+  bool m_finalized = false;
+  bool m_done = false;
 
   static constexpr StatisticsMode statistics_mode =
       get_option_value<track_statistics_t, Options...>();
 
   using MPICommunicator = dynampi::MPICommunicator<track_statistics<statistics_mode>>;
-  MPICommunicator _communicator;
-  std::function<ResultT(TaskT)> _worker_function;
-  Config _config;
 
-  enum Tag : int { TASK = 0, DONE = 1, RESULT = 2, REQUEST = 3, ERROR = 4 };
+  MPICommunicator m_communicator;  // Global communicator
+  MPIGroup m_world_group;          // Group for the global communicator (for rank translation)
+  std::optional<MPIGroup> m_local_group;  // Intra-node group (Shared Memory, excludes manager)
+  std::optional<MPIGroup>
+      m_leader_group;  // Inter-node group (Leaders only: manager + node coordinators)
+
+  std::function<ResultT(TaskT)> m_worker_function;
+  Config m_config;
+
+  // Cached parent target to avoid repeated MPI_Group_translate_ranks calls
+  mutable std::optional<std::pair<int, CommLayer>> m_cached_parent_target;
+
+  // --- Topology Helper Methods ---
+
+  inline int max_workers_per_coordinator() const {
+    const int default_value = std::max(2, static_cast<int>(std::sqrt(m_communicator.size())));
+    const int configured = m_config.max_workers_per_coordinator.value_or(default_value);
+    return std::max(1, configured);
+  }
+
+  // Returns {parent_rank, communicator_layer}
+  inline std::pair<int, CommLayer> get_parent_target() const {
+    // Return cached value if available
+    if (m_cached_parent_target.has_value()) {
+      return m_cached_parent_target.value();
+    }
+
+    std::pair<int, CommLayer> result;
+    DYNAMPI_ASSERT(!is_root_manager(), "Root manager should not have a parent");
+    if (m_config.coordinator_per_node) {
+      DYNAMPI_ASSERT(m_local_group.has_value() || m_leader_group.has_value(),
+                     "Local or leader group should be present");
+      if (m_local_group && m_local_group->rank() > 0) {
+        // Case 1: I am a Local Worker (Rank > 0 in Local Group)
+        // Parent is the Node Coordinator (Local Rank 0).
+        // Translate local rank 0 to world rank
+        int node_coord_world_rank = m_local_group->translate_rank(0, m_world_group);
+        result = {node_coord_world_rank, CommLayer::Local};
+      } else {
+        // Case 2: I am a Node Coordinator (Local Rank 0).
+        // Parent is the Global Manager.
+        // With the new topology, Manager is ALWAYS in the leader group.
+        // We need the manager's world rank, which we already have
+        int global_manager = m_config.manager_rank;
+        result = std::make_pair(global_manager, CommLayer::Leader);
+      }
+    } else {
+      // Original Logic
+      int rank = m_communicator.rank();
+      int virtual_rank = rank == m_config.manager_rank ? 0 : idx_for_worker(rank) + 1;
+      int virtual_parent = (virtual_rank - 1) / max_workers_per_coordinator();
+      int parent_rank =
+          virtual_parent == 0 ? m_config.manager_rank : worker_for_idx(virtual_parent - 1);
+      result = {parent_rank, CommLayer::Global};
+    }
+
+    // Cache the result
+    m_cached_parent_target = result;
+    return result;
+  }
+
+  inline int total_num_children(int rank) const {
+    if (m_config.coordinator_per_node) {
+      DYNAMPI_UNIMPLEMENTED("Recursive child counting not supported/needed in Node topology mode");
+      return 0;
+    }
+    int virtual_rank = rank == m_config.manager_rank ? 0 : idx_for_worker(rank) + 1;
+    int num_children = 0;
+    int max_children = max_workers_per_coordinator();
+    for (int i = 0; i < max_children; ++i) {
+      int child = virtual_rank * max_children + i + 1;
+      if (child >= m_communicator.size()) break;  // No more children
+      num_children += 1 + total_num_children(worker_for_idx(child - 1));
+    }
+    return num_children;
+  }
+
+  // Calculate number of direct children based on active topology
+  inline int num_direct_children() const {
+    if (m_config.coordinator_per_node) {
+      int count = 0;
+      // 1. Local Children: Everyone in local group except me (Rank 0)
+      if (m_local_group && m_local_group->rank() == 0) {
+        count += (m_local_group->size() - 1);
+      }
+      // 2. Remote Children: If I am Manager, other Leaders are my children.
+      // Note: In this topology, Manager is IN leader group, but NOT in local group.
+      if (is_root_manager() && m_leader_group) {
+        count += (m_leader_group->size() - 1);
+      }
+      return count;
+    } else {
+      // Original Logic
+      int rank = m_communicator.rank();
+      int num_children = 0;
+      int max_children = max_workers_per_coordinator();
+      for (int i = 0; i < max_children; ++i) {
+        int virtual_rank = rank == m_config.manager_rank ? 0 : idx_for_worker(rank) + 1;
+        int virtual_child = virtual_rank * max_children + i + 1;
+        if (virtual_child < m_communicator.size()) {
+          num_children++;
+        }
+      }
+      return num_children;
+    }
+  }
+
+  bool is_leaf_worker() const {
+    if (m_config.coordinator_per_node) {
+      if (is_root_manager()) return false;
+
+      // If I am NOT in local group (should only be Manager, handled above), panic?
+      // Actually, with this topology, everyone except Manager is in local group.
+      if (!m_local_group) return true;  // Safety fallback
+
+      // Standard Worker: Rank > 0 in Local Group
+      if (m_local_group->rank() > 0) return true;
+
+      // Node Coordinator: Rank 0 in Local Comm.
+      // Leaf only if single-core node (no children).
+      return num_direct_children() == 0;
+    } else {
+      int rank = m_communicator.rank();
+      int max_children = max_workers_per_coordinator();
+      int virtual_rank = rank == m_config.manager_rank ? 0 : idx_for_worker(rank) + 1;
+      int first_child_virtual = virtual_rank * max_children + 1;
+      return first_child_virtual >= m_communicator.size();
+    }
+  }
+
+  enum Tag : int {
+    TASK = 0,
+    DONE = 1,
+    RESULT = 2,
+    REQUEST = 3,
+    TASK_BATCH = 4,
+    RESULT_BATCH = 5,
+    REQUEST_BATCH = 6
+  };
 
   struct Statistics {
     const CommStatistics& comm_statistics;
-    std::vector<size_t> worker_task_counts;
+    std::optional<std::vector<size_t>> worker_task_counts = {};
   };
 
   using StatisticsT =
@@ -69,7 +242,7 @@ class HierarchicalMPIWorkDistributor : public BaseMPIWorkDistributor<TaskT, Resu
 
   static StatisticsT create_statistics(const MPICommunicator& comm) {
     if constexpr (statistics_mode != StatisticsMode::None) {
-      return Statistics{comm.get_statistics(), {}};
+      return Statistics{.comm_statistics = comm.get_statistics()};
     } else {
       return {};
     }
@@ -78,76 +251,159 @@ class HierarchicalMPIWorkDistributor : public BaseMPIWorkDistributor<TaskT, Resu
  public:
   explicit HierarchicalMPIWorkDistributor(std::function<ResultT(TaskT)> worker_function,
                                           Config runtime_config = Config{})
-      : _communicator(runtime_config.comm, MPICommunicator::Duplicate),
-        _worker_function(worker_function),
-        _config(runtime_config),
-        _statistics{create_statistics(_communicator)} {
-    if (is_root_manager()) _worker_current_task_indices.resize(_communicator.size() - 1, -1);
-    if (_config.auto_run_workers && _communicator.rank() != _config.manager_rank) {
-      run_worker();
+      : m_communicator(runtime_config.comm, MPICommunicator::Duplicate),
+        m_world_group(m_communicator),
+        m_worker_function(worker_function),
+        m_config(runtime_config),
+        _statistics{create_statistics(m_communicator)} {
+    // --- Initialize Topology Groups ---
+    if (m_config.coordinator_per_node) {
+      // 1. Identify physical nodes via split_by_node
+      MPICommunicator node_comm = m_communicator.split_by_node();
+
+      // 2. Create Local Group: Exclude Manager!
+      // If I am Manager, color is Undefined (I don't participate in local worker pool).
+      // Everyone else participates.
+      int local_color = (m_communicator.rank() == m_config.manager_rank) ? MPI_UNDEFINED : 0;
+
+      auto local_comm_opt = node_comm.split(local_color, m_communicator.rank());
+      if (local_comm_opt.has_value()) {
+        // Extract group from the temporary communicator, then let it be freed
+        m_local_group.emplace(*local_comm_opt);
+      }
+
+      // 3. Create Leader Group
+      // Who joins?
+      // A: The Manager (Always)
+      // B: The Node Coordinators (Rank 0 of the *Local* Comm)
+      bool is_manager = (m_communicator.rank() == m_config.manager_rank);
+      // Check if we're rank 0 in the local group (node coordinator)
+      bool is_node_coordinator = false;
+      if (m_local_group.has_value()) {
+        int my_local_rank = m_local_group->rank();
+        is_node_coordinator = (my_local_rank == 0);
+      }
+
+      int leader_color = (is_manager || is_node_coordinator) ? 0 : MPI_UNDEFINED;
+
+      // Key is global rank to maintain global ordering among leaders
+      auto leader_comm_opt = m_communicator.split(leader_color, m_communicator.rank());
+      if (leader_comm_opt.has_value()) {
+        // Extract group from the temporary communicator, then let it be freed
+        m_leader_group.emplace(*leader_comm_opt);
+      }
     }
-    if constexpr (statistics_mode >= StatisticsMode::Aggregated) {
-      if (is_root_manager()) _statistics.worker_task_counts.resize(_communicator.size(), 0);
+
+    if (m_config.auto_run_workers && m_communicator.rank() != m_config.manager_rank) {
+      run_worker();
     }
   }
 
   const StatisticsT& get_statistics() const
     requires(statistics_mode != StatisticsMode::None)
   {
-    assert(is_root_manager() && "Only the manager can access statistics");
+    DYNAMPI_ASSERT(is_root_manager(), "Only the manager can access statistics");
     return _statistics;
   }
 
   void run_worker() {
-    assert(_communicator.rank() != _config.manager_rank && "Worker cannot run on the manager rank");
-    using task_type = MPI_Type<TaskT>;
-    _communicator.send(nullptr, _config.manager_rank, Tag::REQUEST);
-    while (true) {
-      MPI_Status status;
-      DYNAMPI_MPI_CHECK(MPI_Probe, (MPI_ANY_SOURCE, MPI_ANY_TAG, _communicator.get(), &status));
-      if (status.MPI_TAG == Tag::DONE) {
-        _communicator.recv_empty_message(_config.manager_rank, Tag::DONE);
-        break;
+    DYNAMPI_ASSERT(m_communicator.rank() != m_config.manager_rank,
+                   "Worker cannot run on the manager rank");
+    if (is_leaf_worker()) {
+      // Leaf workers (usually local ranks > 0) just request from parent
+      send_to_parent(nullptr, Tag::REQUEST);
+      while (!m_done) {
+        receive_from_anyone();
       }
-      int count;
-      DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, task_type::value, &count));
-      TaskT message;
-      task_type::resize(message, count);
-      _communicator.recv(message, _config.manager_rank, Tag::TASK);
-      _tasks_sent++;
-      ResultT result = _worker_function(message);
-      _communicator.send(result, _config.manager_rank, Tag::RESULT);
-      _results_received++;
+    } else {
+      // Intermediate nodes (Node Coordinators)
+      int num_children = num_direct_children();
+      int prefetch = num_children * m_config.batch_size_multiplier;
+
+      // Initial request to parent (Manager)
+      send_to_parent(prefetch, Tag::REQUEST_BATCH);
+
+      while (!m_done) {
+        // If we have no tasks to give, wait for tasks from parent
+        while (!m_done && m_unallocated_task_queue.empty()) {
+          receive_from_anyone();
+        }
+
+        size_t num_tasks_should_be_received = m_unallocated_task_queue.size();
+
+        // Process tasks: Give to workers or execute ourselves if needed
+        while (!m_unallocated_task_queue.empty()) {
+          if (m_done) break;
+
+          if (m_free_worker_indices.empty()) {
+            // Must wait for a worker to become free
+            receive_from_anyone();
+          } else {
+            allocate_task_to_child();
+          }
+        }
+
+        // Wait for results from children
+        while (m_tasks_sent_to_child > m_results_received_from_child) {
+          receive_from_anyone();
+        }
+
+        if (m_done) break;
+
+        (void)num_tasks_should_be_received;
+        DYNAMPI_ASSERT_EQ(m_results.size(), num_tasks_should_be_received);
+
+        return_results_and_request_next_batch_from_manager();
+      }
+      send_done_to_children_when_free();
     }
   }
 
-  bool is_root_manager() const { return _communicator.rank() == _config.manager_rank; }
+  void return_results_and_request_next_batch_from_manager() {
+    DYNAMPI_ASSERT(!is_leaf_worker(), "Leaf workers should not return results directly");
+    DYNAMPI_ASSERT_NE(m_communicator.rank(), m_config.manager_rank,
+                      "Manager should not request tasks from itself");
+    std::vector<ResultT> results = m_results;
+    m_results.clear();
+
+    send_to_parent(results, Tag::RESULT_BATCH);
+    m_results_sent_to_parent += results.size();
+  }
+
+  bool is_root_manager() const { return m_communicator.rank() == m_config.manager_rank; }
 
   size_t remaining_tasks_count() const {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can check remaining tasks");
-    return _unallocated_task_queue.size();
+    DYNAMPI_ASSERT_EQ(m_communicator.rank(), m_config.manager_rank,
+                      "Only the manager can check remaining tasks");
+    return m_unallocated_task_queue.size();
   }
 
   void insert_task(TaskT task)
     requires(!prioritize_tasks)
   {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    _unallocated_task_queue.push_back(task);
+    DYNAMPI_ASSERT_EQ(m_communicator.rank(), m_config.manager_rank,
+                      "Only the manager can distribute tasks");
+    m_unallocated_task_queue.push_back(task);
+    m_tasks_received_from_parent++;
   }
   void insert_task(const TaskT& task, double priority)
     requires(prioritize_tasks)
   {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    _unallocated_task_queue.emplace(priority, task);
+    DYNAMPI_ASSERT_EQ(m_communicator.rank(), m_config.manager_rank,
+                      "Only the manager can distribute tasks");
+    m_unallocated_task_queue.emplace(priority, task);
+    m_tasks_received_from_parent++;
   }
 
   template <typename Range>
     requires std::ranges::input_range<Range> && (!prioritize_tasks)
   void insert_tasks(const Range& tasks) {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
+    DYNAMPI_ASSERT_EQ(m_communicator.rank(), m_config.manager_rank,
+                      "Only the manager can distribute tasks");
     std::copy(std::ranges::begin(tasks), std::ranges::end(tasks),
-              std::back_inserter(_unallocated_task_queue));
+              std::back_inserter(m_unallocated_task_queue));
+    m_tasks_received_from_parent +=
+        std::distance(std::ranges::begin(tasks), std::ranges::end(tasks));
   }
   void insert_tasks(const std::vector<TaskT>& tasks)
     requires(!prioritize_tasks)
@@ -155,120 +411,377 @@ class HierarchicalMPIWorkDistributor : public BaseMPIWorkDistributor<TaskT, Resu
     insert_tasks(std::span<const TaskT>(tasks));
   }
 
-  void get_task_and_allocate() {
-    const TaskT task = get_next_task_to_send();
-    if (_communicator.size() > 1) {
-      if (_free_worker_indices.empty()) {
-        // If no free workers, wait for a result to be received
-        receive_from_any_worker();
-      }
-      int worker = _free_worker_indices.top();
-      _free_worker_indices.pop();
-      _worker_current_task_indices[idx_for_worker(worker)] = _tasks_sent;
-      if constexpr (statistics_mode >= StatisticsMode::Aggregated) {
-        _statistics.worker_task_counts[worker]++;
+  void allocate_task_to_child() {
+    if (m_communicator.size() > 1) {
+      DYNAMPI_ASSERT(!m_free_worker_indices.empty(), "Cannot allocate task with no free workers");
+
+      TaskRequest request = m_free_worker_indices.top();
+      m_free_worker_indices.pop();
+
+      // Determine target and communicator based on request source
+      int worker_rank = request.worker_rank;
+      CommLayer layer = request.source_layer;
+
+      if (request.num_tasks_requested.has_value()) {
+        std::vector<TaskT> tasks;
+        int num_tasks = request.num_tasks_requested.value();
+
+        const int actual_num_tasks =
+            std::min<int>(num_tasks, static_cast<int>(m_unallocated_task_queue.size()));
+        tasks.reserve(actual_num_tasks);
+        if constexpr (std::is_same_v<decltype(m_unallocated_task_queue), std::deque<TaskT>>) {
+          tasks.assign(m_unallocated_task_queue.begin(),
+                       m_unallocated_task_queue.begin() + actual_num_tasks);
+          m_unallocated_task_queue.erase(m_unallocated_task_queue.begin(),
+                                         m_unallocated_task_queue.begin() + actual_num_tasks);
+        } else {
+          for (int i = 0; i < actual_num_tasks; ++i) {
+            tasks.push_back(std::move(m_unallocated_task_queue.top().second));
+            m_unallocated_task_queue.pop();
+          }
+        }
+
+        send_to_worker(tasks, worker_rank, Tag::TASK_BATCH, layer);
+        m_tasks_sent_to_child += tasks.size();
+      } else {
+        const TaskT task = get_next_task_to_send();
+        send_to_worker(task, worker_rank, Tag::TASK, layer);
+        m_tasks_sent_to_child++;
       }
-      _communicator.send(task, worker, Tag::TASK);
     } else {
-      // If there's only one process, we just run the worker function directly
-      _results.emplace_back(_worker_function(task));
-      _results_received++;
+      const TaskT task = get_next_task_to_send();
+      m_results.emplace_back(m_worker_function(task));
+      m_tasks_executed++;
     }
-    _tasks_sent++;
   }
 
-  [[nodiscard]] std::vector<ResultT> finish_remaining_tasks() {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    while (!_unallocated_task_queue.empty()) {
-      get_task_and_allocate();
+  [[nodiscard]] std::vector<ResultT> run_tasks(const RunConfig& config = RunConfig{}) {
+    DYNAMPI_ASSERT_EQ(m_communicator.rank(), m_config.manager_rank,
+                      "Only the manager can finish remaining tasks");
+    Timer timer;
+
+    while (true) {
+      // A. Target reached
+      if (m_results.size() >= config.target_num_tasks) {
+        break;
+      }
+
+      // B. Time limit
+      if (config.max_seconds && timer.elapsed().count() >= *config.max_seconds) {
+        break;
+      }
+
+      // C. Exhaustion
+      size_t active_tasks = m_tasks_sent_to_child - m_results_received_from_child;
+      if (m_unallocated_task_queue.empty() && active_tasks == 0) {
+        break;
+      }
+
+      bool tasks_available = !m_unallocated_task_queue.empty();
+      bool workers_available = !m_free_worker_indices.empty();
+      bool is_single_proc = (m_communicator.size() == 1);
+
+      if (tasks_available && (is_single_proc || workers_available)) {
+        allocate_task_to_child();
+      } else if (active_tasks > 0 || (tasks_available && !workers_available)) {
+        receive_from_anyone();
+      }
     }
-    while (_free_worker_indices.size() + 1 < static_cast<size_t>(_communicator.size())) {
-      receive_from_any_worker();
+
+    // --- Return Logic ---
+    std::vector<ResultT> batch;
+
+    size_t available = m_results.size();
+    size_t count_to_return = available;
+
+    if (!config.allow_more_than_target_tasks) {
+      count_to_return = std::min(available, config.target_num_tasks);
     }
-    assert(_results_received == _tasks_sent && "Not all tasks were processed by workers");
-    assert(_results.size() == _tasks_sent && "Results size should match tasks sent");
-    return _results;
+
+    batch.reserve(count_to_return);
+    auto end_it = m_results.begin() + count_to_return;
+    std::move(m_results.begin(), end_it, std::back_inserter(batch));
+    m_results.erase(m_results.begin(), end_it);
+
+    m_results_sent_to_parent += batch.size();
+    return batch;
+  }
+
+  [[nodiscard]] std::vector<ResultT> finish_remaining_tasks() {
+    RunConfig cfg;
+    cfg.target_num_tasks = std::numeric_limits<size_t>::max();
+    return run_tasks(cfg);
   }
 
   void finalize() {
-    assert(!_finalized && "Work distribution already finalized");
+    DYNAMPI_ASSERT(!m_finalized, "Work distribution already finalized");
     if (is_root_manager()) {
-      send_done_to_workers();
-      _finalized = true;
+      send_done_to_children_when_free();
+    }
+    m_finalized = true;
+    if constexpr (statistics_mode != StatisticsMode::None) {
+      if (is_root_manager()) {
+        _statistics.worker_task_counts = std::vector<size_t>(m_communicator.size(), 0);
+      }
+      m_communicator.gather(m_tasks_executed,
+                            _statistics.worker_task_counts.has_value()
+                                ? &_statistics.worker_task_counts.value()
+                                : nullptr,
+                            m_config.manager_rank);
     }
   }
 
   ~HierarchicalMPIWorkDistributor() {
-    if (!_finalized) {
+    if (!m_finalized) {
       finalize();
     }
-    assert(_tasks_sent == _results_received && "Not all tasks were processed by workers");
+    DYNAMPI_ASSERT_EQ(m_results_received_from_child, m_tasks_sent_to_child,
+                      "All tasks should have been processed by workers before finalizing");
+    DYNAMPI_ASSERT_EQ(m_results_sent_to_parent, m_tasks_received_from_parent,
+                      "All results should have been sent to the parent before finalizing");
+    if (is_leaf_worker())
+      DYNAMPI_ASSERT_EQ(m_results_received_from_child, 0,
+                        "Leaf workers should not receive results from children");
+    else if (m_communicator.size() > 1)
+      DYNAMPI_ASSERT_EQ(m_results_received_from_child + m_tasks_executed, m_results_sent_to_parent,
+                        "Results received from children should match results sent to parent");
   }
 
  private:
   TaskT get_next_task_to_send() {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can get next task");
-    assert(!_unallocated_task_queue.empty() && "There should be tasks available to send");
+    DYNAMPI_ASSERT(is_root_manager() || !is_leaf_worker(),
+                   "Leaf workers should not send tasks directly");
+    DYNAMPI_ASSERT(!m_unallocated_task_queue.empty(), "There should be tasks available to send");
     TaskT task;
-    if constexpr (std::is_same_v<decltype(_unallocated_task_queue), std::deque<TaskT>>) {
-      task = _unallocated_task_queue.front();
-      _unallocated_task_queue.pop_front();
+    if constexpr (std::is_same_v<decltype(m_unallocated_task_queue), std::deque<TaskT>>) {
+      task = m_unallocated_task_queue.front();
+      m_unallocated_task_queue.pop_front();
     } else {
-      task = _unallocated_task_queue.top().second;
-      _unallocated_task_queue.pop();
+      task = m_unallocated_task_queue.top().second;
+      m_unallocated_task_queue.pop();
     }
     return task;
   }
 
-  void send_done_to_workers() {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can finalize the work distribution");
-    assert(_free_worker_indices.size() + 1 == static_cast<size_t>(_communicator.size()) &&
-           "All workers should be free before finalizing");
-    for (int i = 0; i < _communicator.size() - 1; i++) {
-      _communicator.send(nullptr, worker_for_idx(i), Tag::DONE);
-    }
-  }
-
   int idx_for_worker(int worker_rank) const {
-    assert(worker_rank != _config.manager_rank &&
-           "Manager rank should not be used as a worker rank");
-    if (worker_rank < _config.manager_rank) {
+    DYNAMPI_ASSERT_NE(worker_rank, m_config.manager_rank,
+                      "Manager rank should not be used as a worker rank");
+    if (worker_rank < m_config.manager_rank) {
       return worker_rank;
     } else {
       return worker_rank - 1;
     }
   }
 
-  int worker_for_idx(int idx) const { return (idx < _config.manager_rank) ? idx : (idx + 1); }
-
-  void receive_from_any_worker() {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can receive results and send tasks");
-    assert(_communicator.size() > 1 &&
-           "There should be at least one worker to receive results from");
-    using result_type = MPI_Type<ResultT>;
-    MPI_Status status;
-    DYNAMPI_MPI_CHECK(MPI_Probe, (MPI_ANY_SOURCE, MPI_ANY_TAG, _communicator.get(), &status));
-    if (status.MPI_TAG == Tag::RESULT) {
-      int64_t task_idx = _worker_current_task_indices[status.MPI_SOURCE -
-                                                      (status.MPI_SOURCE > _config.manager_rank)];
-      _worker_current_task_indices[status.MPI_SOURCE - (status.MPI_SOURCE > _config.manager_rank)] =
-          -1;
-      assert(task_idx >= 0 && "Task index should be valid");
-      if (static_cast<uint64_t>(task_idx) >= _results.size()) {
-        _results.resize(task_idx + 1);
+  int worker_for_idx(int idx) const { return (idx < m_config.manager_rank) ? idx : (idx + 1); }
+
+  // --- Helper: Determine which layer a world rank belongs to ---
+  CommLayer determine_layer_from_world_rank(int world_rank) const {
+    DYNAMPI_ASSERT(m_config.coordinator_per_node);
+    // Check if rank is in local group (and not manager)
+    if (m_local_group) {
+      int local_rank = m_world_group.translate_rank(world_rank, *m_local_group);
+      if (local_rank != MPI_UNDEFINED) {
+        return CommLayer::Local;
       }
+    }
+    DYNAMPI_ASSERT(m_leader_group.has_value(), "Leader group should be present");
+    [[maybe_unused]] int leader_rank = m_world_group.translate_rank(world_rank, *m_leader_group);
+    DYNAMPI_ASSERT_NE(leader_rank, MPI_UNDEFINED, "Rank should be in leader group");
+    return CommLayer::Leader;
+  }
+
+  // --- Abstract Send Wrappers ---
+
+  template <typename T>
+  void send_to_parent(const T& data, Tag tag) {
+    auto [target, layer] = get_parent_target();
+    DYNAMPI_ASSERT_NE(target, -1, "Root cannot send to parent");
+
+    // With groups, target is always a world rank, so use global communicator
+    m_communicator.send(data, target, tag);
+  }
+
+  template <typename T>
+  void send_to_worker(const T& data, int rank, Tag tag, [[maybe_unused]] CommLayer layer) {
+    // With groups, rank is stored as world rank in TaskRequest, so use global communicator
+    m_communicator.send(data, rank, tag);
+  }
+
+  void send_done_to_children_when_free() {
+    const int direct_children = num_direct_children();
+    int done_sent_count = 0;
+    while (done_sent_count < direct_children) {
+      if (m_free_worker_indices.empty()) {
+        receive_from_anyone();
+        continue;
+      }
+      TaskRequest request = m_free_worker_indices.top();
+      m_free_worker_indices.pop();
+
+      send_to_worker(nullptr, request.worker_rank, Tag::DONE, request.source_layer);
+      done_sent_count++;
+    }
+  }
+
+  using result_mpi_type = MPI_Type<ResultT>;
+  using task_mpi_type = MPI_Type<TaskT>;
+
+  void receive_result_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                           CommLayer layer) {
+    m_results.push_back(ResultT{});
+    if (result_mpi_type::resize_required) {
+      DYNAMPI_UNIMPLEMENTED(  // LCOV_EXCL_LINE
+          "Dynamic resizing of results is not supported in hierarchical distribution");
+    }
+    // With groups, always use global communicator and determine layer from source rank
+    int world_source = status.MPI_SOURCE;
+    if (m_config.coordinator_per_node) {
+      layer = determine_layer_from_world_rank(world_source);
+    }
+    m_communicator.recv(m_results.back(), world_source, Tag::RESULT);
+    m_results_received_from_child++;
+    m_free_worker_indices.push(TaskRequest{.worker_rank = world_source, .source_layer = layer});
+  }
+
+  void receive_result_batch_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                                 CommLayer layer) {
+    using message_type = MPI_Type<std::vector<ResultT>>;
+    int count;
+    DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, message_type::value, &count));
+    std::vector<ResultT> results;
+    message_type::resize(results, count);
+    // With groups, always use global communicator and determine layer from source rank
+    int world_source = status.MPI_SOURCE;
+    if (m_config.coordinator_per_node) {
+      layer = determine_layer_from_world_rank(world_source);
+    }
+    m_communicator.recv(results, world_source, Tag::RESULT_BATCH);
+    m_free_worker_indices.push({.worker_rank = world_source,
+                                .source_layer = layer,
+                                .num_tasks_requested = static_cast<int>(results.size())});
+    std::copy(results.begin(), results.end(), std::back_inserter(m_results));
+    m_results_received_from_child += results.size();
+  }
+
+  void receive_execute_return_task_from(MPI_Status status,
+                                        [[maybe_unused]] MPICommunicator& source_comm,
+                                        [[maybe_unused]] CommLayer layer) {
+    int count;
+    DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, task_mpi_type::value, &count));
+    TaskT message;
+    task_mpi_type::resize(message, count);
+    // With groups, always use global communicator
+    int world_source = status.MPI_SOURCE;
+    m_communicator.recv(message, world_source, Tag::TASK);
+    m_tasks_received_from_parent++;
+    ResultT result = m_worker_function(message);
+    m_tasks_executed++;
+    // Reply on the global communicator
+    m_communicator.send(result, world_source, Tag::RESULT);
+    m_results_sent_to_parent++;
+  }
+
+  void receive_task_batch_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                               [[maybe_unused]] CommLayer layer) {
+    if constexpr (prioritize_tasks) {
+      DYNAMPI_UNIMPLEMENTED("Prioritized hierarchical distribution");
+    } else {
+      using message_type = MPI_Type<std::vector<TaskT>>;
       int count;
-      DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, result_type::value, &count));
-      result_type::resize(_results[task_idx], count);
-      _communicator.recv(_results[task_idx], status.MPI_SOURCE, Tag::RESULT);
-      _results_received++;
+      DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, message_type::value, &count));
+      std::vector<TaskT> tasks;
+      message_type::resize(tasks, count);
+      // With groups, always use global communicator
+      int world_source = status.MPI_SOURCE;
+      m_communicator.recv(tasks, world_source, Tag::TASK_BATCH);
+      m_tasks_received_from_parent += tasks.size();
+      for (const auto& task : tasks) {
+        m_unallocated_task_queue.push_back(task);
+      }
+    }
+  }
+
+  void receive_request_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                            CommLayer layer) {
+    // With groups, always use global communicator and determine layer from source rank
+    int world_source = status.MPI_SOURCE;
+    if (m_config.coordinator_per_node) {
+      layer = determine_layer_from_world_rank(world_source);
+    }
+    m_communicator.recv_empty_message(world_source, Tag::REQUEST);
+    m_free_worker_indices.push(TaskRequest{.worker_rank = world_source, .source_layer = layer});
+  }
+
+  void receive_request_batch_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                                  CommLayer layer) {
+    // With groups, always use global communicator and determine layer from source rank
+    int world_source = status.MPI_SOURCE;
+    if (m_config.coordinator_per_node) {
+      layer = determine_layer_from_world_rank(world_source);
+    }
+    int request_count;
+    m_communicator.recv(request_count, world_source, Tag::REQUEST_BATCH);
+    DYNAMPI_ASSERT_GT(request_count, 0, "Invalid request count");
+    DYNAMPI_ASSERT_LE(request_count, kMaxTasksRequested, "Request count exceeds maximum allowed");
+    m_free_worker_indices.push(TaskRequest{
+        .worker_rank = world_source, .source_layer = layer, .num_tasks_requested = request_count});
+  }
+
+  void receive_done_from(MPI_Status status, [[maybe_unused]] MPICommunicator& source_comm,
+                         [[maybe_unused]] CommLayer layer) {
+    // With groups, always use global communicator
+    int world_source = status.MPI_SOURCE;
+    m_communicator.recv_empty_message(world_source, Tag::DONE);
+    m_done = true;
+  }
+
+  void receive_from_anyone() {
+    DYNAMPI_ASSERT_GT(m_communicator.size(), 1,
+                      "There should be at least one worker to receive results from");
+
+    MPI_Status status{};
+    CommLayer layer = CommLayer::Global;
+
+    if (m_config.coordinator_per_node) {
+      // Poll global communicator non-blocking until a message is available
+      // The layer will be determined from the source rank in the receive methods
+      bool found = false;
+      while (!found) {
+        auto opt_status = m_communicator.iprobe();
+        if (opt_status.has_value()) {
+          status = opt_status.value();
+          found = true;
+          break;
+        }
+        std::this_thread::yield();
+      }
     } else {
-      assert(status.MPI_TAG == Tag::REQUEST && "Unexpected tag received in worker");
-      _communicator.recv_empty_message(status.MPI_SOURCE, Tag::REQUEST);
+      status = m_communicator.probe();
+    }
+
+    // Assert that the tag is a valid Tag enum value before casting
+    DYNAMPI_ASSERT(status.MPI_TAG >= static_cast<int>(Tag::TASK) &&
+                       status.MPI_TAG <= static_cast<int>(Tag::REQUEST_BATCH),
+                   "Received invalid MPI tag: " + std::to_string(status.MPI_TAG));
+    Tag tag = static_cast<Tag>(status.MPI_TAG);
+    // Note: receive methods now use global communicator and determine layer from source rank
+    switch (tag) {
+      case Tag::TASK:
+        return receive_execute_return_task_from(status, m_communicator, layer);
+      case Tag::TASK_BATCH:
+        return receive_task_batch_from(status, m_communicator, layer);
+      case Tag::RESULT:
+        return receive_result_from(status, m_communicator, layer);
+      case Tag::RESULT_BATCH:
+        return receive_result_batch_from(status, m_communicator, layer);
+      case Tag::REQUEST:
+        return receive_request_from(status, m_communicator, layer);
+      case Tag::REQUEST_BATCH:
+        return receive_request_batch_from(status, m_communicator, layer);
+      case Tag::DONE:
+        return receive_done_from(status, m_communicator, layer);
     }
-    _free_worker_indices.push(status.MPI_SOURCE);
   }
 };
 
diff --git a/include/dynampi/impl/naive_distributor.hpp b/include/dynampi/impl/naive_distributor.hpp
index a74a1c3..70523c4 100644
--- a/include/dynampi/impl/naive_distributor.hpp
+++ b/include/dynampi/impl/naive_distributor.hpp
@@ -10,16 +10,18 @@
 #include <cstdint>
 #include <deque>
 #include <functional>
-#include <iterator>
+#include <limits>
+#include <optional>
 #include <queue>
-#include <ranges>
 #include <stack>
 #include <type_traits>
+#include <variant>
 #include <vector>
 
 #include "../mpi/mpi_communicator.hpp"
 #include "../mpi/mpi_types.hpp"
 #include "dynampi/impl/base_distributor.hpp"
+#include "dynampi/utilities/timer.hpp"
 
 namespace dynampi {
 
@@ -36,30 +38,56 @@ class NaiveMPIWorkDistributor {
                                  // message. If a message exceeds this size, behavior is undefined.
   };
 
+  struct RunConfig {
+    // Stop once we have at least this many contiguous results ready to return.
+    size_t target_num_tasks = std::numeric_limits<size_t>::max();
+
+    // If false, strictly clips the return vector to `target_num_tasks`.
+    // Excess results remain in the internal buffer for the next call.
+    bool allow_more_than_target_tasks = true;
+
+    // Stop if this much time has passed.
+    std::optional<double> max_seconds = std::nullopt;
+  };
+
+  static const bool ordered = true;
+
  private:
   static constexpr bool prioritize_tasks = get_option_value<prioritize_tasks_t, Options...>();
+  static constexpr StatisticsMode statistics_mode =
+      get_option_value<track_statistics_t, Options...>();
+
   using QueueT = std::conditional_t<prioritize_tasks, std::priority_queue<std::pair<double, TaskT>>,
                                     std::deque<TaskT>>;
+  using MPICommunicator = dynampi::MPICommunicator<track_statistics<statistics_mode>>;
 
-  QueueT _unallocated_task_queue;
-  std::vector<int64_t> _worker_current_task_indices;
-  std::vector<ResultT> _results;
-  std::stack<int, std::vector<int>> _free_worker_indices;
+  // --- Member Variables ---
+  Config m_config;
+  MPICommunicator m_communicator;
+  std::function<ResultT(TaskT)> m_worker_function;
 
-  size_t _tasks_sent = 0;
-  size_t _results_received = 0;
-  bool _finalized = false;
+  QueueT m_unallocated_task_queue;
 
-  static constexpr StatisticsMode statistics_mode =
-      get_option_value<track_statistics_t, Options...>();
+  // State tracking
+  std::vector<int64_t> m_worker_current_task_indices;  // Maps worker_idx -> task_id
+  std::stack<int> m_free_worker_ranks;
 
-  using MPICommunicator = dynampi::MPICommunicator<track_statistics<statistics_mode>>;
-  MPICommunicator _communicator;
-  std::function<ResultT(TaskT)> _worker_function;
-  Config _config;
+  // Transient Storage:
+  // We use a vector to store results by task ID, with a bitmap to track validity.
+  // Items are marked invalid as soon as they become contiguous and ready to return.
+  std::vector<ResultT> m_pending_results;
+  std::vector<bool> m_pending_results_valid;
+
+  // Counters
+  size_t m_tasks_sent = 0;        // Total tasks ever sent (acts as the unique ID for the next task)
+  size_t m_front_result_idx = 0;  // The task ID of the result at the front of the vector (index 0)
+  size_t m_known_contiguous_results =
+      0;  // Number of contiguous valid results starting from m_front_result_idx
+  bool m_finalized = false;
 
   enum Tag : int { TASK = 0, DONE = 1, RESULT = 2, REQUEST = 3, ERROR = 4 };
 
+ public:
   struct Statistics {
     const CommStatistics& comm_statistics;
     std::vector<size_t> worker_task_counts;
@@ -68,256 +96,321 @@ class NaiveMPIWorkDistributor {
   using StatisticsT =
       std::conditional_t<statistics_mode == StatisticsMode::Detailed, Statistics, std::monostate>;
 
-  StatisticsT _statistics;
-
-  static StatisticsT create_statistics(const MPICommunicator& comm) {
-    if constexpr (statistics_mode != StatisticsMode::None) {
-      return Statistics{comm.get_statistics(), {}};
-    } else {
-      return {};
-    }
-  }
+ private:
+  StatisticsT m_statistics;
 
  public:
   explicit NaiveMPIWorkDistributor(std::function<ResultT(TaskT)> worker_function,
                                    Config runtime_config = Config{})
-      : _communicator(runtime_config.comm, MPICommunicator::Duplicate),
-        _worker_function(worker_function),
-        _config(runtime_config),
-        _statistics{create_statistics(_communicator)} {
-    if (is_root_manager()) _worker_current_task_indices.resize(_communicator.size() - 1, -1);
-    if (_config.auto_run_workers && _communicator.rank() != _config.manager_rank) {
+      : m_config(runtime_config),
+        m_communicator(runtime_config.comm, MPICommunicator::Duplicate),
+        m_worker_function(worker_function),
+        m_statistics{create_statistics(m_communicator)} {
+    if (is_root_manager()) {
+      m_worker_current_task_indices.resize(num_workers(), -1);
+    }
+
+    if (m_config.auto_run_workers && !is_root_manager()) {
       run_worker();
     }
+
     if constexpr (statistics_mode >= StatisticsMode::Aggregated) {
-      if (is_root_manager()) _statistics.worker_task_counts.resize(_communicator.size(), 0);
+      if (is_root_manager()) m_statistics.worker_task_counts.resize(m_communicator.size(), 0);
     }
   }
 
-  const StatisticsT& get_statistics() const
-    requires(statistics_mode != StatisticsMode::None)
-  {
-    assert(is_root_manager() && "Only the manager can access statistics");
-    return _statistics;
+  ~NaiveMPIWorkDistributor() {
+    if (!m_finalized) finalize();
   }
 
-  void run_worker() {
-    assert(_communicator.rank() != _config.manager_rank && "Worker cannot run on the manager rank");
-    using task_type = MPI_Type<TaskT>;
-    // Send REQUEST as 0 elements of ResultT so manager can recv_any(ResultT&) for both REQUEST and
-    // RESULT
-    _communicator.template send_empty<ResultT>(_config.manager_rank, Tag::REQUEST);
+  // --- Main Interface ---
+
+  [[nodiscard]] std::vector<ResultT> run_tasks(RunConfig config = RunConfig{}) {
+    assert(is_root_manager() && "Only the manager can distribute tasks");
+
+    Timer timer;
+
+    // We loop until one of the exit conditions is met.
     while (true) {
-      MPI_Status status = _communicator.probe();
-      if (status.MPI_TAG == Tag::DONE) {
-        _communicator.recv_empty_message(_config.manager_rank, Tag::DONE);
+      // --- 1. Check Exit Conditions ---
+
+      // A. Have we collected enough contiguous results?
+      if (m_known_contiguous_results >= config.target_num_tasks) {
         break;
       }
-      int count;
-      DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, task_type::value, &count));
-      TaskT message;
-      task_type::resize(message, count);
-      _communicator.recv(message, _config.manager_rank, Tag::TASK);
-      _tasks_sent++;
-      ResultT result = _worker_function(message);
-      _communicator.send(result, _config.manager_rank, Tag::RESULT);
-      _results_received++;
+
+      // B. Time limit check
+      if (config.max_seconds && timer.elapsed().count() >= *config.max_seconds) {
+        break;
+      }
+
+      // C. Total exhaustion check
+      if (m_unallocated_task_queue.empty() && active_worker_count() == 0) {
+        break;
+      }
+
+      // --- 2. Action Logic (Send vs Receive) ---
+
+      // Priority: Keep workers busy
+      if (!m_unallocated_task_queue.empty() && !m_free_worker_ranks.empty()) {
+        send_next_task_to_worker(m_free_worker_ranks.top());
+        m_free_worker_ranks.pop();
+      } else {
+        // Single process mode fallback
+        if (num_workers() == 0 && !m_unallocated_task_queue.empty()) {
+          run_task_locally();
+        }
+        // Standard MPI wait
+        else if (active_worker_count() > 0) {
+          process_incoming_message();
+        }
+      }
+    }
+
+    // --- 3. Return Logic ---
+    size_t limit = std::numeric_limits<size_t>::max();
+    if (!config.allow_more_than_target_tasks) {
+      limit = config.target_num_tasks;
+    }
+
+    return collect_available_results(limit);
+  }
+
+  [[nodiscard]] std::vector<ResultT> finish_remaining_tasks() {
+    RunConfig cfg;
+    cfg.target_num_tasks = std::numeric_limits<size_t>::max();
+    return run_tasks(cfg);
+  }
+
+  void finalize() {
+    assert(!m_finalized && "Work distribution already finalized");
+    if (is_root_manager()) {
+      broadcast_done();
     }
+    m_finalized = true;
   }
 
-  bool is_root_manager() const { return _communicator.rank() == _config.manager_rank; }
+  // --- Public Accessors ---
+
+  bool is_root_manager() const { return m_communicator.rank() == m_config.manager_rank; }
 
   size_t remaining_tasks_count() const {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can check remaining tasks");
-    return _unallocated_task_queue.size();
+    assert(is_root_manager() && "Only the manager can check remaining tasks");
+    return m_unallocated_task_queue.size();
   }
 
+  const StatisticsT& get_statistics() const
+    requires(statistics_mode != StatisticsMode::None)
+  {
+    assert(is_root_manager() && "Only the manager can access statistics");
+    return m_statistics;
+  }
+
+  // --- Task Insertion ---
+
   void insert_task(TaskT task)
     requires(!prioritize_tasks)
   {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    _unallocated_task_queue.push_back(task);
+    assert(is_root_manager());
+    m_unallocated_task_queue.push_back(std::move(task));
   }
+
   void insert_task(const TaskT& task, double priority)
     requires(prioritize_tasks)
   {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    _unallocated_task_queue.emplace(priority, task);
+    assert(is_root_manager());
+    m_unallocated_task_queue.emplace(priority, task);
   }
 
-  template <typename Range>
-    requires std::ranges::input_range<Range> && (!prioritize_tasks)
-  void insert_tasks(const Range& tasks) {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    std::copy(std::ranges::begin(tasks), std::ranges::end(tasks),
-              std::back_inserter(_unallocated_task_queue));
-  }
   void insert_tasks(const std::vector<TaskT>& tasks)
     requires(!prioritize_tasks)
   {
-    insert_tasks(std::span<const TaskT>(tasks));
+    assert(is_root_manager());
+    for (const auto& t : tasks) m_unallocated_task_queue.push_back(t);
   }
 
-  void get_task_and_allocate() {
-    const TaskT task = get_next_task_to_send();
-    if (_communicator.size() > 1) {
-      if (_free_worker_indices.empty()) {
-        // If no free workers, wait for a result to be received
-        receive_from_any_worker();
-      }
-      int worker = _free_worker_indices.top();
-      _free_worker_indices.pop();
-      _worker_current_task_indices[idx_for_worker(worker)] = _tasks_sent;
-      if constexpr (statistics_mode >= StatisticsMode::Aggregated) {
-        _statistics.worker_task_counts[worker]++;
+  // --- Worker Logic ---
+
+  void run_worker() {
+    assert(!is_root_manager());
+    using task_type = MPI_Type<TaskT>;
+
+    // Handshake: send REQUEST as 0 elements of ResultT so manager can recv_any(ResultT&) for both
+    // REQUEST and RESULT
+    m_communicator.template send_empty<ResultT>(m_config.manager_rank, Tag::REQUEST);
+
+    while (true) {
+      MPI_Status status = m_communicator.probe();
+
+      if (status.MPI_TAG == Tag::DONE) {
+        m_communicator.recv_empty_message(m_config.manager_rank, Tag::DONE);
+        break;
       }
-      _communicator.send(task, worker, Tag::TASK);
-    } else {
-      // If there's only one process, we just run the worker function directly
-      _results.emplace_back(_worker_function(task));
-      _results_received++;
-    }
-    _tasks_sent++;
-  }
 
-  [[nodiscard]] std::vector<ResultT> finish_remaining_tasks() {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can distribute tasks");
-    while (!_unallocated_task_queue.empty()) {
-      get_task_and_allocate();
-    }
-    while (_free_worker_indices.size() + 1 < static_cast<size_t>(_communicator.size())) {
-      receive_from_any_worker();
+      int count;
+      DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, task_type::value, &count));
+      TaskT message;
+      task_type::resize(message, count);
+      m_communicator.recv(message, m_config.manager_rank, Tag::TASK);
+
+      ResultT result = m_worker_function(std::move(message));
+
+      m_communicator.send(result, m_config.manager_rank, Tag::RESULT);
     }
-    assert(_results_received == _tasks_sent && "Not all tasks were processed by workers");
-    assert(_results.size() == _tasks_sent && "Results size should match tasks sent");
-    return _results;
   }
 
-  void finalize() {
-    assert(!_finalized && "Work distribution already finalized");
-    if (is_root_manager()) {
-      send_done_to_workers();
-      _finalized = true;
-    }
+ private:
+  // --- Helpers ---
+
+  int num_workers() const { return m_communicator.size() - 1; }
+
+  size_t active_worker_count() const {
+    return static_cast<size_t>(num_workers()) - m_free_worker_ranks.size();
   }
 
-  ~NaiveMPIWorkDistributor() {
-    if (!_finalized) {
-      finalize();
-    }
-    assert(_tasks_sent == _results_received && "Not all tasks were processed by workers");
+  int rank_to_worker_idx(int rank) const {
+    return (rank < m_config.manager_rank) ? rank : (rank - 1);
   }
 
- private:
-  TaskT get_next_task_to_send() {
-    assert(_communicator.rank() == _config.manager_rank && "Only the manager can get next task");
-    assert(!_unallocated_task_queue.empty() && "There should be tasks available to send");
+  int worker_idx_to_rank(int idx) const { return (idx < m_config.manager_rank) ? idx : (idx + 1); }
+
+  TaskT pop_next_task() {
     TaskT task;
-    if constexpr (std::is_same_v<QueueT, std::deque<TaskT>>) {
-      task = _unallocated_task_queue.front();
-      _unallocated_task_queue.pop_front();
+    if constexpr (prioritize_tasks) {
+      task = m_unallocated_task_queue.top().second;
+      m_unallocated_task_queue.pop();
     } else {
-      task = _unallocated_task_queue.top().second;
-      _unallocated_task_queue.pop();
+      task = std::move(m_unallocated_task_queue.front());
+      m_unallocated_task_queue.pop_front();
     }
     return task;
   }
 
-  void send_done_to_workers() {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can finalize the work distribution");
-    assert(_free_worker_indices.size() + 1 == static_cast<size_t>(_communicator.size()) &&
-           "All workers should be free before finalizing");
-    for (int i = 0; i < _communicator.size() - 1; i++) {
-      _communicator.send(nullptr, worker_for_idx(i), Tag::DONE);
+  void run_task_locally() {
+    TaskT task = pop_next_task();
+    // Store result directly in vector (using relative indexing)
+    int64_t task_id = static_cast<int64_t>(m_tasks_sent);
+    ensure_result_capacity(task_id - m_front_result_idx + 1);
+    size_t vector_idx = task_id - m_front_result_idx;
+    m_pending_results[vector_idx] = m_worker_function(std::move(task));
+    m_pending_results_valid[vector_idx] = true;
+    m_tasks_sent++;
+    update_contiguous_results_count(task_id);
+  }
+
+  void send_next_task_to_worker(int worker_rank) {
+    TaskT task = pop_next_task();
+    int64_t task_id = static_cast<int64_t>(m_tasks_sent);
+
+    m_worker_current_task_indices[rank_to_worker_idx(worker_rank)] = task_id;
+    if constexpr (statistics_mode >= StatisticsMode::Aggregated) {
+      m_statistics.worker_task_counts[rank_to_worker_idx(worker_rank)]++;
     }
+
+    m_communicator.send(task, worker_rank, Tag::TASK);
+    m_tasks_sent++;
   }
 
-  int idx_for_worker(int worker_rank) const {
-    assert(worker_rank != _config.manager_rank &&
-           "Manager rank should not be used as a worker rank");
-    if (worker_rank < _config.manager_rank) {
-      return worker_rank;
+  void process_incoming_message() {
+    MPI_Status status = m_communicator.probe(MPI_ANY_SOURCE, MPI_ANY_TAG);
+    int source = status.MPI_SOURCE;
+    if (status.MPI_TAG == Tag::RESULT) {
+      handle_result_message(source, status);
     } else {
-      return worker_rank - 1;
+      DYNAMPI_ASSERT_EQ(status.MPI_TAG, Tag::REQUEST, "Unexpected tag received");
+      m_communicator.template recv_empty<ResultT>(source, Tag::REQUEST);
     }
+    m_free_worker_ranks.push(source);
   }
 
-  int worker_for_idx(int idx) const { return (idx < _config.manager_rank) ? idx : (idx + 1); }
+  void handle_result_message(int source, MPI_Status& probe_status) {
+    int worker_idx = rank_to_worker_idx(source);
+    int64_t task_id = m_worker_current_task_indices[worker_idx];
+    m_worker_current_task_indices[worker_idx] = -1;
 
-  void process_result_message(const MPI_Status& status, ResultT&& result, int count) {
     using result_type = MPI_Type<ResultT>;
-    int worker_idx = status.MPI_SOURCE - (status.MPI_SOURCE > _config.manager_rank);
-    int64_t task_idx = _worker_current_task_indices[worker_idx];
-    _worker_current_task_indices[worker_idx] = -1;
-    assert(task_idx >= 0 && "Task index should be valid");
-    if (static_cast<uint64_t>(task_idx) >= _results.size()) {
-      _results.resize(task_idx + 1);
+    int count;
+    DYNAMPI_MPI_CHECK(MPI_Get_count, (&probe_status, result_type::value, &count));
+
+    ResultT result_data;
+    result_type::resize(result_data, count);
+    m_communicator.recv(result_data, source, Tag::RESULT);
+
+    // Store in vector (using relative indexing)
+    size_t vector_idx = task_id - m_front_result_idx;
+    ensure_result_capacity(vector_idx + 1);
+    m_pending_results[vector_idx] = std::move(result_data);
+    m_pending_results_valid[vector_idx] = true;
+    update_contiguous_results_count(task_id);
+  }
+
+  std::vector<ResultT> collect_available_results(size_t limit) {
+    std::vector<ResultT> batch;
+    size_t num_results_to_return = std::min(limit, m_known_contiguous_results);
+    if (num_results_to_return == 0) {
+      return batch;
     }
-    if constexpr (result_type::resize_required) {
-      result_type::resize(_results[task_idx], count);
+
+    batch.reserve(num_results_to_return);
+    // Extract from the beginning of the vectors (which contain contiguous results starting from
+    // m_front_result_idx)
+    batch.insert(batch.end(), std::make_move_iterator(m_pending_results.begin()),
+                 std::make_move_iterator(m_pending_results.begin() + num_results_to_return));
+
+    // Erase the collected results from the beginning
+    m_pending_results_valid.erase(m_pending_results_valid.begin(),
+                                  m_pending_results_valid.begin() + num_results_to_return);
+    m_pending_results.erase(m_pending_results.begin(),
+                            m_pending_results.begin() + num_results_to_return);
+
+    // Update counters: increment m_front_result_idx to reflect the new starting point,
+    // and decrement the contiguous count. The vectors now use relative indexing
+    // where index 0 corresponds to task_id = m_front_result_idx.
+    m_front_result_idx += num_results_to_return;
+    m_known_contiguous_results -= num_results_to_return;
+
+    return batch;
+  }
+
+  void broadcast_done() {
+    for (int i = 0; i < num_workers(); i++) {
+      m_communicator.send(nullptr, worker_idx_to_rank(i), Tag::DONE);
     }
-    _results[task_idx] = std::move(result);
-    _results_received++;
   }
 
-  void receive_from_any_worker() {
-    assert(_communicator.rank() == _config.manager_rank &&
-           "Only the manager can receive results and send tasks");
-    assert(_communicator.size() > 1 &&
-           "There should be at least one worker to receive results from");
-    using result_type = MPI_Type<ResultT>;
-    MPI_Status status;
-
-    if (_config.use_immediate_recv) {
-      // Immediate receive mode: REQUEST and RESULT both use type ResultT (REQUEST = 0 elements).
-      // recv_any(buffer) receives into the same buffer type for both.
-      if constexpr (result_type::resize_required) {
-        ResultT buffer;
-        result_type::resize(buffer, _config.max_result_size);
-        status = _communicator.recv_any(buffer);
-
-        if (status.MPI_TAG == Tag::RESULT) {
-          int count;
-          DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, result_type::value, &count));
-          // Resize buffer to actual received count (may be less than max_result_size)
-          result_type::resize(buffer, count);
-          process_result_message(status, std::move(buffer), count);
-        } else {
-          assert(status.MPI_TAG == Tag::REQUEST && "Unexpected tag received");
-        }
-      } else {
-        ResultT buffer;
-        status = _communicator.recv_any(buffer);
-
-        if (status.MPI_TAG == Tag::RESULT) {
-          int count;
-          DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, result_type::value, &count));
-          process_result_message(status, std::move(buffer), count);
-        } else {
-          assert(status.MPI_TAG == Tag::REQUEST && "Unexpected tag received");
-        }
+  void ensure_result_capacity(size_t required_size) {
+    // required_size is relative to m_front_result_idx
+    if (m_pending_results.size() < required_size) {
+      m_pending_results.resize(required_size);
+      m_pending_results_valid.resize(required_size, false);
+    }
+  }
+
+  // Updates m_known_contiguous_results when a new result arrives.
+  // If the result extends the contiguous sequence, increment and check forward.
+  void update_contiguous_results_count(int64_t task_id) {
+    int64_t expected_task_id =
+        static_cast<int64_t>(m_front_result_idx + m_known_contiguous_results);
+
+    // Only update if this result extends the contiguous sequence
+    if (task_id == expected_task_id) {
+      // Extend the contiguous sequence forward as far as possible
+      // Use relative indexing for vector access
+      size_t vector_idx = expected_task_id - m_front_result_idx;
+      while (vector_idx < m_pending_results.size() && m_pending_results_valid[vector_idx]) {
+        m_known_contiguous_results++;
+        vector_idx++;
       }
+    }
+  }
+
+  static StatisticsT create_statistics(const MPICommunicator& comm) {
+    if constexpr (statistics_mode != StatisticsMode::None) {
+      return Statistics{comm.get_statistics(), {}};
     } else {
-      // Probe mode: use probe to check message size before receiving
-      status = _communicator.probe();
-      if (status.MPI_TAG == Tag::RESULT) {
-        int count;
-        DYNAMPI_MPI_CHECK(MPI_Get_count, (&status, result_type::value, &count));
-        ResultT buffer;
-        if constexpr (result_type::resize_required) {
-          result_type::resize(buffer, count);
-        }
-        _communicator.recv(buffer, status.MPI_SOURCE, Tag::RESULT);
-        process_result_message(status, std::move(buffer), count);
-      } else {
-        assert(status.MPI_TAG == Tag::REQUEST && "Unexpected tag received in worker");
-        _communicator.recv_empty<ResultT>(status.MPI_SOURCE, Tag::REQUEST);
-      }
+      return {};
     }
-    _free_worker_indices.push(status.MPI_SOURCE);
   }
 };
 
-};  // namespace dynampi
+}  // namespace dynampi
diff --git a/include/dynampi/mpi/mpi_communicator.hpp b/include/dynampi/mpi/mpi_communicator.hpp
index bc81f98..316b653 100644
--- a/include/dynampi/mpi/mpi_communicator.hpp
+++ b/include/dynampi/mpi/mpi_communicator.hpp
@@ -10,7 +10,9 @@
 #include <optional>
 #include <variant>
 
+#include "dynampi/mpi/mpi_group.hpp"
 #include "dynampi/mpi/mpi_types.hpp"
+#include "dynampi/utilities/assert.hpp"
 #include "dynampi/utilities/template_options.hpp"
 #include "mpi_error.hpp"
 
@@ -68,8 +70,8 @@ class MPICommunicator {
   };
 
  private:
-  MPI_Comm _comm;
-  Ownership _ownership;
+  MPI_Comm m_comm;
+  Ownership m_ownership;
 
   static constexpr StatisticsMode statistics_mode =
       get_option_value<track_statistics_t, Options...>();
@@ -80,39 +82,39 @@ class MPICommunicator {
 
  public:
   MPICommunicator(MPI_Comm comm, Ownership ownership = Duplicate)
-      : _comm(comm), _ownership(ownership) {
-    if (_ownership == Duplicate) {
-      DYNAMPI_MPI_CHECK(MPI_Comm_dup, (comm, &_comm));
+      : m_comm(comm), m_ownership(ownership) {
+    if (m_ownership == Duplicate) {
+      DYNAMPI_MPI_CHECK(MPI_Comm_dup, (comm, &m_comm));
     }
   }
 
   MPICommunicator(const MPICommunicator& other) = delete;
   MPICommunicator& operator=(const MPICommunicator& other) = delete;
   MPICommunicator(MPICommunicator&& other) noexcept
-      : _comm(other._comm),
-        _ownership(other._ownership),
+      : m_comm(other.m_comm),
+        m_ownership(other.m_ownership),
         _statistics(std::move(other._statistics)) {
-    other._comm = MPI_COMM_NULL;
-    other._ownership = Reference;
+    other.m_comm = MPI_COMM_NULL;
+    other.m_ownership = Reference;
   }
   MPICommunicator& operator=(MPICommunicator&& other) = delete;
 
   ~MPICommunicator() {
-    if (_ownership != Reference) {
-      MPI_Comm_free(&_comm);
+    if (m_ownership != Reference) {
+      MPI_Comm_free(&m_comm);
     }
   }
 
   MPICommunicator split_by_node() const {
     MPI_Comm node_comm;
     DYNAMPI_MPI_CHECK(MPI_Comm_split_type,
-                      (_comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm));
+                      (m_comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm));
     return MPICommunicator(node_comm, Move);
   }
 
   std::optional<MPICommunicator> split(int color, int key = 0) const {
     MPI_Comm new_comm;
-    DYNAMPI_MPI_CHECK(MPI_Comm_split, (_comm, color, key, &new_comm));
+    DYNAMPI_MPI_CHECK(MPI_Comm_split, (m_comm, color, key, &new_comm));
     if (new_comm == MPI_COMM_NULL) {
       return std::nullopt;
     }
@@ -120,7 +122,7 @@ class MPICommunicator {
     return MPICommunicator(new_comm, Move);
   }
 
-  operator MPI_Comm() const { return _comm; }
+  operator MPI_Comm() const { return m_comm; }
 
   const CommStatistics& get_statistics() const
     requires(statistics_mode != StatisticsMode::None)
@@ -130,13 +132,13 @@ class MPICommunicator {
 
   int rank() const {
     int rank;
-    DYNAMPI_MPI_CHECK(MPI_Comm_rank, (_comm, &rank));
+    DYNAMPI_MPI_CHECK(MPI_Comm_rank, (m_comm, &rank));
     return rank;
   }
 
   int size() const {
     int size;
-    DYNAMPI_MPI_CHECK(MPI_Comm_size, (_comm, &size));
+    DYNAMPI_MPI_CHECK(MPI_Comm_size, (m_comm, &size));
     return size;
   }
 
@@ -144,7 +146,7 @@ class MPICommunicator {
   inline void send(const T& data, int dest, int tag = 0) {
     using mpi_type = MPI_Type<T>;
     DYNAMPI_MPI_CHECK(
-        MPI_Send, (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value, dest, tag, _comm));
+        MPI_Send, (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value, dest, tag, m_comm));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.send_count++;
       int size;
@@ -153,12 +155,28 @@ class MPICommunicator {
     }
   }
 
+  inline MPI_Status probe(int source = MPI_ANY_SOURCE, int tag = MPI_ANY_TAG) {
+    MPI_Status status;
+    DYNAMPI_MPI_CHECK(MPI_Probe, (source, tag, m_comm, &status));
+    return status;
+  }
+
+  inline std::optional<MPI_Status> iprobe(int source = MPI_ANY_SOURCE, int tag = MPI_ANY_TAG) {
+    MPI_Status status;
+    int flag;
+    DYNAMPI_MPI_CHECK(MPI_Iprobe, (source, tag, m_comm, &flag, &status));
+    if (flag) {
+      return status;
+    }
+    return std::nullopt;
+  }
+
   template <typename T>
   inline void recv(T& data, int source, int tag = 0) {
     using mpi_type = MPI_Type<T>;
     MPI_Status status;
     DYNAMPI_MPI_CHECK(MPI_Recv, (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value,
-                                 source, tag, _comm, &status));
+                                 source, tag, m_comm, &status));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.recv_count++;
       int actual_count;
@@ -169,20 +187,13 @@ class MPICommunicator {
     }
   }
 
-  // Probe for a message, returns status
-  inline MPI_Status probe(int source = MPI_ANY_SOURCE, int tag = MPI_ANY_TAG) {
-    MPI_Status status;
-    DYNAMPI_MPI_CHECK(MPI_Probe, (source, tag, _comm, &status));
-    return status;
-  }
-
   // Receive with MPI_ANY_SOURCE/MPI_ANY_TAG and return status
   template <typename T>
   inline MPI_Status recv_any(T& data, int source = MPI_ANY_SOURCE, int tag = MPI_ANY_TAG) {
     using mpi_type = MPI_Type<T>;
     MPI_Status status;
     DYNAMPI_MPI_CHECK(MPI_Recv, (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value,
-                                 source, tag, _comm, &status));
+                                 source, tag, m_comm, &status));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.recv_count++;
       int actual_count;
@@ -205,7 +216,7 @@ class MPICommunicator {
       }
     }
     DYNAMPI_MPI_CHECK(MPI_Bcast,
-                      (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value, root, _comm));
+                      (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value, root, m_comm));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.collective_count++;
     }
@@ -214,7 +225,7 @@ class MPICommunicator {
   inline void recv_empty_message(int source, int tag = 0) {
     using mpi_type = MPI_Type<std::nullptr_t>;
     DYNAMPI_MPI_CHECK(MPI_Recv, (nullptr, mpi_type::count(nullptr), mpi_type::value, source, tag,
-                                 _comm, MPI_STATUS_IGNORE));
+                                 m_comm, MPI_STATUS_IGNORE));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.recv_count++;
     }
@@ -225,7 +236,7 @@ class MPICommunicator {
   template <typename T>
   inline void send_empty(int dest, int tag = 0) {
     using mpi_type = MPI_Type<T>;
-    DYNAMPI_MPI_CHECK(MPI_Send, (nullptr, 0, mpi_type::value, dest, tag, _comm));
+    DYNAMPI_MPI_CHECK(MPI_Send, (nullptr, 0, mpi_type::value, dest, tag, m_comm));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.send_count++;
     }
@@ -236,13 +247,29 @@ class MPICommunicator {
   inline void recv_empty(int source, int tag = 0) {
     using mpi_type = MPI_Type<T>;
     DYNAMPI_MPI_CHECK(MPI_Recv,
-                      (nullptr, 0, mpi_type::value, source, tag, _comm, MPI_STATUS_IGNORE));
+                      (nullptr, 0, mpi_type::value, source, tag, m_comm, MPI_STATUS_IGNORE));
     if constexpr (statistics_mode != StatisticsMode::None) {
       _statistics.recv_count++;
     }
   }
 
-  [[nodiscard]] MPI_Comm get() const { return _comm; }
+  template <typename T>
+  inline void gather(const T& data, std::vector<T>* result, int root = 0) {
+    DYNAMPI_ASSERT_EQ(result != nullptr, root == rank(),
+                      "Gather result must be provided only on the root rank");
+    using mpi_type = MPI_Type<T>;
+    DYNAMPI_MPI_CHECK(MPI_Gather, (mpi_type::ptr(data), mpi_type::count(data), mpi_type::value,
+                                   result == nullptr ? nullptr : result->data(),
+                                   mpi_type::count(data), mpi_type::value, root, m_comm));
+    if constexpr (statistics_mode != StatisticsMode::None) {
+      _statistics.collective_count++;
+    }
+  }
+
+  [[nodiscard]] MPI_Comm get() const { return m_comm; }
+
+  // Get the group associated with this communicator
+  [[nodiscard]] MPIGroup get_group() const { return MPIGroup(*this); }
 };
 
 }  // namespace dynampi
diff --git a/include/dynampi/mpi/mpi_error.hpp b/include/dynampi/mpi/mpi_error.hpp
index c4c2d43..b87f860 100644
--- a/include/dynampi/mpi/mpi_error.hpp
+++ b/include/dynampi/mpi/mpi_error.hpp
@@ -9,6 +9,7 @@
 
 #include <string>
 
+// cppcheck-suppress preprocessorErrorDirective
 #if __has_include(<source_location>)
 #include <source_location>
 #if defined(__cpp_lib_source_location)
diff --git a/include/dynampi/mpi/mpi_group.hpp b/include/dynampi/mpi/mpi_group.hpp
new file mode 100644
index 0000000..c8868dc
--- /dev/null
+++ b/include/dynampi/mpi/mpi_group.hpp
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <mpi.h>
+
+#include "dynampi/utilities/assert.hpp"
+#include "mpi_error.hpp"
+
+namespace dynampi {
+
+// Forward declaration
+template <typename... Options>
+class MPICommunicator;
+
+class MPIGroup {
+ private:
+  MPI_Group m_group;
+
+ public:
+  // Create from a communicator (extracts the group)
+  template <typename... Options>
+  explicit MPIGroup(const MPICommunicator<Options...>& comm) {
+    DYNAMPI_MPI_CHECK(MPI_Comm_group, (comm.get(), &m_group));
+  }
+
+  // Non-copyable
+  MPIGroup(const MPIGroup& other) = delete;
+  MPIGroup& operator=(const MPIGroup& other) = delete;
+
+  // Movable
+  MPIGroup(MPIGroup&& other) noexcept : m_group(other.m_group) { other.m_group = MPI_GROUP_NULL; }
+  MPIGroup& operator=(MPIGroup&& other) noexcept {
+    if (this != &other) {
+      if (m_group != MPI_GROUP_NULL) {
+        MPI_Group_free(&m_group);
+      }
+      m_group = other.m_group;
+      other.m_group = MPI_GROUP_NULL;
+    }
+    return *this;
+  }
+
+  ~MPIGroup() {
+    if (m_group != MPI_GROUP_NULL) {
+      MPI_Group_free(&m_group);
+    }
+  }
+
+  // Translate ranks from this group to another group
+  void translate_ranks(const MPIGroup& to_group, int n, const int ranks[],
+                       int translated_ranks[]) const {
+    DYNAMPI_MPI_CHECK(MPI_Group_translate_ranks,
+                      (m_group, n, ranks, to_group.m_group, translated_ranks));
+  }
+
+  // Convenience method for single rank translation
+  int translate_rank(int rank, const MPIGroup& to_group) const {
+    int translated_rank;
+    translate_ranks(to_group, 1, &rank, &translated_rank);
+    return translated_rank;
+  }
+
+  // Get the size of this group
+  int size() const {
+    int size;
+    DYNAMPI_MPI_CHECK(MPI_Group_size, (m_group, &size));
+    return size;
+  }
+
+  // Get the rank of the calling process in this group (MPI_UNDEFINED if not in group)
+  int rank() const {
+    int rank;
+    DYNAMPI_MPI_CHECK(MPI_Group_rank, (m_group, &rank));
+    return rank;
+  }
+
+  // Check if a rank (from a reference group, typically the world group) is in this group
+  // Returns the rank in this group, or MPI_UNDEFINED if not found
+  // Note: This translates FROM reference_group TO this group
+  static int contains_rank_in_group(int rank_in_reference, const MPIGroup& reference_group,
+                                    const MPIGroup& target_group) {
+    return reference_group.translate_rank(rank_in_reference, target_group);
+  }
+
+  operator MPI_Group() const { return m_group; }
+
+  [[nodiscard]] MPI_Group get() const { return m_group; }
+};
+
+}  // namespace dynampi
diff --git a/include/dynampi/mpi/mpi_types.hpp b/include/dynampi/mpi/mpi_types.hpp
index 5fa4ba9..5614299 100644
--- a/include/dynampi/mpi/mpi_types.hpp
+++ b/include/dynampi/mpi/mpi_types.hpp
@@ -9,6 +9,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <string>
 #include <type_traits>
 #include <vector>
 
@@ -53,6 +54,9 @@ DYNAMPI_DEFINE_PRIMITIVE_MPI_TYPE(double, MPI_DOUBLE);
 DYNAMPI_DEFINE_PRIMITIVE_MPI_TYPE(long double, MPI_LONG_DOUBLE);
 #if defined(MPI_CXX_BOOL)
 DYNAMPI_DEFINE_PRIMITIVE_MPI_TYPE(bool, MPI_CXX_BOOL);
+#else
+// Fallback for when MPI_CXX_BOOL is not available (e.g. Microsoft-MPI)
+DYNAMPI_DEFINE_PRIMITIVE_MPI_TYPE(bool, MPI_C_BOOL);
 #endif
 
 template <>
diff --git a/include/dynampi/utilities/assert.hpp b/include/dynampi/utilities/assert.hpp
new file mode 100644
index 0000000..14f4cd0
--- /dev/null
+++ b/include/dynampi/utilities/assert.hpp
@@ -0,0 +1,145 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <mpi.h>
+
+#include <exception>
+#ifndef _MSC_VER
+#define DYNAMPI_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define DYNAMPI_HAS_BUILTIN(x) 0
+#endif
+
+#ifndef NDEBUG
+#include <iostream>
+#include <optional>
+
+// cppcheck-suppress preprocessorErrorDirective
+#if __has_include(<source_location>)
+#include <source_location>
+#if defined(__cpp_lib_source_location)
+#define DYNAMPI_HAS_SOURCE_LOCATION
+#endif
+#elif __has_include(<experimental/source_location>)
+#include <experimental/source_location>
+namespace std {
+using source_location = std::experimental::source_location;
+}
+#define DYNAMPI_HAS_SOURCE_LOCATION
+#endif
+
+#include <sstream>
+#include <string>
+
+#include "printing.hpp"
+#endif
+
+namespace dynampi {
+
+#ifndef NDEBUG
+template <typename... Args>
+std::optional<std::string> OptionalString(Args &&...args) {
+  if constexpr (sizeof...(args) == 0) {
+    return std::nullopt;
+  } else {
+    std::stringstream ss;
+    (ss << ... << args);
+    return ss.str();
+  }
+}
+
+#define DYNAMPI_ASSERT(condition, ...)                                                 \
+  do {                                                                                 \
+    if (!(condition))                                                                  \
+      dynampi::_DYNAMPI_FAIL_ASSERT(#condition, dynampi::OptionalString(__VA_ARGS__)); \
+  } while (false)
+
+inline void _DYNAMPI_FAIL_ASSERT(const std::string &condition_str,
+                                 const std::optional<std::string> &message
+#ifdef DYNAMPI_HAS_SOURCE_LOCATION
+                                 ,
+                                 const std::source_location &loc = std::source_location::current()
+#endif
+) {
+  if (!std::uncaught_exceptions()) {
+    std::stringstream ss;
+#ifdef DYNAMPI_HAS_SOURCE_LOCATION
+    std::string_view s = loc.file_name();
+    std::string_view filename = s.substr(s.find_last_of('/') + 1);
+#endif
+    int rank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    ss << "DynaMPI assertion failed on rank " << rank << ": " << condition_str
+       << (message ? " " + *message : "")
+
+#ifdef DYNAMPI_HAS_SOURCE_LOCATION
+       << "\n in " << loc.function_name() << " at " << filename << ":" << loc.line()
+#endif
+       << std::endl;
+    std::cerr << ss.str();
+    throw std::runtime_error(ss.str());
+  }
+}
+
+#define DYNAMPI_ASSERT_BIN_OP(a, b, op, nop, ...)                        \
+  do {                                                                   \
+    const auto A = a;                                                    \
+    const auto B = b;                                                    \
+    if (!((A)op(B)))                                                     \
+      dynampi::_DYNAMPI_FAILBinOp((A), (B), (#a), (#b), (#nop),          \
+                                  dynampi::OptionalString(__VA_ARGS__)); \
+  } while (false)
+
+template <typename A, typename B>
+inline void _DYNAMPI_FAILBinOp(const A &a, const B &b, const std::string &a_str,
+                               const std::string &b_str, const std::string &nop,
+                               const std::optional<std::string> &message
+#ifdef DYNAMPI_HAS_SOURCE_LOCATION
+                               ,
+                               const std::source_location &loc = std::source_location::current()
+#endif
+) {
+  std::stringstream ss;
+  ss << a << " " << nop << " " << b;
+  dynampi::_DYNAMPI_FAIL_ASSERT(a_str + " " + nop + " " + b_str,
+                                message ? (ss.str() + " " + *message) : ss.str()
+#ifdef DYNAMPI_HAS_SOURCE_LOCATION
+                                    ,
+                                loc
+#endif
+  );
+}
+
+#else
+#define DYNAMPI_ASSERT(condition, ...) \
+  do {                                 \
+  } while (false)
+#define DYNAMPI_ASSERT_BIN_OP(a, b, op, nop, ...) \
+  do {                                            \
+  } while (false)
+#endif
+
+#define DYNAMPI_FAIL(...)             \
+  DYNAMPI_ASSERT(false, __VA_ARGS__); \
+  DYNAMPI_UNREACHABLE()  // LCOV_EXCL_LINE
+
+#define DYNAMPI_UNIMPLEMENTED(...) DYNAMPI_FAIL("DYNAMPI_UNIMPLEMENTED")
+
+#define DYNAMPI_ASSERT_GE(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, >=, <, __VA_ARGS__)
+#define DYNAMPI_ASSERT_LE(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, <=, >, __VA_ARGS__)
+#define DYNAMPI_ASSERT_GT(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, >, <=, __VA_ARGS__)
+#define DYNAMPI_ASSERT_LT(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, <, >=, __VA_ARGS__)
+#define DYNAMPI_ASSERT_EQ(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, ==, !=, __VA_ARGS__)
+#define DYNAMPI_ASSERT_NE(expr, val, ...) DYNAMPI_ASSERT_BIN_OP(expr, val, !=, ==, __VA_ARGS__)
+
+#if defined(_MSC_VER) && !defined(__clang__)  // MSVC
+#define DYNAMPI_UNREACHABLE() __assume(false)
+#else  // GCC, Clang
+#define DYNAMPI_UNREACHABLE() __builtin_unreachable()
+#endif
+
+}  // namespace dynampi
diff --git a/include/dynampi/utilities/printing.hpp b/include/dynampi/utilities/printing.hpp
new file mode 100644
index 0000000..440ee24
--- /dev/null
+++ b/include/dynampi/utilities/printing.hpp
@@ -0,0 +1,110 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <optional>
+#include <set>
+#include <span>
+#include <tuple>
+#include <vector>
+
+namespace dynampi {
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::set<T>& set);
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec);
+template <typename T, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const std::array<T, N>& arr);
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::span<T>& vec);
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& op);
+template <typename... Args>
+inline std::ostream& operator<<(std::ostream& os, const std::tuple<Args...>& tup);
+template <typename T, typename U>
+inline std::ostream& operator<<(std::ostream& os, const std::pair<T, U>& pair);
+inline std::ostream& operator<<(std::ostream& os, const std::byte& b);
+
+// --------------- IMPLEMENTATIONS ---------------
+
+inline std::ostream& operator<<(std::ostream& os, const std::byte& b) {
+  return os << static_cast<uint32_t>(b);
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::span<T>& vec) {
+  os << "[";
+  for (std::size_t i = 0; i < vec.size(); i++) {
+    os << vec[i];
+    if (i < vec.size() - 1) {
+      os << ", ";
+    }
+  }
+  return os << "]";
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
+  return os << std::span<const T>(vec);
+}
+
+template <typename T, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const std::array<T, N>& arr) {
+  os << "[";
+  for (std::size_t i = 0; i < arr.size(); i++) {
+    os << arr[i];
+    if (i < arr.size() - 1) {
+      os << ", ";
+    }
+  }
+  return os << "]";
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::set<T>& set) {
+  os << "{";
+  auto it = set.begin();
+  while (it != set.end()) {
+    os << *it;
+    ++it;
+    if (it != set.end()) {
+      os << ", ";
+    }
+  }
+  return os << "}";
+}
+
+template <typename T, typename U>
+inline std::ostream& operator<<(std::ostream& os, const std::pair<T, U>& pair) {
+  return os << "(" << pair.first << ", " << pair.second << ")";
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& op) {
+  if (op.has_value()) {
+    return os << "Some(" << op.value() << ")";
+  }
+  return os << "None";
+}
+
+template <typename... Args>
+inline std::ostream& operator<<(std::ostream& os, const std::tuple<Args...>& tup) {
+  os << "(";
+  std::apply(
+      [&os](const Args&... args) {
+        std::size_t i = 0;
+        ((os << args << (++i < sizeof...(Args) ? ", " : "")), ...);
+      },
+      tup);
+  return os << ")";
+}
+
+}  // namespace dynampi
diff --git a/include/dynampi/utilities/timer.hpp b/include/dynampi/utilities/timer.hpp
index a7d50c3..6df8f9b 100644
--- a/include/dynampi/utilities/timer.hpp
+++ b/include/dynampi/utilities/timer.hpp
@@ -14,7 +14,7 @@ namespace dynampi {
 
 class Timer {
   std::optional<std::chrono::time_point<std::chrono::high_resolution_clock>> _start_time;
-  std::chrono::duration<double> _elapsed_time{0.0};
+  std::chrono::nanoseconds _elapsed_time{0};
 
  public:
   enum class AutoStart { Yes, No };
@@ -33,14 +33,15 @@ class Timer {
   std::chrono::duration<double> stop() {
     assert(_start_time.has_value() && "Timer not started");
     auto end_time = std::chrono::high_resolution_clock::now();
-    _elapsed_time += end_time - _start_time.value();
+    _elapsed_time +=
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - _start_time.value());
     _start_time.reset();
-    return _elapsed_time;
+    return std::chrono::duration<double>(_elapsed_time);
   }
 
   void reset(AutoStart auto_start = AutoStart::Yes) {
     _start_time.reset();
-    _elapsed_time = std::chrono::duration<double>(0.0);
+    _elapsed_time = std::chrono::nanoseconds{0};
     if (auto_start == AutoStart::Yes) {
       start();
     }
@@ -48,9 +49,12 @@ class Timer {
 
   [[nodiscard]] std::chrono::duration<double> elapsed() const {
     if (_start_time.has_value()) {
-      return _elapsed_time + (std::chrono::high_resolution_clock::now() - _start_time.value());
+      auto current_elapsed =
+          _elapsed_time + std::chrono::duration_cast<std::chrono::nanoseconds>(
+                              std::chrono::high_resolution_clock::now() - _start_time.value());
+      return std::chrono::duration<double>(current_elapsed);
     }
-    return _elapsed_time;
+    return std::chrono::duration<double>(_elapsed_time);
   }
 
   friend std::ostream& operator<<(std::ostream& os, const Timer& timer) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ac05872..aefda3f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -79,6 +79,11 @@ endif()
 
 # --------------------------------------------------------------------
 # Add MPI tests with varying ranks
+set(DYNAMPI_MAX_MPI_RANK 64 CACHE STRING "Max MPI ranks for ctest MPI runs")
+if(DEFINED ENV{DYNAMPI_MAX_MPI_RANK})
+  set(DYNAMPI_MAX_MPI_RANK "$ENV{DYNAMPI_MAX_MPI_RANK}")
+endif()
+
 function(add_mpi_test test_name num_procs)
   set(command
       ${MPIEXEC_EXECUTABLE}
@@ -91,6 +96,10 @@ function(add_mpi_test test_name num_procs)
   add_test(NAME ${test_name} COMMAND ${command})
 endfunction()
 
-foreach(rank 1 2 3 4 8)
+foreach(rank 1 2 3 4 8 16 64)
+  if(rank GREATER DYNAMPI_MAX_MPI_RANK)
+    message(STATUS "Skipping mpi_test_${rank}_rank (rank > ${DYNAMPI_MAX_MPI_RANK})")
+  else()
   add_mpi_test(mpi_test_${rank}_rank ${rank})
+  endif()
 endforeach()
diff --git a/test/lsan.supp b/test/lsan.supp
index 3c24107..4e68452 100644
--- a/test/lsan.supp
+++ b/test/lsan.supp
@@ -3,6 +3,20 @@
 leak:ompi_op_base_op_select
 leak:ompi_mpi_init
 leak:ompi_mpi_finalize
+leak:ompi_comm_init_mpi3
+leak:pmix_hash_fetch
+leak:opal_reachable_allocate
+leak:pmix_pointer_array_list
+leak:fetch_nodeinfo
+leak:mca_btl_tcp_proc_create
+leak:opal_vasprintf
+leak:PMIx_Value_create
+leak:fetch_appinfo
+leak:fetch_sessioninfo
+leak:avx_component_op_query
+leak:pmix_pointer_array_init
+leak:pmix_bfrops_base_copy_value
+leak:hwloc__add_info
 leak:orte_finalize
 leak:libevent_core
 leak:event_base_loop
diff --git a/test/mpi/test_distributers.cpp b/test/mpi/test_distributers.cpp
index 4b666f2..7efa4e9 100644
--- a/test/mpi/test_distributers.cpp
+++ b/test/mpi/test_distributers.cpp
@@ -6,26 +6,65 @@
 #include <gtest/gtest.h>
 #include <mpi.h>
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <dynampi/dynampi.hpp>
+#include <type_traits>
 #include <vector>
 
 #include "dynampi/impl/hierarchical_distributor.hpp"
 #include "dynampi/mpi/mpi_communicator.hpp"
 #include "mpi_test_environment.hpp"
 
-// --- Configuration Wrapper ---
-template <template <typename, typename, typename...> class DistributorT, bool ImmediateRecv = false>
-struct TestConfig {
+template <template <typename...> class Template, typename T>
+struct is_specialization_of : std::false_type {};
+
+template <template <typename...> class Template, typename... Args>
+struct is_specialization_of<Template, Template<Args...>> : std::true_type {};
+
+template <template <typename, typename, typename...> class TT>
+struct DistributerTypeWrapper {
+  template <typename TaskT, typename ResultT, typename... Options>
+  using type = TT<TaskT, ResultT, Options...>;
+
+  static constexpr bool use_immediate_recv = false;
+  static constexpr size_t max_result_size = 1024;
+
+  template <typename TaskT, typename ResultT, typename... Options>
+  static typename TT<TaskT, ResultT, Options...>::Config get_config() {
+    return typename TT<TaskT, ResultT, Options...>::Config{};
+  }
+};
+
+// Specialized wrapper for HierarchicalMPIWorkDistributor with coordinator_per_node config
+template <bool CoordinatorPerNode>
+struct HierarchicalDistributerTypeWrapper {
   template <typename TaskT, typename ResultT, typename... Options>
-  using type = DistributorT<TaskT, ResultT, Options...>;
+  using type = dynampi::HierarchicalMPIWorkDistributor<TaskT, ResultT, Options...>;
 
-  static constexpr bool use_immediate_recv = ImmediateRecv;
+  static constexpr bool use_immediate_recv = false;
   static constexpr size_t max_result_size = 1024;
+
+  template <typename TaskT, typename ResultT, typename... Options>
+  static typename dynampi::HierarchicalMPIWorkDistributor<TaskT, ResultT, Options...>::Config
+  get_config() {
+    typename dynampi::HierarchicalMPIWorkDistributor<TaskT, ResultT, Options...>::Config config;
+    config.coordinator_per_node = CoordinatorPerNode;
+    return config;
+  }
 };
 
-// --- Unified Test Fixture ---
+// Helper to get config from wrapper
+template <typename Wrapper, typename TaskT, typename ResultT, typename... Options>
+auto get_distributer_config() {
+  return Wrapper::template get_config<TaskT, ResultT, Options...>();
+}
+
+template <typename Wrapper, typename... T>
+using DistributerOf = typename Wrapper::template type<T...>;
+
+// Test fixture
 template <typename T>
 class DynamicDistribution : public ::testing::Test {
  protected:
@@ -49,103 +88,283 @@ class DynamicDistribution : public ::testing::Test {
   }
 };
 
-using DistributerTypes =
-    ::testing::Types<TestConfig<dynampi::NaiveMPIWorkDistributor, false>,
-                     TestConfig<dynampi::NaiveMPIWorkDistributor, true>,
-                     TestConfig<dynampi::HierarchicalMPIWorkDistributor, false>>;
+using DistributerTypes = ::testing::Types<DistributerTypeWrapper<dynampi::NaiveMPIWorkDistributor>,
+                                          HierarchicalDistributerTypeWrapper<true>,
+                                          HierarchicalDistributerTypeWrapper<false>>;
 
 TYPED_TEST_SUITE(DynamicDistribution, DistributerTypes);
 
 // --- Tests are now much leaner ---
 
 TYPED_TEST(DynamicDistribution, BasicFlow) {
-  auto worker_task = [](uint32_t task) -> double { return sqrt(static_cast<double>(task)); };
-  auto dist = this->template make_distributor<uint32_t, double>(worker_task);
+  using TaskT = int;
+  using Distributer = DistributerOf<TypeParam, TaskT, double>;
+  auto worker_task = [](TaskT task) -> double { return sqrt(static_cast<double>(task)); };
 
-  if (dist.is_root_manager()) {
-    for (int i = 0; i < 10; ++i) dist.insert_task(i);
-    auto results = dist.finish_remaining_tasks();
-    EXPECT_EQ(results.size(), 10);
+  auto config = get_distributer_config<TypeParam, TaskT, double>();
+  config.comm = MPI_COMM_WORLD;
+  config.auto_run_workers = false;
+  Distributer distributor(worker_task, config);
+
+  EXPECT_EQ(distributor.is_root_manager(), MPIEnvironment::world_comm_rank() == 0);
+
+  if (distributor.is_root_manager()) {
+    for (int i = 0; i < 10; ++i) distributor.insert_task(i);
+  }
+
+  if (distributor.is_root_manager()) {
+    auto results =
+        distributor.run_tasks({.target_num_tasks = 5, .allow_more_than_target_tasks = false});
+    EXPECT_EQ(results.size(), 5);
+    EXPECT_LE(distributor.remaining_tasks_count(), 5);
+    auto second_results = distributor.finish_remaining_tasks();
+    EXPECT_EQ(second_results.size(), 5);
+    EXPECT_EQ(distributor.remaining_tasks_count(), 0);
+    results.insert(results.end(), second_results.begin(), second_results.end());
+    if (!Distributer::ordered) {
+      std::sort(results.begin(), results.end());
+    }
     for (size_t i = 0; i < results.size(); ++i) {
       EXPECT_DOUBLE_EQ(results[i] * results[i], static_cast<double>(i));
     }
   } else {
-    dist.run_worker();
+    distributor.run_worker();
   }
 }
 
-TYPED_TEST(DynamicDistribution, MultiStageTasks) {
-  using Result = std::vector<int>;
-  auto worker_task = [](int task) -> Result { return {task, task * task, task * task * task}; };
-  auto dist = this->template make_distributor<int, Result>(worker_task);
+TYPED_TEST(DynamicDistribution, Naive2) {
+  using DistributerWrapper = TypeParam;
 
-  if (dist.is_root_manager()) {
-    dist.insert_tasks({1, 2, 3, 4, 5});
-    EXPECT_EQ(dist.finish_remaining_tasks().size(), 5);
+  auto worker_task = [](size_t task) -> char { return "Hi"[task]; };
 
-    dist.insert_tasks({6, 7, 8});
-    auto results = dist.finish_remaining_tasks();
-    EXPECT_EQ(results.size(), 8);
-    EXPECT_EQ(results.back(), (Result{8, 64, 512}));
+  auto result = dynampi::mpi_manager_worker_distribution<char, DistributerWrapper::template type>(
+      2, worker_task);
+
+  if (MPIEnvironment::world_comm_rank() == 0) {
+    ASSERT_TRUE(result.has_value());
+    if constexpr (!DistributerWrapper::template type<int, int>::ordered) {
+      std::sort(result->begin(), result->end());
+    }
+    EXPECT_EQ(result.value(), std::vector<char>({'H', 'i'}));
   } else {
-    dist.run_worker();
+    EXPECT_FALSE(result.has_value());
   }
 }
 
-TYPED_TEST(DynamicDistribution, PriorityQueue) {
-  auto worker_task = [](int task) -> int { return task * task; };
-  // Pass the priority option as a template argument to the factory
-  auto dist =
-      this->template make_distributor<int, int, dynampi::enable_prioritization>(worker_task);
+// Exercises manager_rank != 0 for both Naive and Hierarchical (e.g. idx_for_worker branches).
+TYPED_TEST(DynamicDistribution, ManagerRankNonZero) {
+  if (MPIEnvironment::world_comm_size() < 2) {
+    GTEST_SKIP() << "Need at least 2 ranks for non-zero manager rank";
+  }
+  const int manager_rank = 1;
+  using TaskT = int;
+  using ResultT = double;
+  using Distributer = DistributerOf<TypeParam, TaskT, ResultT>;
+  auto worker_task = [](TaskT task) -> ResultT { return sqrt(static_cast<double>(task)); };
+  auto config = get_distributer_config<TypeParam, TaskT, ResultT>();
+  config.comm = MPI_COMM_WORLD;
+  config.auto_run_workers = false;
+  config.manager_rank = manager_rank;
+  Distributer distributor(worker_task, config);
 
-  if (dist.is_root_manager()) {
-    std::vector<std::pair<int, double>> tasks = {{1, 1.0}, {7, 7.0}, {3, 3.0}, {6, 6.0},
-                                                 {2, 2.0}, {4, 5.0}, {5, 4.0}};
-    for (auto& t : tasks) dist.insert_task(t.first, t.second);
+  EXPECT_EQ(distributor.is_root_manager(), MPIEnvironment::world_comm_rank() == manager_rank);
 
-    auto result = dist.finish_remaining_tasks();
-    EXPECT_EQ(result, (std::vector<int>{49, 36, 16, 25, 9, 4, 1}));
+  if (distributor.is_root_manager()) {
+    for (int i = 0; i < 10; ++i) distributor.insert_task(i);
+    auto results = distributor.finish_remaining_tasks();
+    if (!Distributer::ordered) {
+      std::sort(results.begin(), results.end());
+    }
+    EXPECT_EQ(results.size(), 10u);
+    for (size_t i = 0; i < results.size(); ++i) {
+      EXPECT_DOUBLE_EQ(results[i] * results[i], static_cast<double>(i));
+    }
   } else {
-    dist.run_worker();
+    distributor.run_worker();
   }
 }
 
-TYPED_TEST(DynamicDistribution, Statistics) {
-  auto worker_task = [](int task) -> int { return task * task; };
-  using StatsOpt = dynampi::track_statistics<dynampi::StatisticsMode::Detailed>;
-  auto dist = this->template make_distributor<int, int, StatsOpt>(worker_task);
+TYPED_TEST(DynamicDistribution, Example1) {
+  using DistributerWrapper = TypeParam;
 
-  if (dist.is_root_manager()) {
-    dist.insert_tasks({1, 2, 3, 4, 5});
-    auto results = dist.finish_remaining_tasks();
-
-    size_t expected_size = (MPIEnvironment::world_comm_size() == 1) ? 0 : 5;
-    EXPECT_EQ(results, (std::vector<int>{1, 4, 9, 16, 25}));
-    EXPECT_EQ(dist.get_statistics().comm_statistics.send_count, expected_size);
-    EXPECT_EQ(dist.get_statistics().comm_statistics.bytes_sent, expected_size * sizeof(int));
-    EXPECT_EQ(dist.get_statistics().comm_statistics.recv_count,
-              expected_size + MPIEnvironment::world_comm_size() - 1);
-    EXPECT_EQ(dist.get_statistics().comm_statistics.bytes_received, expected_size * sizeof(int));
-
-    dist.finalize();
-    EXPECT_EQ(dist.get_statistics().comm_statistics.send_count,
-              expected_size + MPIEnvironment::world_comm_size() - 1);
-    EXPECT_EQ(dist.get_statistics().comm_statistics.bytes_sent, expected_size * sizeof(int));
-
-    double expected_num_bytes = 0;
-    if (MPIEnvironment::world_comm_size() > 1) {
-      expected_num_bytes = static_cast<double>(expected_size * sizeof(int)) /
-                           (expected_size + MPIEnvironment::world_comm_size() - 1);
+  for (int manager_rank : {0, MPIEnvironment::world_comm_size() - 1}) {
+    auto worker_task = [](size_t task) -> size_t { return task * task; };
+    auto result =
+        dynampi::mpi_manager_worker_distribution<size_t, DistributerWrapper::template type>(
+            4, worker_task, MPI_COMM_WORLD, manager_rank);
+    if (result.has_value()) {
+      if constexpr (!DistributerWrapper::template type<int, int>::ordered) {
+        std::sort(result->begin(), result->end());
+      }
+      EXPECT_EQ(MPIEnvironment::world_comm_rank(), manager_rank);
+      EXPECT_EQ(result, std::vector<size_t>({0, 1, 4, 9}));
     }
-    EXPECT_DOUBLE_EQ(dist.get_statistics().comm_statistics.average_receive_size(),
-                     expected_num_bytes);
-    EXPECT_DOUBLE_EQ(dist.get_statistics().comm_statistics.average_send_size(), expected_num_bytes);
+  }
+}
+
+TYPED_TEST(DynamicDistribution, Example2) {
+  using Task = int;
+  using Result = std::vector<int>;
+  using Distributer = DistributerOf<TypeParam, Task, Result>;
+  if constexpr (is_specialization_of<dynampi::HierarchicalMPIWorkDistributor, Distributer>::value) {
+    GTEST_SKIP() << "This test is not applicable for HierarchicalMPIWorkDistributor.";
   } else {
-    dist.run_worker();
+    auto worker_task = [](Task task) -> Result {
+      return Result{task, task * task, task * task * task};
+    };
+    {
+      auto config = get_distributer_config<TypeParam, Task, Result>();
+      Distributer work_distributer(worker_task, config);
+      if (work_distributer.is_root_manager()) {
+        work_distributer.insert_tasks({1, 2, 3, 4, 5});
+        auto results = work_distributer.finish_remaining_tasks();
+        EXPECT_EQ(results, (std::vector<std::vector<int>>{
+                               {1, 1, 1}, {2, 4, 8}, {3, 9, 27}, {4, 16, 64}, {5, 25, 125}}));
+        work_distributer.insert_tasks({6, 7, 8});
+        results = work_distributer.finish_remaining_tasks();
+        EXPECT_EQ(results,
+                  (std::vector<std::vector<int>>{{6, 36, 216}, {7, 49, 343}, {8, 64, 512}}));
+      }
+    }
+  }
+}
+
+TYPED_TEST(DynamicDistribution, RunTasksMaxTasks) {
+  using Task = int;
+  using Result = int;
+  using Distributer = DistributerOf<TypeParam, Task, Result>;
+
+  auto worker_task = [](Task task) -> Result { return task * 2; };
+
+  auto config = get_distributer_config<TypeParam, Task, Result>();
+  Distributer work_distributer(worker_task, config);
+  if (work_distributer.is_root_manager()) {
+    work_distributer.insert_tasks({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+    typename Distributer::RunConfig run_config;
+    run_config.target_num_tasks = 3;
+    run_config.allow_more_than_target_tasks = false;
+    auto results = work_distributer.run_tasks(run_config);
+    EXPECT_EQ(results.size(), 3u);
+
+    run_config.target_num_tasks = 4;
+    auto more_results = work_distributer.run_tasks(run_config);
+    EXPECT_EQ(more_results.size(), 4u);
+
+    auto remaining_results = work_distributer.run_tasks();
+    EXPECT_EQ(remaining_results.size(), 3u);
+
+    std::vector<int> all_results;
+    all_results.insert(all_results.end(), results.begin(), results.end());
+    all_results.insert(all_results.end(), more_results.begin(), more_results.end());
+    all_results.insert(all_results.end(), remaining_results.begin(), remaining_results.end());
+    std::sort(all_results.begin(), all_results.end());
+    EXPECT_EQ(all_results, (std::vector<int>{2, 4, 6, 8, 10, 12, 14, 16, 18, 20}));
+  }
+}
+
+TYPED_TEST(DynamicDistribution, RunTasksMinTasksWithTimeLimit) {
+  using Task = int;
+  using Result = int;
+  using Distributer = DistributerOf<TypeParam, Task, Result>;
+
+  auto worker_task = [](Task task) -> Result { return task * 3; };
+
+  auto config = get_distributer_config<TypeParam, Task, Result>();
+  Distributer work_distributer(worker_task, config);
+  if (work_distributer.is_root_manager()) {
+    work_distributer.insert_tasks({1, 2, 3, 4, 5});
+
+    typename Distributer::RunConfig run_config;
+    run_config.target_num_tasks = 2;
+    run_config.max_seconds = 0.0;
+    auto results = work_distributer.run_tasks(run_config);
+    EXPECT_EQ(results.size(), 0u);
+
+    auto remaining_results = work_distributer.run_tasks();
+    EXPECT_EQ(results.size() + remaining_results.size(), 5u);
+  }
+}
+
+TYPED_TEST(DynamicDistribution, PriorityQueue) {
+  using Task = int;
+  using Result = int;
+  using Distributer = DistributerOf<TypeParam, Task, Result, dynampi::enable_prioritization>;
+  if (!Distributer::ordered) {
+    GTEST_SKIP()
+        << "This test requires ordered results, which is not supported by this distributer.";
+  }
+  auto worker_task = [](Task task) -> Result { return task * task; };
+  {
+    auto config = get_distributer_config<TypeParam, Task, Result, dynampi::enable_prioritization>();
+    Distributer work_distributer(worker_task, config);
+    if (work_distributer.is_root_manager()) {
+      work_distributer.insert_task(1, 1.0);
+      work_distributer.insert_task(7, 7.0);
+      work_distributer.insert_task(3, 3.0);
+      work_distributer.insert_task(6, 6.0);
+      work_distributer.insert_task(2, 2.0);
+      work_distributer.insert_task(4, 5.0);
+      work_distributer.insert_task(5, 4.0);
+      auto result = work_distributer.finish_remaining_tasks();
+      EXPECT_EQ(result, (std::vector<int>{49, 36, 16, 25, 9, 4, 1}));
+    }
+  }
+}
+
+TYPED_TEST(DynamicDistribution, Statistics) {
+  using Task = int;
+  using Result = int;
+  using Distributer = DistributerOf<TypeParam, Task, Result,
+                                    dynampi::track_statistics<dynampi::StatisticsMode::Detailed>>;
+  auto worker_task = [](Task task) -> Result { return task * task; };
+  {
+    auto config =
+        get_distributer_config<TypeParam, Task, Result,
+                               dynampi::track_statistics<dynampi::StatisticsMode::Detailed>>();
+    Distributer work_distributer(worker_task, config);
+    if (work_distributer.is_root_manager()) {
+      work_distributer.insert_tasks({1, 2, 3, 4, 5});
+      auto results = work_distributer.finish_remaining_tasks();
+      size_t expected_size = 5;
+      if (MPIEnvironment::world_comm_size() == 1) {
+        expected_size = 0;
+      }
+      if constexpr (!Distributer::ordered) {
+        std::sort(results.begin(), results.end());
+      }
+      EXPECT_EQ(results, (std::vector<int>{1, 4, 9, 16, 25}));
+      EXPECT_EQ(work_distributer.get_statistics().comm_statistics.bytes_sent,
+                expected_size * sizeof(int));
+      if constexpr (is_specialization_of<dynampi::NaiveMPIWorkDistributor, Distributer>::value) {
+        EXPECT_EQ(work_distributer.get_statistics().comm_statistics.send_count, expected_size);
+        EXPECT_EQ(work_distributer.get_statistics().comm_statistics.recv_count,
+                  expected_size + MPIEnvironment::world_comm_size() - 1);
+        EXPECT_EQ(work_distributer.get_statistics().comm_statistics.bytes_received,
+                  expected_size * sizeof(int));
+      }
+      work_distributer.finalize();
+      EXPECT_EQ(work_distributer.get_statistics().comm_statistics.bytes_sent,
+                expected_size * sizeof(int));
+      if constexpr (is_specialization_of<dynampi::NaiveMPIWorkDistributor, Distributer>::value) {
+        EXPECT_EQ(work_distributer.get_statistics().comm_statistics.send_count,
+                  expected_size + MPIEnvironment::world_comm_size() - 1);
+        double expected_num_bytes = 0;
+        if (MPIEnvironment::world_comm_size() > 1) {
+          expected_num_bytes = static_cast<double>(expected_size * sizeof(int)) /
+                               (expected_size + MPIEnvironment::world_comm_size() - 1);
+        }
+        EXPECT_DOUBLE_EQ(work_distributer.get_statistics().comm_statistics.average_receive_size(),
+                         expected_num_bytes);
+        EXPECT_DOUBLE_EQ(work_distributer.get_statistics().comm_statistics.average_send_size(),
+                         expected_num_bytes);
+      }
+    }
   }
 }
 
 TYPED_TEST(DynamicDistribution, AutoRunWorkers) {
+  using Distributer = DistributerOf<TypeParam, int, int>;
   auto worker_task = [](int task) -> int { return task * task; };
   // Test with auto_run_workers = true - workers should start automatically
   auto dist = this->template make_distributor<int, int>(worker_task, true);
@@ -154,6 +373,9 @@ TYPED_TEST(DynamicDistribution, AutoRunWorkers) {
     // Workers should already be running, so we can just insert tasks
     dist.insert_tasks({1, 2, 3, 4, 5});
     auto results = dist.finish_remaining_tasks();
+    if constexpr (!Distributer::ordered) {
+      std::sort(results.begin(), results.end());
+    }
     EXPECT_EQ(results, (std::vector<int>{1, 4, 9, 16, 25}));
   }
   // Workers run automatically in constructor, no need to call run_worker()
diff --git a/test/mpi/test_mpi_wrapper.cpp b/test/mpi/test_mpi_wrapper.cpp
index 0d09252..00c896f 100644
--- a/test/mpi/test_mpi_wrapper.cpp
+++ b/test/mpi/test_mpi_wrapper.cpp
@@ -13,6 +13,8 @@
 #include "dynampi/mpi/mpi_communicator.hpp"
 
 TEST(MPI, ErrorCheck) {
+  MPI_Errhandler previous_handler = MPI_ERRHANDLER_NULL;
+  MPI_Comm_get_errhandler(MPI_COMM_WORLD, &previous_handler);
   MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
   EXPECT_THROW(DYNAMPI_MPI_CHECK(MPI_Comm_rank, (MPI_COMM_NULL, nullptr)), std::runtime_error);
   try {
@@ -20,6 +22,9 @@ TEST(MPI, ErrorCheck) {
   } catch (const std::runtime_error &e) {
     EXPECT_TRUE(std::string(e.what()).find("MPI error in MPI_Comm_rank") != std::string::npos);
   }
+  if (previous_handler != MPI_ERRHANDLER_NULL) {
+    MPI_Comm_set_errhandler(MPI_COMM_WORLD, previous_handler);
+  }
 }
 
 TEST(MPICommunicatorWrapper, RankAndSizeCastGet) {
@@ -62,10 +67,14 @@ TEST(MPICommunicatorWrapper, SendRecvAndStatistics) {
       EXPECT_EQ(stats.send_count, 1);
       EXPECT_EQ(stats.bytes_sent, sizeof(int));
       EXPECT_EQ(stats.recv_count, 0);
+      EXPECT_EQ(stats.average_send_size(), sizeof(int));
+      EXPECT_EQ(stats.average_receive_size(), 0.0);
     } else if (rank == 1) {
       EXPECT_EQ(stats.recv_count, 1);
       EXPECT_EQ(stats.bytes_received, sizeof(int));
       EXPECT_EQ(stats.send_count, 0);
+      EXPECT_EQ(stats.average_receive_size(), sizeof(int));
+      EXPECT_EQ(stats.average_send_size(), 0.0);
     } else {
       EXPECT_EQ(stats.send_count, 0);
       EXPECT_EQ(stats.recv_count, 0);
@@ -149,6 +158,7 @@ TEST(MPICommunicatorWrapper, SplitByNode) {
   }
   node.broadcast(root_node_name, 0);
   EXPECT_EQ(root_node_name, node_name);
+  MPI_Barrier(MPI_COMM_WORLD);
 }
 
 TEST(MPICommunicatorWrapper, RecvEmptyMessage) {
diff --git a/test/unit/test_assert.cpp b/test/unit/test_assert.cpp
new file mode 100644
index 0000000..34bd3e3
--- /dev/null
+++ b/test/unit/test_assert.cpp
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+#include <mpi.h>
+
+#include <stdexcept>
+
+#ifndef NDEBUG
+// Redirect calls to our stub function
+#define MPI_Comm_rank DYNAMPI_TEST_MPI_Comm_rank
+static inline int DYNAMPI_TEST_MPI_Comm_rank(MPI_Comm /*comm*/, int* rank) {
+  if (rank) *rank = 0;
+  return MPI_SUCCESS;
+}
+#endif
+
+// Include the header-under-test AFTER redefining MPI_Comm_rank.
+#include <dynampi/utilities/assert.hpp>
+
+using namespace dynampi;
+
+// Helper to capture stderr and detect if a throw occurred.
+namespace {
+template <typename F>
+std::pair<std::string, bool> CaptureStderrAndDidThrow(F&& f) {
+  testing::internal::CaptureStderr();
+  bool threw = false;
+  try {
+    f();
+  } catch (...) {
+    threw = true;
+  }
+  std::string out = testing::internal::GetCapturedStderr();
+  return {out, threw};
+}
+}  // namespace
+
+TEST(OptionalString, NoArgsReturnsNullopt) {
+#ifndef NDEBUG
+  auto s = OptionalString();
+  EXPECT_FALSE(s.has_value());
+#else
+  GTEST_SKIP() << "OptionalString only exists in non-NDEBUG builds.";
+#endif
+}
+
+TEST(OptionalString, ConcatsMultipleArgs) {
+#ifndef NDEBUG
+  auto s = OptionalString("Value=", 42, ", ok");
+  ASSERT_TRUE(s.has_value());
+  EXPECT_EQ(*s, "Value=42, ok");
+#else
+  GTEST_SKIP() << "OptionalString only exists in non-NDEBUG builds.";
+#endif
+}
+
+TEST(DynaMPIAssert, TrueConditionDoesNotThrow) {
+#ifndef NDEBUG
+  int a1 = 1, b1 = 1;
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT(a1 == b1, "should not throw"); });
+#else
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT(false, "no-op in NDEBUG"); });
+#endif
+}
+
+TEST(DynaMPIAssert, NoAssertInDestructorDuringThrow) {
+#ifndef NDEBUG
+  class Test {
+   public:
+    ~Test() { DYNAMPI_ASSERT(false, "Destructor should not assert if already throwing"); }
+  };
+
+  EXPECT_THROW(
+      {
+        Test t;
+        throw std::logic_error("Test exception");
+      },
+      std::logic_error);
+#else
+  GTEST_SKIP() << "DYNAMPI_ASSERT in destructor is a no-op in NDEBUG builds.";
+#endif
+}
+
+TEST(DynaMPIAssert, FalseConditionThrowsAndPrints) {
+#ifndef NDEBUG
+  int a1 = 1, b2 = 2;
+  auto [msg, threw] = CaptureStderrAndDidThrow([=] { DYNAMPI_ASSERT(a1 == b2, "custom message"); });
+  EXPECT_TRUE(threw);
+  EXPECT_NE(msg.find("DynaMPI assertion failed"), std::string::npos);
+  EXPECT_NE(msg.find("a1 == b2"), std::string::npos);        // condition text
+  EXPECT_NE(msg.find("custom message"), std::string::npos);  // user message
+  EXPECT_NE(msg.find("rank "), std::string::npos);           // our stub sets rank 0
+#else
+  GTEST_SKIP() << "DYNAMPI_ASSERT is a no-op in NDEBUG builds.";
+#endif
+}
+
+// ---------- Binary-op helpers ----------
+TEST(DynaMPIAssertBinOp, EqFailureShowsValuesAndNegatedOp) {
+#ifndef NDEBUG
+  int a1 = 1, b2 = 2;
+  auto [msg, threw] = CaptureStderrAndDidThrow([=] { DYNAMPI_ASSERT_EQ(a1, b2, "boom"); });
+  EXPECT_TRUE(threw);
+  EXPECT_NE(msg.find("1 != 2"), std::string::npos);  // comes from _DYNAMPI_FAILBinOp
+  EXPECT_NE(msg.find("boom"), std::string::npos);
+  EXPECT_NE(msg.find("DynaMPI assertion failed"), std::string::npos);
+#else
+  GTEST_SKIP() << "DYNAMPI_ASSERT_* are no-ops in NDEBUG builds.";
+#endif
+}
+
+TEST(DynaMPIAssertBinOp, GeAndLtPassWithoutThrow) {
+#ifndef NDEBUG
+  int a5 = 5, b5 = 5, c4 = 4;
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT_GE(a5, b5, "ok"); });
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT_LT(c4, a5, "ok"); });
+#else
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT_GE(c4, a5, "no-op"); });
+  EXPECT_NO_THROW({ DYNAMPI_ASSERT_LT(can write, anything here); });
+#endif
+}
diff --git a/test/unit/test_printing.cpp b/test/unit/test_printing.cpp
new file mode 100644
index 0000000..8792eeb
--- /dev/null
+++ b/test/unit/test_printing.cpp
@@ -0,0 +1,98 @@
+/*
+ * SPDX-FileCopyrightText: 2025 QDX Technologies. Authored by Ryan Stocks <ryan.stocks00@gmail.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cstddef>
+#include <dynampi/utilities/printing.hpp>
+#include <optional>
+#include <set>
+#include <span>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+using namespace dynampi;
+
+namespace {
+template <typename T>
+std::string to_str(const T& v) {
+  std::ostringstream oss;
+  oss << v;
+  return oss.str();
+}
+
+}  // namespace
+
+TEST(ByteOstreamTest, PrintsAsUnsignedInteger) {
+  std::byte b1{static_cast<unsigned char>(0)};
+  std::byte b2{static_cast<unsigned char>(255)};
+  std::byte b3{static_cast<unsigned char>(42)};
+
+  EXPECT_EQ(to_str(b1), "0");
+  EXPECT_EQ(to_str(b2), "255");
+  EXPECT_EQ(to_str(b3), "42");
+}
+
+TEST(SpanOstreamTest, EmptySpanPrintsBrackets) {
+  std::vector<int> v;
+  std::span<const int> sp(v.data(), v.size());
+  EXPECT_EQ(to_str(sp), "[]");
+}
+
+TEST(SpanOstreamTest, PrintsCommaSeparatedWithBrackets) {
+  int data[] = {1, 2, 3};
+  std::span<const int> sp(data, 3);
+  EXPECT_EQ(to_str(sp), "[1, 2, 3]");
+}
+
+TEST(VectorOstreamTest, DelegatesToSpanFormat) {
+  std::vector<int> v{4, 5, 6};
+  EXPECT_EQ(to_str(v), "[4, 5, 6]");
+}
+
+TEST(ArrayOstreamTest, PrintsCommaSeparatedWithBrackets) {
+  std::array<int, 3> a{7, 8, 9};
+  EXPECT_EQ(to_str(a), "[7, 8, 9]");
+}
+
+TEST(SetOstreamTest, PrintsInAscendingOrderWithBraces) {
+  std::set<int> s{5, 1, 3};
+  EXPECT_EQ(to_str(s), "{1, 3, 5}");
+}
+
+TEST(PairOstreamTest, PrintsInParensWithCommaSpace) {
+  std::pair<int, std::string> p{10, "foo"};
+  EXPECT_EQ(to_str(p), "(10, foo)");
+}
+
+TEST(OptionalOstreamTest, PrintsSome) {
+  std::optional<int> o = 123;
+  EXPECT_EQ(to_str(o), "Some(123)");
+}
+
+TEST(OptionalOstreamTest, PrintsNone) {
+  std::optional<int> o;
+  EXPECT_EQ(to_str(o), "None");
+}
+
+TEST(NestedContainersTest, WorksRecursively) {
+  std::vector<std::set<int>> v{{2, 1}, {4, 3}};
+  // vector prints as span -> each set prints with braces
+  EXPECT_EQ(to_str(v), "[{1, 2}, {3, 4}]");
+}
+
+TEST(SpanOfBytesTest, PrintsNumericBytes) {
+  std::array<std::byte, 4> bytes{std::byte{0x00}, std::byte{0x0A}, std::byte{0x7F},
+                                 std::byte{0xFF}};
+  std::span<const std::byte> sp(bytes.data(), bytes.size());
+  EXPECT_EQ(to_str(sp), "[0, 10, 127, 255]");
+}
+
+TEST(TupleOstreamTest, PrintsCommaSeparatedInParensNoTrailingComma) {
+  auto t = std::make_tuple(1, std::string("x"), 3);
+  EXPECT_EQ(to_str(t), "(1, x, 3)");
+}