Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions benchmarks/expt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ if (CHAI_ENABLE_HIP)
endif ()

if (CHAI_ENABLE_CUDA OR CHAI_ENABLE_HIP)
blt_add_executable(
NAME DualArrayManagerBenchmarks
SOURCES DualArrayManagerBenchmarks.cpp
DEPENDS_ON ${chai_expt_benchmark_depends})

blt_add_benchmark(
NAME DualArrayManagerBenchmarks
COMMAND DualArrayManagerBenchmarks)

blt_add_executable(
NAME UnifiedArrayManagerBenchmarks
SOURCES UnifiedArrayManagerBenchmarks.cpp
Expand Down
206 changes: 206 additions & 0 deletions benchmarks/expt/DualArrayManagerBenchmarks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) Lawrence Livermore National Security, LLC and other CHAI
// contributors. See the CHAI LICENSE and COPYRIGHT files for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////

#include "benchmark/benchmark.h"

#include <cstddef>

#include "chai/expt/ContextGuard.hpp"
#include "chai/expt/DualArrayManager.hpp"

namespace {
using ::chai::expt::Context;
using ::chai::expt::ContextGuard;
using ::chai::expt::ContextManager;
using ::chai::expt::DualArrayManager;

static void DualArrayManager_DataSequence(benchmark::State& state,
Context initial_context,
bool initial_touch,
Context call_context,
bool call_touch)
{
const auto size = static_cast<std::size_t>(state.range(0));

for (auto _ : state)
{
state.PauseTiming();

auto& contextManager = ContextManager::getInstance();
contextManager.reset();

DualArrayManager<int> manager{size};
{
ContextGuard guard{initial_context};
int* initial_data = manager.data(/*touch=*/initial_touch);
benchmark::DoNotOptimize(initial_data);
}

{
ContextGuard guard{call_context};
state.ResumeTiming(); // measure only the data() call
int* data = manager.data(/*touch=*/call_touch);
benchmark::DoNotOptimize(data);
state.PauseTiming(); // exclude guard destruction
}
}

state.SetItemsProcessed(state.iterations());
state.SetBytesProcessed(state.iterations() * size * sizeof(int));
}

static void DualArrayManager_DefaultConstruct(benchmark::State& state)
{
for (auto _ : state)
{
DualArrayManager<int> manager{};
benchmark::DoNotOptimize(manager);
}
}

BENCHMARK(DualArrayManager_DefaultConstruct);

static void DualArrayManager_SizeConstruct(benchmark::State& state)
{
const auto size = static_cast<std::size_t>(state.range(0));

for (auto _ : state)
{
DualArrayManager<int> manager{size};
benchmark::DoNotOptimize(manager);
}

state.SetBytesProcessed(state.iterations() * size * sizeof(int));
}

BENCHMARK(DualArrayManager_SizeConstruct)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20);

static void DualArrayManager_AfterHostRead_DataHostRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::HOST,
/*initial_touch=*/false,
/*call_context=*/Context::HOST,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterHostRead_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterHostWrite_DataHostRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::HOST,
/*initial_touch=*/true,
/*call_context=*/Context::HOST,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterHostWrite_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterHostRead_DataDeviceRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::HOST,
/*initial_touch=*/false,
/*call_context=*/Context::DEVICE,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterHostRead_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterHostWrite_DataDeviceRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::HOST,
/*initial_touch=*/true,
/*call_context=*/Context::DEVICE,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterHostWrite_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterDeviceRead_DataHostRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::DEVICE,
/*initial_touch=*/false,
/*call_context=*/Context::HOST,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterDeviceRead_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterDeviceWrite_DataHostRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::DEVICE,
/*initial_touch=*/true,
/*call_context=*/Context::HOST,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterDeviceWrite_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterDeviceRead_DataDeviceRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::DEVICE,
/*initial_touch=*/false,
/*call_context=*/Context::DEVICE,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterDeviceRead_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void DualArrayManager_AfterDeviceWrite_DataDeviceRead(benchmark::State& state)
{
DualArrayManager_DataSequence(state,
/*initial_context=*/Context::DEVICE,
/*initial_touch=*/true,
/*call_context=*/Context::DEVICE,
/*call_touch=*/false);
}

BENCHMARK(DualArrayManager_AfterDeviceWrite_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);
} // namespace

BENCHMARK_MAIN();
27 changes: 26 additions & 1 deletion benchmarks/expt/UnifiedArrayManagerBenchmarks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace {
Context call_context,
bool call_touch)
{
constexpr std::size_t size = 1;
const auto size = static_cast<std::size_t>(state.range(0));

for (auto _ : state)
{
Expand All @@ -50,6 +50,7 @@ namespace {
}

state.SetItemsProcessed(state.iterations());
state.SetBytesProcessed(state.iterations() * size * sizeof(int));
}

static void UnifiedArrayManager_DefaultConstruct(benchmark::State& state)
Expand Down Expand Up @@ -91,6 +92,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterHostRead_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterHostWrite_DataHostRead(benchmark::State& state)
Expand All @@ -103,6 +107,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterHostWrite_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterHostRead_DataDeviceRead(benchmark::State& state)
Expand All @@ -115,6 +122,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterHostRead_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterHostWrite_DataDeviceRead(benchmark::State& state)
Expand All @@ -127,6 +137,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterHostWrite_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterDeviceRead_DataHostRead(benchmark::State& state)
Expand All @@ -139,6 +152,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterDeviceRead_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterDeviceWrite_DataHostRead(benchmark::State& state)
Expand All @@ -151,6 +167,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterDeviceWrite_DataHostRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterDeviceRead_DataDeviceRead(benchmark::State& state)
Expand All @@ -163,6 +182,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterDeviceRead_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);

static void UnifiedArrayManager_AfterDeviceWrite_DataDeviceRead(benchmark::State& state)
Expand All @@ -175,6 +197,9 @@ namespace {
}

BENCHMARK(UnifiedArrayManager_AfterDeviceWrite_DataDeviceRead)
->Arg(0)
->RangeMultiplier(8)
->Range(1, 1 << 20)
->Unit(benchmark::kNanosecond);
} // namespace

Expand Down
59 changes: 59 additions & 0 deletions docs/sphinx/expt/design.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,62 @@ of each element on the host (numeric types are initialized to zero).
const int* p = a.data(false);
// Read through p...
}

----------------
DualArrayManager
----------------

``DualArrayManager`` manages separate host and device allocations and keeps them
coherent explicitly. Unlike ``UnifiedArrayManager``, it does not rely on a
single unified-memory pointer that is valid in both contexts. Instead, it
allocates storage lazily in the requested context and copies data between the
two allocations when the authoritative copy lives in the other context.

``DualArrayManager`` relies on :ref:`ContextManager <experimental_design>` in
the same way as ``UnifiedArrayManager``:

- ``data(touch=false)`` returns a pointer suitable for read access in the
current context and synchronizes with the most recent modifying context when
needed.
- ``data(touch=true)`` indicates the caller will modify the array in the current
context; it performs any required synchronization first, then records the
current context as authoritative.

``DualArrayManager`` also performs value initialization of each element when a
new allocation is created. Since it uses raw host/device allocations and
byte-wise copies between them, it is intended for trivially copyable element
types.

This manager is useful when an application wants explicit mirrored allocations
in host and device memory rather than unified memory.

.. code-block:: cpp

#include "chai/expt/Context.hpp"
#include "chai/expt/ContextGuard.hpp"
#include "chai/expt/DualArrayManager.hpp"

const std::size_t N = 1000000;
::chai::expt::DualArrayManager<int> a{N};

{
::chai::expt::ContextGuard guard{::chai::expt::Context::HOST};
int* p = a.data(true);
for (std::size_t i = 0; i < N; ++i) {
p[i] = static_cast<int>(i);
}
}

{
::chai::expt::ContextGuard guard{::chai::expt::Context::DEVICE};
int* p = a.data(true);
// If the most recent modification was on HOST, this call copies to DEVICE first.
// Launch a CUDA/HIP kernel that writes through p...
}

{
::chai::expt::ContextGuard guard{::chai::expt::Context::HOST};
// If the most recent modification was on DEVICE, this call synchronizes and copies first.
const int* p = a.data(false);
// Read through p...
}
1 change: 1 addition & 0 deletions src/chai/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ if(CHAI_ENABLE_EXPERIMENTAL)
expt/Context.hpp
expt/ContextGuard.hpp
expt/ContextManager.hpp
expt/DualArrayManager.hpp
expt/HostArrayManager.hpp
expt/HostSharedPointer.hpp
expt/ManagedArrayPointer.hpp
Expand Down
Loading