Skip to content

Commit

Permalink
Implement cudax::async_buffer (#3460)
Browse files Browse the repository at this point in the history
* Make cudax depend on thrust

* Implement `cudax::heterogeneous_iterator`

* Implement `cudax::async_vector`

This implements `cudax::async_vector` a contiguous container based on async memory resources.

In contrast to `std::async_vector` it is templated on a set of properties and accepts any resource that satisfies these properties.

That makes it suitable for heterogeneous systems, where we need to account for different execution spaces.

* Add an example on how to use cudax::async_vector

* Simplify vector to a buffer

* Drop assignment

* Add `get` and `get_unsynchonized` methods

* Is fake resource_ref to avoid deep copy

* Add `copy_to` method to transfer memory

* Fix spelling

* Drop old dialect checks

* Address review comments

* Rename to `make_async_buffer`

* Fix comparison

* Address review comments

* Properly wait before accessing pointers

* fixup! Address review comments

* Drop superfluous `__Copy_same`

* We need to synchronize after the host_launch 🤷

* Add comments

* Simplify the implementation and add another comment

* Address review comments

* Fix concept issue with ==

* Update cudax/include/cuda/experimental/__container/async_buffer.cuh

Co-authored-by: Eric Niebler <[email protected]>

* Address review comments

* Address review comments on `heterogeneous_iterator`

* Ensure that we are on the right stream for thrust calls

* Make doxygen happy

---------

Co-authored-by: Eric Niebler <[email protected]>
  • Loading branch information
miscco and ericniebler authored Feb 28, 2025
1 parent 52ed31c commit b048cb7
Show file tree
Hide file tree
Showing 28 changed files with 3,445 additions and 25 deletions.
6 changes: 4 additions & 2 deletions cudax/cmake/cudaxBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
# be linked into the developer build targets, as they include both
# cudax.compiler_interface and cccl.compiler_interface_cppXX.

find_package(libcudacxx ${cudax_VERSION} EXACT CONFIG REQUIRED
find_package(Thrust ${cudax_VERSION} EXACT CONFIG REQUIRED
NO_DEFAULT_PATH # Only check the explicit path in HINTS:
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/libcudacxx/"
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
)

function(cudax_build_compiler_targets)
Expand Down Expand Up @@ -73,6 +73,8 @@ function(cudax_build_compiler_targets)
cccl.compiler_interface_cpp${dialect}
cudax.compiler_interface
libcudacxx::libcudacxx
CUB::CUB
Thrust::Thrust
)
endforeach()

Expand Down
4 changes: 4 additions & 0 deletions cudax/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ function(cudax_add_example target_name_var example_src cudax_target)
${cudax_target}
cudax.examples.thrust
)
target_compile_options(${example_target} PRIVATE
"-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
)

cudax_clone_target_properties(${example_target} ${cudax_target})
target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")

Expand Down
90 changes: 90 additions & 0 deletions cudax/examples/async_buffer_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/

#include <thrust/generate.h>
#include <thrust/random.h>
#include <thrust/transform.h>

#include <cuda/experimental/container.cuh>
#include <cuda/experimental/memory_resource.cuh>
#include <cuda/experimental/stream.cuh>

#include <iostream>

namespace cudax = cuda::experimental;

constexpr int numElements = 50000;

struct generator
{
thrust::default_random_engine gen{};
thrust::uniform_real_distribution<float> dist{-10.0f, 10.0f};

__host__ __device__ generator(const unsigned seed)
: gen{seed}
{}

__host__ __device__ float operator()() noexcept
{
return dist(gen);
}
};

int main()
{
// A CUDA stream on which to execute the vector addition kernel
cudax::stream stream{};

// The execution policy we want to use to run all work on the same stream
auto policy = thrust::cuda::par_nosync.on(stream.get());

// An environment we use to pass all necessary information to the containers
cudax::env_t<cuda::mr::device_accessible> env{cudax::device_memory_resource{}, stream};

// Allocate the two inputs and output, but do not zero initialize via `cudax::uninit`
cudax::async_device_buffer<float> A{env, numElements, cudax::uninit};
cudax::async_device_buffer<float> B{env, numElements, cudax::uninit};
cudax::async_device_buffer<float> C{env, numElements, cudax::uninit};

// Fill both vectors on stream using a random number generator
thrust::generate(policy, A.begin(), A.end(), generator{42});
thrust::generate(policy, B.begin(), B.end(), generator{1337});

// Add the vectors together
thrust::transform(policy, A.begin(), A.end(), B.begin(), C.begin(), cuda::std::plus<>{});

// Verify that the result vector is correct, by copying it to host
cudax::env_t<cuda::mr::host_accessible> host_env{cudax::pinned_memory_resource{}, stream};
cudax::async_host_buffer<float> h_A{host_env, A};
cudax::async_host_buffer<float> h_B{host_env, B};
cudax::async_host_buffer<float> h_C{host_env, C};

// Do not forget to sync afterwards
stream.wait();

for (int i = 0; i < numElements; ++i)
{
if (cuda::std::abs(h_A.get_unsynchronized(i) + h_B.get_unsynchronized(i) - h_C.get_unsynchronized(i)) > 1e-5)
{
std::cerr << "Result verification failed at element " << i << "\n";
exit(EXIT_FAILURE);
}
}

return 0;
}
1 change: 0 additions & 1 deletion cudax/examples/simple_p2p.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
* Unified Virtual Address Space (UVA) features.
*/

#define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
#include <cuda/memory_resource>

#include <cuda/experimental/algorithm.cuh>
Expand Down
Loading

0 comments on commit b048cb7

Please sign in to comment.