Implement cudax::async_buffer (#3460)

* Make cudax depend on thrust * Implement `cudax::heterogeneous_iterator` * Implement `cudax::async_vector` This implements `cudax::async_vector` a contiguous container based on async memory resources. In contrast to `std::async_vector` it is templated on a set of properties and accepts any resource that satisfies these properties. That makes it suitable for heterogeneous systems, where we need to account for different execution spaces. * Add an example on how to use cudax::async_vector * Simplify vector to a buffer * Drop assignment * Add `get` and `get_unsynchonized` methods * Is fake resource_ref to avoid deep copy * Add `copy_to` method to transfer memory * Fix spelling * Drop old dialect checks * Address review comments * Rename to `make_async_buffer` * Fix comparison * Address review comments * Properly wait before accessing pointers * fixup! Address review comments * Drop superfluous `__Copy_same` * We need to synchronize after the host_launch 🤷 * Add comments * Simplify the implementation and add another comment * Address review comments * Fix concept issue with == * Update cudax/include/cuda/experimental/__container/async_buffer.cuh Co-authored-by: Eric Niebler <[email protected]> * Address review comments * Address review comments on `heterogeneous_iterator` * Ensure that we are on the right stream for thrust calls * Make doxygen happy --------- Co-authored-by: Eric Niebler <[email protected]>
NVIDIA · Feb 28, 2025 · b048cb7 · b048cb7
1 parent 52ed31c
commit b048cb7
Show file tree

Hide file tree

Showing 28 changed files with 3,445 additions and 25 deletions.
diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -11,9 +11,9 @@
 #   be linked into the developer build targets, as they include both
 #   cudax.compiler_interface and cccl.compiler_interface_cppXX.
 
-find_package(libcudacxx ${cudax_VERSION} EXACT CONFIG REQUIRED
+find_package(Thrust ${cudax_VERSION} EXACT CONFIG REQUIRED
   NO_DEFAULT_PATH # Only check the explicit path in HINTS:
-  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/libcudacxx/"
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
 )
 
 function(cudax_build_compiler_targets)
@@ -73,6 +73,8 @@ function(cudax_build_compiler_targets)
       cccl.compiler_interface_cpp${dialect}
       cudax.compiler_interface
       libcudacxx::libcudacxx
+      CUB::CUB
+      Thrust::Thrust
     )
   endforeach()
 

diff --git a/cudax/examples/CMakeLists.txt b/cudax/examples/CMakeLists.txt
@@ -25,6 +25,10 @@ function(cudax_add_example target_name_var example_src cudax_target)
     ${cudax_target}
     cudax.examples.thrust
   )
+  target_compile_options(${example_target} PRIVATE
+    "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+  )
+
   cudax_clone_target_properties(${example_target} ${cudax_target})
   target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
 

diff --git a/cudax/examples/async_buffer_add.cu b/cudax/examples/async_buffer_add.cu
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <thrust/generate.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include <cuda/experimental/container.cuh>
+#include <cuda/experimental/memory_resource.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include <iostream>
+
+namespace cudax = cuda::experimental;
+
+constexpr int numElements = 50000;
+
+struct generator
+{
+  thrust::default_random_engine gen{};
+  thrust::uniform_real_distribution<float> dist{-10.0f, 10.0f};
+
+  __host__ __device__ generator(const unsigned seed)
+      : gen{seed}
+  {}
+
+  __host__ __device__ float operator()() noexcept
+  {
+    return dist(gen);
+  }
+};
+
+int main()
+{
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream{};
+
+  // The execution policy we want to use to run all work on the same stream
+  auto policy = thrust::cuda::par_nosync.on(stream.get());
+
+  // An environment we use to pass all necessary information to the containers
+  cudax::env_t<cuda::mr::device_accessible> env{cudax::device_memory_resource{}, stream};
+
+  // Allocate the two inputs and output, but do not zero initialize via `cudax::uninit`
+  cudax::async_device_buffer<float> A{env, numElements, cudax::uninit};
+  cudax::async_device_buffer<float> B{env, numElements, cudax::uninit};
+  cudax::async_device_buffer<float> C{env, numElements, cudax::uninit};
+
+  // Fill both vectors on stream using a random number generator
+  thrust::generate(policy, A.begin(), A.end(), generator{42});
+  thrust::generate(policy, B.begin(), B.end(), generator{1337});
+
+  // Add the vectors together
+  thrust::transform(policy, A.begin(), A.end(), B.begin(), C.begin(), cuda::std::plus<>{});
+
+  // Verify that the result vector is correct, by copying it to host
+  cudax::env_t<cuda::mr::host_accessible> host_env{cudax::pinned_memory_resource{}, stream};
+  cudax::async_host_buffer<float> h_A{host_env, A};
+  cudax::async_host_buffer<float> h_B{host_env, B};
+  cudax::async_host_buffer<float> h_C{host_env, C};
+
+  // Do not forget to sync afterwards
+  stream.wait();
+
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (cuda::std::abs(h_A.get_unsynchronized(i) + h_B.get_unsynchronized(i) - h_C.get_unsynchronized(i)) > 1e-5)
+    {
+      std::cerr << "Result verification failed at element " << i << "\n";
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  return 0;
+}
diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu
@@ -30,7 +30,6 @@
  * Unified Virtual Address Space (UVA) features.
  */
 
-#define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 #include <cuda/memory_resource>
 
 #include <cuda/experimental/algorithm.cuh>