Add some smoke tests that we are not completely breaking NVHPC stdpar (NVIDIA#4131)

miscco · alliepiper · web-flow · commit 036fddf638b6 · 2025-03-14T17:10:16.000+01:00
Co-authored-by: Allison Piper &lt;alliepiper16@gmail.com&gt;
diff --git a/ci/build_stdpar.sh b/ci/build_stdpar.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Ensure the script is being executed in the root cccl directory:
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..";
+
+# Get the current CCCL info:
+readonly cccl_repo="${PWD}"
+readonly workdir="${cccl_repo}/test/stdpar"
+
+CXX_STANDARD=17
+
+args=("$@")
+while [ "${#args[@]}" -ne 0 ]; do
+    case "${args[0]}" in
+    -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
+    *) echo "Unrecognized option: ${args[0]}"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${workdir}"
+cd "${workdir}"
+
+# Configure and build
+rm -rf build
+mkdir build
+cd build
+# Explicitly compile for hopper since the CI machine does not have a gpu:
+cmake -G Ninja .. -DCMAKE_CXX_STANDARD="${CXX_STANDARD}" -DCMAKE_CXX_FLAGS="-gpu=cc90"
+cmake --build .
diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
@@ -26,6 +26,7 @@ subprojects=(
   cub
   thrust
   cudax
+  stdpar
   python
   cccl_c_parallel
   c2h
@@ -38,6 +39,7 @@ declare -A dependencies=(
   [cub]="cccl libcudacxx thrust c2h"
   [thrust]="cccl libcudacxx cub"
   [cudax]="cccl libcudacxx"
+  [stdpar]="cccl libcudacxx cub thrust"
   [python]="cccl libcudacxx cub thrust cccl_c_parallel"
   [cccl_c_parallel]="cccl libcudacxx cub thrust"
   [c2h]="cccl libcudacxx cub thrust"
@@ -49,6 +51,7 @@ declare -A project_names=(
   [cub]="CUB"
   [thrust]="Thrust"
   [cudax]="CUDA Experimental"
+  [stdpar]="stdpar"
   [python]="python"
   [cccl_c_parallel]="CCCL C Parallel Library"
   [c2h]="Catch2Helper"
@@ -59,6 +62,7 @@ declare -A project_names=(
 # of any subproject directory.
 declare -A project_dirs=(
   [cccl_c_parallel]="c/parallel"
+  [stdpar]="test/stdpar"
 )
 
 # Changes to files / directories listed here are ignored when checking if the
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -58,6 +58,8 @@ workflows:
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    # NVHPC stdpar smoke tests
+    - {jobs: ['build'], project: 'stdpar', std: 'all', ctk: '12.6', cxx: 'nvhpc', cpu: ['amd64', 'arm64']}
 
   nightly:
     # Edge-case jobs
@@ -239,6 +241,9 @@ projects:
     job_map: { test: ['test_cpu', 'test_gpu'] }
   cudax:
     stds: [17, 20]
+  stdpar:
+    name: 'NVHPC stdpar'
+    stds: [17, 20]
   python:
     name: "cuda (python)"
     job_map: { build: [], test: ['test_nobuild'] }
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -22,6 +22,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__cccl/extended_data_types.h>
+
 //! This file consolidates all compiler builtin detection for CCCL.
 //!
 //! To work around older compilers not supporting `__has_builtin` we use `_CCCL_CHECK_BUILTIN` that detects more
@@ -427,14 +429,16 @@
 #  define _CCCL_BUILTIN_HUGE_VALL() static_cast<long double>(__builtin_huge_val())
 #endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall)
 
-#if _CCCL_CHECK_BUILTIN(builtin_huge_valf128) || _CCCL_COMPILER(GCC, >=, 7)
-#  define _CCCL_BUILTIN_HUGE_VALF128() __builtin_huge_valf128()
-#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf128) || _CCCL_COMPILER(GCC, >=, 7)
+#if _CCCL_HAS_FLOAT128()
+#  if _CCCL_CHECK_BUILTIN(builtin_huge_valf128) || _CCCL_COMPILER(GCC, >=, 7)
+#    define _CCCL_BUILTIN_HUGE_VALF128() __builtin_huge_valf128()
+#  endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf128) || _CCCL_COMPILER(GCC, >=, 7)
 
 // nvcc does not implement __builtin_huge_valf128
-#if _CCCL_CUDA_COMPILER(NVCC)
-#  undef _CCCL_BUILTIN_HUGE_VALF128
-#endif // _CCCL_CUDA_COMPILER(NVCC)
+#  if _CCCL_CUDA_COMPILER(NVCC)
+#    undef _CCCL_BUILTIN_HUGE_VALF128
+#  endif // _CCCL_CUDA_COMPILER(NVCC)
+#endif // _CCCL_HAS_FLOAT128()
 
 #if _CCCL_CHECK_BUILTIN(builtin_hypot) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_HYPOTF(...) __builtin_hypotf(__VA_ARGS__)
@@ -575,14 +579,16 @@
 #  define _CCCL_BUILTIN_NANL(...) static_cast<long double>(__builtin_nan(__VA_ARGS__))
 #endif // _CCCL_CHECK_BUILTIN(builtin_nanl)
 
-#if _CCCL_CHECK_BUILTIN(builtin_nanf128) || _CCCL_COMPILER(GCC, >=, 7)
-#  define _CCCL_BUILTIN_NANF128(...) __builtin_nanf128(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(builtin_nanf128) || _CCCL_COMPILER(GCC, >=, 7)
+#if _CCCL_HAS_FLOAT128()
+#  if _CCCL_CHECK_BUILTIN(builtin_nanf128) || _CCCL_COMPILER(GCC, >=, 7)
+#    define _CCCL_BUILTIN_NANF128(...) __builtin_nanf128(__VA_ARGS__)
+#  endif // _CCCL_CHECK_BUILTIN(builtin_nanf128) || _CCCL_COMPILER(GCC, >=, 7)
 
 // nvcc does not implement __builtin_nanf128
-#if _CCCL_CUDA_COMPILER(NVCC)
-#  undef _CCCL_BUILTIN_NANF128
-#endif // _CCCL_CUDA_COMPILER(NVCC)
+#  if _CCCL_CUDA_COMPILER(NVCC)
+#    undef _CCCL_BUILTIN_NANF128
+#  endif // _CCCL_CUDA_COMPILER(NVCC)
+#endif // _CCCL_HAS_FLOAT128()
 
 #if _CCCL_CHECK_BUILTIN(builtin_nansf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
 #  define _CCCL_BUILTIN_NANSF(...) __builtin_nansf(__VA_ARGS__)
@@ -598,14 +604,16 @@
 #  define _CCCL_BUILTIN_NANSL(...) static_cast<long double>(__builtin_nans(__VA_ARGS__))
 #endif // _CCCL_CHECK_BUILTIN(builtin_nansl)
 
-#if _CCCL_CHECK_BUILTIN(builtin_nansf128) || _CCCL_COMPILER(GCC, >=, 7)
-#  define _CCCL_BUILTIN_NANSF128(...) __builtin_nansf128(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(builtin_nansf128) || _CCCL_COMPILER(GCC, >=, 7)
+#if _CCCL_HAS_FLOAT128()
+#  if _CCCL_CHECK_BUILTIN(builtin_nansf128) || _CCCL_COMPILER(GCC, >=, 7)
+#    define _CCCL_BUILTIN_NANSF128(...) __builtin_nansf128(__VA_ARGS__)
+#  endif // _CCCL_CHECK_BUILTIN(builtin_nansf128) || _CCCL_COMPILER(GCC, >=, 7)
 
 // nvcc does not implement __builtin_nansf128
-#if _CCCL_CUDA_COMPILER(NVCC)
-#  undef _CCCL_BUILTIN_NANSF128
-#endif // _CCCL_CUDA_COMPILER(NVCC)
+#  if _CCCL_CUDA_COMPILER(NVCC)
+#    undef _CCCL_BUILTIN_NANSF128
+#  endif // _CCCL_CUDA_COMPILER(NVCC)
+#endif // _CCCL_HAS_FLOAT128()
 
 #if _CCCL_CHECK_BUILTIN(builtin_nearbyint) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_NEARBYINTF(...) __builtin_nearbyintf(__VA_ARGS__)
diff --git a/libcudacxx/include/cuda/std/__cccl/extended_data_types.h b/libcudacxx/include/cuda/std/__cccl/extended_data_types.h
@@ -22,6 +22,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__cccl/architecture.h>
 #include <cuda/std/__cccl/diagnostic.h>
 #include <cuda/std/__cccl/os.h>
 #include <cuda/std/__cccl/preprocessor.h>
@@ -81,14 +82,15 @@
 #  define _CCCL_HAS_NVFP4() 1
 #endif
 
-// NVC++ supports float128 only in host code
-#if !defined(CCCL_DISABLE_FLOAT128_SUPPORT) && _CCCL_OS(LINUX)                     \
-  && ((_CCCL_COMPILER(NVRTC) && defined(__CUDACC_RTC_FLOAT128__)) /*NVRTC*/        \
-      || defined(__SIZEOF_FLOAT128__) || defined(__FLOAT128__)) /*HOST COMPILERS*/ \
-  && (!defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000)) /*DEVICE CODE*/
-#  undef _CCCL_HAS_FLOAT128
-#  define _CCCL_HAS_FLOAT128() 1
-#endif
+#if !defined(CCCL_DISABLE_FLOAT128_SUPPORT) && _CCCL_OS(LINUX) && !_CCCL_ARCH(ARM64)
+#  if (defined(__CUDACC_RTC_FLOAT128__) || defined(__SIZEOF_FLOAT128__) || defined(__FLOAT128__)) /*HOST COMPILERS*/
+#    if _CCCL_CUDA_COMPILER(NVHPC) \
+      || ((_CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(CLANG)) && __CUDA_ARCH__ >= 1000) /*DEVICE CODE*/
+#      undef _CCCL_HAS_FLOAT128
+#      define _CCCL_HAS_FLOAT128() 1
+#    endif // CUDA compiler
+#  endif // Host compiler support
+#endif // !CCCL_DISABLE_FLOAT128_SUPPORT && _CCCL_OS(LINUX)
 
 // gcc does not allow to use 'operator""q' when __STRICT_ANSI__ is defined, it may be allowed by
 // -fext-numeric-literals, but we have no way to detect it. However, from gcc 13, we can use 'operator""f128' and cast
diff --git a/test/stdpar/CMakeLists.txt b/test/stdpar/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.21)
+
+# NOTE: this is build outside of the libcu++ test harness
+project(CCCL_STDPAR_TESTS LANGUAGES CXX)
+
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL NVHPC)
+  message(FATAL_ERROR "The stdpar tests require nvc++ for CMAKE_CXX_COMPILER.")
+endif()
+
+# Enable testing for the project
+enable_testing()
+
+find_package(CCCL CONFIG REQUIRED
+  NO_DEFAULT_PATH # Only check the explicit HINTS below:
+  HINTS "${CMAKE_CURRENT_LIST_DIR}/../../lib/cmake/cccl/"
+)
+
+file(GLOB test_files
+  LIST_DIRECTORIES false
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  "tests/*.cpp"
+)
+
+function(cccl_add_stdpar_test test_file)
+  get_filename_component(test_name ${test_file} NAME_WE)
+
+  add_executable(stdpar_test_${test_name} ${test_file})
+  target_link_libraries(stdpar_test_${test_name} PUBLIC CCCL::CCCL)
+
+  # Ensure that we are testing with GPU support
+  target_compile_options(stdpar_test_${test_name} PUBLIC -stdpar=gpu)
+  target_link_options(stdpar_test_${test_name} PUBLIC -stdpar=gpu)
+
+  # Ensure that we are indeed testing the same CCCL version
+  target_compile_definitions(stdpar_test_${test_name} PUBLIC CMAKE_CCCL_VERSION_MAJOR=${CCCL_VERSION_MAJOR})
+  target_compile_definitions(stdpar_test_${test_name} PUBLIC CMAKE_CCCL_VERSION_MINOR=${CCCL_VERSION_MINOR})
+  target_compile_definitions(stdpar_test_${test_name} PUBLIC CMAKE_CCCL_VERSION_PATCH=${CCCL_VERSION_PATCH})
+
+  # Register with ctest
+  add_test(NAME stdpar_test_${test_name} COMMAND stdpar_test_${test_name})
+endfunction()
+
+foreach(test IN LISTS test_files)
+  cccl_add_stdpar_test(${test})
+endforeach()
diff --git a/test/stdpar/tests/reduce.cpp b/test/stdpar/tests/reduce.cpp
@@ -0,0 +1,22 @@
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <numeric>
+#include <vector>
+
+// Ensure that we are indeed using the correct CCCL version
+static_assert(CCCL_MAJOR_VERSION == CMAKE_CCCL_VERSION_MAJOR);
+static_assert(CCCL_MINOR_VERSION == CMAKE_CCCL_VERSION_MINOR);
+static_assert(CCCL_PATCH_VERSION == CMAKE_CCCL_VERSION_PATCH);
+
+constexpr int N = 1000;
+
+int main()
+{
+  std::vector<int> v(N);
+  std::fill(std::execution::par_unseq, v.begin(), v.end(), 42);
+  int sum = std::reduce(std::execution::par_unseq, v.begin(), v.end(), 100, [](int a, int b) {
+    return a + b;
+  });
+  assert(sum == (42 * N) + 100);
+}