diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3a980ee93a..90488a9504 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -15,12 +15,13 @@ This release contains ... Notable changes include: * New features / API changes: + * Added `RAJA::atomicOperation` to enable user-defined atomic update + operations implemented using a compare-and-swap loop. * Build changes/improvements: * Bug fixes/improvements: - Version 2025.12.2 -- Release date 2026-03-04 ============================================ diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp index 7979fd2d86..c6f6390a75 100644 --- a/include/RAJA/pattern/atomic.hpp +++ b/include/RAJA/pattern/atomic.hpp @@ -280,7 +280,6 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value) * @param compare Value to compare with *acc * @return Returns value at *acc immediately before this operation completed */ - RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value) @@ -288,6 +287,19 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value) return RAJA::atomicCAS(Policy {}, acc, compare, value); } +/*! + * @brief Generic atomic operation implemented using CAS loop + * @param acc Pointer to location to store value + * @param operation Functor that computes a new value from the old value + * @return Returns value at *acc immediately before this operation completed + */ +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T atomicOperation(T* acc, Operation&& operation) +{ + return RAJA::atomicOperation(Policy {}, acc, std::forward(operation)); +} + /*! * \brief Atomic wrapper object * diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp index ca61f437cf..45e35e3448 100644 --- a/include/RAJA/policy/atomic_auto.hpp +++ b/include/RAJA/policy/atomic_auto.hpp @@ -156,6 +156,12 @@ atomicCAS(auto_atomic, T* acc, T compare, T value) return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value); } +template +RAJA_INLINE RAJA_HOST_DEVICE T +atomicOperation(auto_atomic, T* acc, Operation&& operation) +{ + return atomicOperation(RAJA_AUTO_ATOMIC, acc, std::forward(operation)); +} } // namespace RAJA diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp index f2a479bc69..8ec38f494e 100644 --- a/include/RAJA/policy/atomic_builtin.hpp +++ b/include/RAJA/policy/atomic_builtin.hpp @@ -23,6 +23,7 @@ #include "RAJA/config.hpp" #include +#include #if defined(RAJA_COMPILER_MSVC) || \ ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER)) @@ -1011,6 +1012,13 @@ atomicCAS(builtin_atomic, T* acc, T compare, T value) return detail::builtin_atomicCAS(acc, compare, value); } +template +RAJA_DEVICE_HIP RAJA_INLINE T +atomicOperation(builtin_atomic, T* acc, Operation&& operation) +{ + return detail::builtin_atomicCAS_loop(acc, std::forward(operation)); +} + } // namespace RAJA diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp index 9293c7cb0a..b2a22537c6 100644 --- a/include/RAJA/policy/cuda/atomic.hpp +++ b/include/RAJA/policy/cuda/atomic.hpp @@ -858,6 +858,18 @@ atomicCAS(cuda_atomic_explicit, T* acc, T compare, T value) #endif } +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T +atomicOperation(cuda_atomic_explicit, T* acc, Operation&& operation) +{ +#ifdef __CUDA_ARCH__ + return detail::cuda_atomicCAS_loop(acc, std::forward(operation)); +#else + return RAJA::atomicOperation(host_policy {}, acc, std::forward(operation)); +#endif +} + } // namespace RAJA diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp index 22039283ce..f1dfff43d5 100644 --- a/include/RAJA/policy/desul/atomic.hpp +++ b/include/RAJA/policy/desul/atomic.hpp @@ -14,7 +14,12 @@ #if defined(RAJA_ENABLE_DESUL_ATOMICS) +#include +#include +#include + #include "RAJA/util/macros.hpp" +#include "RAJA/util/TypeConvert.hpp" #include "RAJA/policy/atomic_builtin.hpp" @@ -27,6 +32,32 @@ using raja_default_desul_scope = desul::MemoryScopeDevice; namespace RAJA { +namespace detail +{ + +template +RAJA_HOST_DEVICE RAJA_INLINE bool desul_atomicCAS_equal(const T& a, const T& b) +{ + return a == b; +} + +template::value, bool> = true> +RAJA_HOST_DEVICE RAJA_INLINE bool desul_atomicCAS_equal(const T& a, const T& b) +{ + using R = std::conditional_t; + static_assert(sizeof(T) == sizeof(std::uint32_t) || + sizeof(T) == sizeof(std::uint64_t), + "desul_atomicCAS_equal only supports 32/64-bit floating point"); + + return RAJA::util::reinterp_A_as_B(a) == + RAJA::util::reinterp_A_as_B(b); +} + +} // namespace detail + RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc) @@ -153,6 +184,34 @@ atomicCAS(AtomicPolicy, T* acc, T compare, T value) raja_default_desul_scope {}); } +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE T +atomicOperation(AtomicPolicy, T* acc, Operation&& operation) +{ + T expected = desul::atomic_load(acc, + raja_default_desul_order {}, + raja_default_desul_scope {}); + + while (true) { + const T desired = operation(expected); + + if (desul_atomicCAS_equal(desired, expected)) { + return expected; // no-op + } + + const T old = desul::atomic_compare_exchange(acc, expected, desired, + raja_default_desul_order {}, + raja_default_desul_scope {}); + + if (desul_atomicCAS_equal(old, expected)) { + return old; // success + } + + expected = old; // CAS failed, old is the latest observed value + } +} + } // namespace RAJA #endif // RAJA_ENABLE_DESUL_ATOMICS diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp index 13a708d6eb..a638546bf7 100644 --- a/include/RAJA/policy/hip/atomic.hpp +++ b/include/RAJA/policy/hip/atomic.hpp @@ -901,6 +901,18 @@ atomicCAS(hip_atomic_explicit, T* acc, T compare, T value) #endif } +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T +atomicOperation(hip_atomic_explicit, T* acc, Operation&& operation) +{ +#if defined(__HIP_DEVICE_COMPILE__) + return detail::hip_atomicCAS_loop(acc, std::forward(operation)); +#else + return RAJA::atomicOperation(host_policy {}, acc, std::forward(operation)); +#endif +} + } // namespace RAJA diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp index 2043b36342..c56c225e6c 100644 --- a/include/RAJA/policy/openmp/atomic.hpp +++ b/include/RAJA/policy/openmp/atomic.hpp @@ -24,6 +24,8 @@ #if defined(RAJA_ENABLE_OPENMP) +#include + #include "RAJA/policy/openmp/policy.hpp" #include "RAJA/util/macros.hpp" @@ -230,6 +232,17 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value) return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value); } +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE T +atomicOperation(omp_atomic, T* acc, Operation&& operation) +{ + // OpenMP doesn't define a generic atomic operation, so use builtin atomics + return RAJA::atomicOperation(builtin_atomic {}, + acc, + std::forward(operation)); +} + #endif // not defined RAJA_COMPILER_MSVC diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp index 32eda4658b..83b91bd3fe 100644 --- a/include/RAJA/policy/sequential/atomic.hpp +++ b/include/RAJA/policy/sequential/atomic.hpp @@ -158,6 +158,15 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value) return ret; } +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE T atomicOperation(seq_atomic, T* acc, Operation&& operation) +{ + T ret = *acc; + *acc = operation(ret); + return ret; +} + } // namespace RAJA diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp index cdd10d025c..977c0578a5 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp @@ -63,7 +63,7 @@ template (0); test_array[10] = static_cast(0); test_array[11] = static_cast(0); + test_array[12] = static_cast(1); work_res.memcpy(work_array, test_array, sizeof(T) * len); @@ -109,6 +110,23 @@ void ForallAtomicBasicTestImpl( IdxType seglimit ) RAJA::atomicStore(work_array + 9, static_cast(1)); RAJA::atomicInc(work_array + 10, static_cast(16)); RAJA::atomicDec(work_array + 11, static_cast(16)); + + // Exercise generic atomicOperation with an order-independent update: + // compute factorial(N) by multiplying by (i+1) for i in [0, N). + // + // Choose N small enough that: + // - The result fits in 32-bit signed ints (avoids overflow/UB). + // - The intermediate values are exactly representable in float/double + // (avoids non-associativity issues). + constexpr IdxType factN = static_cast(10); + RAJA::atomicOperation(work_array + 12, + [=] RAJA_HOST_DEVICE(T old) { + if (i < factN) + { + return old * static_cast(i + static_cast(1)); + } + return old; + }); }); work_res.memcpy( check_array, work_array, sizeof(T) * len ); @@ -128,6 +146,7 @@ void ForallAtomicBasicTestImpl( IdxType seglimit ) EXPECT_EQ(static_cast(1), check_array[9]); EXPECT_EQ(static_cast(4), check_array[10]); EXPECT_EQ(static_cast(13), check_array[11]); + EXPECT_EQ(static_cast(3628800), check_array[12]); deallocateForallTestData(work_res, work_array, diff --git a/test/unit/atomic/CMakeLists.txt b/test/unit/atomic/CMakeLists.txt index 8dc841e1f7..ca947a66eb 100644 --- a/test/unit/atomic/CMakeLists.txt +++ b/test/unit/atomic/CMakeLists.txt @@ -34,3 +34,7 @@ raja_add_test( raja_add_test( NAME test-atomic-ref-bitwise SOURCES test-atomic-ref-bitwise.cpp) + +raja_add_test( + NAME test-atomic-operation + SOURCES test-atomic-operation.cpp)