From 6f2dc1d06ee7d3f0024a9b8480ea8dde0d5fd644 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 29 Sep 2021 15:37:17 -0700 Subject: [PATCH] Update for oneapi 2021.4.0 (#2368) There are some namespace changes and deprecated functions in the new release. --- .github/workflows/intel.yml | 3 ++- Src/Base/AMReX_FBI.H | 12 +++++++++++ Src/Base/AMReX_GpuPrint.H | 13 ++++++++---- Src/Base/AMReX_GpuQualifiers.H | 12 +++++++++++ Src/Base/AMReX_GpuReduce.H | 6 +++--- Src/Base/AMReX_ParmParse.H | 6 ++++++ Src/Base/AMReX_Scan.H | 39 +++++++++++++++++++++++++++------- 7 files changed, 75 insertions(+), 16 deletions(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 3aa21e45a6b..b67d2ea8d3a 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -11,7 +11,8 @@ jobs: name: DPCPP GFortran@7.5 C++17 [tests] runs-on: ubuntu-20.04 # mkl/rng/device/detail/mrg32k3a_impl.hpp has a number of sign-compare error - env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare"} + # Since 2021.4.0, AMReX_GpuUtility.H: error: comparison with NaN always evaluates to false in fast floating point modes + env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare -Wno-tautological-constant-compare"} steps: - uses: actions/checkout@v2 - name: Dependencies diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index be60ec3b43d..80c120b54fb 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -102,12 +102,24 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in int to_try = 1; while (true) { int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0; +#if (__INTEL_LLVM_COMPILER <= 20210300) if (sycl::ONEAPI::all_of(item.get_sub_group(), msk == 0)) { // 0 means lock acquired +#else + if (sycl::all_of_group(item.get_sub_group(), msk == 0)) { // 0 means lock acquired +#endif break; // all threads have acquired. } else { +#if (__INTEL_LLVM_COMPILER <= 20210300) if (sycl::ONEAPI::any_of(item.get_sub_group(), msk > mypriority)) { +#else + if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) { +#endif if (m) *m = 0; // yield +#if (__INTEL_LLVM_COMPILER <= 20210300) item.mem_fence(sycl::access::fence_space::global_and_local); +#else + sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device); +#endif to_try = 1; } else { to_try = (msk > 0); // hold on to my lock diff --git a/Src/Base/AMReX_GpuPrint.H b/Src/Base/AMReX_GpuPrint.H index 9e863141a09..badc39ce6f8 100644 --- a/Src/Base/AMReX_GpuPrint.H +++ b/Src/Base/AMReX_GpuPrint.H @@ -12,9 +12,15 @@ #if defined(AMREX_USE_DPCPP) #if defined(__SYCL_DEVICE_ONLY__) -# define AMREX_DEVICE_PRINTF(format,...) { \ - static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \ - sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); } +# if (__INTEL_LLVM_COMPILER <= 20210300) +# define AMREX_DEVICE_PRINTF(format,...) { \ + static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \ + sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); } +# else +# define AMREX_DEVICE_PRINTF(format,...) { \ + static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \ + sycl::ext::oneapi::experimental::printf(amrex_i_format, __VA_ARGS__); } +# endif #else # define AMREX_DEVICE_PRINTF(format,...) { \ std::printf(format, __VA_ARGS__); } @@ -27,4 +33,3 @@ #endif // !defined(__APPLE__) #endif // AMREX_GPU_PRINT_H_ - diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index 4229f8b834d..f627f61096e 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -43,11 +43,23 @@ #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__) #ifdef AMREX_USE_DPCPP + +# include + +namespace amrex { +# if (__INTEL_LLVM_COMPILER <= 20210300) + namespace oneapi = sycl::ONEAPI; +# else + namespace oneapi = sycl::ext::oneapi; +# endif +} + # define AMREX_REQUIRE_SUBGROUP_SIZE(x) \ _Pragma("clang diagnostic push") \ _Pragma("clang diagnostic ignored \"-Wattributes\"") \ [[intel::reqd_sub_group_size(x)]] \ _Pragma("clang diagnostic pop") + #endif // AMREX_USE_DPCPP #endif diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H index ae0c504dea2..9b48138940c 100644 --- a/Src/Base/AMReX_GpuReduce.H +++ b/Src/Base/AMReX_GpuReduce.H @@ -54,7 +54,7 @@ template struct warpReduce { AMREX_GPU_DEVICE AMREX_FORCE_INLINE - T operator() (T x, sycl::ONEAPI::sub_group const& sg) const noexcept + T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept { for (int offset = warpSize/2; offset > 0; offset /= 2) { T y = sg.shuffle_down(x, offset); @@ -70,7 +70,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h) { T* shared = (T*)h.local; int tid = h.item->get_local_id(0); - sycl::ONEAPI::sub_group const& sg = h.item->get_sub_group(); + amrex::oneapi::sub_group const& sg = h.item->get_sub_group(); int lane = sg.get_local_id()[0]; int wid = sg.get_group_id()[0]; int numwarps = sg.get_group_range()[0]; @@ -93,7 +93,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, Gpu::Handler const& handler) { - sycl::ONEAPI::sub_group const& sg = handler.item->get_sub_group(); + amrex::oneapi::sub_group const& sg = handler.item->get_sub_group(); int wid = sg.get_group_id()[0]; if ((wid+1)*warpSize <= handler.numActiveThreads) { x = warp_reduce(x, sg); // full warp diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H index bfc87222a89..347e6c8a54a 100644 --- a/Src/Base/AMReX_ParmParse.H +++ b/Src/Base/AMReX_ParmParse.H @@ -273,6 +273,12 @@ public: * for this particular ParmParse object. */ explicit ParmParse (const std::string& prefix = std::string()); + + // To avoid error: definition of implicit copy constructor for + // 'ParmParse' is deprecated because it has a user-declared copy + // assignment operator. + ParmParse (ParmParse const& rhs) = default; + //! Returns true if name is in table. bool contains (const char* name) const; /** diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H index 0ab3f639d1f..e7a6c043b03 100644 --- a/Src/Base/AMReX_Scan.H +++ b/Src/Base/AMReX_Scan.H @@ -60,7 +60,7 @@ struct BlockStatus Data d; AMREX_GPU_DEVICE AMREX_FORCE_INLINE -#if defined(AMREX_USE_DPCPP) +#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300) void write (char a_status, T a_value, sycl::nd_item<1> const& /*item*/) { #else void write (char a_status, T a_value) { @@ -102,7 +102,7 @@ struct BlockStatus void set_status (char a_status) { d.s.status = a_status; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -#if defined(AMREX_USE_DPCPP) +#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300) STVA wait (sycl::nd_item<1> const& item) volatile { #else STVA wait () volatile { @@ -110,7 +110,11 @@ struct BlockStatus STVA r; do { #if defined(AMREX_USE_DPCPP) +#if (__INTEL_LLVM_COMPILER <= 20210300) item.mem_fence(); +#else + sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group); +#endif #else __threadfence_block(); #endif @@ -128,7 +132,7 @@ struct BlockStatus char status; AMREX_GPU_DEVICE AMREX_FORCE_INLINE -#if defined(AMREX_USE_DPCPP) +#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300) void write (char a_status, T a_value, sycl::nd_item<1> const& item) { #else void write (char a_status, T a_value) { @@ -139,7 +143,11 @@ struct BlockStatus inclusive = a_value; } #if defined(AMREX_USE_DPCPP) +#if (__INTEL_LLVM_COMPILER <= 20210300) item.mem_fence(sycl::access::fence_space::global_and_local); +#else + sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device); +#endif #else __threadfence(); #endif @@ -164,7 +172,7 @@ struct BlockStatus void set_status (char a_status) { status = a_status; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -#if defined(AMREX_USE_DPCPP) +#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300) STVA wait (sycl::nd_item<1> const& item) volatile { #else STVA wait () volatile { @@ -173,7 +181,11 @@ struct BlockStatus do { r = read(); #if defined(AMREX_USE_DPCPP) +#if (__INTEL_LLVM_COMPILER <= 20210300) item.mem_fence(sycl::access::fence_space::global_and_local); +#else + sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device); +#endif #else __threadfence(); #endif @@ -227,7 +239,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) amrex::launch(nblocks, nthreads, sm, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - sycl::ONEAPI::sub_group const& sg = gh.item->get_sub_group(); + amrex::oneapi::sub_group const& sg = gh.item->get_sub_group(); int lane = sg.get_local_id()[0]; int warp = sg.get_group_id()[0]; int nwarps = sg.get_group_range()[0]; @@ -323,7 +335,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) // sum_prev_chunk now holds the sum of the whole block. if (threadIdxx == 0 && gridDimx > 1) { block_status.write((virtual_block_id == 0) ? 'p' : 'a', - sum_prev_chunk, *gh.item); + sum_prev_chunk +#if (__INTEL_LLVM_COMPILER <= 20210300) + , *gh.item +#endif + ); } if (virtual_block_id == 0) { @@ -345,7 +361,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) int iblock = iblock0-lane; detail::STVA stva{'p', 0}; if (iblock >= 0) { +#if (__INTEL_LLVM_COMPILER <= 20210300) stva = pbs[iblock].wait(*gh.item); +#else + stva = pbs[iblock].wait(); +#endif } T x = stva.value; @@ -382,8 +402,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum) } if (lane == 0) { - block_status.write('p', block_status.get_aggregate() + exclusive_prefix, - *gh.item); + block_status.write('p', block_status.get_aggregate() + exclusive_prefix +#if (__INTEL_LLVM_COMPILER <= 20210300) + , *gh.item +#endif + ); shared[0] = exclusive_prefix; } }