From 6f2dc1d06ee7d3f0024a9b8480ea8dde0d5fd644 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 29 Sep 2021 15:37:17 -0700
Subject: [PATCH] Update for oneapi 2021.4.0 (#2368)

There are some namespace changes and deprecated functions in the new
release.
---
 .github/workflows/intel.yml    |  3 ++-
 Src/Base/AMReX_FBI.H           | 12 +++++++++++
 Src/Base/AMReX_GpuPrint.H      | 13 ++++++++----
 Src/Base/AMReX_GpuQualifiers.H | 12 +++++++++++
 Src/Base/AMReX_GpuReduce.H     |  6 +++---
 Src/Base/AMReX_ParmParse.H     |  6 ++++++
 Src/Base/AMReX_Scan.H          | 39 +++++++++++++++++++++++++++-------
 7 files changed, 75 insertions(+), 16 deletions(-)
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 3aa21e45a6b..b67d2ea8d3a 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -11,7 +11,8 @@ jobs:
     name: DPCPP GFortran@7.5 C++17 [tests]
     runs-on: ubuntu-20.04
     # mkl/rng/device/detail/mrg32k3a_impl.hpp has a number of sign-compare error
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare"}
+    # Since 2021.4.0, AMReX_GpuUtility.H: error: comparison with NaN always evaluates to false in fast floating point modes
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare -Wno-tautological-constant-compare"}
     steps:
     - uses: actions/checkout@v2
     - name: Dependencies
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index be60ec3b43d..80c120b54fb 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -102,12 +102,24 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
         int to_try  = 1;
         while (true) {
             int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0;
+#if (__INTEL_LLVM_COMPILER <= 20210300)
             if (sycl::ONEAPI::all_of(item.get_sub_group(), msk == 0)) {  // 0 means lock acquired
+#else
+            if (sycl::all_of_group(item.get_sub_group(), msk == 0)) {  // 0 means lock acquired
+#endif
                 break; // all threads have acquired.
             } else {
+#if (__INTEL_LLVM_COMPILER <= 20210300)
                 if (sycl::ONEAPI::any_of(item.get_sub_group(), msk > mypriority)) {
+#else
+                if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) {
+#endif
                     if (m) *m = 0; // yield
+#if (__INTEL_LLVM_COMPILER <= 20210300)
                     item.mem_fence(sycl::access::fence_space::global_and_local);
+#else
+                    sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
+#endif
                     to_try = 1;
                 } else {
                     to_try = (msk > 0); // hold on to my lock
diff --git a/Src/Base/AMReX_GpuPrint.H b/Src/Base/AMReX_GpuPrint.H
index 9e863141a09..badc39ce6f8 100644
--- a/Src/Base/AMReX_GpuPrint.H
+++ b/Src/Base/AMReX_GpuPrint.H
@@ -12,9 +12,15 @@
 
 #if defined(AMREX_USE_DPCPP)
 #if defined(__SYCL_DEVICE_ONLY__)
-#  define AMREX_DEVICE_PRINTF(format,...) { \
-      static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
-      sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); }
+#  if (__INTEL_LLVM_COMPILER <= 20210300)
+#    define AMREX_DEVICE_PRINTF(format,...) { \
+         static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
+         sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); }
+#  else
+#    define AMREX_DEVICE_PRINTF(format,...) { \
+         static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
+         sycl::ext::oneapi::experimental::printf(amrex_i_format, __VA_ARGS__); }
+#  endif
 #else
 #  define AMREX_DEVICE_PRINTF(format,...) { \
       std::printf(format, __VA_ARGS__); }
@@ -27,4 +33,3 @@
 
 #endif  // !defined(__APPLE__)
 #endif  // AMREX_GPU_PRINT_H_
-
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index 4229f8b834d..f627f61096e 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -43,11 +43,23 @@
 #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__)
 
 #ifdef AMREX_USE_DPCPP
+
+# include <CL/sycl.hpp>
+
+namespace amrex {
+# if (__INTEL_LLVM_COMPILER <= 20210300)
+    namespace oneapi = sycl::ONEAPI;
+# else
+    namespace oneapi = sycl::ext::oneapi;
+# endif
+}
+
 # define AMREX_REQUIRE_SUBGROUP_SIZE(x) \
   _Pragma("clang diagnostic push") \
   _Pragma("clang diagnostic ignored \"-Wattributes\"") \
   [[intel::reqd_sub_group_size(x)]]                \
   _Pragma("clang diagnostic pop")
+
 #endif // AMREX_USE_DPCPP
 
 #endif
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index ae0c504dea2..9b48138940c 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -54,7 +54,7 @@ template <int warpSize, typename T, typename F>
 struct warpReduce
 {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    T operator() (T x, sycl::ONEAPI::sub_group const& sg) const noexcept
+    T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept
     {
         for (int offset = warpSize/2; offset > 0; offset /= 2) {
             T y = sg.shuffle_down(x, offset);
@@ -70,7 +70,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
 {
     T* shared = (T*)h.local;
     int tid = h.item->get_local_id(0);
-    sycl::ONEAPI::sub_group const& sg = h.item->get_sub_group();
+    amrex::oneapi::sub_group const& sg = h.item->get_sub_group();
     int lane = sg.get_local_id()[0];
     int wid = sg.get_group_id()[0];
     int numwarps = sg.get_group_range()[0];
@@ -93,7 +93,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
                           Gpu::Handler const& handler)
 {
-   sycl::ONEAPI::sub_group const& sg = handler.item->get_sub_group();
+   amrex::oneapi::sub_group const& sg = handler.item->get_sub_group();
    int wid = sg.get_group_id()[0];
    if ((wid+1)*warpSize <= handler.numActiveThreads) {
        x = warp_reduce(x, sg); // full warp
diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index bfc87222a89..347e6c8a54a 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -273,6 +273,12 @@ public:
     * for this particular ParmParse object.
     */
     explicit ParmParse (const std::string& prefix = std::string());
+
+    // To avoid error: definition of implicit copy constructor for
+    // 'ParmParse' is deprecated because it has a user-declared copy
+    // assignment operator.
+    ParmParse (ParmParse const& rhs) = default;
+
     //! Returns true if name is in table.
     bool contains (const char* name) const;
     /**
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index 0ab3f639d1f..e7a6c043b03 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -60,7 +60,7 @@ struct BlockStatus<T, true>
     Data<T> d;
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-#if defined(AMREX_USE_DPCPP)
+#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
     void write (char a_status, T a_value, sycl::nd_item<1> const& /*item*/) {
 #else
     void write (char a_status, T a_value) {
@@ -102,7 +102,7 @@ struct BlockStatus<T, true>
     void set_status (char a_status) { d.s.status = a_status; }
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-#if defined(AMREX_USE_DPCPP)
+#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
     STVA<T> wait (sycl::nd_item<1> const& item) volatile {
 #else
     STVA<T> wait () volatile {
@@ -110,7 +110,11 @@ struct BlockStatus<T, true>
         STVA<T> r;
         do {
 #if defined(AMREX_USE_DPCPP)
+#if (__INTEL_LLVM_COMPILER <= 20210300)
             item.mem_fence();
+#else
+            sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
+#endif
 #else
             __threadfence_block();
 #endif
@@ -128,7 +132,7 @@ struct BlockStatus<T, false>
     char status;
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-#if defined(AMREX_USE_DPCPP)
+#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
     void write (char a_status, T a_value, sycl::nd_item<1> const& item) {
 #else
     void write (char a_status, T a_value) {
@@ -139,7 +143,11 @@ struct BlockStatus<T, false>
             inclusive = a_value;
         }
 #if defined(AMREX_USE_DPCPP)
+#if (__INTEL_LLVM_COMPILER <= 20210300)
         item.mem_fence(sycl::access::fence_space::global_and_local);
+#else
+        sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
+#endif
 #else
         __threadfence();
 #endif
@@ -164,7 +172,7 @@ struct BlockStatus<T, false>
     void set_status (char a_status) { status = a_status; }
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-#if defined(AMREX_USE_DPCPP)
+#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
     STVA<T> wait (sycl::nd_item<1> const& item) volatile {
 #else
     STVA<T> wait () volatile {
@@ -173,7 +181,11 @@ struct BlockStatus<T, false>
         do {
             r = read();
 #if defined(AMREX_USE_DPCPP)
+#if (__INTEL_LLVM_COMPILER <= 20210300)
             item.mem_fence(sycl::access::fence_space::global_and_local);
+#else
+            sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
+#endif
 #else
             __threadfence();
 #endif
@@ -227,7 +239,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        sycl::ONEAPI::sub_group const& sg = gh.item->get_sub_group();
+        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -323,7 +335,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
         // sum_prev_chunk now holds the sum of the whole block.
         if (threadIdxx == 0 && gridDimx > 1) {
             block_status.write((virtual_block_id == 0) ? 'p' : 'a',
-                               sum_prev_chunk, *gh.item);
+                               sum_prev_chunk
+#if (__INTEL_LLVM_COMPILER <= 20210300)
+                               , *gh.item
+#endif
+                );
         }
 
         if (virtual_block_id == 0) {
@@ -345,7 +361,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
                     int iblock = iblock0-lane;
                     detail::STVA<T> stva{'p', 0};
                     if (iblock >= 0) {
+#if (__INTEL_LLVM_COMPILER <= 20210300)
                         stva = pbs[iblock].wait(*gh.item);
+#else
+                        stva = pbs[iblock].wait();
+#endif
                     }
 
                     T x = stva.value;
@@ -382,8 +402,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
                 }
 
                 if (lane == 0) {
-                    block_status.write('p', block_status.get_aggregate() + exclusive_prefix,
-                                       *gh.item);
+                    block_status.write('p', block_status.get_aggregate() + exclusive_prefix
+#if (__INTEL_LLVM_COMPILER <= 20210300)
+                                       , *gh.item
+#endif
+                        );
                     shared[0] = exclusive_prefix;
                 }
             }