Skip to content

Commit

Permalink
Update for oneapi 2021.4.0 (#2368)
Browse files Browse the repository at this point in the history
There are some namespace changes and deprecated functions in the new
release.
  • Loading branch information
WeiqunZhang authored Sep 29, 2021
1 parent b11479d commit 6f2dc1d
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 16 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/intel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ jobs:
name: DPCPP [email protected] C++17 [tests]
runs-on: ubuntu-20.04
# mkl/rng/device/detail/mrg32k3a_impl.hpp has a number of sign-compare error
env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare"}
# Since 2021.4.0, AMReX_GpuUtility.H: error: comparison with NaN always evaluates to false in fast floating point modes
env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare -Wno-tautological-constant-compare"}
steps:
- uses: actions/checkout@v2
- name: Dependencies
Expand Down
12 changes: 12 additions & 0 deletions Src/Base/AMReX_FBI.H
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,24 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
int to_try = 1;
while (true) {
int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0;
#if (__INTEL_LLVM_COMPILER <= 20210300)
if (sycl::ONEAPI::all_of(item.get_sub_group(), msk == 0)) { // 0 means lock acquired
#else
if (sycl::all_of_group(item.get_sub_group(), msk == 0)) { // 0 means lock acquired
#endif
break; // all threads have acquired.
} else {
#if (__INTEL_LLVM_COMPILER <= 20210300)
if (sycl::ONEAPI::any_of(item.get_sub_group(), msk > mypriority)) {
#else
if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) {
#endif
if (m) *m = 0; // yield
#if (__INTEL_LLVM_COMPILER <= 20210300)
item.mem_fence(sycl::access::fence_space::global_and_local);
#else
sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
#endif
to_try = 1;
} else {
to_try = (msk > 0); // hold on to my lock
Expand Down
13 changes: 9 additions & 4 deletions Src/Base/AMReX_GpuPrint.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@

#if defined(AMREX_USE_DPCPP)
#if defined(__SYCL_DEVICE_ONLY__)
# define AMREX_DEVICE_PRINTF(format,...) { \
static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); }
# if (__INTEL_LLVM_COMPILER <= 20210300)
# define AMREX_DEVICE_PRINTF(format,...) { \
static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
sycl::ONEAPI::experimental::printf(amrex_i_format, __VA_ARGS__); }
# else
# define AMREX_DEVICE_PRINTF(format,...) { \
static const __attribute__((opencl_constant)) char amrex_i_format[] = format ; \
sycl::ext::oneapi::experimental::printf(amrex_i_format, __VA_ARGS__); }
# endif
#else
# define AMREX_DEVICE_PRINTF(format,...) { \
std::printf(format, __VA_ARGS__); }
Expand All @@ -27,4 +33,3 @@

#endif // !defined(__APPLE__)
#endif // AMREX_GPU_PRINT_H_

12 changes: 12 additions & 0 deletions Src/Base/AMReX_GpuQualifiers.H
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,23 @@
#define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__)

#ifdef AMREX_USE_DPCPP

# include <CL/sycl.hpp>

namespace amrex {
# if (__INTEL_LLVM_COMPILER <= 20210300)
namespace oneapi = sycl::ONEAPI;
# else
namespace oneapi = sycl::ext::oneapi;
# endif
}

# define AMREX_REQUIRE_SUBGROUP_SIZE(x) \
_Pragma("clang diagnostic push") \
_Pragma("clang diagnostic ignored \"-Wattributes\"") \
[[intel::reqd_sub_group_size(x)]] \
_Pragma("clang diagnostic pop")

#endif // AMREX_USE_DPCPP

#endif
6 changes: 3 additions & 3 deletions Src/Base/AMReX_GpuReduce.H
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ template <int warpSize, typename T, typename F>
struct warpReduce
{
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T operator() (T x, sycl::ONEAPI::sub_group const& sg) const noexcept
T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept
{
for (int offset = warpSize/2; offset > 0; offset /= 2) {
T y = sg.shuffle_down(x, offset);
Expand All @@ -70,7 +70,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
{
T* shared = (T*)h.local;
int tid = h.item->get_local_id(0);
sycl::ONEAPI::sub_group const& sg = h.item->get_sub_group();
amrex::oneapi::sub_group const& sg = h.item->get_sub_group();
int lane = sg.get_local_id()[0];
int wid = sg.get_group_id()[0];
int numwarps = sg.get_group_range()[0];
Expand All @@ -93,7 +93,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
Gpu::Handler const& handler)
{
sycl::ONEAPI::sub_group const& sg = handler.item->get_sub_group();
amrex::oneapi::sub_group const& sg = handler.item->get_sub_group();
int wid = sg.get_group_id()[0];
if ((wid+1)*warpSize <= handler.numActiveThreads) {
x = warp_reduce(x, sg); // full warp
Expand Down
6 changes: 6 additions & 0 deletions Src/Base/AMReX_ParmParse.H
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,12 @@ public:
* for this particular ParmParse object.
*/
explicit ParmParse (const std::string& prefix = std::string());

// To avoid error: definition of implicit copy constructor for
// 'ParmParse' is deprecated because it has a user-declared copy
// assignment operator.
ParmParse (ParmParse const& rhs) = default;

//! Returns true if name is in table.
bool contains (const char* name) const;
/**
Expand Down
39 changes: 31 additions & 8 deletions Src/Base/AMReX_Scan.H
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct BlockStatus<T, true>
Data<T> d;

AMREX_GPU_DEVICE AMREX_FORCE_INLINE
#if defined(AMREX_USE_DPCPP)
#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
void write (char a_status, T a_value, sycl::nd_item<1> const& /*item*/) {
#else
void write (char a_status, T a_value) {
Expand Down Expand Up @@ -102,15 +102,19 @@ struct BlockStatus<T, true>
void set_status (char a_status) { d.s.status = a_status; }

AMREX_GPU_DEVICE AMREX_FORCE_INLINE
#if defined(AMREX_USE_DPCPP)
#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
STVA<T> wait (sycl::nd_item<1> const& item) volatile {
#else
STVA<T> wait () volatile {
#endif
STVA<T> r;
do {
#if defined(AMREX_USE_DPCPP)
#if (__INTEL_LLVM_COMPILER <= 20210300)
item.mem_fence();
#else
sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
#endif
#else
__threadfence_block();
#endif
Expand All @@ -128,7 +132,7 @@ struct BlockStatus<T, false>
char status;

AMREX_GPU_DEVICE AMREX_FORCE_INLINE
#if defined(AMREX_USE_DPCPP)
#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
void write (char a_status, T a_value, sycl::nd_item<1> const& item) {
#else
void write (char a_status, T a_value) {
Expand All @@ -139,7 +143,11 @@ struct BlockStatus<T, false>
inclusive = a_value;
}
#if defined(AMREX_USE_DPCPP)
#if (__INTEL_LLVM_COMPILER <= 20210300)
item.mem_fence(sycl::access::fence_space::global_and_local);
#else
sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
#endif
#else
__threadfence();
#endif
Expand All @@ -164,7 +172,7 @@ struct BlockStatus<T, false>
void set_status (char a_status) { status = a_status; }

AMREX_GPU_DEVICE AMREX_FORCE_INLINE
#if defined(AMREX_USE_DPCPP)
#if defined(AMREX_USE_DPCPP) && (__INTEL_LLVM_COMPILER <= 20210300)
STVA<T> wait (sycl::nd_item<1> const& item) volatile {
#else
STVA<T> wait () volatile {
Expand All @@ -173,7 +181,11 @@ struct BlockStatus<T, false>
do {
r = read();
#if defined(AMREX_USE_DPCPP)
#if (__INTEL_LLVM_COMPILER <= 20210300)
item.mem_fence(sycl::access::fence_space::global_and_local);
#else
sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
#endif
#else
__threadfence();
#endif
Expand Down Expand Up @@ -227,7 +239,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
amrex::launch(nblocks, nthreads, sm, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
sycl::ONEAPI::sub_group const& sg = gh.item->get_sub_group();
amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
int lane = sg.get_local_id()[0];
int warp = sg.get_group_id()[0];
int nwarps = sg.get_group_range()[0];
Expand Down Expand Up @@ -323,7 +335,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
// sum_prev_chunk now holds the sum of the whole block.
if (threadIdxx == 0 && gridDimx > 1) {
block_status.write((virtual_block_id == 0) ? 'p' : 'a',
sum_prev_chunk, *gh.item);
sum_prev_chunk
#if (__INTEL_LLVM_COMPILER <= 20210300)
, *gh.item
#endif
);
}

if (virtual_block_id == 0) {
Expand All @@ -345,7 +361,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
int iblock = iblock0-lane;
detail::STVA<T> stva{'p', 0};
if (iblock >= 0) {
#if (__INTEL_LLVM_COMPILER <= 20210300)
stva = pbs[iblock].wait(*gh.item);
#else
stva = pbs[iblock].wait();
#endif
}

T x = stva.value;
Expand Down Expand Up @@ -382,8 +402,11 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
}

if (lane == 0) {
block_status.write('p', block_status.get_aggregate() + exclusive_prefix,
*gh.item);
block_status.write('p', block_status.get_aggregate() + exclusive_prefix
#if (__INTEL_LLVM_COMPILER <= 20210300)
, *gh.item
#endif
);
shared[0] = exclusive_prefix;
}
}
Expand Down

0 comments on commit 6f2dc1d

Please sign in to comment.