Skip to content

Commit

Permalink
Match the dimensions of reqd_work_group_size to submitted nd_range (A…
Browse files Browse the repository at this point in the history
…MReX-Codes#4002)

## Summary
According to sycl spec, the number of arguments of reqd_work_group_size
must match the dimensions of the work-group used to invoke the kernel.

## Additional background

https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:kernel.attributes

## Checklist

The proposed changes:
- [x] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
  • Loading branch information
zhaomaosu authored Jun 25, 2024
1 parent 27b399a commit 463bdf4
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 19 deletions.
32 changes: 16 additions & 16 deletions Src/Base/AMReX_GpuLaunchFunctsG.H
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(MT)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
Expand All @@ -100,7 +100,7 @@ void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(MT)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
f(item);
Expand All @@ -124,7 +124,7 @@ void launch (T const& n, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
Expand Down Expand Up @@ -203,7 +203,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
Expand All @@ -220,7 +220,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
Expand Down Expand Up @@ -252,7 +252,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
Expand All @@ -270,7 +270,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
Expand Down Expand Up @@ -303,7 +303,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
Expand All @@ -322,7 +322,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
Expand Down Expand Up @@ -353,7 +353,7 @@ void ParallelForRNG (T n, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const tid = item.get_global_id(0);
Expand Down Expand Up @@ -387,7 +387,7 @@ void ParallelForRNG (Box const& box, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const tid = item.get_global_id(0);
Expand Down Expand Up @@ -423,7 +423,7 @@ void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const tid = item.get_global_id(0);
Expand Down Expand Up @@ -460,7 +460,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
Expand Down Expand Up @@ -500,7 +500,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
Expand Down Expand Up @@ -545,7 +545,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
Expand Down Expand Up @@ -593,7 +593,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
[[sycl::reqd_work_group_size(1,1,MT)]]
[[sycl::reqd_work_group_size(MT)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
{
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
Expand Down
4 changes: 2 additions & 2 deletions Src/Base/AMReX_GpuLaunchMacrosG.nolint.H
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
sycl::range<1>(amrex_i_nthreads_per_block)), \
[=] (sycl::nd_item<1> amrex_i_item) \
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
{ \
for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
Expand Down Expand Up @@ -266,7 +266,7 @@
amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
sycl::range<1>(amrex_i_nthreads_per_block)), \
[=] (sycl::nd_item<1> amrex_i_item) \
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
{ \
for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_TagParallelFor.H
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
#ifdef AMREX_USE_SYCL
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
[[sycl::reqd_work_group_size(1,1,nthreads)]]
[[sycl::reqd_work_group_size(nthreads)]]
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
#else
[=] AMREX_GPU_DEVICE () noexcept
Expand Down

0 comments on commit 463bdf4

Please sign in to comment.