diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index 30af9c9cfc0..e7449a11bf1 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -53,14 +53,14 @@ void fill (BaseFab& aos_fab, F && f) std::uint64_t const threadIdxx = handler.threadIdx(); std::uint64_t const blockIdxx = handler.blockIdx(); auto const shared = (T*)handler.sharedMemory(); - if (icell < ntotcells) { + if (icell < indexer.numPts()) { auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT; auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } handler.sharedBarrier(); for (std::uint64_t m = threadIdxx, - mend = amrex::min(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE; + mend = amrex::min(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE; m < mend; m += blockDimx) { p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m]; } @@ -72,14 +72,14 @@ void fill (BaseFab& aos_fab, F && f) std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; Gpu::SharedMemory gsm; T* const shared = gsm.dataPtr(); - if (icell < ntotcells) { + if (icell < indexer.numPts()) { auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT; auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } __syncthreads(); for (std::uint64_t m = threadIdx.x, - mend = amrex::min(blockDim.x, ntotcells-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; + mend = amrex::min(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; m < mend; m += blockDim.x) { p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m]; } diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index e7f6f2d5a48..82a2ad9cd13 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -1842,13 +1842,16 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy struct BoxIndexer { + std::uint64_t npts; + #if (AMREX_SPACEDIM == 3) Math::FastDivmodU64 fdxy; Math::FastDivmodU64 fdx; IntVect lo; BoxIndexer (Box const& box) - : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))), + : npts(box.numPts()), + fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))), fdx (std::uint64_t(box.length(0))), lo (box.smallEnd()) {} @@ -1877,7 +1880,8 @@ struct BoxIndexer IntVect lo; BoxIndexer (Box const& box) - : fdx (std::uint64_t(box.length(0))), + : npts(box.numPts()), + fdx (std::uint64_t(box.length(0))), lo (box.smallEnd()) {} @@ -1902,7 +1906,8 @@ struct BoxIndexer int lo; BoxIndexer (Box const& box) - : lo(box.smallEnd(0)) + : npts(box.numPts()), + lo(box.smallEnd(0)) {} [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE @@ -1918,6 +1923,9 @@ struct BoxIndexer } #endif + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + std::uint64_t numPts () const { return npts; } }; } diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index e2cf0ed9641..21029eff840 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -651,7 +651,7 @@ public: ~ParForInfo (); std::pair const& getBlocks () const { return m_nblocks_x; } - Box const* getBoxes () const { return m_boxes; } + BoxIndexer const* getBoxes () const { return m_boxes; } ParForInfo () = delete; ParForInfo (ParForInfo const&) = delete; @@ -663,7 +663,7 @@ public: IntVect m_ng; int m_nthreads; std::pair m_nblocks_x; - Box* m_boxes = nullptr; + BoxIndexer* m_boxes = nullptr; char* m_hp = nullptr; char* m_dp = nullptr; }; diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index a280968c773..5e28ba20884 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -239,9 +239,8 @@ template void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -257,9 +256,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); - int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); @@ -275,7 +274,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,Gpu::Handler{&item}); } @@ -291,9 +290,8 @@ template (ncells); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -309,9 +307,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); - int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, ncomp, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), @@ -328,7 +326,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item}); } @@ -377,9 +375,8 @@ template void ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::ExecutionConfig(ncells); + const auto ec = Gpu::ExecutionConfig(box.numPts()); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); @@ -397,7 +394,7 @@ void ParallelForRNG (Box const& box, L&& f) noexcept auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; for (std::uint64_t icell = tid, stride = item.get_global_range(0); - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); f(i,j,k,rand_eng); } @@ -414,9 +411,8 @@ template void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells = amrex::max(ncells1, ncells2); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(), box2.numPts())); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -470,13 +463,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); f2(i,j,k); } @@ -494,14 +488,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells3 = std::uint64_t(box3.numPts()); - const auto ncells = amrex::max(ncells1, ncells2, ncells3); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); const BoxIndexer indexer3(box3); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -513,17 +503,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); f2(i,j,k); } - if (icell < ncells3) { + if (icell < indexer3.numPts()) { auto [i, j, k] = indexer3(icell); f3(i,j,k); } @@ -543,12 +534,9 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells = amrex::max(ncells1, ncells2); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -560,15 +548,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); @@ -592,14 +581,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells3 = std::uint64_t(box3.numPts()); - const auto ncells = amrex::max(ncells1, ncells2, ncells3); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); const BoxIndexer indexer3(box3); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); const auto nthreads_per_block = ec.numThreads.x; const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); @@ -611,21 +596,22 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } - if (icell < ncells3) { + if (icell < indexer3.numPts()) { auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); @@ -768,16 +754,15 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < ncells; icell += stride) + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); - detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x)); + detail::call_f(f, i, j, k, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -788,15 +773,14 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < ncells; icell += stride) { + icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); - detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x)); + detail::call_f(f, i, j, k, ncomp, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -829,16 +813,15 @@ ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::ExecutionConfig(ncells); + const auto ec = Gpu::ExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); f(i,j,k,engine); } @@ -853,16 +836,15 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - const auto ncells = std::uint64_t(box.numPts()); const BoxIndexer indexer(box); - const auto ec = Gpu::ExecutionConfig(ncells); + const auto ec = Gpu::ExecutionConfig(box.numPts()); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) { auto [i, j, k] = indexer(icell); for (T n = 0; n < ncomp; ++n) { f(i,j,k,n,engine); @@ -879,21 +861,19 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells = amrex::max(ncells1, ncells2); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); f2(i,j,k); } @@ -909,27 +889,24 @@ ParallelFor (Gpu::KernelInfo const&, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells3 = std::uint64_t(box3.numPts()); - const auto ncells = amrex::max(ncells1, ncells2, ncells3); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); const BoxIndexer indexer3(box3); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); f1(i,j,k); } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); f2(i,j,k); } - if (icell < ncells3) { + if (icell < indexer3.numPts()) { auto [i, j, k] = indexer3(icell); f3(i,j,k); } @@ -947,23 +924,21 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells = amrex::max(ncells1, ncells2); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max(box1.numPts(),box2.numPts())); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { + auto const ncells = std::max(indexer1.numPts(), indexer2.numPts()); for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); @@ -985,31 +960,28 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = std::uint64_t(box1.numPts()); - const auto ncells2 = std::uint64_t(box2.numPts()); - const auto ncells3 = std::uint64_t(box3.numPts()); - const auto ncells = amrex::max(ncells1, ncells2, ncells3); const BoxIndexer indexer1(box1); const BoxIndexer indexer2(box2); const BoxIndexer indexer3(box3); - const auto ec = Gpu::makeExecutionConfig(ncells); + const auto ec = Gpu::makeExecutionConfig(std::max({box1.numPts(),box2.numPts(),box3.numPts()})); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { + auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()}); for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - if (icell < ncells1) { + if (icell < indexer1.numPts()) { auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } - if (icell < ncells2) { + if (icell < indexer2.numPts()) { auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } - if (icell < ncells3) { + if (icell < indexer3.numPts()) { auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); diff --git a/Src/Base/AMReX_MFParallelForG.H b/Src/Base/AMReX_MFParallelForG.H index ba65b18937d..80260ca0eb5 100644 --- a/Src/Base/AMReX_MFParallelForG.H +++ b/Src/Base/AMReX_MFParallelForG.H @@ -12,36 +12,33 @@ namespace amrex { namespace detail { inline -void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair& blocks_x, Box*& pboxes, +void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair& blocks_x, BoxIndexer*& pboxes, Vector const& boxes, Vector const& ncells, int nthreads) { if (!ncells.empty()) { const int nboxes = ncells.size(); - const std::size_t nbytes_boxes = amrex::aligned_size(16, (nboxes+1) * sizeof(int)); - const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(Box); + const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int)); + const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer); a_hp = (char*)The_Pinned_Arena()->alloc(nbytes); int* hp_blks = (int*)a_hp; - Box* hp_boxes = (Box*)(a_hp + nbytes_boxes); + auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes); hp_blks[0] = 0; - Long ntot = 0; bool same_size = true; for (int i = 0; i < nboxes; ++i) { Long nblocks = (ncells[i] + nthreads-1) / nthreads; + AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits::max())); hp_blks[i+1] = hp_blks[i] + static_cast(nblocks); - ntot += nblocks; same_size = same_size && (ncells[i] == ncells[0]); - new (hp_boxes+i) Box(boxes[i]); + new (hp_boxes+i) BoxIndexer(boxes[i]); } - amrex::ignore_unused(ntot); - AMREX_ASSERT(static_cast(hp_blks[nboxes]) == ntot); // no overflow a_dp = (char*) The_Arena()->alloc(nbytes); Gpu::htod_memcpy_async(a_dp, a_hp, nbytes); blocks_x.first = hp_blks; blocks_x.second = (same_size) ? nullptr : (int*)a_dp; - pboxes = (Box*)(a_dp + nbytes_boxes); + pboxes = (BoxIndexer*)(a_dp + nbytes_boxes); } } @@ -94,7 +91,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo const int nblocks = par_for_blocks.first[nboxes]; const int block_0_size = par_for_blocks.first[1]; const int* dp_nblocks = par_for_blocks.second; - const Box* dp_boxes = parforinfo.getBoxes(); + const BoxIndexer* dp_boxes = parforinfo.getBoxes(); #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -102,13 +99,14 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo <<>> ([=] AMREX_GPU_DEVICE () noexcept { - int ibox, icell; + int ibox; + std::uint64_t icell; if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdx.x)); - icell = (blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x; + icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x; } else { ibox = blockIdx.x / block_0_size; - icell = (blockIdx.x-ibox*block_0_size)*MT + threadIdx.x; + icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x; } #elif defined(AMREX_USE_SYCL) @@ -116,27 +114,21 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo amrex::launch(nblocks, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { - int ibox, icell; + int ibox; + std::uint64_t icell; int blockIdxx = item.get_group_linear_id(); int threadIdxx = item.get_local_linear_id(); if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdxx)); - icell = (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx; + icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx; } else { ibox = blockIdxx / block_0_size; - icell = (blockIdxx-ibox*block_0_size)*MT + threadIdxx; + icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx; } #endif - Box const& b = dp_boxes[ibox]; - int ncells = b.numPts(); - if (icell < ncells) { - const auto len = amrex::length(b); - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - AMREX_D_TERM(i += b.smallEnd(0);, - j += b.smallEnd(1);, - k += b.smallEnd(2);) + BoxIndexer const& indexer = dp_boxes[ibox]; + if (icell < indexer.numPts()) { + auto [i, j, k] = indexer(icell); for (int n = 0; n < ncomp; ++n) { parfor_mf_detail::call_f(f, ibox, i, j, k, n); } diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index aa1a4dd0122..bf49c3df0a8 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -376,11 +376,12 @@ public: const int nblocks = par_for_blocks.first[nboxes]; const int block_0_size = par_for_blocks.first[1]; const int* dp_nblocks = par_for_blocks.second; - const Box* dp_boxes = parforinfo.getBoxes(); + const BoxIndexer* dp_boxes = parforinfo.getBoxes(); auto const& stream = Gpu::gpuStream(); auto pdst = reduce_data.devicePtr(stream); int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks()); + AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits::max())); reduce_data.nBlocks(stream) = nblocks_ec; reduce_data.updateMaxStreamIndex(stream); @@ -405,25 +406,19 @@ public: dst = r; } for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { - int ibox, icell; + int ibox; + std::uint64_t icell; if (dp_nblocks) { ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock); - icell = (iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x; + icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x; } else { ibox = iblock / block_0_size; - icell = (iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x; + icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x; } - Box const& b = dp_boxes[ibox]; - int ncells = b.numPts(); - if (icell < ncells) { - const auto len = amrex::length(b); - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - AMREX_D_TERM(i += b.smallEnd(0);, - j += b.smallEnd(1);, - k += b.smallEnd(2);); + BoxIndexer const& indexer = dp_boxes[ibox]; + if (icell < indexer.numPts()) { + auto [i, j, k] = indexer(icell); Reduce::detail::mf_call_f (f, ibox, i, j, k, ncomp, r); }