diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H index 5e3ce12064d..27c902fc0e9 100644 --- a/Src/Base/AMReX_GpuContainers.H +++ b/Src/Base/AMReX_GpuContainers.H @@ -402,7 +402,105 @@ namespace Gpu { Gpu::streamSynchronize(); } -}} + /** + * \brief Fill the elements in the given range using the given + * calllable. + * + * This function is asynchronous for GPU builds. + * + * \tparam IT the iterator type + * \tparam F the callable type + * + * \param first the inclusive first in the range [first, last) + * \param last the exclusive last in the range [first, last) + * \param f the callable with the function signature of void(T&, Long), + * where T is the element type and the Long parameter is the + * index for the element to be filled. + */ + template ::value_type, + std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory + std::is_trivially_copyable_v && + // std::is_invocable_v, // HIP does not like this. + !std::is_convertible_v,T>, // So we use this instead. + int> FOO = 0> + void fillAsync (IT first, IT last, F&& f) noexcept + { + auto N = static_cast(std::distance(first, last)); + if (N <= 0) return; + auto p = &(*first); +#ifndef AMREX_USE_GPU + for (Long i = 0; i < N; ++i) { + f(p[i], i); + } +#else + // No need to use shared memory if the type is small. + // May not have enough shared memory if the type is too big. + // Cannot use shared memory, if the type is not trivially copable. + if constexpr ((sizeof(T) <= 8) + || (sizeof(T) > 36*8) + || ! std::is_trivially_copyable()) { + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept + { + f(p[i], i); + }); + } else { + static_assert(sizeof(T) % sizeof(unsigned int) == 0); + using U = std::conditional_t; + constexpr Long nU = sizeof(T) / sizeof(U); + auto pu = reinterpret_cast(p); + int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128; + int nblocks = static_cast((N+nthreads_per_block-1)/nthreads_per_block); + std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T); +#ifdef AMREX_USE_DPCPP + amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept + { + Long i = handler.globalIdx(); + Long blockDimx = handler.blockDim(); + Long threadIdxx = handler.threadIdx(); + Long blockIdxx = handler.blockIdx(); + auto const shared_U = (U*)handler.sharedMemory(); + auto const shared_T = (T*)shared_U; + if (i < N) { + auto ga = new(shared_T+threadIdxx) T; + f(*ga, i); + } + handler.sharedBarrier(); + for (Long m = threadIdxx, + mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); + m < mend; m += blockDimx) { + pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; + } + }); +#else + amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept + { + Long blockDimx = blockDim.x; + Long threadIdxx = threadIdx.x; + Long blockIdxx = blockIdx.x; + Long i = blockDimx*blockIdxx + threadIdxx; + Gpu::SharedMemory gsm; + auto const shared_U = gsm.dataPtr(); + auto const shared_T = (T*)shared_U; + if (i < N) { + auto ga = new(shared_T+threadIdxx) T; + f(*ga, i); + } + __syncthreads(); + for (Long m = threadIdxx, + mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); + m < mend; m += blockDimx) { + pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; + } + }); +#endif + } +#endif + } +}} #endif