Skip to content

Commit

Permalink
amrex::fillAsync (#3076)
Browse files Browse the repository at this point in the history
This new function can be used to fill the elements in a vector type
container (e.g., Gpu::DeviceVector). If the element type is a struct
with several arithmetic types (e.g., GpuArray<Real,10>), the usual
ParallelFor does not have good performance because of the memory access
pattern. The new fillAsync function will use shared memory in that case
to improve the performance.
  • Loading branch information
WeiqunZhang authored Dec 23, 2022
1 parent 9c76f9a commit ea7e8a5
Showing 1 changed file with 99 additions and 1 deletion.
100 changes: 99 additions & 1 deletion Src/Base/AMReX_GpuContainers.H
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,105 @@ namespace Gpu {
Gpu::streamSynchronize();
}

}}
/**
* \brief Fill the elements in the given range using the given
* calllable.
*
* This function is asynchronous for GPU builds.
*
* \tparam IT the iterator type
* \tparam F the callable type
*
* \param first the inclusive first in the range [first, last)
* \param last the exclusive last in the range [first, last)
* \param f the callable with the function signature of void(T&, Long),
* where T is the element type and the Long parameter is the
* index for the element to be filled.
*/
template <typename IT, typename F,
typename T = typename std::iterator_traits<IT>::value_type,
std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory
std::is_trivially_copyable_v<T> &&
// std::is_invocable_v<F, T&, Long>, // HIP does not like this.
!std::is_convertible_v<std::decay_t<F>,T>, // So we use this instead.
int> FOO = 0>
void fillAsync (IT first, IT last, F&& f) noexcept
{
auto N = static_cast<Long>(std::distance(first, last));
if (N <= 0) return;
auto p = &(*first);
#ifndef AMREX_USE_GPU
for (Long i = 0; i < N; ++i) {
f(p[i], i);
}
#else
// No need to use shared memory if the type is small.
// May not have enough shared memory if the type is too big.
// Cannot use shared memory, if the type is not trivially copable.
if constexpr ((sizeof(T) <= 8)
|| (sizeof(T) > 36*8)
|| ! std::is_trivially_copyable<T>()) {
amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
{
f(p[i], i);
});
} else {
static_assert(sizeof(T) % sizeof(unsigned int) == 0);
using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
unsigned long long, unsigned int>;
constexpr Long nU = sizeof(T) / sizeof(U);
auto pu = reinterpret_cast<U*>(p);
int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
#ifdef AMREX_USE_DPCPP
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
{
Long i = handler.globalIdx();
Long blockDimx = handler.blockDim();
Long threadIdxx = handler.threadIdx();
Long blockIdxx = handler.blockIdx();
auto const shared_U = (U*)handler.sharedMemory();
auto const shared_T = (T*)shared_U;
if (i < N) {
auto ga = new(shared_T+threadIdxx) T;
f(*ga, i);
}
handler.sharedBarrier();
for (Long m = threadIdxx,
mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
m < mend; m += blockDimx) {
pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
}
});
#else
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
Long blockDimx = blockDim.x;
Long threadIdxx = threadIdx.x;
Long blockIdxx = blockIdx.x;
Long i = blockDimx*blockIdxx + threadIdxx;
Gpu::SharedMemory<U> gsm;
auto const shared_U = gsm.dataPtr();
auto const shared_T = (T*)shared_U;
if (i < N) {
auto ga = new(shared_T+threadIdxx) T;
f(*ga, i);
}
__syncthreads();
for (Long m = threadIdxx,
mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
m < mend; m += blockDimx) {
pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
}
});
#endif
}
#endif
}

}}

#endif

0 comments on commit ea7e8a5

Please sign in to comment.