amrex::fillAsync (#3076)

WeiqunZhang · web-flow · commit ea7e8a507e77 · 2022-12-23T08:34:21.000+01:00
This new function can be used to fill the elements in a vector type
container (e.g., Gpu::DeviceVector). If the element type is a struct
with several arithmetic types (e.g., GpuArray&lt;Real,10&gt;), the usual
ParallelFor does not have good performance because of the memory access
pattern. The new fillAsync function will use shared memory in that case
to improve the performance.
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
@@ -402,7 +402,105 @@ namespace Gpu {
         Gpu::streamSynchronize();
     }
 
-}}
+    /**
+     * \brief Fill the elements in the given range using the given
+     * calllable.
+     *
+     *  This function is asynchronous for GPU builds.
+     *
+     * \tparam IT the iterator type
+     * \tparam F  the callable type
+     *
+     * \param first the inclusive first in the range [first, last)
+     * \param last  the exclusive last in the range [first, last)
+     * \param f     the callable with the function signature of void(T&, Long),
+     *              where T is the element type and the Long parameter is the
+     *              index for the element to be filled.
+     */
+    template <typename IT, typename F,
+              typename T = typename std::iterator_traits<IT>::value_type,
+              std::enable_if_t<(sizeof(T) <= 36*8) &&  // so there is enough shared memory
+                               std::is_trivially_copyable_v<T> &&
+                               // std::is_invocable_v<F, T&, Long>,       // HIP does not like this.
+                               !std::is_convertible_v<std::decay_t<F>,T>, // So we use this instead.
+                               int> FOO = 0>
+    void fillAsync (IT first, IT last, F&& f) noexcept
+    {
+        auto N = static_cast<Long>(std::distance(first, last));
+        if (N <= 0) return;
+        auto p = &(*first);
+#ifndef AMREX_USE_GPU
+        for (Long i = 0; i < N; ++i) {
+            f(p[i], i);
+        }
+#else
+        // No need to use shared memory if the type is small.
+        // May not have enough shared memory if the type is too big.
+        // Cannot use shared memory, if the type is not trivially copable.
+        if constexpr ((sizeof(T) <= 8)
+                      || (sizeof(T) > 36*8)
+                      || ! std::is_trivially_copyable<T>()) {
+            amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
+            {
+                f(p[i], i);
+            });
+        } else {
+            static_assert(sizeof(T) % sizeof(unsigned int) == 0);
+            using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
+                                         unsigned long long, unsigned int>;
+            constexpr Long nU = sizeof(T) / sizeof(U);
+            auto pu = reinterpret_cast<U*>(p);
+            int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
+            int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
+            std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
+#ifdef AMREX_USE_DPCPP
+            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+            [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+            {
+                Long i = handler.globalIdx();
+                Long blockDimx = handler.blockDim();
+                Long threadIdxx = handler.threadIdx();
+                Long blockIdxx = handler.blockIdx();
+                auto const shared_U = (U*)handler.sharedMemory();
+                auto const shared_T = (T*)shared_U;
+                if (i < N) {
+                    auto ga = new(shared_T+threadIdxx) T;
+                    f(*ga, i);
+                }
+                handler.sharedBarrier();
+                for (Long m = threadIdxx,
+                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
+                     m < mend; m += blockDimx) {
+                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
+                }
+            });
+#else
+            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+                          [=] AMREX_GPU_DEVICE () noexcept
+            {
+                Long blockDimx = blockDim.x;
+                Long threadIdxx = threadIdx.x;
+                Long blockIdxx = blockIdx.x;
+                Long i = blockDimx*blockIdxx + threadIdxx;
+                Gpu::SharedMemory<U> gsm;
+                auto const shared_U = gsm.dataPtr();
+                auto const shared_T = (T*)shared_U;
+                if (i < N) {
+                    auto ga = new(shared_T+threadIdxx) T;
+                    f(*ga, i);
+                }
+                __syncthreads();
+                for (Long m = threadIdxx,
+                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
+                     m < mend; m += blockDimx) {
+                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
+                }
+            });
+#endif
+        }
+#endif
+    }
 
+}}
 
 #endif