From 4d5972f9e280ca08c5f2ae03a330977d76a6ebe9 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Thu, 3 Dec 2020 15:55:50 +0000 Subject: [PATCH 001/198] git ignore *.d and profile_* --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 58f2516546..e652edd6d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.o +*.d *.f90 *.mod *.a @@ -10,6 +11,7 @@ milc_interface/* *.pyc tunecache.tsv profile.tsv +profile_*.tsv config.log CMakeCache.txt CMakeFiles From b8d4b2e779814d3be60622268f8b21366fe6c729 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Thu, 3 Dec 2020 15:56:57 +0000 Subject: [PATCH 002/198] omptarget replicates cuda --- include/quda_api.h | 215 +++++ include/targets/omptarget/FFT_Plans.h | 137 +++ .../omptarget/block_reduction_kernel.h | 70 ++ include/targets/omptarget/kernel.h | 61 ++ include/targets/omptarget/quda_fp16.cuh | 21 + include/targets/omptarget/random_helper.h | 81 ++ include/targets/omptarget/reduction_kernel.h | 54 ++ include/targets/omptarget/target_device.h | 139 +++ lib/targets/omptarget/blas_lapack_cublas.cpp | 191 ++++ lib/targets/omptarget/device.cpp | 376 ++++++++ lib/targets/omptarget/malloc.cpp | 707 ++++++++++++++ lib/targets/omptarget/quda_api.cpp | 549 +++++++++++ lib/targets/omptarget/tune.cpp | 876 ++++++++++++++++++ 13 files changed, 3477 insertions(+) create mode 100644 include/targets/omptarget/FFT_Plans.h create mode 100644 include/targets/omptarget/block_reduction_kernel.h create mode 100644 include/targets/omptarget/kernel.h create mode 100644 include/targets/omptarget/quda_fp16.cuh create mode 100644 include/targets/omptarget/random_helper.h create mode 100644 include/targets/omptarget/reduction_kernel.h create mode 100644 include/targets/omptarget/target_device.h create mode 100644 lib/targets/omptarget/blas_lapack_cublas.cpp create mode 100644 lib/targets/omptarget/device.cpp create mode 100644 lib/targets/omptarget/malloc.cpp create mode 100644 lib/targets/omptarget/quda_api.cpp create mode 100644 lib/targets/omptarget/tune.cpp diff --git a/include/quda_api.h b/include/quda_api.h index 3f8b7bc9e5..2bf284386a 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -1,10 +1,225 @@ #pragma once +#ifdef QUDA_BACKEND_OMPTARGET + +#include +#include +#include +#include +#include +#include + +#include + +template +constexpr auto type_name() noexcept { + std::string_view name = "Error: unsupported compiler", prefix, suffix; +#ifdef __clang__ + name = __PRETTY_FUNCTION__; + prefix = "auto type_name() [T = "; + suffix = "]"; +#elif defined(__GNUC__) + name = __PRETTY_FUNCTION__; + prefix = "constexpr auto type_name() [with T = "; + suffix = "]"; +#endif + name.remove_prefix(prefix.size()); + name.remove_suffix(suffix.size()); + return name; +} + +template +struct seq_args_call{ + static constexpr size_t n = sizeof...(Arg); + template + void operator()(F *func, void *args[n], std::index_sequence) + {(*func)(*(Arg*)args[IX]...);} +}; + +template +T * to_device(const T * x, size_t s) { + if(0 +T * to_device(const T& x) { + constexpr size_t s = sizeof(T); + return to_device(&x, s); +} + +#define __host__ +#define __device__ +#define __shared__ +#define __global__ +#define __constant__ static +#define __launch_bounds__(x) +#define __syncthreads() _Pragma("omp barrier") + +using size_t = std::size_t; + +#define __forceinline__ inline __attribute__((always_inline)) + +enum cudaMemcpyKind{cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyDefault}; +enum cudaError_t{cudaSuccess,cudaErrorNotReady}; +enum {cudaEventDisableTiming,cudaEventInterprocess}; +enum {cudaHostRegisterDefault}; +enum {cudaIpcMemLazyEnablePeerAccess}; +enum {cudaStreamDefault}; + +static inline cudaError_t +ompwip_(const char * const file, const size_t line, const char * const func, const char * const msg, std::functionf = [](){}) +{ + if(0==omp_get_team_num()&&0==omp_get_thread_num()) std::cerr<<"OMP WIP:"<f = [](){}) +{return ompwip_(file,line,func,"",f);} +#define ompwip(...) ompwip_(__FILE__,__LINE__,__PRETTY_FUNCTION__,##__VA_ARGS__) + +#define __shfl_down_sync(a,b,c) ompwip("__shfl_down_sync") + +#define cuMemcpy(a,b,c) ompwip() +#define cuMemcpyAsync(a,b,c,d) ompwip() +#define cuMemcpyDtoD(a,b,c) ompwip() +#define cuMemcpyDtoDAsync(a,b,c,d) ompwip() +#define cuMemcpyDtoH(a,b,c) ompwip() +#define cuMemcpyDtoHAsync(a,b,c,d) ompwip() +#define cuMemcpyHtoD(a,b,c) ompwip() +#define cuMemcpyHtoDAsync(a,b,c,d) ompwip() +#define cudaMemcpy(a,b,c,d) ompwip([&](){printfQuda("memcpy %p <- %p\n",a,b);ompwipMemcpy(a,(void*)b,c,d);}) +#define cudaMemcpy2D(a,b,c,d,e,f,g) ompwip() +#define cudaMemcpy2DAsync(a,b,c,d,e,f,g,h) ompwip() +#define cudaMemcpyAsync(a,b,c,d,e) ompwip() +#define cudaMemcpyToSymbolAsync(a,b,c,d,e,f) ompwip() +#define cudaMemset(a,b,c) ompwip([&](){printfQuda("memset %p\n",a);ompwipMemset(a,b,c);}) +#define cudaMemset2D(a,b,c,d,e) ompwip() +#define cudaMemset2DAsync(a,b,c,d,e,f) ompwip() +#define cudaMemsetAsync(a,b,c,d) ompwip() + +static inline void +ompwipMemset(void *p, unsigned char b, std::size_t s) +{ +#pragma omp target teams distribute parallel for simd is_device_ptr(p) + for(std::size_t i=0;i #include #endif +using qudaStream_t = cudaStream_t; + +#endif // QUDA_BACKEND_OMPTARGET + #include /** diff --git a/include/targets/omptarget/FFT_Plans.h b/include/targets/omptarget/FFT_Plans.h new file mode 100644 index 0000000000..e53bfe1a5f --- /dev/null +++ b/include/targets/omptarget/FFT_Plans.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include + +using FFTPlanHandle = int; +/* +#include + +using FFTPlanHandle = cufftHandle; +#define FFT_FORWARD CUFFT_FORWARD +#define FFT_INVERSE CUFFT_INVERSE + +#ifndef GPU_GAUGE_ALG +*/ +#ifdef QUDA_BACKEND_OMPTARGET +#define CUFFT_SAFE_CALL(call) + +inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int) +{ + errorQuda("unimplemented"); +} + +inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int) +{ + errorQuda("unimplemented"); +} + +inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("unimplemented"); +} + +inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("unimplemented"); +} + +inline void FFTDestroyPlan(FFTPlanHandle &) +{ + errorQuda("unimplemented"); +} +#else + +/*-------------------------------------------------------------------------------*/ +#define CUFFT_SAFE_CALL( call) { \ + cufftResult err = call; \ + if ( CUFFT_SUCCESS != err ) { \ + fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } +/*-------------------------------------------------------------------------------*/ + +/** + * @brief Call CUFFT to perform a single-precision complex-to-complex + * transform plan in the transform direction as specified by direction + * parameter + * @param[in] CUFFT plan + * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform + * @param[out] data_out, pointer to the complex output data (in GPU memory) + * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE + */ +inline void ApplyFFT(FFTPlanHandle &plan, float2 *data_in, float2 *data_out, int direction){ + CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)data_in, (cufftComplex *)data_out, direction)); +} + +/** + * @brief Call CUFFT to perform a double-precision complex-to-complex transform plan in the transform direction +as specified by direction parameter + * @param[in] CUFFT plan + * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform + * @param[out] data_out, pointer to the complex output data (in GPU memory) + * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE + */ +inline void ApplyFFT(FFTPlanHandle &plan, double2 *data_in, double2 *data_out, int direction){ + CUFFT_SAFE_CALL(cufftExecZ2Z(plan, (cufftDoubleComplex *)data_in, (cufftDoubleComplex *)data_out, direction)); +} + +/** + * @brief Creates a CUFFT plan supporting 4D (1D+3D) data layouts for complex-to-complex + * @param[out] plan, CUFFT plan + * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt) + * @param[in] dim, 1 for 1D plan along the temporal direction with batch size Nx*Ny*Nz, 3 for 3D plan along Nx, Ny and Nz with batch size Nt + * @param[in] precision The precision of the computation + */ + +inline void SetPlanFFTMany(FFTPlanHandle &plan, int4 size, int dim, QudaPrecision precision) +{ + auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C; + switch (dim) { + case 1: + { + int n[1] = { size.w }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 1, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y * size.z)); + } + break; + case 3: + { + int n[3] = { size.x, size.y, size.z }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 3, n, NULL, 1, 0, NULL, 1, 0, type, size.w)); + } + break; + } +} + +/** + * @brief Creates a CUFFT plan supporting 4D (2D+2D) data layouts for complex-to-complex + * @param[out] plan, CUFFT plan + * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt) + * @param[in] dim, 0 for 2D plan in Z-T planes with batch size Nx*Ny, 1 for 2D plan in X-Y planes with batch size Nz*Nt + * @param[in] precision The precision of the computation + */ +inline void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, QudaPrecision precision) +{ + auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C; + switch (dim) { + case 0: + { + int n[2] = { size.w, size.z }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y)); + } + break; + case 1: + { + int n[2] = { size.x, size.y }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.z * size.w)); + } + break; + } +} + +inline void FFTDestroyPlan( FFTPlanHandle &plan) { + CUFFT_SAFE_CALL(cufftDestroy(plan)); +} + +#endif diff --git a/include/targets/omptarget/block_reduction_kernel.h b/include/targets/omptarget/block_reduction_kernel.h new file mode 100644 index 0000000000..2a825cae8b --- /dev/null +++ b/include/targets/omptarget/block_reduction_kernel.h @@ -0,0 +1,70 @@ +#pragma once + +#include + +namespace quda { + + /** + @brief This helper function swizzles the block index through + mapping the block index onto a matrix and tranposing it. This is + done to potentially increase the cache utilization. Requires + that the argument class has a member parameter "swizzle" which + determines if we are swizzling and a parameter "swizzle_factor" + which is the effective matrix dimension that we are tranposing in + this mapping. + */ + template __device__ int virtual_block_idx(const Arg &arg) + { + QUDA_RT_CONSTS; + int block_idx = blockIdx.x; + if (arg.swizzle) { + // the portion of the grid that is exactly divisible by the number of SMs + const int gridp = gridDim.x - gridDim.x % arg.swizzle_factor; + + block_idx = blockIdx.x; + if (blockIdx.x < gridp) { + // this is the portion of the block that we are going to transpose + const int i = blockIdx.x % arg.swizzle_factor; + const int j = blockIdx.x / arg.swizzle_factor; + + // transpose the coordinates + block_idx = i * (gridp / arg.swizzle_factor) + j; + } + } + return block_idx; + } + + /** + @brief Generic block reduction kernel. Here, we ensure that each + thread block maps exactly to a logical block to be reduced, with + number of threads equal to the number of sites per block. The y + thread dimension is a trivial vectorizable dimension. + + TODO: add a Reducer class for non summation reductions + */ + template class Transformer, typename Arg> + __global__ void BlockReductionKernel2D(Arg arg) + { + QUDA_RT_CONSTS; + using reduce_t = typename Transformer::reduce_t; + Transformer t(arg); + + const int block = virtual_block_idx(arg); + const int i = threadIdx.x; + const int j = blockDim.y*blockIdx.y + threadIdx.y; + const int j_local = threadIdx.y; + if (j >= arg.threads.y) return; + + reduce_t value; // implicitly we assume here that default constructor zeros reduce_t + // only active threads call the transformer + if (i < arg.threads.x) value = t(block, i, j); +/* + // but all threads take part in the reduction + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage[Arg::n_vector_y]; + value = BlockReduce(temp_storage[j_local]).Sum(value); +*/ + if (i == 0) t.store(value, block, j); + } + +} diff --git a/include/targets/omptarget/kernel.h b/include/targets/omptarget/kernel.h new file mode 100644 index 0000000000..4c7ff3e240 --- /dev/null +++ b/include/targets/omptarget/kernel.h @@ -0,0 +1,61 @@ +#pragma once + +namespace quda { + + template