diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu index 79937d779..f1ce70a55 100644 --- a/Cxx11/dgemm-cublas.cu +++ b/Cxx11/dgemm-cublas.cu @@ -182,13 +182,11 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; - int batches = 0; - int input_copy = 0; + int iterations, order, batches = 0; + bool input_copy = false, random_initialization = false; try { if (argc < 2) { - throw "Usage: <# iterations> [] []"; + throw "Usage: <# iterations> [] [] []"; } iterations = std::atoi(argv[1]); @@ -213,6 +211,13 @@ int main(int argc, char * argv[]) throw "ERROR: input_copy was not 0 or 1"; } } + + if (argc > 5) { + random_initialization = std::atoi(argv[5]); + if (random_initialization != 0 && random_initialization != 1) { + throw "ERROR: random_initialization was not 0 or 1"; + } + } } catch (const char * e) { std::cout << e << std::endl; @@ -229,6 +234,7 @@ int main(int argc, char * argv[]) std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; cublasHandle_t h; prk::check( cublasCreate(&h) ); @@ -270,10 +276,24 @@ int main(int argc, char * argv[]) prk::CUDA::copyH2Dasync(&(d_a[b*nelems]), h_a, nelems); prk::CUDA::copyH2Dasync(&(d_b[b*nelems]), h_b, nelems); } - prk::CUDA::sync(); init<<>>(order, matrices, d_c); + } else if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + + // Generate uniform random numbers in [0,1] for matrices A and B + prk::check( curandGenerateUniformDouble(gen, d_a, matrices * nelems) ); + prk::check( curandGenerateUniformDouble(gen, d_b, matrices * nelems) ); + + prk::check( curandDestroyGenerator(gen) ); + + // Initialize matrix C to zero + init<<>>(order, matrices, d_c); + } else { init<<>>(order, matrices, d_a, d_b, d_c); @@ -346,12 +366,14 @@ int main(int argc, char * argv[]) } residuum /= matrices; - if (residuum < epsilon) { + if (residuum < epsilon || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (!random_initialization) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations/matrices; auto nflops = 2.0 * prk::pow(forder,3); prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime); diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index 584499379..28e0fa5dc 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -15,6 +15,7 @@ #ifdef PRK_USE_CUBLAS #include +#include #endif //#include @@ -58,6 +59,14 @@ namespace prk std::abort(); } } + + void check(curandStatus_t rc) + { + if (rc!=CURAND_STATUS_SUCCESS) { + std::cerr << "PRK CURAND error: " << rc << std::endl; + std::abort(); + } + } #endif namespace CUDA diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu index 7883daf90..902ff727e 100644 --- a/Cxx11/sgemm-cublas.cu +++ b/Cxx11/sgemm-cublas.cu @@ -62,6 +62,7 @@ #include "prk_util.h" #include "prk_cuda.h" +#include #if 0 __global__ void init(unsigned order, float * A, float * B, float * C) @@ -182,14 +183,12 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; - int batches = 0; - bool input_copy{false}; + int iterations, order, batches = 0; + bool input_copy = false, random_initialization = false; bool tf32{false}; try { if (argc < 2) { - throw "Usage: <# iterations> [] [] []"; + throw "Usage: <# iterations> [] [] [] []"; } iterations = std::atoi(argv[1]); @@ -215,6 +214,10 @@ int main(int argc, char * argv[]) if (argc > 5) { tf32 = prk::parse_boolean(std::string(argv[5])); } + + if (argc > 6) { + random_initialization = prk::parse_boolean(std::string(argv[6])); + } } catch (const char * e) { std::cout << e << std::endl; @@ -232,6 +235,7 @@ int main(int argc, char * argv[]) } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; std::cout << "TF32 = " << (tf32 ? "yes" : "no") << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; cublasHandle_t h; prk::check( cublasCreate(&h) ); @@ -281,6 +285,21 @@ int main(int argc, char * argv[]) init<<>>(order, matrices, d_c); + } else if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + + // Generate uniform random numbers in [0,1] for matrices A and B + prk::check( curandGenerateUniform(gen, d_a, matrices * nelems) ); + prk::check( curandGenerateUniform(gen, d_b, matrices * nelems) ); + + prk::check( curandDestroyGenerator(gen) ); + + // Initialize matrix C to zero + init<<>>(order, matrices, d_c); + } else { init<<>>(order, matrices, d_a, d_b, d_c); @@ -358,12 +377,14 @@ int main(int argc, char * argv[]) } residuum /= matrices; - if (residuum < epsilon) { + if (residuum < epsilon || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (!random_initialization) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations/matrices; auto nflops = 2.0 * prk::pow(forder,3); std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime diff --git a/Cxx11/xgemm-cublas.cu b/Cxx11/xgemm-cublas.cu index 4f47ad972..e4eee5a3b 100644 --- a/Cxx11/xgemm-cublas.cu +++ b/Cxx11/xgemm-cublas.cu @@ -59,6 +59,7 @@ #include "prk_util.h" #include "prk_cuda.h" +#include "prk_thrust.h" prk::CUDA::info info; @@ -86,6 +87,64 @@ __global__ void init(int order, T * A, T * B, T * C) } } +// CURAND initialization templates +template +void curand_init_matrices(T* d_a, T* d_b, size_t nelems) { + std::cerr << "No valid CURAND template match for type T" << std::endl; + std::abort(); +} + +template <> +void curand_init_matrices(float* d_a, float* d_b, size_t nelems) { + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniform(gen, d_a, nelems) ); + prk::check( curandGenerateUniform(gen, d_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); +} + +template <> +void curand_init_matrices(double* d_a, double* d_b, size_t nelems) { + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniformDouble(gen, d_a, nelems) ); + prk::check( curandGenerateUniformDouble(gen, d_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); +} + +template <> +void curand_init_matrices<__half>(__half* d_a, __half* d_b, size_t nelems) { + // CURAND doesn't directly support __half, so generate float and convert + auto temp_a = prk::CUDA::malloc_device(nelems); + auto temp_b = prk::CUDA::malloc_device(nelems); + + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniform(gen, temp_a, nelems) ); + prk::check( curandGenerateUniform(gen, temp_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); + + // Convert float to __half + const int block_size = 256; + const int grid_size = (nelems + block_size - 1) / block_size; + + auto convert_kernel = [] __device__ (int idx, float* src, __half* dst, size_t n) { + if (idx < n) dst[idx] = __float2half(src[idx]); + }; + + // Launch conversion kernels + thrust::for_each_n(thrust::device, thrust::counting_iterator(0), nelems, + [=] __device__ (int idx) { if (idx < nelems) d_a[idx] = __float2half(temp_a[idx]); }); + thrust::for_each_n(thrust::device, thrust::counting_iterator(0), nelems, + [=] __device__ (int idx) { if (idx < nelems) d_b[idx] = __float2half(temp_b[idx]); }); + + prk::CUDA::free(temp_a); + prk::CUDA::free(temp_b); +} + template void prk_gemm(const cublasHandle_t & h, const int order, const TC alpha, const TC beta, @@ -141,7 +200,7 @@ void prk_gemm(const cublasHandle_t & h, } template -void run(const cublasHandle_t & h, int iterations, int order) +void run(const cublasHandle_t & h, int iterations, int order, bool random_initialization) { double gemm_time{0}; @@ -156,7 +215,15 @@ void run(const cublasHandle_t & h, int iterations, int order) auto d_a = prk::CUDA::malloc_device(nelems); auto d_b = prk::CUDA::malloc_device(nelems); auto d_c = prk::CUDA::malloc_device(nelems); - init<<>>(order, d_a, d_b, d_c); + + if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curand_init_matrices(d_a, d_b, nelems); + // Initialize matrix C to zero + init<<>>(order, d_c); + } else { + init<<>>(order, d_a, d_b, d_c); + } prk::CUDA::sync(); { for (int iter = 0; iter<=iterations; iter++) { @@ -190,12 +257,14 @@ void run(const cublasHandle_t & h, int iterations, int order) } const double residuum = std::abs(checksum - reference) / reference; const double epsilon{1.0e-8}; - if ((residuum < epsilon) || (sizeof(T) < 4)) { + if ((residuum < epsilon) || (sizeof(T) < 4) || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (residuum < epsilon) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations; auto nflops = 2.0 * prk::pow(forder,3); auto is_fp64 = (typeid(T) == typeid(double)); @@ -222,11 +291,11 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; + int iterations, order; + bool random_initialization = false; try { if (argc < 2) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> []"; } iterations = std::atoi(argv[1]); @@ -240,6 +309,10 @@ int main(int argc, char * argv[]) } else if (order > prk::get_max_matrix_size()) { throw "ERROR: matrix dimension too large - overflow risk"; } + + if (argc > 3) { + random_initialization = prk::parse_boolean(std::string(argv[3])); + } } catch (const char * e) { std::cout << e << std::endl; @@ -250,6 +323,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; ////////////////////////////////////////////////////////////////////// /// Setup CUBLAS environment @@ -257,9 +331,9 @@ int main(int argc, char * argv[]) cublasHandle_t h; prk::check( cublasCreate(&h) ); - run<__half>(h, iterations, order); - run(h, iterations, order); - run(h, iterations, order); + run<__half>(h, iterations, order, random_initialization); + run(h, iterations, order, random_initialization); + run(h, iterations, order, random_initialization); prk::check( cublasDestroy(h) ); return 0; diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 0fd828d7c..4419c03ed 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -148,6 +148,7 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +THRUSTFLAG+=--extended-lambda # # CBLAS for C++ DGEMM #