Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions Cxx11/dgemm-cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,11 @@ int main(int argc, char * argv[])
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
int order;
int batches = 0;
int input_copy = 0;
int iterations, order, batches = 0;
bool input_copy = false, random_initialization = false;
try {
if (argc < 2) {
throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<random initializatoin [0/1]>]";
}

iterations = std::atoi(argv[1]);
Expand All @@ -213,6 +211,13 @@ int main(int argc, char * argv[])
throw "ERROR: input_copy was not 0 or 1";
}
}

if (argc > 5) {
random_initialization = std::atoi(argv[5]);
if (random_initialization != 0 && random_initialization != 1) {
throw "ERROR: random_initialization was not 0 or 1";
}
}
}
catch (const char * e) {
std::cout << e << std::endl;
Expand All @@ -229,6 +234,7 @@ int main(int argc, char * argv[])
std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl;
}
std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl;
std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl;

cublasHandle_t h;
prk::check( cublasCreate(&h) );
Expand Down Expand Up @@ -270,10 +276,24 @@ int main(int argc, char * argv[])
prk::CUDA::copyH2Dasync(&(d_a[b*nelems]), h_a, nelems);
prk::CUDA::copyH2Dasync(&(d_b[b*nelems]), h_b, nelems);
}
prk::CUDA::sync();

init<<<dimGrid, dimBlock>>>(order, matrices, d_c);

} else if (random_initialization) {
// Initialize matrices with CURAND uniform distribution [0,1]
curandGenerator_t gen;
prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );

// Generate uniform random numbers in [0,1] for matrices A and B
prk::check( curandGenerateUniformDouble(gen, d_a, matrices * nelems) );
prk::check( curandGenerateUniformDouble(gen, d_b, matrices * nelems) );

prk::check( curandDestroyGenerator(gen) );

// Initialize matrix C to zero
init<<<dimGrid, dimBlock>>>(order, matrices, d_c);

} else {

init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
Expand Down Expand Up @@ -346,12 +366,14 @@ int main(int argc, char * argv[])
}
residuum /= matrices;

if (residuum < epsilon) {
if (residuum < epsilon || random_initialization) {
#if VERBOSE
std::cout << "Reference checksum = " << reference << "\n"
<< "Actual checksum = " << checksum << std::endl;
#endif
std::cout << "Solution validates" << std::endl;
if (!random_initialization) {
std::cout << "Solution validates" << std::endl;
}
auto avgtime = gemm_time/iterations/matrices;
auto nflops = 2.0 * prk::pow(forder,3);
prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime);
Expand Down
9 changes: 9 additions & 0 deletions Cxx11/prk_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#ifdef PRK_USE_CUBLAS
#include <cublas_v2.h>
#include <curand.h>
#endif

//#include <nvtx3.hpp>
Expand Down Expand Up @@ -58,6 +59,14 @@ namespace prk
std::abort();
}
}

void check(curandStatus_t rc)
{
if (rc!=CURAND_STATUS_SUCCESS) {
std::cerr << "PRK CURAND error: " << rc << std::endl;
std::abort();
}
}
#endif

namespace CUDA
Expand Down
35 changes: 28 additions & 7 deletions Cxx11/sgemm-cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@

#include "prk_util.h"
#include "prk_cuda.h"
#include <curand.h>

#if 0
__global__ void init(unsigned order, float * A, float * B, float * C)
Expand Down Expand Up @@ -182,14 +183,12 @@ int main(int argc, char * argv[])
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
int order;
int batches = 0;
bool input_copy{false};
int iterations, order, batches = 0;
bool input_copy = false, random_initialization = false;
bool tf32{false};
try {
if (argc < 2) {
throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>]";
throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>] [<random initialization [0/1]>]";
}

iterations = std::atoi(argv[1]);
Expand All @@ -215,6 +214,10 @@ int main(int argc, char * argv[])
if (argc > 5) {
tf32 = prk::parse_boolean(std::string(argv[5]));
}

if (argc > 6) {
random_initialization = prk::parse_boolean(std::string(argv[6]));
}
}
catch (const char * e) {
std::cout << e << std::endl;
Expand All @@ -232,6 +235,7 @@ int main(int argc, char * argv[])
}
std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl;
std::cout << "TF32 = " << (tf32 ? "yes" : "no") << std::endl;
std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl;

cublasHandle_t h;
prk::check( cublasCreate(&h) );
Expand Down Expand Up @@ -281,6 +285,21 @@ int main(int argc, char * argv[])

init<<<dimGrid, dimBlock>>>(order, matrices, d_c);

} else if (random_initialization) {
// Initialize matrices with CURAND uniform distribution [0,1]
curandGenerator_t gen;
prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );

// Generate uniform random numbers in [0,1] for matrices A and B
prk::check( curandGenerateUniform(gen, d_a, matrices * nelems) );
prk::check( curandGenerateUniform(gen, d_b, matrices * nelems) );

prk::check( curandDestroyGenerator(gen) );

// Initialize matrix C to zero
init<<<dimGrid, dimBlock>>>(order, matrices, d_c);

} else {

init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
Expand Down Expand Up @@ -358,12 +377,14 @@ int main(int argc, char * argv[])
}
residuum /= matrices;

if (residuum < epsilon) {
if (residuum < epsilon || random_initialization) {
#if VERBOSE
std::cout << "Reference checksum = " << reference << "\n"
<< "Actual checksum = " << checksum << std::endl;
#endif
std::cout << "Solution validates" << std::endl;
if (!random_initialization) {
std::cout << "Solution validates" << std::endl;
}
auto avgtime = gemm_time/iterations/matrices;
auto nflops = 2.0 * prk::pow(forder,3);
std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
Expand Down
94 changes: 84 additions & 10 deletions Cxx11/xgemm-cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

#include "prk_util.h"
#include "prk_cuda.h"
#include "prk_thrust.h"

prk::CUDA::info info;

Expand Down Expand Up @@ -86,6 +87,64 @@ __global__ void init(int order, T * A, T * B, T * C)
}
}

// CURAND initialization templates
template <typename T>
void curand_init_matrices(T* d_a, T* d_b, size_t nelems) {
std::cerr << "No valid CURAND template match for type T" << std::endl;
std::abort();
}

template <>
void curand_init_matrices<float>(float* d_a, float* d_b, size_t nelems) {
curandGenerator_t gen;
prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
prk::check( curandGenerateUniform(gen, d_a, nelems) );
prk::check( curandGenerateUniform(gen, d_b, nelems) );
prk::check( curandDestroyGenerator(gen) );
}

template <>
void curand_init_matrices<double>(double* d_a, double* d_b, size_t nelems) {
curandGenerator_t gen;
prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
prk::check( curandGenerateUniformDouble(gen, d_a, nelems) );
prk::check( curandGenerateUniformDouble(gen, d_b, nelems) );
prk::check( curandDestroyGenerator(gen) );
}

template <>
void curand_init_matrices<__half>(__half* d_a, __half* d_b, size_t nelems) {
// CURAND doesn't directly support __half, so generate float and convert
auto temp_a = prk::CUDA::malloc_device<float>(nelems);
auto temp_b = prk::CUDA::malloc_device<float>(nelems);

curandGenerator_t gen;
prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
prk::check( curandGenerateUniform(gen, temp_a, nelems) );
prk::check( curandGenerateUniform(gen, temp_b, nelems) );
prk::check( curandDestroyGenerator(gen) );

// Convert float to __half
const int block_size = 256;
const int grid_size = (nelems + block_size - 1) / block_size;

auto convert_kernel = [] __device__ (int idx, float* src, __half* dst, size_t n) {
if (idx < n) dst[idx] = __float2half(src[idx]);
};

// Launch conversion kernels
thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
[=] __device__ (int idx) { if (idx < nelems) d_a[idx] = __float2half(temp_a[idx]); });
thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
[=] __device__ (int idx) { if (idx < nelems) d_b[idx] = __float2half(temp_b[idx]); });

prk::CUDA::free(temp_a);
prk::CUDA::free(temp_b);
}

template <typename TAB, typename TC>
void prk_gemm(const cublasHandle_t & h,
const int order, const TC alpha, const TC beta,
Expand Down Expand Up @@ -141,7 +200,7 @@ void prk_gemm(const cublasHandle_t & h,
}

template <typename T>
void run(const cublasHandle_t & h, int iterations, int order)
void run(const cublasHandle_t & h, int iterations, int order, bool random_initialization)
{
double gemm_time{0};

Expand All @@ -156,7 +215,15 @@ void run(const cublasHandle_t & h, int iterations, int order)
auto d_a = prk::CUDA::malloc_device<T>(nelems);
auto d_b = prk::CUDA::malloc_device<T>(nelems);
auto d_c = prk::CUDA::malloc_device<T>(nelems);
init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);

if (random_initialization) {
// Initialize matrices with CURAND uniform distribution [0,1]
curand_init_matrices(d_a, d_b, nelems);
// Initialize matrix C to zero
init<<<dimGrid, dimBlock>>>(order, d_c);
} else {
init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
}
prk::CUDA::sync();
{
for (int iter = 0; iter<=iterations; iter++) {
Expand Down Expand Up @@ -190,12 +257,14 @@ void run(const cublasHandle_t & h, int iterations, int order)
}
const double residuum = std::abs(checksum - reference) / reference;
const double epsilon{1.0e-8};
if ((residuum < epsilon) || (sizeof(T) < 4)) {
if ((residuum < epsilon) || (sizeof(T) < 4) || random_initialization) {
#if VERBOSE
std::cout << "Reference checksum = " << reference << "\n"
<< "Actual checksum = " << checksum << std::endl;
#endif
std::cout << "Solution validates" << std::endl;
if (residuum < epsilon) {
std::cout << "Solution validates" << std::endl;
}
auto avgtime = gemm_time/iterations;
auto nflops = 2.0 * prk::pow(forder,3);
auto is_fp64 = (typeid(T) == typeid(double));
Expand All @@ -222,11 +291,11 @@ int main(int argc, char * argv[])
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
int order;
int iterations, order;
bool random_initialization = false;
try {
if (argc < 2) {
throw "Usage: <# iterations> <matrix order>";
throw "Usage: <# iterations> <matrix order> [<random initialization [0/1]>]";
}

iterations = std::atoi(argv[1]);
Expand All @@ -240,6 +309,10 @@ int main(int argc, char * argv[])
} else if (order > prk::get_max_matrix_size()) {
throw "ERROR: matrix dimension too large - overflow risk";
}

if (argc > 3) {
random_initialization = prk::parse_boolean(std::string(argv[3]));
}
}
catch (const char * e) {
std::cout << e << std::endl;
Expand All @@ -250,16 +323,17 @@ int main(int argc, char * argv[])

std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Matrix order = " << order << std::endl;
std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl;

//////////////////////////////////////////////////////////////////////
/// Setup CUBLAS environment
//////////////////////////////////////////////////////////////////////

cublasHandle_t h;
prk::check( cublasCreate(&h) );
run<__half>(h, iterations, order);
run<float>(h, iterations, order);
run<double>(h, iterations, order);
run<__half>(h, iterations, order, random_initialization);
run<float>(h, iterations, order, random_initialization);
run<double>(h, iterations, order, random_initialization);
prk::check( cublasDestroy(h) );

return 0;
Expand Down
1 change: 1 addition & 0 deletions common/make.defs.cuda
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ RAJADIR=/opt/raja/gcc
RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar
THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
THRUSTFLAG+=--extended-lambda
#
# CBLAS for C++ DGEMM
#
Expand Down