ParRes · jeffhammond · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
@@ -182,13 +182,11 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
-  int batches = 0;
-  int input_copy = 0;
+  int iterations, order, batches = 0;
+  bool input_copy = false, random_initialization = false;
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<random initializatoin [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -213,6 +211,13 @@ int main(int argc, char * argv[])
           throw "ERROR: input_copy was not 0 or 1";
         }
       }
+
+      if (argc > 5) {
+        random_initialization = std::atoi(argv[5]);
+        if (random_initialization != 0 && random_initialization != 1) {
+          throw "ERROR: random_initialization was not 0 or 1";
+        }
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -229,6 +234,7 @@ int main(int argc, char * argv[])
       std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
   }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
@@ -270,10 +276,24 @@ int main(int argc, char * argv[])
       prk::CUDA::copyH2Dasync(&(d_a[b*nelems]), h_a, nelems);
       prk::CUDA::copyH2Dasync(&(d_b[b*nelems]), h_b, nelems);
     }
-    prk::CUDA::sync();
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
 
+  } else if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+
+    // Generate uniform random numbers in [0,1] for matrices A and B
+    prk::check( curandGenerateUniformDouble(gen, d_a, matrices * nelems) );
+    prk::check( curandGenerateUniformDouble(gen, d_b, matrices * nelems) );
+
+    prk::check( curandDestroyGenerator(gen) );
+
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
+
   } else {
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
@@ -346,12 +366,14 @@ int main(int argc, char * argv[])
   }
   residuum /= matrices;
 
-  if (residuum < epsilon) {
+  if (residuum < epsilon || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (!random_initialization) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations/matrices;
     auto nflops = 2.0 * prk::pow(forder,3);
     prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime);

@@ -15,6 +15,7 @@
 
 #ifdef PRK_USE_CUBLAS
 #include <cublas_v2.h>
+#include <curand.h>
 #endif
 
 //#include <nvtx3.hpp>
@@ -58,6 +59,14 @@ namespace prk
             std::abort();
         }
     }
+
+    void check(curandStatus_t rc)
+    {
+        if (rc!=CURAND_STATUS_SUCCESS) {
+            std::cerr << "PRK CURAND error: " << rc << std::endl;
+            std::abort();
+        }
+    }
 #endif
 
     namespace CUDA

@@ -62,6 +62,7 @@
 
 #include "prk_util.h"
 #include "prk_cuda.h"
+#include <curand.h>
 
 #if 0
 __global__ void init(unsigned order, float * A, float * B, float * C)
@@ -182,14 +183,12 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
-  int batches = 0;
-  bool input_copy{false};
+  int iterations, order, batches = 0;
+  bool input_copy = false, random_initialization = false;
   bool tf32{false};
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>]";
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>] [<random initialization [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -215,6 +214,10 @@ int main(int argc, char * argv[])
       if (argc > 5) {
         tf32 = prk::parse_boolean(std::string(argv[5]));
       }
+
+      if (argc > 6) {
+        random_initialization = prk::parse_boolean(std::string(argv[6]));
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -232,6 +235,7 @@ int main(int argc, char * argv[])
   }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
   std::cout << "TF32                 = " << (tf32 ? "yes" : "no") << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
@@ -281,6 +285,21 @@ int main(int argc, char * argv[])
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
 
+  } else if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+
+    // Generate uniform random numbers in [0,1] for matrices A and B
+    prk::check( curandGenerateUniform(gen, d_a, matrices * nelems) );
+    prk::check( curandGenerateUniform(gen, d_b, matrices * nelems) );
+
+    prk::check( curandDestroyGenerator(gen) );
+
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
+
   } else {
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
@@ -358,12 +377,14 @@ int main(int argc, char * argv[])
   }
   residuum /= matrices;
 
-  if (residuum < epsilon) {
+  if (residuum < epsilon || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (!random_initialization) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations/matrices;
     auto nflops = 2.0 * prk::pow(forder,3);
     std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime

@@ -59,6 +59,7 @@
 
 #include "prk_util.h"
 #include "prk_cuda.h"
+#include "prk_thrust.h"
 
 prk::CUDA::info info;
 
@@ -86,6 +87,64 @@ __global__ void init(int order, T * A, T * B, T * C)
     }
 }
 
+// CURAND initialization templates
+template <typename T>
+void curand_init_matrices(T* d_a, T* d_b, size_t nelems) {
+    std::cerr << "No valid CURAND template match for type T" << std::endl;
+    std::abort();
+}
+
+template <>
+void curand_init_matrices<float>(float* d_a, float* d_b, size_t nelems) {
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniform(gen, d_a, nelems) );
+    prk::check( curandGenerateUniform(gen, d_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+}
+
+template <>
+void curand_init_matrices<double>(double* d_a, double* d_b, size_t nelems) {
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniformDouble(gen, d_a, nelems) );
+    prk::check( curandGenerateUniformDouble(gen, d_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+}
+
+template <>
+void curand_init_matrices<__half>(__half* d_a, __half* d_b, size_t nelems) {
+    // CURAND doesn't directly support __half, so generate float and convert
+    auto temp_a = prk::CUDA::malloc_device<float>(nelems);
+    auto temp_b = prk::CUDA::malloc_device<float>(nelems);
+
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniform(gen, temp_a, nelems) );
+    prk::check( curandGenerateUniform(gen, temp_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+
+    // Convert float to __half
+    const int block_size = 256;
+    const int grid_size = (nelems + block_size - 1) / block_size;
+
+    auto convert_kernel = [] __device__ (int idx, float* src, __half* dst, size_t n) {
+        if (idx < n) dst[idx] = __float2half(src[idx]);
+    };
+
+    // Launch conversion kernels
+    thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
+        [=] __device__ (int idx) { if (idx < nelems) d_a[idx] = __float2half(temp_a[idx]); });
+    thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
+        [=] __device__ (int idx) { if (idx < nelems) d_b[idx] = __float2half(temp_b[idx]); });
+
+    prk::CUDA::free(temp_a);
+    prk::CUDA::free(temp_b);
+}
+
 template <typename TAB, typename TC>
 void prk_gemm(const cublasHandle_t & h,
               const int order, const TC alpha, const TC beta,
@@ -141,7 +200,7 @@ void prk_gemm(const cublasHandle_t & h,
 }
 
 template <typename T>
-void run(const cublasHandle_t & h, int iterations, int order)
+void run(const cublasHandle_t & h, int iterations, int order, bool random_initialization)
 {
   double gemm_time{0};
 
@@ -156,7 +215,15 @@ void run(const cublasHandle_t & h, int iterations, int order)
   auto d_a = prk::CUDA::malloc_device<T>(nelems);
   auto d_b = prk::CUDA::malloc_device<T>(nelems);
   auto d_c = prk::CUDA::malloc_device<T>(nelems);
-  init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+
+  if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curand_init_matrices(d_a, d_b, nelems);
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, d_c);
+  } else {
+    init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+  }
   prk::CUDA::sync();
   {
     for (int iter = 0; iter<=iterations; iter++) {
@@ -190,12 +257,14 @@ void run(const cublasHandle_t & h, int iterations, int order)
   }
   const double residuum = std::abs(checksum - reference) / reference;
   const double epsilon{1.0e-8};
-  if ((residuum < epsilon) || (sizeof(T) < 4)) {
+  if ((residuum < epsilon) || (sizeof(T) < 4) || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (residuum < epsilon) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations;
     auto nflops = 2.0 * prk::pow(forder,3);
     auto is_fp64 = (typeid(T) == typeid(double));
@@ -222,11 +291,11 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
+  int iterations, order;
+  bool random_initialization = false;
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order>";
+        throw "Usage: <# iterations> <matrix order> [<random initialization [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -240,6 +309,10 @@ int main(int argc, char * argv[])
       } else if (order > prk::get_max_matrix_size()) {
         throw "ERROR: matrix dimension too large - overflow risk";
       }
+
+      if (argc > 3) {
+        random_initialization = prk::parse_boolean(std::string(argv[3]));
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -250,16 +323,17 @@ int main(int argc, char * argv[])
 
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   /// Setup CUBLAS environment
   //////////////////////////////////////////////////////////////////////
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
-  run<__half>(h, iterations, order);
-  run<float>(h, iterations, order);
-  run<double>(h, iterations, order);
+  run<__half>(h, iterations, order, random_initialization);
+  run<float>(h, iterations, order, random_initialization);
+  run<double>(h, iterations, order, random_initialization);
   prk::check( cublasDestroy(h) );
 
   return 0;

@@ -148,6 +148,7 @@ RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+THRUSTFLAG+=--extended-lambda
 #
 # CBLAS for C++ DGEMM
 #