diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp index 1573aac64..ac9505760 100644 --- a/clients/common/cblas_interface.cpp +++ b/clients/common/cblas_interface.cpp @@ -5,9 +5,11 @@ #include +#include #include "hipblas.h" #include "cblas_interface.h" #include "cblas.h" +#include "utility.h" /*!\file * \brief provide template functions interfaces to CBLAS C89 interfaces, it is only used for testing not part of the GPU library @@ -385,6 +387,70 @@ extern "C" { //gemm + template <> + void cblas_gemm(hipblasOperation_t transA, + hipblasOperation_t transB, + int m, + int n, + int k, + hipblasHalf alpha, + hipblasHalf* A, + int lda, + hipblasHalf* B, + int ldb, + hipblasHalf beta, + hipblasHalf* C, + int ldc) + { + // cblas does not support hipblasHalf, so convert to higher precision float + // This will give more precise result which is acceptable for testing + float alpha_float = half_to_float(alpha); + float beta_float = half_to_float(beta); + + int sizeA = transA == HIPBLAS_OP_N ? k * lda : m * lda; + int sizeB = transB == HIPBLAS_OP_N ? n * ldb : k * ldb; + int sizeC = n * ldc; + + std::unique_ptr A_float(new float[sizeA]()); + std::unique_ptr B_float(new float[sizeB]()); + std::unique_ptr C_float(new float[sizeC]()); + + for(int i = 0; i < sizeA; i++) + { + A_float[i] = half_to_float(A[i]); + } + for(int i = 0; i < sizeB; i++) + { + B_float[i] = half_to_float(B[i]); + } + for(int i = 0; i < sizeC; i++) + { + C_float[i] = half_to_float(C[i]); + } + + // just directly cast, since transA, transB are integers in the enum + // printf("transA: rocblas =%d, cblas=%d\n", transA, (CBLAS_TRANSPOSE)transA ); + cblas_sgemm(CblasColMajor, + (CBLAS_TRANSPOSE)transA, + (CBLAS_TRANSPOSE)transB, + m, + n, + k, + alpha_float, + const_cast(A_float.get()), + lda, + const_cast(B_float.get()), + ldb, + beta_float, + static_cast(C_float.get()), + ldc); + + for(int i = 0; i < sizeC; i++) + { + C[i] = float_to_half(C_float[i]); + } + } + template<> void cblas_gemm( hipblasOperation_t transA, hipblasOperation_t transB, int m, int n, int k, diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index e99114e76..862fad36e 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -5,6 +5,7 @@ #include "hipblas.h" #include "unit.h" +#include "utility.h" /* ========================================Gtest Unit Check ==================================================== */ @@ -13,6 +14,25 @@ //Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test case // a wrapper will cause the loop keep going +template <> +void unit_check_general( + int M, int N, int lda, hipblasHalf* hCPU, hipblasHalf* hGPU) +{ +#pragma unroll + for(int j = 0; j < N; j++) + { +#pragma unroll + for(int i = 0; i < M; i++) + { +#ifdef GOOGLE_TEST + float cpu_float = half_to_float(hCPU[i + j * lda]); + float gpu_float = half_to_float(hGPU[i + j * lda]); + ASSERT_FLOAT_EQ(cpu_float, gpu_float); +#endif + } + } +} + template<> diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 8193ec989..830a3ad9f 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -45,6 +45,7 @@ set(hipblas_test_source gemv_gtest.cpp ger_gtest.cpp gemm_gtest.cpp + gemm_ex_gtest.cpp gemm_strided_batched_gtest.cpp gemm_batched_gtest.cpp geam_gtest.cpp @@ -92,11 +93,18 @@ if( NOT CUDA_FOUND ) get_target_property( HCC_AM_LOCATION hcc::hc_am IMPORTED_LOCATION_RELEASE ) target_link_libraries( hipblas-test PRIVATE ${HIP_HCC_LOCATION} ${HCC_AM_LOCATION} ) + if( CMAKE_CXX_COMPILER MATCHES ".*/hcc$|.*/hipcc$" ) # Remove following when hcc is fixed; hcc emits following spurious warning # "clang-5.0: warning: argument unused during compilation: '-isystem /opt/rocm/include'" - target_compile_options( hipblas-test PRIVATE -Wno-unused-command-line-argument ) + target_compile_options( hipblas-test PRIVATE -Wno-unused-command-line-argument -mf16c) + + elseif( CMAKE_COMPILER_IS_GNUCXX ) + # GCC needs specific flag to turn on f16c intrinsics + target_compile_options( hipblas-test PRIVATE -mf16c ) + endif( ) + if( CMAKE_CXX_COMPILER MATCHES ".*/hcc$|.*/hipcc$" ) # hip-clang needs specific flag to turn on pthread and m target_link_libraries( hipblas-test PRIVATE -lpthread -lm ) diff --git a/clients/gtest/gemm_ex_gtest.cpp b/clients/gtest/gemm_ex_gtest.cpp new file mode 100644 index 000000000..9308e9001 --- /dev/null +++ b/clients/gtest/gemm_ex_gtest.cpp @@ -0,0 +1,415 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include +#include +#include +#include +#include "testing_gemm_ex.hpp" +#include "utility.h" + +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using ::testing::Combine; +using namespace std; + +/* ===================================================================== +README: This file contains testers to verify the correctness of + BLAS routines with google test + + It is supposed to be played/used by advance / expert users + Normal users only need to get the library routines without testers + =================================================================== */ + +// only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary, std::tuple is good enough; + +typedef std::tuple, vector, vector, vector> + gemm_ex_tuple; + +// clang-format off +// vector of vector, each vector is a {M, N, K, lda, ldb, ldc}; +// add/delete as a group +const vector> small_matrix_size_range = { + { 1, 1, 1, 1, 1, 1}, + { 1, 2, 3, 4, 5, 6}, + { 7, 9, 15, 17, 18, 19}, + { 8, 1, 1, 8, 8, 8}, + { 2, 2, 2, 2, 2, 2}, + { 3, 3, 3, 3, 3, 3}, + { 4, 4, 4, 4, 4, 4}, + { 5, 5, 5, 5, 5, 5}, + { 6, 6, 6, 6, 6, 6}, + { 7, 7, 7, 7, 7, 7}, + { 8, 8, 8, 8, 8, 8}, + { 9, 9, 9, 9, 9, 9}, + {10, 10, 10, 10, 10, 10}, + {11, 11, 11, 11, 11, 11}, + {12, 12, 12, 12, 12, 12}, + {13, 13, 13, 13, 13, 13}, + {14, 14, 14, 14, 14, 14}, + {15, 15, 15, 15, 15, 15}, + {16, 16, 16, 16, 16, 16}, + {17, 17, 17, 17, 17, 17}, + {18, 18, 18, 18, 18, 18}, + {19, 19, 19, 19, 19, 19}, + {20, 20, 20, 20, 20, 20}, + { 2, 3, 4, 5, 6, 7}, + { 3, 4, 5, 6, 7, 8}, + { 4, 5, 6, 6, 6, 6}, + { 5, 6, 7, 7, 8, 9}, + { 6, 7, 8, 10, 9, 8}, + { 7, 8, 9, 11, 9, 10}, + { 8, 9, 10, 10, 11, 12}, + { 9, 10, 11, 12, 11, 13}, + {13, 12, 11, 15, 14, 13}, + {15, 16, 17, 17, 18, 19}, + {18, 17, 16, 18, 18, 18}, + {16, 17, 18, 20, 19, 18}, + { 8, 2, 2, 8, 8, 8}, + { 8, 3, 3, 8, 8, 8}, + { 8, 4, 4, 8, 8, 8}, + { 8, 5, 5, 8, 8, 8}, + { 8, 6, 6, 8, 8, 8}, + { 8, 7, 7, 8, 8, 8}, + { 8, 9, 9, 9, 9, 9}, + { 8, 10, 10, 10, 10, 10}, + { 8, 11, 11, 11, 11, 11}, + { 8, 12, 12, 12, 12, 12}, + { 8, 13, 13, 13, 13, 13}, + { 8, 14, 14, 14, 14, 14}, + { 8, 15, 15, 15, 15, 15}, +// {16, 15, 15, 16, 16, 16}, +// {16, 17, 17, 17, 17, 17}, +// {17, 16, 16, 17, 17, 17}, +// {16, 18, 18, 18, 18, 18}, +// {24, 24, 24, 24, 24, 24}, +// {32, 32, 32, 32, 32, 32}, +// {40, 40, 40, 40, 40, 40}, +// {48, 48, 48, 48, 48, 48}, +// {56, 56, 56, 56, 56, 56}, +// {64, 64, 64, 64, 64, 64}, +// {72, 72, 72, 72, 72, 72}, +}; +const vector> medium_matrix_size_range = { + {127, 127, 63, 127, 127, 127}, + {128, 127, 63, 128, 128, 128}, + {129, 127, 63, 129, 129, 129}, +// {127, 128, 63, 128, 127, 127}, +// {128, 128, 63, 128, 127, 127}, +// {129, 128, 63, 129, 129, 129}, +// {127, 129, 63, 129, 129, 129}, +// {128, 129, 63, 129, 129, 129}, +// {129, 129, 63, 129, 129, 129}, +// {127, 127, 64, 127, 127, 127}, +// {128, 127, 64, 128, 128, 128}, +// {129, 127, 64, 129, 129, 129}, +// {127, 128, 64, 128, 127, 127}, +// {128, 128, 64, 128, 127, 127}, +// {129, 128, 64, 129, 129, 129}, +// {127, 129, 64, 129, 129, 129}, +// {128, 129, 64, 129, 129, 129}, +// {129, 129, 64, 129, 129, 129}, +// {127, 127, 65, 127, 127, 127}, +// {128, 127, 65, 128, 128, 128}, +// {129, 127, 65, 129, 129, 129}, +// {127, 128, 65, 128, 127, 127}, +// {128, 128, 65, 128, 127, 127}, +// {129, 128, 65, 129, 129, 129}, +// {127, 129, 65, 129, 129, 129}, +// {128, 129, 65, 129, 129, 129}, +// {129, 129, 65, 129, 129, 129}, +// {191, 193, 194, 195, 196, 197}, +// {500, 501, 502, 503, 604, 505}, +// {639, 640, 347, 960, 961,1062}, +}; + +// vector of vector, each vector is a {M, N, K, lda, ldb, ldc}; +const vector> large_matrix_size_range = { + {1000, 1001, 101, 2002, 1003, 1004}, + { 925, 1026, 1027, 1028, 2029, 1031}, + {4011, 4012, 103, 4014, 4015, 4016}, +}; + +// vector of vector, each vector is a {M, N, K, lda, ldb, ldc}; +const vector> chunk_matrix_size_range = { + {24000, 256, 256, 24010, 256, 24000}, + {24000, 256, 256, 24000, 256, 24020}, + { 256, 24001, 256, 256, 24030, 24000}, + { 256, 24001, 256, 256, 24000, 24040}, +}; + +// vector of vector, each vector is a {M, N, K, lda, ldb, ldc}; +const vector> NaN_matrix_size_range = { + { 5, 6, 7, 8, 9, 10}, + {4011, 4012, 111, 4013, 4014, 4015}, +}; + +// vector of vector, each pair is a {alpha, beta}; +// add/delete this list in pairs, like {2.0, 4.0} +const vector> alpha_beta_2_3_range = { + {2.0, 3.0}, +}; + +const vector> NaN_alpha_beta_range = { + {1.0, 2.0}, +}; + +const vector> alpha_beta_range = { + {5.0, 0.0}, {0.0, 3.0}, {1.0, 3.0}, +}; + +const vector> small_alpha_beta_range = { + {1.0, 2.0}, +}; + +const vector> full_alpha_beta_range = { + {1.0, 0.0}, {-1.0, -1.0}, {2.0, 1.0}, {0.0, 1.0}}; + +// vector of vector, each pair is a {transA, transB}; +// add/delete this list in pairs, like {'N', 'T'} +// for single/double precision, 'C'(conjTranspose) will downgraded to 'T' (transpose) internally in +// sgemm/dgemm, +const vector> small_transA_transB_range = {{'N', 'N'}}; +const vector> transA_transB_range = {{'N', 'N'}, {'N', 'T'}, {'C', 'N'}, {'T', 'C'}}; + +// a_type, b_type, c_type, d_type, compute_type +const vector> precision_half = {{ HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F }}; + +const vector> precision_hpa_half = {{ HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_32F }}; + +const vector> precision_single = {{ HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F }}; + +const vector> precision_double = {{ HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F }}; + +const vector> precision_type_range = {{HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F}, + {HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_16F, + HIPBLAS_R_32F}, + {HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F, + HIPBLAS_R_32F}, + {HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F, + HIPBLAS_R_64F}}; +// clang-format on + +/* ===============Google Unit Test==================================================== */ + +/* ===================================================================== + BLAS-3 GEMM: +=================================================================== */ +/* ============================Setup Arguments======================================= */ + +// Please use "class Arguments" (see utility.hpp) to pass parameters to templated testers; +// Some routines may not touch/use certain "members" of objects "argus". +// like BLAS-1 Scal does not have lda, BLAS-2 GEMV does not have ldb, ldc; +// That is fine. These testers & routines will leave untouched members alone. +// Do not use std::tuple to directly pass parameters to testers +// by std:tuple, you have unpack it with extreme care for each one by like "std::get<0>" which is +// not intuitive and error-prone + +Arguments setup_gemm_ex_arguments(gemm_ex_tuple tup) +{ + vector matrix_size = std::get<0>(tup); + vector alpha_beta = std::get<1>(tup); + vector transA_transB = std::get<2>(tup); + vector precision_types = std::get<3>(tup); + + Arguments arg; + + // see the comments about matrix_size_range above + arg.M = matrix_size[0]; + arg.N = matrix_size[1]; + arg.K = matrix_size[2]; + arg.lda = matrix_size[3]; + arg.ldb = matrix_size[4]; + arg.ldc = matrix_size[5]; + + // the first element of alpha_beta_range is always alpha, and the second is always beta + arg.alpha = alpha_beta[0]; + arg.beta = alpha_beta[1]; + + arg.transA_option = transA_transB[0]; + arg.transB_option = transA_transB[1]; + + arg.timing = 0; + + arg.a_type = precision_types[0]; + arg.b_type = precision_types[1]; + arg.c_type = precision_types[2]; + arg.compute_type = precision_types[4]; + + return arg; +} + +class parameterized_gemm_ex : public ::TestWithParam +{ + protected: + parameterized_gemm_ex() {} + virtual ~parameterized_gemm_ex() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +TEST_P(parameterized_gemm_ex, standard) +{ + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + Arguments arg = setup_gemm_ex_arguments(GetParam()); + + hipblasStatus_t status = testing_gemm_ex(arg); + + // if not success, then the input argument is problematic, so detect the error message + if(status != HIPBLAS_STATUS_SUCCESS) + { + if(arg.M < 0 || arg.N < 0 || arg.K < 0) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + else if(arg.transA_option == 'N' ? arg.lda < arg.M : arg.lda < arg.K) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + else if(arg.transB_option == 'N' ? arg.ldb < arg.K : arg.ldb < arg.N) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + } +} + +class parameterized_chunk_gemm_ex : public ::TestWithParam +{ + protected: + parameterized_chunk_gemm_ex() {} + virtual ~parameterized_chunk_gemm_ex() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +TEST_P(parameterized_chunk_gemm_ex, float) +{ + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + Arguments arg = setup_gemm_ex_arguments(GetParam()); + + hipblasStatus_t status = testing_gemm_ex(arg); + + // if not success, then the input argument is problematic, so detect the error message + if(status != HIPBLAS_STATUS_SUCCESS) + { + if(arg.M < 0 || arg.N < 0 || arg.K < 0) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + else if(arg.transA_option == 'N' ? arg.lda < arg.M : arg.lda < arg.K) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + else if(arg.transB_option == 'N' ? arg.ldb < arg.K : arg.ldb < arg.N) + { + EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status); + } + } +} + +class parameterized_half_gemm_ex : public ::TestWithParam +{ + protected: + parameterized_half_gemm_ex() {} + virtual ~parameterized_half_gemm_ex() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +//TEST(pre_checkin_blas_ex_bad_arg, float) { testing_gemm_ex_bad_arg(); } + +//----small +INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_hpa_half, + parameterized_gemm_ex, + Combine(ValuesIn(small_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_hpa_half))); + +INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_half, + parameterized_gemm_ex, + Combine(ValuesIn(small_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_half))); + +INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_single, + parameterized_gemm_ex, + Combine(ValuesIn(small_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_single))); + +INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_double, + parameterized_gemm_ex, + Combine(ValuesIn(small_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_double))); +//----medium +INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_hpa_half, + parameterized_gemm_ex, + Combine(ValuesIn(medium_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_hpa_half))); + +INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_half, + parameterized_gemm_ex, + Combine(ValuesIn(medium_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_half))); + +INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_float, + parameterized_gemm_ex, + Combine(ValuesIn(medium_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_single))); + +INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_double, + parameterized_gemm_ex, + Combine(ValuesIn(medium_matrix_size_range), + ValuesIn(alpha_beta_range), + ValuesIn(transA_transB_range), + ValuesIn(precision_double))); diff --git a/clients/include/testing_gemm_ex.hpp b/clients/include/testing_gemm_ex.hpp new file mode 100644 index 000000000..c407394f4 --- /dev/null +++ b/clients/include/testing_gemm_ex.hpp @@ -0,0 +1,397 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include +#include +#include +#include +#include +#include + +#include "hipblas.hpp" +#include "arg_check.h" +#include "hipblas_unique_ptr.hpp" +#include "utility.h" +#include "cblas_interface.h" +#include "norm.h" +#include "unit.h" +#include "flops.h" +#include + +using namespace std; + +/* ============================================================================================ */ + +template +hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA, + hipblasOperation_t transB, + int M, + int N, + int K, + float alpha_float, + int lda, + int ldb, + float beta_float, + int ldc, + int norm_check, + int unit_check, + hipblasDatatype_t a_type, + hipblasDatatype_t b_type, + hipblasDatatype_t c_type, + hipblasDatatype_t compute_type) +{ + hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; + uint32_t solution_index = 0; + uint32_t flags = 0; + size_t* workspace_size = 0; + void* workspace = 0; + + Td h_alpha_Td; + Td h_beta_Td; + + if(is_same::value) + { + h_alpha_Td = float_to_half(alpha_float); + h_beta_Td = float_to_half(beta_float); + } + else if(is_same::value) + { + h_alpha_Td = static_cast(alpha_float); + h_beta_Td = static_cast(beta_float); + } + else if(is_same::value) + { + h_alpha_Td = static_cast(alpha_float); + h_beta_Td = static_cast(beta_float); + } + else + { + return HIPBLAS_STATUS_NOT_SUPPORTED; + } + + Tc h_alpha_Tc; + Tc h_beta_Tc; + + if(is_same::value) + { + h_alpha_Tc = float_to_half(alpha_float); + h_beta_Tc = float_to_half(beta_float); + } + else if(is_same::value) + { + h_alpha_Tc = static_cast(alpha_float); + h_beta_Tc = static_cast(beta_float); + } + else if(is_same::value) + { + h_alpha_Tc = static_cast(alpha_float); + h_beta_Tc = static_cast(beta_float); + } + else + { + return HIPBLAS_STATUS_NOT_SUPPORTED; + } + + hipblasHandle_t handle; + hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasCreate(&handle); + + int A_row = transA == HIPBLAS_OP_N ? M : K; + int A_col = transA == HIPBLAS_OP_N ? K : M; + int B_row = transB == HIPBLAS_OP_N ? K : N; + int B_col = transB == HIPBLAS_OP_N ? N : K; + + // check here to prevent undefined memory allocation error + if(M < 0 || N < 0 || K < 0 || lda < A_row || ldb < B_row || ldc < M) + { + return HIPBLAS_STATUS_INVALID_VALUE; + } + + const size_t size_A = static_cast(lda) * static_cast(A_col); + const size_t size_B = static_cast(ldb) * static_cast(B_col); + const size_t size_C = static_cast(ldc) * static_cast(N); + + // allocate memory on device +// auto dA_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_A), +// rocblas_test::device_free}; +// auto dB_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_B), +// rocblas_test::device_free}; +// auto dC_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_C), +// rocblas_test::device_free}; +// auto dD_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_D), +// rocblas_test::device_free}; +// auto d_alpha_Tc_managed = +// rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Tc)), rocblas_test::device_free}; +// auto d_beta_Tc_managed = +// rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Tc)), rocblas_test::device_free}; +// Td* dA = (Td*)dA_managed.get(); +// Td* dB = (Td*)dB_managed.get(); +// Td* dC = (Td*)dC_managed.get(); +// Td* dD = (Td*)dD_managed.get(); +// Tc* d_alpha_Tc = (Tc*)d_alpha_Tc_managed.get(); +// Tc* d_beta_Tc = (Tc*)d_beta_Tc_managed.get(); + + Td *dA, *dB, *dC; + Tc *d_alpha_Tc, *d_beta_Tc; + + CHECK_HIP_ERROR(hipMalloc(&dA, size_A * sizeof(Td))); + CHECK_HIP_ERROR(hipMalloc(&dB, size_B * sizeof(Td))); + CHECK_HIP_ERROR(hipMalloc(&dC, size_C * sizeof(Td))); + + CHECK_HIP_ERROR(hipMalloc(&d_alpha_Tc, sizeof(Td))); + CHECK_HIP_ERROR(hipMalloc(&d_beta_Tc, sizeof(Td))); + + if(!dA || !dB || !dC || !d_alpha_Tc || !d_beta_Tc) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return HIPBLAS_STATUS_ALLOC_FAILED; + } + + // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory + vector hA(size_A); + vector hB(size_B); + vector hC(size_C); + vector hC_gold(size_C); + + // Initial Data on CPU + srand(1); + hipblas_init(hA, A_row, A_col, lda); + hipblas_init_alternating_sign(hB, B_row, B_col, ldb); + hipblas_init(hC, M, N, ldc); + + // if(is_same::value) + // { + // std::cout << "----A-----------------" << std::endl; + // for(int i = 0; i < size_A; i++){ cout << half_to_float(hA[i]) << " "; } + // std::cout << std::endl << "-----B-----------------" << std::endl; + // for(int i = 0; i < size_B; i++){ cout << half_to_float(hB[i]) << " "; } + // std::cout << std::endl << "-----C-----------------" << std::endl; + // for(int i = 0; i < size_C; i++){ cout << half_to_float(hC[i]) << " "; } + // std::cout << std::endl << "-----D-----------------" << std::endl; + // for(int i = 0; i < size_D; i++){ cout << half_to_float(hD_1[i]) << " "; } + // std::cout << std::endl << "-----------------------" << std::endl; + // } + // else + // { + // std::cout << "----A-----------------" << std::endl; + // for(int i = 0; i < size_A; i++){ cout << hA[i] << " "; } + // std::cout << std::endl << "-----B-----------------" << std::endl; + // for(int i = 0; i < size_B; i++){ cout << hB[i] << " "; } + // std::cout << std::endl << "-----C-----------------" << std::endl; + // for(int i = 0; i < size_C; i++){ cout << hC[i] << " "; } + // std::cout << std::endl << "-----D-----------------" << std::endl; + // for(int i = 0; i < size_D; i++){ cout << hD_1[i] << " "; } + // std::cout << std::endl << "-----------------------" << std::endl; + // } + + hC_gold = hC; + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Td) * size_A, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Td) * size_B, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Td) * size_C, hipMemcpyHostToDevice)); + + status = hipblasGemmEx(handle, + transA, + transB, + M, + N, + K, + &h_alpha_Tc, + dA, + a_type, + lda, + dB, + b_type, + ldb, + &h_beta_Tc, + dC, + c_type, + ldc, + compute_type, + algo); + + CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Td) * size_C, hipMemcpyDeviceToHost)); + + // std::cout << std::endl << "-----hD_1---------------------------------------" << + // std::endl; + // if(is_same::value) + // { + // for(int i = 0; i < size_D; i++){ cout << half_to_float(hD_1[i]) << " "; + // } + // } + // else + // { + // for(int i = 0; i < size_D; i++){ cout << hD_1[i] << " "; } + // } + // std::cout << std::endl << "------------------------------------------------" << + // std::endl; + + + // CPU BLAS + + cblas_gemm(transA, + transB, + M, + N, + K, + h_alpha_Td, + hA.data(), + lda, + hB.data(), + ldb, + h_beta_Td, + hC_gold.data(), + ldc); + +// std::cout << std::endl << "---gold---gold---gold---------------------" << std::endl; +// if(is_same::value) +// { +// for(int i = 0; i < size_D; i++){ std::cout << half_to_float(hD_gold[i]) << " "; } +// } +// else +// { +// for(int i = 0; i < size_D; i++){ std::cout << hD_gold[i] << " "; } +// } +// std::cout << std::endl << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl; + +#ifndef NDEBUG +// print_matrix(hC_gold, hC, min(M, 3), min(N, 3), ldc); +#endif + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + if(unit_check) + { + unit_check_general(M, N, ldc, hC_gold.data(), hC.data()); + } + + hipblasDestroy(handle); + CHECK_HIP_ERROR(hipFree(dA)); + CHECK_HIP_ERROR(hipFree(dB)); + CHECK_HIP_ERROR(hipFree(dC)); + + CHECK_HIP_ERROR(hipFree(d_alpha_Tc)); + CHECK_HIP_ERROR(hipFree(d_beta_Tc)); + + return status; +} + +hipblasStatus_t testing_gemm_ex(Arguments argus) +{ + hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); + hipblasOperation_t transB = char2hipblas_operation(argus.transB_option); + + int M = argus.M; + int N = argus.N; + int K = argus.K; + + int lda = argus.lda; + int ldb = argus.ldb; + int ldc = argus.ldc; + + hipblasDatatype_t a_type = argus.a_type; + hipblasDatatype_t b_type = argus.b_type; + hipblasDatatype_t c_type = argus.c_type; + hipblasDatatype_t compute_type = argus.compute_type; + + float alpha = argus.alpha; + float beta = argus.beta; + + hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + + int norm_check = argus.norm_check; + int unit_check = argus.unit_check; + + if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && + c_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F && + compute_type == HIPBLAS_R_16F) + { + status = testing_gemm_ex_template(transA, + transB, + M, + N, + K, + alpha, + lda, + ldb, + beta, + ldc, + norm_check, + unit_check, + a_type, + b_type, + c_type, + compute_type); + } + else if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && + c_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F && + compute_type == HIPBLAS_R_32F) + { + status = testing_gemm_ex_template(transA, + transB, + M, + N, + K, + alpha, + lda, + ldb, + beta, + ldc, + norm_check, + unit_check, + a_type, + b_type, + c_type, + compute_type); + } + else if(a_type == HIPBLAS_R_32F && b_type == HIPBLAS_R_32F && + c_type == HIPBLAS_R_32F && c_type == HIPBLAS_R_32F && + compute_type == HIPBLAS_R_32F) + { + status = testing_gemm_ex_template(transA, + transB, + M, + N, + K, + alpha, + lda, + ldb, + beta, + ldc, + norm_check, + unit_check, + a_type, + b_type, + c_type, + compute_type); + } + else if(a_type == HIPBLAS_R_64F && b_type == HIPBLAS_R_64F && + c_type == HIPBLAS_R_64F && c_type == HIPBLAS_R_64F && + compute_type == HIPBLAS_R_64F) + { + status = testing_gemm_ex_template(transA, + transB, + M, + N, + K, + alpha, + lda, + ldb, + beta, + ldc, + norm_check, + unit_check, + a_type, + b_type, + c_type, + compute_type); + } + else + { + status = HIPBLAS_STATUS_NOT_SUPPORTED; + } + + return status; +} diff --git a/clients/include/utility.h b/clients/include/utility.h index 2fdc764cb..51dd9d3f8 100644 --- a/clients/include/utility.h +++ b/clients/include/utility.h @@ -1,6 +1,5 @@ /* ************************************************************************ * Copyright 2016 Advanced Micro Devices, Inc. - * * ************************************************************************ */ #pragma once @@ -12,6 +11,8 @@ #include #include "hipblas.h" #include +#include +#include using namespace std; @@ -40,16 +41,62 @@ using namespace std; cout << endl; \ } - /* ============================================================================================ */ - /* generate random number :*/ +// Helper routine to convert floats into their half equivalent; uses F16C instructions +inline hipblasHalf float_to_half(float val) +{ + // return static_cast( _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( val ), 0 ) ) + // ); + const int zero = 0; + short unsigned int a; + a = _cvtss_sh(val, zero); +// return _cvtss_sh(val, zero); + return a; +} - /*! \brief generate a random number between [0, 0.999...] . */ - template - T random_generator(){ - //return rand()/( (T)RAND_MAX + 1); - return (T)(rand() % 10 + 1); //generate a integer number between [1, 10] - }; +// Helper routine to convert halfs into their floats equivalent; uses F16C instructions +inline float half_to_float(hipblasHalf val) +{ + // return static_cast(_mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(val), 0))); + return _cvtsh_ss(val); +} + +/* ============================================================================================ */ +/* generate random number :*/ + +/*! \brief generate a random number in range [1,2,3,4,5,6,7,8,9,10] */ +template +T random_generator() +{ + // return rand()/( (T)RAND_MAX + 1); + return (T)(rand() % 10 + 1); +}; + +// for hipblasHalf, generate float, and convert to hipblasHalf +/*! \brief generate a random number in range [1,2,3] */ +template <> +inline hipblasHalf random_generator() +{ + return float_to_half( + static_cast((rand() % 3 + 1))); // generate a integer number in range [1,2,3] +}; + +/*! \brief generate a random number in range [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10] */ +template +T random_generator_negative() +{ + // return rand()/( (T)RAND_MAX + 1); + return -(T)(rand() % 10 + 1); +}; + +// for hipblasHalf, generate float, and convert to hipblasHalf +/*! \brief generate a random number in range [-1,-2,-3] */ +template <> +inline hipblasHalf random_generator_negative() +{ + return float_to_half(-static_cast((rand() % 3 + 1))); +}; +/* ============================================================================================ */ /* ============================================================================================ */ /*! \brief matrix/vector initialization: */ @@ -64,6 +111,66 @@ using namespace std; } }; + template + void hipblas_init_alternating_sign(vector& A, int M, int N, int lda) + { + // Initialize matrix so adjacent entries have alternating sign. + // In gemm if either A or B are initialized with alernating + // sign the reduction sum will be summing positive + // and negative numbers, so it should not get too large. + // This helps reduce floating point inaccuracies for 16bit + // arithmetic where the exponent has only 5 bits, and the + // mantissa 10 bits. + for(int i = 0; i < M; ++i) + { + for(int j = 0; j < N; ++j) + { + if(j % 2 ^ i % 2) + { + A[i + j * lda] = random_generator(); + } + else + { + A[i + j * lda] = random_generator_negative(); + } + } + } + }; + + template + void hipblas_init_alternating_sign(vector& A, + int M, + int N, + int lda, + int stride, + int batch_count) + { + // Initialize matrix so adjacent entries have alternating sign. + // In gemm if either A or B are initialized with alernating + // sign the reduction sum will be summing positive + // and negative numbers, so it should not get too large. + // This helps reduce floating point inaccuracies for 16bit + // arithmetic where the exponent has only 5 bits, and the + // mantissa 10 bits. + for(int i_batch = 0; i_batch < batch_count; i_batch++) + { + for(int i = 0; i < M; ++i) + { + for(int j = 0; j < N; ++j) + { + if(j % 2 ^ i % 2) + { + A[i + j * lda + i_batch * stride] = random_generator(); + } + else + { + A[i + j * lda + i_batch * stride] = random_generator_negative(); + } + } + } + } + }; + template void hipblas_init(T* A, int M, int N, int lda){ for (int i = 0; i < M; ++i){ @@ -192,6 +299,11 @@ class Arguments { int ldb = 128; int ldc = 128; + hipblasDatatype_t a_type = HIPBLAS_R_32F; + hipblasDatatype_t b_type = HIPBLAS_R_32F; + hipblasDatatype_t c_type = HIPBLAS_R_32F; + hipblasDatatype_t compute_type = HIPBLAS_R_32F; + int incx = 1 ; int incy = 1 ; int incd = 1 ; diff --git a/library/include/hipblas.h b/library/include/hipblas.h index 22dccc160..d4f8c7e5f 100644 --- a/library/include/hipblas.h +++ b/library/include/hipblas.h @@ -62,6 +62,19 @@ enum hipblasSideMode_t { HIPBLAS_SIDE_BOTH = 143 }; +enum hipblasDatatype_t { + HIPBLAS_R_16F = 150, + HIPBLAS_R_32F = 151, + HIPBLAS_R_64F = 152, + HIPBLAS_C_16F = 153, + HIPBLAS_C_32F = 154, + HIPBLAS_C_64F = 155 +}; + +enum hipblasGemmAlgo_t { + HIPBLAS_GEMM_DEFAULT = 160 +}; + #ifdef __cplusplus extern "C" { #endif @@ -215,8 +228,21 @@ HIPBLAS_EXPORT hipblasStatus_t hipblasDgemmBatched(hipblasHandle_t handle, const double *beta, double * C[], int ldc, int batchCount); +HIPBLAS_EXPORT hipblasStatus_t hipblasGemmEx( + hipblasHandle_t handle, + hipblasOperation_t trans_a, + hipblasOperation_t trans_b, + int m, + int n, + int k, + const void* alpha, + const void* a, hipblasDatatype_t a_type, int lda, + const void* b, hipblasDatatype_t b_type, int ldb, const void* beta, + void* c, hipblasDatatype_t c_type, int ldc, + hipblasDatatype_t compute_type, + hipblasGemmAlgo_t algo); -/* not implementes, requires complex support +/* not implemented, requires complex support hipblasStatus_t hipblasCgemm(hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, int k, const hipComplex *alpha, const hipComplex *A, int lda, const hipComplex *B, int ldb, const hipComplex *beta, hipComplex *C, int ldc); diff --git a/library/src/hcc_detail/hipblas.cpp b/library/src/hcc_detail/hipblas.cpp index 4289a676a..a9c7fc123 100644 --- a/library/src/hcc_detail/hipblas.cpp +++ b/library/src/hcc_detail/hipblas.cpp @@ -125,7 +125,6 @@ hipblasSideMode_t HCCSideToHIPSide( rocblas_side_ side) } } - rocblas_pointer_mode HIPPointerModeToRocblasPointerMode( hipblasPointerMode_t mode) { switch (mode) @@ -141,7 +140,6 @@ rocblas_pointer_mode HIPPointerModeToRocblasPointerMode( hipblasPointerMode_t mo } } - hipblasPointerMode_t RocblasPointerModeToHIPPointerMode( rocblas_pointer_mode mode) { switch (mode) @@ -157,7 +155,83 @@ hipblasPointerMode_t RocblasPointerModeToHIPPointerMode( rocblas_pointer_mode mo } } +rocblas_datatype HIPDatatypeToRocblasDatatype ( hipblasDatatype_t type) +{ + switch (type) + { + case HIPBLAS_R_16F: + return rocblas_datatype_f16_r ; + + case HIPBLAS_R_32F: + return rocblas_datatype_f32_r ; + + case HIPBLAS_R_64F: + return rocblas_datatype_f64_r ; + + case HIPBLAS_C_16F: + return rocblas_datatype_f16_c ; + + case HIPBLAS_C_32F: + return rocblas_datatype_f32_c ; + + case HIPBLAS_C_64F: + return rocblas_datatype_f64_c ; + + default: + throw "Non existant DataType"; + } +} + +hipblasDatatype_t RocblasDatatypeToHIPDatatype( rocblas_datatype type) +{ + switch (type) + { + case rocblas_datatype_f16_r : + return HIPBLAS_R_16F; + + case rocblas_datatype_f32_r : + return HIPBLAS_R_32F; + + case rocblas_datatype_f64_r : + return HIPBLAS_R_64F; + + case rocblas_datatype_f16_c : + return HIPBLAS_C_16F; + + case rocblas_datatype_f32_c : + return HIPBLAS_C_32F; + + case rocblas_datatype_f64_c : + return HIPBLAS_C_64F; + + default: + throw "Non existant DataType"; + } +} + +rocblas_gemm_algo HIPGemmAlgoToRocblasGemmAlgo( hipblasGemmAlgo_t algo) +{ + switch (algo) + { + case HIPBLAS_GEMM_DEFAULT: + return rocblas_gemm_algo_standard; + + default: + throw "Non existant GemmAlgo"; + } +} +hipblasGemmAlgo_t RocblasGemmAlgoToHIPGemmAlgo ( rocblas_gemm_algo algo) +{ + switch (algo) + { + case rocblas_gemm_algo_standard: + return HIPBLAS_GEMM_DEFAULT; + + default: + throw "Non existant GemmAlgo"; + } +} hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status_ error) { @@ -709,3 +783,33 @@ hipblasDgemmBatched(hipblasHandle_t handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount); } +extern "C" +hipblasStatus_t +hipblasGemmEx(hipblasHandle_t handle, + hipblasOperation_t transa, hipblasOperation_t transb, + int m, int n, int k, const void *alpha, + const void * A, hipblasDatatype_t a_type, int lda, + const void * B, hipblasDatatype_t b_type, int ldb, const void * beta, + void * C, hipblasDatatype_t c_type, int ldc, + hipblasDatatype_t compute_type, + hipblasGemmAlgo_t algo) +{ + uint32_t solution_index = 0; + + uint32_t flags = 0; + + size_t* workspace_size = 0; + + void* workspace = 0; + + return rocBLASStatusToHIPStatus(rocblas_gemm_ex((rocblas_handle)handle, + hipOperationToHCCOperation(transa), hipOperationToHCCOperation(transb), + m, n, k, alpha, + A, HIPDatatypeToRocblasDatatype(a_type), lda, + B, HIPDatatypeToRocblasDatatype(b_type), ldb, beta, + C, HIPDatatypeToRocblasDatatype(c_type), ldc, + C, HIPDatatypeToRocblasDatatype(c_type), ldc, + HIPDatatypeToRocblasDatatype(compute_type), + HIPGemmAlgoToRocblasGemmAlgo(algo), + solution_index, flags, workspace_size, workspace)); +}