diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index 1573aac64..ac9505760 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -5,9 +5,11 @@
 
 
 #include <typeinfo>
+#include <memory>
 #include "hipblas.h"
 #include "cblas_interface.h"
 #include "cblas.h"
+#include "utility.h"
 
 /*!\file
  * \brief provide template functions interfaces to CBLAS C89 interfaces, it is only used for testing not part of the GPU library
@@ -385,6 +387,70 @@ extern "C" {
 
     //gemm
 
+    template <>
+    void cblas_gemm<hipblasHalf>(hipblasOperation_t transA,
+                                  hipblasOperation_t transB,
+                                  int m,
+                                  int n,
+                                  int k,
+                                  hipblasHalf alpha,
+                                  hipblasHalf* A,
+                                  int lda,
+                                  hipblasHalf* B,
+                                  int ldb,
+                                  hipblasHalf beta,
+                                  hipblasHalf* C,
+                                  int ldc)
+    {
+        // cblas does not support hipblasHalf, so convert to higher precision float
+        // This will give more precise result which is acceptable for testing
+        float alpha_float = half_to_float(alpha);
+        float beta_float  = half_to_float(beta);
+
+        int sizeA = transA == HIPBLAS_OP_N ? k * lda : m * lda;
+        int sizeB = transB == HIPBLAS_OP_N ? n * ldb : k * ldb;
+        int sizeC = n * ldc;
+
+        std::unique_ptr<float[]> A_float(new float[sizeA]());
+        std::unique_ptr<float[]> B_float(new float[sizeB]());
+        std::unique_ptr<float[]> C_float(new float[sizeC]());
+
+        for(int i = 0; i < sizeA; i++)
+        {
+            A_float[i] = half_to_float(A[i]);
+        }
+        for(int i = 0; i < sizeB; i++)
+        {
+            B_float[i] = half_to_float(B[i]);
+        }
+        for(int i = 0; i < sizeC; i++)
+        {
+            C_float[i] = half_to_float(C[i]);
+        }
+
+        // just directly cast, since transA, transB are integers in the enum
+        // printf("transA: rocblas =%d, cblas=%d\n", transA, (CBLAS_TRANSPOSE)transA );
+        cblas_sgemm(CblasColMajor,
+                    (CBLAS_TRANSPOSE)transA,
+                    (CBLAS_TRANSPOSE)transB,
+                    m,
+                    n,
+                    k,
+                    alpha_float,
+                    const_cast<const float*>(A_float.get()),
+                    lda,
+                    const_cast<const float*>(B_float.get()),
+                    ldb,
+                    beta_float,
+                    static_cast<float*>(C_float.get()),
+                    ldc);
+
+        for(int i = 0; i < sizeC; i++)
+        {
+            C[i] = float_to_half(C_float[i]);
+        }
+    }
+
     template<>
     void cblas_gemm<float>( hipblasOperation_t transA, hipblasOperation_t transB,
                             int m, int n, int k,
diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp
index e99114e76..862fad36e 100644
--- a/clients/common/unit.cpp
+++ b/clients/common/unit.cpp
@@ -5,6 +5,7 @@
 
 #include "hipblas.h"
 #include "unit.h"
+#include "utility.h"
 
 /* ========================================Gtest Unit Check ==================================================== */
 
@@ -13,6 +14,25 @@
     //Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test case
     // a wrapper will cause the loop keep going
 
+template <>
+void unit_check_general(
+    int M, int N, int lda, hipblasHalf* hCPU, hipblasHalf* hGPU)
+{
+#pragma unroll
+    for(int j = 0; j < N; j++)
+    {
+#pragma unroll
+        for(int i = 0; i < M; i++)
+        {
+#ifdef GOOGLE_TEST
+            float cpu_float = half_to_float(hCPU[i + j * lda]);
+            float gpu_float = half_to_float(hGPU[i + j * lda]);
+            ASSERT_FLOAT_EQ(cpu_float, gpu_float);
+#endif
+        }
+    }
+}
+
 
 
     template<>
diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt
index 8193ec989..830a3ad9f 100644
--- a/clients/gtest/CMakeLists.txt
+++ b/clients/gtest/CMakeLists.txt
@@ -45,6 +45,7 @@ set(hipblas_test_source
   gemv_gtest.cpp
   ger_gtest.cpp
   gemm_gtest.cpp
+  gemm_ex_gtest.cpp
   gemm_strided_batched_gtest.cpp
   gemm_batched_gtest.cpp
   geam_gtest.cpp
@@ -92,11 +93,18 @@ if( NOT CUDA_FOUND )
   get_target_property( HCC_AM_LOCATION hcc::hc_am IMPORTED_LOCATION_RELEASE )
   target_link_libraries( hipblas-test PRIVATE ${HIP_HCC_LOCATION} ${HCC_AM_LOCATION} )
 
+
   if( CMAKE_CXX_COMPILER MATCHES ".*/hcc$|.*/hipcc$" )
     # Remove following when hcc is fixed; hcc emits following spurious warning
     # "clang-5.0: warning: argument unused during compilation: '-isystem /opt/rocm/include'"
-    target_compile_options( hipblas-test PRIVATE -Wno-unused-command-line-argument )
+    target_compile_options( hipblas-test PRIVATE -Wno-unused-command-line-argument -mf16c)
+
+  elseif( CMAKE_COMPILER_IS_GNUCXX )
+    # GCC needs specific flag to turn on f16c intrinsics
+    target_compile_options( hipblas-test PRIVATE -mf16c )
+
   endif( )
+
   if( CMAKE_CXX_COMPILER MATCHES ".*/hcc$|.*/hipcc$" )
     # hip-clang needs specific flag to turn on pthread and m
     target_link_libraries( hipblas-test PRIVATE -lpthread -lm )
diff --git a/clients/gtest/gemm_ex_gtest.cpp b/clients/gtest/gemm_ex_gtest.cpp
new file mode 100644
index 000000000..9308e9001
--- /dev/null
+++ b/clients/gtest/gemm_ex_gtest.cpp
@@ -0,0 +1,415 @@
+/* ************************************************************************
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdexcept>
+#include <vector>
+#include "testing_gemm_ex.hpp"
+#include "utility.h"
+
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using ::testing::Combine;
+using namespace std;
+
+/* =====================================================================
+README: This file contains testers to verify the correctness of
+        BLAS routines with google test
+
+        It is supposed to be played/used by advance / expert users
+        Normal users only need to get the library routines without testers
+     =================================================================== */
+
+// only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary,  std::tuple is good enough;
+
+typedef std::tuple<vector<int>, vector<double>, vector<char>, vector<hipblasDatatype_t>>
+    gemm_ex_tuple;
+
+// clang-format off
+// vector of vector, each vector is a {M, N, K, lda, ldb, ldc};
+// add/delete as a group
+const vector<vector<int>> small_matrix_size_range = {
+    { 1,  1,  1,  1,  1,  1}, 
+    { 1,  2,  3,  4,  5,  6}, 
+    { 7,  9, 15, 17, 18, 19}, 
+    { 8,  1,  1,  8,  8,  8},
+    { 2,  2,  2,  2,  2,  2},
+    { 3,  3,  3,  3,  3,  3},
+    { 4,  4,  4,  4,  4,  4},
+    { 5,  5,  5,  5,  5,  5},
+    { 6,  6,  6,  6,  6,  6},
+    { 7,  7,  7,  7,  7,  7},
+    { 8,  8,  8,  8,  8,  8},
+    { 9,  9,  9,  9,  9,  9},
+    {10, 10, 10, 10, 10, 10},
+    {11, 11, 11, 11, 11, 11},
+    {12, 12, 12, 12, 12, 12},
+    {13, 13, 13, 13, 13, 13},
+    {14, 14, 14, 14, 14, 14},
+    {15, 15, 15, 15, 15, 15},
+    {16, 16, 16, 16, 16, 16},
+    {17, 17, 17, 17, 17, 17},
+    {18, 18, 18, 18, 18, 18},
+    {19, 19, 19, 19, 19, 19},
+    {20, 20, 20, 20, 20, 20},
+    { 2,  3,  4,  5,  6,  7},
+    { 3,  4,  5,  6,  7,  8},
+    { 4,  5,  6,  6,  6,  6},
+    { 5,  6,  7,  7,  8,  9},
+    { 6,  7,  8, 10,  9,  8},
+    { 7,  8,  9, 11,  9, 10},
+    { 8,  9, 10, 10, 11, 12},
+    { 9, 10, 11, 12, 11, 13},
+    {13, 12, 11, 15, 14, 13},
+    {15, 16, 17, 17, 18, 19},
+    {18, 17, 16, 18, 18, 18},
+    {16, 17, 18, 20, 19, 18},
+    { 8,  2,  2,  8,  8,  8},
+    { 8,  3,  3,  8,  8,  8},
+    { 8,  4,  4,  8,  8,  8},
+    { 8,  5,  5,  8,  8,  8},
+    { 8,  6,  6,  8,  8,  8},
+    { 8,  7,  7,  8,  8,  8},
+    { 8,  9,  9,  9,  9,  9},
+    { 8, 10, 10, 10, 10, 10},
+    { 8, 11, 11, 11, 11, 11},
+    { 8, 12, 12, 12, 12, 12},
+    { 8, 13, 13, 13, 13, 13},
+    { 8, 14, 14, 14, 14, 14},
+    { 8, 15, 15, 15, 15, 15},
+//  {16, 15, 15, 16, 16, 16},
+//  {16, 17, 17, 17, 17, 17},
+//  {17, 16, 16, 17, 17, 17},
+//  {16, 18, 18, 18, 18, 18},
+//  {24, 24, 24, 24, 24, 24},
+//  {32, 32, 32, 32, 32, 32},
+//  {40, 40, 40, 40, 40, 40},
+//  {48, 48, 48, 48, 48, 48},
+//  {56, 56, 56, 56, 56, 56},
+//  {64, 64, 64, 64, 64, 64},
+//  {72, 72, 72, 72, 72, 72},
+};
+const vector<vector<int>> medium_matrix_size_range = {
+    {127, 127,  63, 127, 127, 127},
+    {128, 127,  63, 128, 128, 128},
+    {129, 127,  63, 129, 129, 129},
+//  {127, 128,  63, 128, 127, 127},
+//  {128, 128,  63, 128, 127, 127},
+//  {129, 128,  63, 129, 129, 129},
+//  {127, 129,  63, 129, 129, 129},
+//  {128, 129,  63, 129, 129, 129},
+//  {129, 129,  63, 129, 129, 129},
+//  {127, 127,  64, 127, 127, 127},
+//  {128, 127,  64, 128, 128, 128},
+//  {129, 127,  64, 129, 129, 129},
+//  {127, 128,  64, 128, 127, 127},
+//  {128, 128,  64, 128, 127, 127},
+//  {129, 128,  64, 129, 129, 129},
+//  {127, 129,  64, 129, 129, 129},
+//  {128, 129,  64, 129, 129, 129},
+//  {129, 129,  64, 129, 129, 129},
+//  {127, 127,  65, 127, 127, 127},
+//  {128, 127,  65, 128, 128, 128},
+//  {129, 127,  65, 129, 129, 129},
+//  {127, 128,  65, 128, 127, 127},
+//  {128, 128,  65, 128, 127, 127},
+//  {129, 128,  65, 129, 129, 129},
+//  {127, 129,  65, 129, 129, 129},
+//  {128, 129,  65, 129, 129, 129},
+//  {129, 129,  65, 129, 129, 129},
+//  {191, 193, 194, 195, 196, 197},
+//  {500, 501, 502, 503, 604, 505},
+//  {639, 640, 347, 960, 961,1062},
+};
+
+// vector of vector, each vector is a {M, N, K, lda, ldb, ldc};
+const vector<vector<int>> large_matrix_size_range = {
+    {1000, 1001,  101, 2002, 1003, 1004},
+    { 925, 1026, 1027, 1028, 2029, 1031},
+    {4011, 4012,  103, 4014, 4015, 4016},
+};
+
+// vector of vector, each vector is a {M, N, K, lda, ldb, ldc};
+const vector<vector<int>> chunk_matrix_size_range = {
+    {24000,   256, 256, 24010,   256, 24000},
+    {24000,   256, 256, 24000,   256, 24020},
+    {  256, 24001, 256,   256, 24030, 24000},
+    {  256, 24001, 256,   256, 24000, 24040},
+};
+
+// vector of vector, each vector is a {M, N, K, lda, ldb, ldc};
+const vector<vector<int>> NaN_matrix_size_range = {
+    {   5,    6,   7,    8,    9,   10}, 
+    {4011, 4012, 111, 4013, 4014, 4015},
+};
+
+// vector of vector, each pair is a {alpha, beta};
+// add/delete this list in pairs, like {2.0, 4.0}
+const vector<vector<double>> alpha_beta_2_3_range = {
+    {2.0, 3.0},
+};
+
+const vector<vector<double>> NaN_alpha_beta_range = {
+    {1.0, 2.0},
+};
+
+const vector<vector<double>> alpha_beta_range = {
+    {5.0, 0.0}, {0.0, 3.0}, {1.0, 3.0},
+};
+
+const vector<vector<double>> small_alpha_beta_range = {
+    {1.0, 2.0},
+};
+
+const vector<vector<double>> full_alpha_beta_range = {
+    {1.0, 0.0}, {-1.0, -1.0}, {2.0, 1.0}, {0.0, 1.0}};
+
+// vector of vector, each pair is a {transA, transB};
+// add/delete this list in pairs, like {'N', 'T'}
+// for single/double precision, 'C'(conjTranspose) will downgraded to 'T' (transpose) internally in
+// sgemm/dgemm,
+const vector<vector<char>> small_transA_transB_range = {{'N', 'N'}};
+const vector<vector<char>> transA_transB_range = {{'N', 'N'}, {'N', 'T'}, {'C', 'N'}, {'T', 'C'}};
+
+// a_type, b_type, c_type, d_type, compute_type
+const vector<vector<hipblasDatatype_t>> precision_half = {{ HIPBLAS_R_16F,
+                                                            HIPBLAS_R_16F,
+                                                            HIPBLAS_R_16F,
+                                                            HIPBLAS_R_16F,
+                                                            HIPBLAS_R_16F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_hpa_half = {{ HIPBLAS_R_16F,
+                                                                HIPBLAS_R_16F,
+                                                                HIPBLAS_R_16F,
+                                                                HIPBLAS_R_16F,
+                                                                HIPBLAS_R_32F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_single = {{ HIPBLAS_R_32F,
+                                                              HIPBLAS_R_32F,
+                                                              HIPBLAS_R_32F,
+                                                              HIPBLAS_R_32F,
+                                                              HIPBLAS_R_32F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_double = {{ HIPBLAS_R_64F,
+                                                              HIPBLAS_R_64F,
+                                                              HIPBLAS_R_64F,
+                                                              HIPBLAS_R_64F,
+                                                              HIPBLAS_R_64F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_type_range = {{HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F},
+                                                                {HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_16F,
+                                                                 HIPBLAS_R_32F},
+                                                                {HIPBLAS_R_32F,
+                                                                 HIPBLAS_R_32F,
+                                                                 HIPBLAS_R_32F,
+                                                                 HIPBLAS_R_32F,
+                                                                 HIPBLAS_R_32F},
+                                                                {HIPBLAS_R_64F,
+                                                                 HIPBLAS_R_64F,
+                                                                 HIPBLAS_R_64F,
+                                                                 HIPBLAS_R_64F,
+                                                                 HIPBLAS_R_64F}};
+// clang-format on
+
+/* ===============Google Unit Test==================================================== */
+
+/* =====================================================================
+     BLAS-3 GEMM:
+=================================================================== */
+/* ============================Setup Arguments======================================= */
+
+// Please use "class Arguments" (see utility.hpp) to pass parameters to templated testers;
+// Some routines may not touch/use certain "members" of objects "argus".
+// like BLAS-1 Scal does not have lda, BLAS-2 GEMV does not have ldb, ldc;
+// That is fine. These testers & routines will leave untouched members alone.
+// Do not use std::tuple to directly pass parameters to testers
+// by std:tuple, you have unpack it with extreme care for each one by like "std::get<0>" which is
+// not intuitive and error-prone
+
+Arguments setup_gemm_ex_arguments(gemm_ex_tuple tup)
+{
+    vector<int> matrix_size                  = std::get<0>(tup);
+    vector<double> alpha_beta                = std::get<1>(tup);
+    vector<char> transA_transB               = std::get<2>(tup);
+    vector<hipblasDatatype_t> precision_types = std::get<3>(tup);
+
+    Arguments arg;
+
+    // see the comments about matrix_size_range above
+    arg.M   = matrix_size[0];
+    arg.N   = matrix_size[1];
+    arg.K   = matrix_size[2];
+    arg.lda = matrix_size[3];
+    arg.ldb = matrix_size[4];
+    arg.ldc = matrix_size[5];
+
+    // the first element of alpha_beta_range is always alpha, and the second is always beta
+    arg.alpha = alpha_beta[0];
+    arg.beta  = alpha_beta[1];
+
+    arg.transA_option = transA_transB[0];
+    arg.transB_option = transA_transB[1];
+
+    arg.timing = 0;
+
+    arg.a_type       = precision_types[0];
+    arg.b_type       = precision_types[1];
+    arg.c_type       = precision_types[2];
+    arg.compute_type = precision_types[4];
+
+    return arg;
+}
+
+class parameterized_gemm_ex : public ::TestWithParam<gemm_ex_tuple>
+{
+    protected:
+    parameterized_gemm_ex() {}
+    virtual ~parameterized_gemm_ex() {}
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+};
+
+TEST_P(parameterized_gemm_ex, standard)
+{
+    // GetParam return a tuple. Tee setup routine unpack the tuple
+    // and initializes arg(Arguments) which will be passed to testing routine
+    // The Arguments data struture have physical meaning associated.
+    // while the tuple is non-intuitive.
+
+    Arguments arg = setup_gemm_ex_arguments(GetParam());
+
+    hipblasStatus_t status = testing_gemm_ex(arg);
+
+    // if not success, then the input argument is problematic, so detect the error message
+    if(status != HIPBLAS_STATUS_SUCCESS)
+    {
+        if(arg.M < 0 || arg.N < 0 || arg.K < 0)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+        else if(arg.transA_option == 'N' ? arg.lda < arg.M : arg.lda < arg.K)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+        else if(arg.transB_option == 'N' ? arg.ldb < arg.K : arg.ldb < arg.N)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+    }
+}
+
+class parameterized_chunk_gemm_ex : public ::TestWithParam<gemm_ex_tuple>
+{
+    protected:
+    parameterized_chunk_gemm_ex() {}
+    virtual ~parameterized_chunk_gemm_ex() {}
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+};
+
+TEST_P(parameterized_chunk_gemm_ex, float)
+{
+    // GetParam return a tuple. Tee setup routine unpack the tuple
+    // and initializes arg(Arguments) which will be passed to testing routine
+    // The Arguments data struture have physical meaning associated.
+    // while the tuple is non-intuitive.
+
+    Arguments arg = setup_gemm_ex_arguments(GetParam());
+
+    hipblasStatus_t status = testing_gemm_ex(arg);
+
+    // if not success, then the input argument is problematic, so detect the error message
+    if(status != HIPBLAS_STATUS_SUCCESS)
+    {
+        if(arg.M < 0 || arg.N < 0 || arg.K < 0)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+        else if(arg.transA_option == 'N' ? arg.lda < arg.M : arg.lda < arg.K)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+        else if(arg.transB_option == 'N' ? arg.ldb < arg.K : arg.ldb < arg.N)
+        {
+            EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
+        }
+    }
+}
+
+class parameterized_half_gemm_ex : public ::TestWithParam<gemm_ex_tuple>
+{
+    protected:
+    parameterized_half_gemm_ex() {}
+    virtual ~parameterized_half_gemm_ex() {}
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+};
+
+//TEST(pre_checkin_blas_ex_bad_arg, float) { testing_gemm_ex_bad_arg(); }
+
+//----small
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_hpa_half,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_hpa_half)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_half,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_half)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_single,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_single)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_double,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_double)));
+//----medium
+INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_hpa_half,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(medium_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_hpa_half)));
+
+INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_half,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(medium_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_half)));
+
+INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_float,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(medium_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_single)));
+
+INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_double,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(medium_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_double)));
diff --git a/clients/include/testing_gemm_ex.hpp b/clients/include/testing_gemm_ex.hpp
new file mode 100644
index 000000000..c407394f4
--- /dev/null
+++ b/clients/include/testing_gemm_ex.hpp
@@ -0,0 +1,397 @@
+/* ************************************************************************
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <limits>
+
+#include "hipblas.hpp"
+#include "arg_check.h"
+#include "hipblas_unique_ptr.hpp"
+#include "utility.h"
+#include "cblas_interface.h"
+#include "norm.h"
+#include "unit.h"
+#include "flops.h"
+#include <typeinfo>
+
+using namespace std;
+
+/* ============================================================================================ */
+
+template <typename Td, typename Tc>
+hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
+                                        hipblasOperation_t transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        float alpha_float,
+                                        int lda,
+                                        int ldb,
+                                        float beta_float,
+                                        int ldc,
+                                        int norm_check,
+                                        int unit_check,
+                                        hipblasDatatype_t a_type,
+                                        hipblasDatatype_t b_type,
+                                        hipblasDatatype_t c_type,
+                                        hipblasDatatype_t compute_type)
+{
+    hipblasGemmAlgo_t algo  = HIPBLAS_GEMM_DEFAULT;
+    uint32_t solution_index = 0;
+    uint32_t flags          = 0;
+    size_t* workspace_size  = 0;
+    void* workspace         = 0;
+
+    Td h_alpha_Td;
+    Td h_beta_Td;
+
+    if(is_same<Td, hipblasHalf>::value)
+    {
+        h_alpha_Td = float_to_half(alpha_float);
+        h_beta_Td  = float_to_half(beta_float);
+    }
+    else if(is_same<Td, float>::value)
+    {
+        h_alpha_Td = static_cast<Td>(alpha_float);
+        h_beta_Td  = static_cast<Td>(beta_float);
+    }
+    else if(is_same<Td, double>::value)
+    {
+        h_alpha_Td = static_cast<Td>(alpha_float);
+        h_beta_Td  = static_cast<Td>(beta_float);
+    }
+    else
+    {
+        return HIPBLAS_STATUS_NOT_SUPPORTED;
+    }
+
+    Tc h_alpha_Tc;
+    Tc h_beta_Tc;
+
+    if(is_same<Tc, hipblasHalf>::value)
+    {
+        h_alpha_Tc = float_to_half(alpha_float);
+        h_beta_Tc  = float_to_half(beta_float);
+    }
+    else if(is_same<Tc, float>::value)
+    {
+        h_alpha_Tc = static_cast<Tc>(alpha_float);
+        h_beta_Tc  = static_cast<Tc>(beta_float);
+    }
+    else if(is_same<Tc, double>::value)
+    {
+        h_alpha_Tc = static_cast<Tc>(alpha_float);
+        h_beta_Tc  = static_cast<Tc>(beta_float);
+    }
+    else
+    {
+        return HIPBLAS_STATUS_NOT_SUPPORTED;
+    }
+
+    hipblasHandle_t handle;
+    hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS;
+    hipblasCreate(&handle);
+
+    int A_row = transA == HIPBLAS_OP_N ? M : K;
+    int A_col = transA == HIPBLAS_OP_N ? K : M;
+    int B_row = transB == HIPBLAS_OP_N ? K : N;
+    int B_col = transB == HIPBLAS_OP_N ? N : K;
+
+    // check here to prevent undefined memory allocation error
+    if(M < 0 || N < 0 || K < 0 || lda < A_row || ldb < B_row || ldc < M)
+    {
+        return HIPBLAS_STATUS_INVALID_VALUE;
+    }
+
+    const size_t size_A = static_cast<size_t>(lda) * static_cast<size_t>(A_col);
+    const size_t size_B = static_cast<size_t>(ldb) * static_cast<size_t>(B_col);
+    const size_t size_C = static_cast<size_t>(ldc) * static_cast<size_t>(N);
+
+    // allocate memory on device
+//  auto dA_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_A),
+//                                       rocblas_test::device_free};
+//  auto dB_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_B),
+//                                       rocblas_test::device_free};
+//  auto dC_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_C),
+//                                       rocblas_test::device_free};
+//  auto dD_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Td) * size_D),
+//                                       rocblas_test::device_free};
+//  auto d_alpha_Tc_managed =
+//      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Tc)), rocblas_test::device_free};
+//  auto d_beta_Tc_managed =
+//      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(Tc)), rocblas_test::device_free};
+//  Td* dA         = (Td*)dA_managed.get();
+//  Td* dB         = (Td*)dB_managed.get();
+//  Td* dC         = (Td*)dC_managed.get();
+//  Td* dD         = (Td*)dD_managed.get();
+//  Tc* d_alpha_Tc = (Tc*)d_alpha_Tc_managed.get();
+//  Tc* d_beta_Tc  = (Tc*)d_beta_Tc_managed.get();
+
+    Td *dA, *dB, *dC;
+    Tc *d_alpha_Tc, *d_beta_Tc;
+
+    CHECK_HIP_ERROR(hipMalloc(&dA, size_A * sizeof(Td)));
+    CHECK_HIP_ERROR(hipMalloc(&dB, size_B * sizeof(Td)));
+    CHECK_HIP_ERROR(hipMalloc(&dC, size_C * sizeof(Td)));
+
+    CHECK_HIP_ERROR(hipMalloc(&d_alpha_Tc, sizeof(Td)));
+    CHECK_HIP_ERROR(hipMalloc(&d_beta_Tc, sizeof(Td)));
+
+    if(!dA || !dB || !dC || !d_alpha_Tc || !d_beta_Tc)
+    {
+        PRINT_IF_HIP_ERROR(hipErrorOutOfMemory);
+        return HIPBLAS_STATUS_ALLOC_FAILED;
+    }
+
+    // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory
+    vector<Td> hA(size_A);
+    vector<Td> hB(size_B);
+    vector<Td> hC(size_C);
+    vector<Td> hC_gold(size_C);
+
+    // Initial Data on CPU
+    srand(1);
+    hipblas_init<Td>(hA, A_row, A_col, lda);
+    hipblas_init_alternating_sign<Td>(hB, B_row, B_col, ldb);
+    hipblas_init<Td>(hC, M, N, ldc);
+
+    //  if(is_same<Td, hipblasHalf>::value)
+    //  {
+    //      std::cout << "----A-----------------" << std::endl;
+    //      for(int i = 0; i < size_A; i++){ cout << half_to_float(hA[i]) << "  "; }
+    //      std::cout << std::endl << "-----B-----------------" << std::endl;
+    //      for(int i = 0; i < size_B; i++){ cout << half_to_float(hB[i]) << "  "; }
+    //      std::cout << std::endl << "-----C-----------------" << std::endl;
+    //      for(int i = 0; i < size_C; i++){ cout << half_to_float(hC[i]) << "  "; }
+    //      std::cout << std::endl << "-----D-----------------" << std::endl;
+    //      for(int i = 0; i < size_D; i++){ cout << half_to_float(hD_1[i]) << "  "; }
+    //      std::cout << std::endl << "-----------------------" << std::endl;
+    //  }
+    //  else
+    //  {
+    //      std::cout << "----A-----------------" << std::endl;
+    //      for(int i = 0; i < size_A; i++){ cout << hA[i] << "  "; }
+    //      std::cout << std::endl << "-----B-----------------" << std::endl;
+    //      for(int i = 0; i < size_B; i++){ cout << hB[i] << "  "; }
+    //      std::cout << std::endl << "-----C-----------------" << std::endl;
+    //      for(int i = 0; i < size_C; i++){ cout << hC[i] << "  "; }
+    //      std::cout << std::endl << "-----D-----------------" << std::endl;
+    //      for(int i = 0; i < size_D; i++){ cout << hD_1[i] << "  "; }
+    //      std::cout << std::endl << "-----------------------" << std::endl;
+    //  }
+
+    hC_gold = hC;
+
+    // copy data from CPU to device
+    CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Td) * size_A, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Td) * size_B, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Td) * size_C, hipMemcpyHostToDevice));
+
+    status = hipblasGemmEx(handle,
+                           transA,
+                           transB,
+                           M,
+                           N,
+                           K,
+                           &h_alpha_Tc,
+                           dA,
+                           a_type,
+                           lda,
+                           dB,
+                           b_type,
+                           ldb,
+                           &h_beta_Tc,
+                           dC,
+                           c_type,
+                           ldc,
+                           compute_type,
+                           algo);
+
+    CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Td) * size_C, hipMemcpyDeviceToHost));
+
+    //      std::cout << std::endl << "-----hD_1---------------------------------------" <<
+    //      std::endl;
+    //      if(is_same<Td, hipblasHalf>::value)
+    //      {
+    //                  for(int i = 0; i < size_D; i++){ cout << half_to_float(hD_1[i]) << "  ";
+    //                  }
+    //      }
+    //      else
+    //      {
+    //                  for(int i = 0; i < size_D; i++){ cout << hD_1[i] << "  "; }
+    //      }
+    //      std::cout << std::endl << "------------------------------------------------" <<
+    //      std::endl;
+
+
+    // CPU BLAS
+
+    cblas_gemm<Td>(transA,
+                   transB,
+                   M,
+                   N,
+                   K,
+                   h_alpha_Td,
+                   hA.data(),
+                   lda,
+                   hB.data(),
+                   ldb,
+                   h_beta_Td,
+                   hC_gold.data(),
+                   ldc);
+
+//      std::cout << std::endl << "---gold---gold---gold---------------------" << std::endl;
+//      if(is_same<Td, hipblasHalf>::value)
+//      {
+//          for(int i = 0; i < size_D; i++){ std::cout << half_to_float(hD_gold[i]) << "  "; }
+//      }
+//      else
+//      {
+//          for(int i = 0; i < size_D; i++){ std::cout << hD_gold[i] << "  "; }
+//      }
+//      std::cout << std::endl << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl;
+
+#ifndef NDEBUG
+// print_matrix(hC_gold, hC, min(M, 3), min(N, 3), ldc);
+#endif
+
+    // enable unit check, notice unit check is not invasive, but norm check is,
+    // unit check and norm check can not be interchanged their order
+    if(unit_check)
+    {
+        unit_check_general<Td>(M, N, ldc, hC_gold.data(), hC.data());
+    }
+
+    hipblasDestroy(handle);
+    CHECK_HIP_ERROR(hipFree(dA));
+    CHECK_HIP_ERROR(hipFree(dB));
+    CHECK_HIP_ERROR(hipFree(dC));
+
+    CHECK_HIP_ERROR(hipFree(d_alpha_Tc));
+    CHECK_HIP_ERROR(hipFree(d_beta_Tc));
+
+    return status;
+}
+
+hipblasStatus_t testing_gemm_ex(Arguments argus)
+{
+    hipblasOperation_t transA = char2hipblas_operation(argus.transA_option);
+    hipblasOperation_t transB = char2hipblas_operation(argus.transB_option);
+
+    int M = argus.M;
+    int N = argus.N;
+    int K = argus.K;
+
+    int lda = argus.lda;
+    int ldb = argus.ldb;
+    int ldc = argus.ldc;
+
+    hipblasDatatype_t a_type       = argus.a_type;
+    hipblasDatatype_t b_type       = argus.b_type;
+    hipblasDatatype_t c_type       = argus.c_type;
+    hipblasDatatype_t compute_type = argus.compute_type;
+
+    float alpha = argus.alpha;
+    float beta  = argus.beta;
+
+    hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS;
+
+    int norm_check = argus.norm_check;
+    int unit_check = argus.unit_check;
+
+    if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F &&
+       c_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F &&
+       compute_type == HIPBLAS_R_16F)
+    {
+        status = testing_gemm_ex_template<hipblasHalf, hipblasHalf>(transA,
+                                                                    transB,
+                                                                    M,
+                                                                    N,
+                                                                    K,
+                                                                    alpha,
+                                                                    lda,
+                                                                    ldb,
+                                                                    beta,
+                                                                    ldc,
+                                                                    norm_check,
+                                                                    unit_check,
+                                                                    a_type,
+                                                                    b_type,
+                                                                    c_type,
+                                                                    compute_type);
+    }
+    else if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F &&
+            c_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F &&
+            compute_type == HIPBLAS_R_32F)
+    {
+        status = testing_gemm_ex_template<hipblasHalf, float>(transA,
+                                                             transB,
+                                                             M,
+                                                             N,
+                                                             K,
+                                                             alpha,
+                                                             lda,
+                                                             ldb,
+                                                             beta,
+                                                             ldc,
+                                                             norm_check,
+                                                             unit_check,
+                                                             a_type,
+                                                             b_type,
+                                                             c_type,
+                                                             compute_type);
+    }
+    else if(a_type == HIPBLAS_R_32F && b_type == HIPBLAS_R_32F &&
+            c_type == HIPBLAS_R_32F && c_type == HIPBLAS_R_32F &&
+            compute_type == HIPBLAS_R_32F)
+    {
+        status = testing_gemm_ex_template<float, float>(transA,
+                                                      transB,
+                                                      M,
+                                                      N,
+                                                      K,
+                                                      alpha,
+                                                      lda,
+                                                      ldb,
+                                                      beta,
+                                                      ldc,
+                                                      norm_check,
+                                                      unit_check,
+                                                      a_type,
+                                                      b_type,
+                                                      c_type,
+                                                      compute_type);
+    }
+    else if(a_type == HIPBLAS_R_64F && b_type == HIPBLAS_R_64F &&
+            c_type == HIPBLAS_R_64F && c_type == HIPBLAS_R_64F &&
+            compute_type == HIPBLAS_R_64F)
+    {
+        status = testing_gemm_ex_template<double, double>(transA,
+                                                        transB,
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        alpha,
+                                                        lda,
+                                                        ldb,
+                                                        beta,
+                                                        ldc,
+                                                        norm_check,
+                                                        unit_check,
+                                                        a_type,
+                                                        b_type,
+                                                        c_type,
+                                                        compute_type);
+    }
+    else
+    {
+        status = HIPBLAS_STATUS_NOT_SUPPORTED;
+    }
+
+    return status;
+}
diff --git a/clients/include/utility.h b/clients/include/utility.h
index 2fdc764cb..51dd9d3f8 100644
--- a/clients/include/utility.h
+++ b/clients/include/utility.h
@@ -1,6 +1,5 @@
 /* ************************************************************************
  * Copyright 2016 Advanced Micro Devices, Inc.
- *
  * ************************************************************************ */
 
 #pragma once
@@ -12,6 +11,8 @@
 #include <vector>
 #include "hipblas.h"
 #include <sys/time.h>
+#include <immintrin.h>
+#include <typeinfo>
 
 using namespace std;
 
@@ -40,16 +41,62 @@ using namespace std;
         cout << endl;                                                            \
     }
 
-    /* ============================================================================================ */
-    /* generate random number :*/
+// Helper routine to convert floats into their half equivalent; uses F16C instructions
+inline hipblasHalf float_to_half(float val)
+{
+    // return static_cast<hipblasHalf>( _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( val ), 0 ) )
+    // );
+    const int zero = 0;
+    short unsigned int a;
+    a = _cvtss_sh(val, zero);
+//  return _cvtss_sh(val, zero);
+    return a;
+}
 
-    /*! \brief  generate a random number between [0, 0.999...] . */
-    template<typename T>
-    T random_generator(){
-        //return rand()/( (T)RAND_MAX + 1);
-        return (T)(rand() % 10 + 1); //generate a integer number between [1, 10]
-    };
+// Helper routine to convert halfs into their floats equivalent; uses F16C instructions
+inline float half_to_float(hipblasHalf val)
+{
+    // return static_cast<hipblasHalf>(_mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(val), 0)));
+    return _cvtsh_ss(val);
+}
+
+/* ============================================================================================ */
+/* generate random number :*/
+
+/*! \brief  generate a random number in range [1,2,3,4,5,6,7,8,9,10] */
+template <typename T>
+T random_generator()
+{
+    // return rand()/( (T)RAND_MAX + 1);
+    return (T)(rand() % 10 + 1);
+};
+
+// for hipblasHalf, generate float, and convert to hipblasHalf
+/*! \brief  generate a random number in range [1,2,3] */
+template <>
+inline hipblasHalf random_generator<hipblasHalf>()
+{
+    return float_to_half(
+        static_cast<float>((rand() % 3 + 1))); // generate a integer number in range [1,2,3]
+};
+
+/*! \brief  generate a random number in range [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10] */
+template <typename T>
+T random_generator_negative()
+{
+    // return rand()/( (T)RAND_MAX + 1);
+    return -(T)(rand() % 10 + 1);
+};
+
+// for hipblasHalf, generate float, and convert to hipblasHalf
+/*! \brief  generate a random number in range [-1,-2,-3] */
+template <>
+inline hipblasHalf random_generator_negative<hipblasHalf>()
+{
+    return float_to_half(-static_cast<float>((rand() % 3 + 1)));
+};
 
+/* ============================================================================================ */
 
     /* ============================================================================================ */
     /*! \brief  matrix/vector initialization: */
@@ -64,6 +111,66 @@ using namespace std;
         }
     };
 
+    template <typename T>
+    void hipblas_init_alternating_sign(vector<T>& A, int M, int N, int lda)
+    {
+        // Initialize matrix so adjacent entries have alternating sign.
+        // In gemm if either A or B are initialized with alernating
+        // sign the reduction sum will be summing positive
+        // and negative numbers, so it should not get too large.
+        // This helps reduce floating point inaccuracies for 16bit
+        // arithmetic where the exponent has only 5 bits, and the
+        // mantissa 10 bits.
+        for(int i = 0; i < M; ++i)
+        {
+            for(int j = 0; j < N; ++j)
+            {
+                if(j % 2 ^ i % 2)
+                {
+                    A[i + j * lda] = random_generator<T>();
+                }
+                else
+                {
+                    A[i + j * lda] = random_generator_negative<T>();
+                }
+            }
+        }
+    };
+
+    template <typename T>
+    void hipblas_init_alternating_sign(vector<T>& A,
+                                       int M,
+                                       int N,
+                                       int lda,
+                                       int stride,
+                                       int batch_count)
+    {
+        // Initialize matrix so adjacent entries have alternating sign.
+        // In gemm if either A or B are initialized with alernating
+        // sign the reduction sum will be summing positive
+        // and negative numbers, so it should not get too large.
+        // This helps reduce floating point inaccuracies for 16bit
+        // arithmetic where the exponent has only 5 bits, and the
+        // mantissa 10 bits.
+        for(int i_batch = 0; i_batch < batch_count; i_batch++)
+        {
+            for(int i = 0; i < M; ++i)
+            {
+                for(int j = 0; j < N; ++j)
+                {
+                    if(j % 2 ^ i % 2)
+                    {
+                        A[i + j * lda + i_batch * stride] = random_generator<T>();
+                    }
+                    else
+                    {
+                        A[i + j * lda + i_batch * stride] = random_generator_negative<T>();
+                    }
+                }
+            }
+        }
+    };
+
     template<typename T>
     void hipblas_init(T* A, int M, int N, int lda){
         for (int i = 0; i < M; ++i){
@@ -192,6 +299,11 @@ class Arguments {
     int ldb = 128;
     int ldc = 128;
 
+    hipblasDatatype_t a_type = HIPBLAS_R_32F;
+    hipblasDatatype_t b_type = HIPBLAS_R_32F;
+    hipblasDatatype_t c_type = HIPBLAS_R_32F;
+    hipblasDatatype_t compute_type = HIPBLAS_R_32F;
+
     int incx = 1 ;
     int incy = 1 ;
     int incd = 1 ;
diff --git a/library/include/hipblas.h b/library/include/hipblas.h
index 22dccc160..d4f8c7e5f 100644
--- a/library/include/hipblas.h
+++ b/library/include/hipblas.h
@@ -62,6 +62,19 @@ enum hipblasSideMode_t {
     HIPBLAS_SIDE_BOTH   = 143
 };
 
+enum hipblasDatatype_t {
+    HIPBLAS_R_16F = 150,
+    HIPBLAS_R_32F = 151,
+    HIPBLAS_R_64F = 152,
+    HIPBLAS_C_16F = 153,
+    HIPBLAS_C_32F = 154,
+    HIPBLAS_C_64F = 155
+};
+
+enum hipblasGemmAlgo_t {
+    HIPBLAS_GEMM_DEFAULT = 160
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -215,8 +228,21 @@ HIPBLAS_EXPORT hipblasStatus_t hipblasDgemmBatched(hipblasHandle_t handle,
         const double *beta,
         double * C[], int ldc, int batchCount);
 
+HIPBLAS_EXPORT hipblasStatus_t hipblasGemmEx(
+        hipblasHandle_t handle,
+        hipblasOperation_t trans_a, 
+        hipblasOperation_t trans_b,
+        int m, 
+        int n, 
+        int k, 
+        const void* alpha,
+        const void* a, hipblasDatatype_t a_type, int lda,
+        const void* b, hipblasDatatype_t b_type, int ldb, const void* beta,
+              void* c, hipblasDatatype_t c_type, int ldc,
+        hipblasDatatype_t compute_type,
+        hipblasGemmAlgo_t algo);
 
-/* not implementes, requires complex support
+/* not implemented, requires complex support
 hipblasStatus_t hipblasCgemm(hipblasHandle_t handle,  hipblasOperation_t transa, hipblasOperation_t transb,
                            int m, int n, int k,  const hipComplex *alpha, const hipComplex *A, int lda, const hipComplex *B, int ldb, const hipComplex *beta, hipComplex *C, int ldc);
 
diff --git a/library/src/hcc_detail/hipblas.cpp b/library/src/hcc_detail/hipblas.cpp
index 4289a676a..a9c7fc123 100644
--- a/library/src/hcc_detail/hipblas.cpp
+++ b/library/src/hcc_detail/hipblas.cpp
@@ -125,7 +125,6 @@ hipblasSideMode_t HCCSideToHIPSide( rocblas_side_ side)
 	}
 }
 
-
 rocblas_pointer_mode HIPPointerModeToRocblasPointerMode( hipblasPointerMode_t mode)
 {
     switch (mode)
@@ -141,7 +140,6 @@ rocblas_pointer_mode HIPPointerModeToRocblasPointerMode( hipblasPointerMode_t mo
     }
 }
 
-
 hipblasPointerMode_t RocblasPointerModeToHIPPointerMode( rocblas_pointer_mode mode)
 {
     switch (mode)
@@ -157,7 +155,83 @@ hipblasPointerMode_t RocblasPointerModeToHIPPointerMode( rocblas_pointer_mode mo
     }
 }
 
+rocblas_datatype HIPDatatypeToRocblasDatatype ( hipblasDatatype_t type)
+{
+    switch (type)
+    {
+        case HIPBLAS_R_16F:
+            return rocblas_datatype_f16_r ;
+
+        case HIPBLAS_R_32F:
+            return rocblas_datatype_f32_r ;
+
+        case HIPBLAS_R_64F:
+            return rocblas_datatype_f64_r ;
+
+        case HIPBLAS_C_16F:
+            return rocblas_datatype_f16_c ;
+
+        case HIPBLAS_C_32F:
+            return rocblas_datatype_f32_c ;
+
+        case HIPBLAS_C_64F:
+            return rocblas_datatype_f64_c ;
+
+        default:
+            throw "Non existant DataType";
+    }
+}
+
+hipblasDatatype_t RocblasDatatypeToHIPDatatype( rocblas_datatype type)
+{
+    switch (type)
+    {
+        case rocblas_datatype_f16_r :
+            return HIPBLAS_R_16F;
+
+        case rocblas_datatype_f32_r :
+            return HIPBLAS_R_32F;
+
+        case rocblas_datatype_f64_r :
+            return HIPBLAS_R_64F;
+
+        case rocblas_datatype_f16_c :
+            return HIPBLAS_C_16F;
+
+        case rocblas_datatype_f32_c :
+            return HIPBLAS_C_32F;
+
+        case rocblas_datatype_f64_c :
+            return HIPBLAS_C_64F;
+
+        default:
+            throw "Non existant DataType";
+    }
+}
+
+rocblas_gemm_algo HIPGemmAlgoToRocblasGemmAlgo( hipblasGemmAlgo_t algo)
+{
+    switch (algo)
+    {
+        case HIPBLAS_GEMM_DEFAULT:
+            return rocblas_gemm_algo_standard;
+
+        default:
+            throw "Non existant GemmAlgo";
+    }
+}
 
+hipblasGemmAlgo_t RocblasGemmAlgoToHIPGemmAlgo ( rocblas_gemm_algo algo)
+{
+    switch (algo)
+    {
+        case rocblas_gemm_algo_standard:
+            return HIPBLAS_GEMM_DEFAULT;
+
+        default:
+            throw "Non existant GemmAlgo";
+    }
+}
 
 hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status_ error)
 {
@@ -709,3 +783,33 @@ hipblasDgemmBatched(hipblasHandle_t handle,
         transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount);
 }
 
+extern "C"
+hipblasStatus_t
+hipblasGemmEx(hipblasHandle_t handle,
+        hipblasOperation_t transa, hipblasOperation_t transb,
+        int m, int n, int k,  const void *alpha,
+        const void * A, hipblasDatatype_t a_type, int lda,
+        const void * B, hipblasDatatype_t b_type, int ldb, const void * beta,
+              void * C, hipblasDatatype_t c_type, int ldc,
+                          hipblasDatatype_t compute_type,
+        hipblasGemmAlgo_t algo)
+{
+    uint32_t solution_index = 0;
+
+    uint32_t flags = 0;
+
+    size_t* workspace_size = 0;
+
+    void* workspace = 0;
+
+	return rocBLASStatusToHIPStatus(rocblas_gemm_ex((rocblas_handle)handle,
+                hipOperationToHCCOperation(transa),  hipOperationToHCCOperation(transb),
+                m, n, k, alpha,
+                A, HIPDatatypeToRocblasDatatype(a_type), lda,
+                B, HIPDatatypeToRocblasDatatype(b_type), ldb, beta,
+                C, HIPDatatypeToRocblasDatatype(c_type), ldc,
+                C, HIPDatatypeToRocblasDatatype(c_type), ldc,
+                   HIPDatatypeToRocblasDatatype(compute_type),
+                   HIPGemmAlgoToRocblasGemmAlgo(algo),
+                solution_index, flags, workspace_size, workspace));
+}