From 8b2299f7a6880ad71272a6cd6f012d1a631368ac Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 30 Jan 2023 09:30:46 -0800 Subject: [PATCH 1/7] Use the unified interface --- sycl/include/CL/__spirv/spirv_ops.hpp | 15 ++++---- .../sycl/ext/oneapi/matrix/matrix-intel.hpp | 36 ++++++++++++++++--- .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 14 +++++--- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index aa07bda9f4282..85de5a2774130 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -24,10 +24,11 @@ #ifdef __SYCL_DEVICE_ONLY__ #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) -template -extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * +extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_JointMatrixLoadINTEL(T *Ptr, std::size_t Stride, __spv::MatrixLayout Layout = L, __spv::Scope::Flag Sc = S, int MemOperand = 0); @@ -107,18 +108,20 @@ template *); -template -extern SYCL_EXTERNAL T __spirv_VectorExtractDynamic( +extern SYCL_EXTERNAL Ts __spirv_VectorExtractDynamic( __spv::__spirv_JointMatrixINTEL *, size_t i); -template extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, - T val, size_t i); + Ts val, size_t i); #else template struct joint_matrix; +// Differentiating between the "element type" and the "storage element type" +template struct helper_traits { + using element_type = T; + using storage_element_type = T; + using fill_argument_type = T; +}; + +template <> struct helper_traits { + using element_type = precision::tf32; + using storage_element_type = float; + using fill_argument_type = float; +}; + template class wi_element { @@ -72,12 +85,19 @@ class wi_element { std::size_t idx; public: + using storage_element_type = typename helper_traits::storage_element_type; wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} - operator T() { + operator storage_element_type() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx); + storage_element_type elem = + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, + idx); + return elem; #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -95,7 +115,8 @@ class wi_element { template wi_element &operator=(const T2 &rhs) { #ifdef __SYCL_DEVICE_ONLY__ - M.spvm = __spirv_VectorInsertDynamic(M.spvm, static_cast(rhs), idx); + M.spvm = __spirv_VectorInsertDynamic( + M.spvm, static_cast(rhs), idx); return *this; #else (void)rhs; @@ -122,8 +143,13 @@ class wi_element { template wi_element &operator op##=(const T2 &rhs) { \ M.spvm = __spirv_VectorInsertDynamic( \ M.spvm, \ - static_cast(__spirv_VectorExtractDynamic(M.spvm, idx) \ - op static_cast(rhs)), \ + static_cast( \ + __spirv_VectorExtractDynamic< \ + storage_element_type, T, NumRows, NumCols, \ + spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(M.spvm, idx) \ + op static_cast(rhs)), \ idx); \ return *this; \ } diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 1303ac46df7b9..bae5a7af5abc4 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -68,6 +68,12 @@ class wi_data { return (jm.cuda_impl.wi_marray[i]); #else return wi_element(jm, i); + /*using storage_element_type = typename + helper_traits::storage_element_type; storage_element_type elems = + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(jm.spvm, i); return elems;*/ #endif }; }; @@ -157,21 +163,21 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( assert(false && "Invalid Memory Layout!"); case layout::row_major: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::RowMajor, spv_scope_traits::value); break; case layout::col_major: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::ColumnMajor, spv_scope_traits::value); break; case sycl::ext::intel::experimental::matrix::layout::packed: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::Packed, spv_scope_traits::value); @@ -210,7 +216,7 @@ joint_matrix_load(Group sg, #else T *Ptr = src.get(); res.spvm = - __spirv_JointMatrixLoadINTEL::value, spv_matrix_layout_traits::value>( Ptr, stride, spv_matrix_layout_traits::value, From 7663f28424bd55e225ffb783b3617ef2cb9cae36 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 30 Jan 2023 09:33:53 -0800 Subject: [PATCH 2/7] add test --- sycl/test/matrix/matrix-tf32-test.cpp | 170 ++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 sycl/test/matrix/matrix-tf32-test.cpp diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp new file mode 100644 index 0000000000000..5a4e91a36a4c9 --- /dev/null +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -0,0 +1,170 @@ +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +auto constexpr SG_SZ = 16; + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +/*// this should be replaced with a DPC++ and spirv functions +float round_to_tf32(float a) { + uint32_t tmp_uint = reinterpret_cast(a); + tmp_uint += 0x1000u; // Round up the 13th last bit + tmp_uint &= 0xFFFFE000u; // Zero out the bottom 13 bits + float ret = reinterpret_cast(tmp_uint); + return ret; + }*/ + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + // buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k, K); + // Assume we alreay in vnni format. + joint_matrix_load( + sg, sub_b, + accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); + // If no rounding to tf32 function is called, joint_matrix_mad + // function will work on truncated floats. + for (int i = 0; i < get_wi_data(sg, sub_a).length(); i++) { + get_wi_data(sg, sub_a)[i] = + round_to_tf32(get_wi_data(sg, sub_a)[i]); + } + for (int i = 0; i < get_wi_data(sg, sub_b).length(); i++) { + get_wi_data(sg, sub_b)[i] = + round_to_tf32(static_cast(get_wi_data(sg, sub_b)[i])); + } + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + auto wi_slice_a = get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + float elem = wi_slice_a[i]; + wi_slice_a[i] *= 2; + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; +float B[MATRIX_K][MATRIX_N]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + float va = A_mem[m * K + k]; + float vb = B_mem[k * N + n]; + float acc = C_mem[m * N + n]; + C_mem[m * N + n] = va * vb; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = 1.0f * (i + j); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = 2.0f * i + 3.0f * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K / 2); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; +} From afd662bddb91323f1314f9527c026b4056a24bba Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 30 Jan 2023 09:36:52 -0800 Subject: [PATCH 3/7] update test --- sycl/test/matrix/matrix-tf32-test.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 5a4e91a36a4c9..ad5ffe905065a 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -21,15 +21,6 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; -/*// this should be replaced with a DPC++ and spirv functions -float round_to_tf32(float a) { - uint32_t tmp_uint = reinterpret_cast(a); - tmp_uint += 0x1000u; // Round up the 13th last bit - tmp_uint &= 0xFFFFE000u; // Zero out the bottom 13 bits - float ret = reinterpret_cast(tmp_uint); - return ret; - }*/ - template From f8841b4abf21be24f49c63ef1fd2de37c8447f70 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 30 Jan 2023 09:40:42 -0800 Subject: [PATCH 4/7] update test --- sycl/test/matrix/matrix-tf32-test.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index ad5ffe905065a..18af9610962ed 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -34,7 +34,6 @@ void matrix_multiply(big_matrix &C, assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - // buffer bufA(A.get_data(), range<2>(M, K)); buffer bufA(A.get_data(), range<2>(M, K)); buffer bufB(B.get_data(), range<2>(K, N)); buffer bufC((float *)C.get_data(), range<2>(M, N)); @@ -59,9 +58,11 @@ void matrix_multiply(big_matrix &C, const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; + joint_matrix + sub_a; joint_matrix + layout::row_major> sub_b; joint_matrix sub_c; joint_matrix_load(sg, sub_c, @@ -71,7 +72,6 @@ void matrix_multiply(big_matrix &C, for (int k = 0; k < K; k += TK) { joint_matrix_load( sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k, K); - // Assume we alreay in vnni format. joint_matrix_load( sg, sub_b, accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); @@ -115,7 +115,6 @@ void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, for (int k = 0; k < K; k++) { float va = A_mem[m * K + k]; float vb = B_mem[k * N + n]; - float acc = C_mem[m * N + n]; C_mem[m * N + n] = va * vb; } } From 2f06dcafcad3b58a2b7a6cd2e2f60c49aa5cb2bb Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 31 Jan 2023 19:40:29 -0800 Subject: [PATCH 5/7] adddress Bing's comments --- sycl/include/CL/__spirv/spirv_ops.hpp | 10 ++- .../sycl/ext/oneapi/matrix/matrix-intel.hpp | 83 +++++++++++++++---- .../ext/oneapi/matrix/matrix-tensorcores.hpp | 6 -- .../oneapi/matrix/matrix-unified-utils.hpp | 6 ++ .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 21 ++--- sycl/test/matrix/matrix-tf32-test.cpp | 2 +- 6 files changed, 88 insertions(+), 40 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 85de5a2774130..bc7688c7874bf 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -33,11 +33,12 @@ __spirv_JointMatrixLoadINTEL(T *Ptr, std::size_t Stride, __spv::MatrixLayout Layout = L, __spv::Scope::Flag Sc = S, int MemOperand = 0); -template extern SYCL_EXTERNAL void __spirv_JointMatrixStoreINTEL( - T *Ptr, __spv::__spirv_JointMatrixINTEL *Object, + T *Ptr, __spv::__spirv_JointMatrixINTEL *Object, std::size_t Stride, __spv::MatrixLayout Layout = L, __spv::Scope::Flag Sc = S, int MemOperand = 0); @@ -96,10 +97,11 @@ __spirv_JointMatrixSUMadINTEL( __spv::__spirv_JointMatrixINTEL *C, __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); -template -extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * +extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_CompositeConstruct(const T v); template (0); + return __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>( + M.spvm, idx) != static_cast(0); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -129,7 +134,13 @@ class wi_element { operator=(const wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ M.spvm = __spirv_VectorInsertDynamic( - M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx); + M.spvm, + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(rhs.M.spvm, + rhs.idx), + idx); return *this; #else (void)rhs; @@ -183,7 +194,11 @@ class wi_element::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, idx); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -192,8 +207,13 @@ class wi_element(__spirv_VectorExtractDynamic( - M.spvm, idx))) >= std::numeric_limits::epsilon(); + return std::fabs(static_cast( + __spirv_VectorExtractDynamic< + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, + NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, idx))) >= + std::numeric_limits::epsilon(); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -215,7 +235,14 @@ class wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ M.spvm = __spirv_VectorInsertDynamic( - M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx); + M.spvm, + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(rhs.M.spvm, + rhs.idx), + idx); return *this; #else (void)rhs; @@ -228,7 +255,13 @@ class wi_element::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(M.spvm, idx) op rhs, \ + idx); \ return *this; \ } #else // __SYCL_DEVICE_ONLY__ @@ -251,13 +284,21 @@ class wi_element &lhs, \ const sycl::ext::oneapi::bfloat16 &rhs) { \ - return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs; \ + return __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(lhs.M.spvm, lhs.idx) op rhs; \ } \ friend type operator op( \ const sycl::ext::oneapi::bfloat16 &lhs, \ const wi_element &rhs) { \ - return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs; \ + return __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(rhs.M.spvm, rhs.idx) op lhs; \ } OP(sycl::ext::oneapi::bfloat16, +) OP(sycl::ext::oneapi::bfloat16, -) @@ -269,15 +310,25 @@ class wi_element &lhs, \ const sycl::ext::oneapi::bfloat16 &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - lhs.M.spvm, lhs.idx)) op static_cast(rhs)}; \ + return type{static_cast( \ + __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(lhs.M.spvm, lhs.idx)) \ + op static_cast(rhs)}; \ } \ friend type operator op( \ const sycl::ext::oneapi::bfloat16 &lhs, \ const wi_element &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - rhs.M.spvm, rhs.idx)) op static_cast(lhs)}; \ + return type{static_cast( \ + __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(rhs.M.spvm, rhs.idx)) \ + op static_cast(lhs)}; \ } OP(bool, ==) OP(bool, !=) @@ -322,7 +373,7 @@ class wi_element &src, + Group, Tp, Use, NumRows, NumCols, Layout> &src, multi_ptr dst, size_t stride) { #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) @@ -347,7 +398,7 @@ joint_matrix_store(Group sg, #else // intel's impl T *Ptr = dst.get(); - __spirv_JointMatrixStoreINTEL::value, sycl::ext::oneapi::experimental::matrix:: diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp index a871b9709ae66..018c66cf5213d 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp @@ -18,12 +18,6 @@ namespace oneapi { namespace experimental { namespace matrix { -namespace precision { -class tf32 { - tf32() = delete; -}; -} // namespace precision - template struct joint_matrix; diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp index 718411b22ddbb..6f820fb82575a 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp @@ -18,6 +18,12 @@ enum class use { a, b, accumulator }; enum class layout { row_major = 0, col_major = 1, dynamic = 3 }; +namespace precision { +class tf32 { + tf32() = delete; +}; +} // namespace precision + } // namespace matrix } // namespace experimental } // namespace oneapi diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index bae5a7af5abc4..7e1c651e3eb04 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -68,12 +68,6 @@ class wi_data { return (jm.cuda_impl.wi_marray[i]); #else return wi_element(jm, i); - /*using storage_element_type = typename - helper_traits::storage_element_type; storage_element_type elems = - __spirv_VectorExtractDynamic::value, - spv_matrix_layout_traits::value, - spv_scope_traits::value>(jm.spvm, i); return elems;*/ #endif }; }; @@ -125,11 +119,12 @@ joint_matrix_fill(Group sg, std::ignore = sg; res.cuda_impl.wi_marray = v; #else + using storage_element_type = typename helper_traits::storage_element_type; res.spvm = - __spirv_CompositeConstruct::value, spv_matrix_layout_traits::value>( - static_cast(v)); + static_cast(v)); #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -253,21 +248,21 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( assert(false && "Invalid Memory Layout!"); case layout::row_major: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::RowMajor, spv_scope_traits::value); break; case layout::col_major: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::ColumnMajor, spv_scope_traits::value); break; case sycl::ext::intel::experimental::matrix::layout::packed: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::Packed, spv_scope_traits::value); @@ -339,12 +334,12 @@ inline __SYCL_ALWAYS_INLINE // This function rounds the bottom 13 bits up or down, and then zeros out the // bottom bits -inline __SYCL_ALWAYS_INLINE float round_to_tf32(float &a) { +inline __SYCL_ALWAYS_INLINE float round_to_tf32(const float &a) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) int32_t tmp_int = __nvvm_f2tf32_rna(a); return __nvvm_bitcast_i2f(tmp_int); #else - uint32_t tmp_uint = reinterpret_cast(a); + uint32_t tmp_uint = reinterpret_cast(a); tmp_uint += 0x1000u; tmp_uint &= 0xFFFFE000u; float ret = 0; diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 18af9610962ed..5858038c40e80 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -83,7 +83,7 @@ void matrix_multiply(big_matrix &C, } for (int i = 0; i < get_wi_data(sg, sub_b).length(); i++) { get_wi_data(sg, sub_b)[i] = - round_to_tf32(static_cast(get_wi_data(sg, sub_b)[i])); + round_to_tf32(get_wi_data(sg, sub_b)[i]); } sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); } From 0201945ce4ddc48a4f2d7ef0096cc76398fcb3c9 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 6 Feb 2023 11:11:11 -0800 Subject: [PATCH 6/7] add the spirv function for round_to_tf32 --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 ++ sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index bc7688c7874bf..8518b37c4a43f 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -24,6 +24,8 @@ #ifdef __SYCL_DEVICE_ONLY__ #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) +extern SYCL_EXTERNAL float __spirv_ConvertFToTF32INTEL(float a); + template (a); tmp_uint += 0x1000u; @@ -345,7 +349,7 @@ inline __SYCL_ALWAYS_INLINE float round_to_tf32(const float &a) { float ret = 0; std::memcpy(&ret, &tmp_uint, sizeof(float)); return ret; -#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#endif // defined(__SYCL_DEVICE_ONLY__) } } // namespace matrix From 0538cac5c86a00b6ac594f041326f48a5c93545f Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 28 Feb 2023 09:43:21 -0800 Subject: [PATCH 7/7] correct the test --- sycl/test/matrix/matrix-tf32-test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 5858038c40e80..e4c6daddf1624 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -115,7 +115,7 @@ void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, for (int k = 0; k < K; k++) { float va = A_mem[m * K + k]; float vb = B_mem[k * N + n]; - C_mem[m * N + n] = va * vb; + C_mem[m * N + n] += va * vb; } } } @@ -126,8 +126,8 @@ int main() { A[i][j] = 1.0f * (i + j); } } - for (int i = 0; i < MATRIX_K / 2; i++) { - for (int j = 0; j < MATRIX_N * 2; j++) { + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { B[i][j] = 2.0f * i + 3.0f * j; } } @@ -144,7 +144,7 @@ int main() { big_matrix MB((float *)&B); matrix_multiply(MC, MA, MB); matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, - MATRIX_K / 2); + MATRIX_K); bool res = true; for (int i = 0; i < MATRIX_M; i++) {