|
| 1 | += sycl_ext_intel_matrix |
| 2 | + |
| 3 | +:source-highlighter: coderay |
| 4 | +:coderay-linenums-mode: table |
| 5 | + |
| 6 | +// This section needs to be after the document title. |
| 7 | +:doctype: book |
| 8 | +:toc2: |
| 9 | +:toc: left |
| 10 | +:encoding: utf-8 |
| 11 | +:lang: en |
| 12 | +:dpcpp: pass:[DPC++] |
| 13 | + |
| 14 | +// Set the default source code type in this document to C++, |
| 15 | +// for syntax highlighting purposes. This is needed because |
| 16 | +// docbook uses c++ and html5 uses cpp. |
| 17 | +:language: {basebackend@docbook:c++:cpp} |
| 18 | + |
| 19 | + |
| 20 | +== Notice |
| 21 | + |
| 22 | +Copyright (c) 2022-2023 Intel Corporation. All rights reserved. |
| 23 | + |
| 24 | +NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are |
| 25 | +trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. |
| 26 | +used by permission by Khronos. |
| 27 | + |
| 28 | +== Contact |
| 29 | + |
| 30 | +To report problems with this extension, please open a new issue at: |
| 31 | + |
| 32 | +https://github.com/intel/llvm/issues |
| 33 | + |
| 34 | +== Dependencies |
| 35 | + |
| 36 | +This extension is written against the SYCL 2020 revision 6 specification. All |
| 37 | +references below to the "core SYCL specification" or to section numbers in the |
| 38 | +SYCL specification refer to that revision. |
| 39 | + |
| 40 | +This extension also depends on the following other SYCL extensions: |
| 41 | + |
| 42 | +* link:sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix] |
| 43 | + |
| 44 | +== Status |
| 45 | +This is an experimental extension specification, intended to provide early |
| 46 | +access to features and gather community feedback. Interfaces defined in this |
| 47 | +specification are implemented in {dpcpp}, but they are not finalized and may |
| 48 | +change incompatibly in future versions of {dpcpp} without prior notice. |
| 49 | +*Shipping software products should not rely on APIs defined in this |
| 50 | +specification.* |
| 51 | + |
| 52 | +== Backend support status |
| 53 | +This document describes the extra features and details for the |
| 54 | +implementation of `joint_matrix` extension on Intel AMX and Intel |
| 55 | +XMX. |
| 56 | + |
| 57 | +The APIs in this extension may be used only on a device that has |
| 58 | +`aspect::ext_intel_matrix`. The application must check that the device |
| 59 | +has this aspect before submitting a kernel using any of the APIs in |
| 60 | +this extension. If the application fails to do this, the |
| 61 | +implementation throws a synchronous exception with the |
| 62 | +`errc::kernel_not_supported` error code when the kernel is submitted to |
| 63 | +the queue. |
| 64 | + |
| 65 | +== Overview |
| 66 | +This extension provides additional APIs related to the `joint_matrix` |
| 67 | +type that can be used only on Intel devices that have Intel AMX or |
| 68 | +Intel XMX technology. These Intel devices also support all of the |
| 69 | +generic matrix APIs specified in `sycl_ext_oneapi_matrix`, but |
| 70 | +applications can make use of the extended Intel specific APIs in this |
| 71 | +extension to gain additional performance and capabilities. |
| 72 | + |
| 73 | +== Specification |
| 74 | + |
| 75 | +=== Feature test macro |
| 76 | + |
| 77 | +This extension provides a feature-test macro as described in the core SYCL |
| 78 | +specification. An implementation supporting this extension must |
| 79 | +predefine the macro `SYCL_EXT_INTEL_MATRIX` to one of the values |
| 80 | +defined in the table below. Applications can test for the existence of |
| 81 | +this macro to determine if the implementation supports this feature, |
| 82 | +or applications can test the macro's value to determine which of the |
| 83 | +extension's APIs the implementation supports. |
| 84 | + |
| 85 | +[%header,cols="1,5"] |
| 86 | +|=== |
| 87 | +|Value |
| 88 | +|Description |
| 89 | + |
| 90 | +|1 |
| 91 | +|The APIs of this experimental extension are not versioned, so the |
| 92 | + feature-test macro always has this value. |
| 93 | +|=== |
| 94 | + |
| 95 | +=== New Aspect for Intel-Specific Matrix APIs |
| 96 | +This extension adds a new device aspect: |
| 97 | +```c++ |
| 98 | +namespace sycl { |
| 99 | + |
| 100 | +enum class aspect : /*unspecified*/ { |
| 101 | + ext_intel_matrix |
| 102 | +}; |
| 103 | + |
| 104 | +} // namespace sycl |
| 105 | +``` |
| 106 | +The `ext_intel_matrix` aspect indicates that the device is capable of |
| 107 | +using the extended joint matrix APIs that are defined in the sections |
| 108 | +that follow. |
| 109 | + |
| 110 | +=== New Layout Type |
| 111 | +This extension adds a new layout type named `ext_intel_packed` which |
| 112 | +an application can use to indicate that the matrix data is loaded or |
| 113 | +stored in VNNI "packed" format. |
| 114 | + |
| 115 | +```c++ |
| 116 | +namespace sycl::ext::oneapi::experimental::matrix::layout { |
| 117 | + |
| 118 | +enum class layout { |
| 119 | + ext_intel_packed |
| 120 | +}; |
| 121 | + |
| 122 | +} // namespace sycl::ext::oneapi::experimental::matrix |
| 123 | +``` |
| 124 | + |
| 125 | +Consequently, the layout argument `layout` in `joint_matrix_load` can |
| 126 | +take `ext_intel_packed` as argument to specify that the data has |
| 127 | +already been transformed into VNNI format. In this case, the `stride` |
| 128 | +argument of `joint_matrix_load` describes the number of elements |
| 129 | +between consecutive rows for packed layouts. |
| 130 | + |
| 131 | +In order to get maximum performance on Intel AMX and Intel XMX, |
| 132 | +prepacking data in the memory is necessary. If users did not specify |
| 133 | +the packed layouts, transforms done by the implementation may be slow |
| 134 | +due to extra scatter/gather operations. Hence, we expose the |
| 135 | +`ext_intel_packed` layout to the user to specify that A or B have |
| 136 | +already been VNNIed. The packed or VNNI layout is introduced in the |
| 137 | +`VNNI layout` section below. |
| 138 | + |
| 139 | +=== Additional Store Operations |
| 140 | +Besides store of matrix `accumulator`, the Intel implementation allows |
| 141 | +store on matrix `a` and `b` as well. |
| 142 | + |
| 143 | +```c++ |
| 144 | +namespace sycl::ext::intel::experimental::matrix { |
| 145 | + |
| 146 | +template <typename Group, typename T, size_t Rows, size_t Cols, |
| 147 | + layout Layout, access::address_space Space, |
| 148 | + access::decorated IsDecorated> |
| 149 | +void joint_matrix_store(Group g, |
| 150 | + const joint_matrix<Group, T, use::a, Rows, Cols, Layout> &res, |
| 151 | + multi_ptr<T, Space, IsDecorated> src, size_t stride); |
| 152 | + |
| 153 | +template <typename Group, typename T, size_t Rows, size_t Cols, |
| 154 | + layout Layout, access::address_space Space, |
| 155 | + access::decorated IsDecorated> |
| 156 | +void joint_matrix_store(Group g, |
| 157 | + const joint_matrix<Group, T, use::b, Rows, Cols, Layout> &res, |
| 158 | + multi_ptr<T, Space, IsDecorated> src, size_t stride); |
| 159 | + |
| 160 | +} // namespace sycl::ext::intel::experimental::matrix |
| 161 | +``` |
| 162 | + |
| 163 | +=== Per-element Access with Coordinates |
| 164 | +The function `joint_matrix_apply` in `sycl_ext_oneapi_matrix` provides |
| 165 | +a way for the application to apply the same operation on every element |
| 166 | +of the matrix. However, some algorithms require the application to |
| 167 | +know the coordinates of each element as it operates on them. In this |
| 168 | +case, the joint matrix index must be known in order to reason about |
| 169 | +the matrix view and extract the relevant piece such as a sum of all |
| 170 | +elements in a row for example. For instance, quantization that is |
| 171 | +needed for conversion between low precision types like `int8_t` and `fp32` |
| 172 | +uses such logic. |
| 173 | + |
| 174 | +This extension adds a new form of the `joint_matrix_apply` function in |
| 175 | +the `sycl::ext::intel::matrix` namespace that allows the application |
| 176 | +to perform an operation on each element of the matrix. This function |
| 177 | +is similar to the form in `sycl_ext_oneapi_joint_matrix`, but it also |
| 178 | +provides the matrix coordinates of each element to the callback |
| 179 | +function: |
| 180 | + |
| 181 | +```c++ |
| 182 | +namespace sycl::ext::intel::experimental::matrix { |
| 183 | + |
| 184 | +template<typename Group, typename T, use Use, size_t Rows, size_t |
| 185 | + Cols, layout Layout, typename F> |
| 186 | +void joint_matrix_apply(Group g, joint_matrix<Group, T, Use, Rows, |
| 187 | + Cols, Layout>& C, F&& func); |
| 188 | + |
| 189 | +} // namespace sycl::ext::intel::experimental::matrix |
| 190 | +``` |
| 191 | +The `func` callback is invoked with three parameters `(T& element, |
| 192 | +size_t row, size_t col)`, where `row` and `col` tell the coordinates |
| 193 | +of element in the joint matrix. To illustrate, the following example |
| 194 | +shows how you can use this API to sum the rows of a matrix: |
| 195 | + |
| 196 | +```c++ |
| 197 | +joint_matrix_apply(sg, A, [=](T &val, size_t row, size_t col) { |
| 198 | + sum_local_rows[row] += val; |
| 199 | +}); |
| 200 | +``` |
| 201 | +=== New Device Information Descriptor |
| 202 | +Besides the query we provide in |
| 203 | +link:sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix], |
| 204 | +some device descriptors are Intel hardware specific. These are |
| 205 | +provided as part of `ext::intel::experimental::info::device::matrix` |
| 206 | +namespace: |
| 207 | + |
| 208 | +[frame="none",options="header"] |
| 209 | +|====================== |
| 210 | +| Device descriptors | Return type| Description |
| 211 | +|`ext::intel::experimental::info::device::matrix::numtiles`| `int` |
| 212 | +|If the matrix hardware in the device has separate storage (register |
| 213 | +files or tiles) from the rest of the processing units (e.g. Intel |
| 214 | +AMX), returns the number of tiles. For other devices, returns 0. |
| 215 | +|====================== |
| 216 | + |
| 217 | +=== Packed Layout Format |
| 218 | +The `ext_intel_packed` layout (aka VNNI) is a special layout for |
| 219 | +matrix data that allows Intel AMX and Intel XMX devices to load |
| 220 | +matrices more efficiently (packing in 32 bits). This layout applies |
| 221 | +only to the A and B matrices, and may not be used with the accumulator |
| 222 | +matrix. The layout is different depending on whether the matrix |
| 223 | +element type is 8 bits or 16 bits, which are the only two element |
| 224 | +sizes supported for the A and B matrices on Intel AMX and Intel XMX |
| 225 | +devices. |
| 226 | + |
| 227 | +For an 8-bit element, the first four elements of column 0 are stored |
| 228 | +contiguously in memory, followed by the first four elements of column |
| 229 | +1, etc. This continues until the end of the row. After all the |
| 230 | +elements for rows 0 - 3 have been stored this way, the process |
| 231 | +repeats, starting with the next four elements of column 0. The diagram |
| 232 | +below illustrates this layout for a 8 x 4 matrix. |
| 233 | + |
| 234 | +==== Example 1: 8-bit elements |
| 235 | + |
| 236 | + // Example of a 8 row x 4 column matrix using a 8-bit data |
| 237 | + // element, in row-major layout, rows are shown horizontally. |
| 238 | + // Element a1 is contiguous in memory with element b1, etc. |
| 239 | + // --------------------------------- |
| 240 | + // a1, b1, c1, d1 |
| 241 | + // a2, b2, c2, d2 |
| 242 | + // a3, b3, c3, d3 |
| 243 | + // a4, b4, c4, d4 |
| 244 | + // a5, b5, c5, d5 |
| 245 | + // a6, b6, c6, d6 |
| 246 | + // a7, b7, c7, d7 |
| 247 | + // a8, b8, c8, d8 |
| 248 | + // --------------------------------- |
| 249 | + // The same matrix reformatted in packed layout. |
| 250 | + // Here, packing of 4 elements is needed to form 32 bits. |
| 251 | + // Elements a1, a2, a3, a4 are contiguous in memory, etc. |
| 252 | + // --------------------------------- |
| 253 | + // a1, a2, a3, a4, b1, b2, b3, b4, c1, c2, c3, c4, d1, d2, d3, d4 |
| 254 | + // a5, a6, a7, a8, b5, b6, b7, b8, c5, c6, c7, c8, d5, d6, d7, d8 |
| 255 | + |
| 256 | +For a 16-bit element, the first two elements of column 0 are stored |
| 257 | +contiguously in memory, followed by the first two elements of column |
| 258 | +1, etc. This continues until the end of the row. After all the |
| 259 | +elements for rows 0 - 1 have been stored this way, the process |
| 260 | +repeats, starting with the next two elements of column 0. The diagram |
| 261 | +below illustrates this layout for a 4 x 4 matrix. |
| 262 | + |
| 263 | +==== Example 2: 16-bit elements |
| 264 | + // Example of a 4 row x 4 column matrix using a 16-bit data |
| 265 | + // element, in row-major layout. |
| 266 | + // Element a1 is contiguous in memory with element b1, etc. |
| 267 | + // --------------------------------- |
| 268 | + // a1, b1, c1, d1 |
| 269 | + // a2, b2, c2, d2 |
| 270 | + // a3, b3, c3, d3 |
| 271 | + // a4, b4, c4, d4 |
| 272 | + // --------------------------------- |
| 273 | + // The same matrix reformatted in packed layout. |
| 274 | + // Here, packing of 2 elements is needed to form 32 bits. |
| 275 | + // Element a1 is contiguous in memory with element a2, etc. |
| 276 | + // --------------------------------- |
| 277 | + // a1, a2, b1, b2, c1, c2, d1, d2 |
| 278 | + // a3, a4, b3, b4, c3, c4, d3, d4 |
| 279 | + |
| 280 | +=== Example using int8_t type |
| 281 | +```c++ |
| 282 | +using namespace sycl::ext::oneapi::experimental::matrix; |
| 283 | + |
| 284 | +queue q; |
| 285 | +range<2> G = {M/tM, N}; |
| 286 | +range<2> L = {1, SG_SIZE}; |
| 287 | +auto bufA = sycl::buffer{memA, sycl::range{M*K}}; |
| 288 | +auto bufB = sycl::buffer{memB, sycl::range{K*N}}; |
| 289 | +auto bufC = sycl::buffer{memC, sycl::range{M*N}}; |
| 290 | +q.submit([&](sycl::handler& cgh) { |
| 291 | + auto accA = sycl::accessor{bufA, cgh, sycl::read_only}; |
| 292 | + auto accB = sycl::accessor{bufB, cgh, sycl::read_only}; |
| 293 | + auto accC = sycl::accessor{bufC, cgh, sycl::read_write}; |
| 294 | + cgh.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item) |
| 295 | + [[sycl::reqd_sub_group_size(SG_SIZE)]] { |
| 296 | + const auto global_idx = item.get_global_id(0); |
| 297 | + const auto global_idy = item.get_global_id(1); |
| 298 | + const auto sg_startx = global_idx - item.get_local_id(0); |
| 299 | + const auto sg_starty = global_idy - item.get_local_id(1); |
| 300 | + sub_group sg = item.get_sub_group(); |
| 301 | + joint_matrix<sub_group, int8_t, use::a, tM, tK, layout::row_major> tA; |
| 302 | + joint_matrix<sub_group, int8_t, use::b, tK, tN, |
| 303 | + layout::ext_intel_packed> tB; |
| 304 | + joint_matrix<sub_group, int32_t, use::accumulator, tM, tN> tC; |
| 305 | + joint_matrix_fill(sg, tC, 0); |
| 306 | + for (int k = 0; k < K; k += tK) { |
| 307 | + joint_matrix_load(sg, tA, accA + sg_startx * tM * K + k, K); |
| 308 | + joint_matrix_load(sg, tB, accB + k * N*4 + sg_starty/SG_SIZE*tN*4, N*4); |
| 309 | + tC = joint_matrix_mad(sg, tA, tB, tC); |
| 310 | + } |
| 311 | + auto wi_data_c = ext::intel::experimental::matrix::get_wi_data(sg, tC); |
| 312 | + for (int i = 0; i < wi_data_c.length(); i++) |
| 313 | + wi_data_c[i] *= alpha; |
| 314 | + joint_matrix_store(sg, tC, |
| 315 | + accC + sg_startx * tM * N + sg_starty/SG_SIZE*tN, N, layout::row_major); |
| 316 | + }); |
| 317 | +}); |
| 318 | +q.wait(); |
| 319 | +``` |
| 320 | +== Revision History |
| 321 | + |
| 322 | +[frame="none",options="header"] |
| 323 | +|====================== |
| 324 | +|Rev |Date |Author |Changes |
| 325 | +|1 |2022-11-07 |Dounia Khaldi |Add Intel-specific store API, |
| 326 | +layout information, and `joint_matrix_apply` with coordinates API |
| 327 | +|====================== |
0 commit comments