Skip to content

Commit ffb0d54

Browse files
authored
Unit tests for LOAD_2D and STORE_2D (#582)
| **API Function** | **Template Signature** | **Function** | |------------------|------------------------|--------------| | `XE_LOAD_2D` | `<Bits, Height, Width>` | High-speed 2D block loading | | `XE_STORE_2D` | `<Bits, Height, Width>` | Efficient 2D block storage | 11 test cases added for diff datatypes with 7 variations of height so total 77 variations used in this PR ## Data Type Coverage ### 8-bit Types (64 elements per row) | Data Type | Description | Test Function | |-----------|-------------|---------------| | `uint8_t` | 8-bit unsigned integer | `XE_COPY_2D_uint8` | | `int8_t` | 8-bit signed integer | `XE_COPY_2D_int8` | | `char` | 8-bit character | `XE_COPY_2D_char` | ### 16-bit Types (32 elements per row) | Data Type | Description | Test Function | |-----------|-------------|---------------| | `uint16_t` | 16-bit unsigned integer | `XE_COPY_2D_uint16` | | `int16_t` | 16-bit signed integer | `XE_COPY_2D_int16` | | `half_t` | 16-bit half precision float | `XE_COPY_2D_half` | | `bfloat16_t` | 16-bit brain float | `XE_COPY_2D_bfloat16` | ### 32-bit Types (16 elements per row) | Data Type | Description | Test Function | |-----------|-------------|---------------| | `uint32_t` | 32-bit unsigned integer | `XE_COPY_2D_uint32` | | `int32_t` | 32-bit signed integer | `XE_COPY_2D_int32` | | `float` | 32-bit single precision float | `XE_COPY_2D_float` | | `tfloat32_t` | 32-bit tensor float | `XE_COPY_2D_tfloat32` | ## Test Configuration Matrix | Bit Width | Data Types Count | Heights Tested | Elements/Row | Memory/Row | Test Cases | |-----------|------------------|----------------|--------------|------------|------------| | **8-bit** | 3 | 2, 3, 4, 5, 6, 7, 8 | 64 | 64 bytes | 21 | | **16-bit** | 4 | 2, 3, 4, 5, 6, 7, 8 | 32 | 64 bytes | 28 | | **32-bit** | 4 | 2, 3, 4, 5, 6, 7, 8 | 16 | 64 bytes | 28 |
2 parents 5a0b7a8 + b113448 commit ffb0d54

File tree

3 files changed

+302
-1
lines changed

3 files changed

+302
-1
lines changed

test/unit/cute/intel_xe/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ cutlass_test_unit_add_executable(
4747
copy_scatter.cpp
4848
mma.cpp
4949
tiled_mma.cpp
50+
xe_copy_2d_test.cpp
5051
)
5152
else()
5253
cutlass_test_unit_add_executable(

test/unit/cute/intel_xe/mma.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ TEST(PVC_CuTe_Xe, MMA_XE_8x16x8_F32TF32TF32F32_TT) {
312312
tfloat32_t, float>(512, 512, 256);
313313
}
314314

315-
#if (IGC_VERSION_MAJOR >= 2 && IGC_VERSION_MINOR >= 18)
315+
#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18)
316316

317317
TEST(PVC_CuTe_Xe, MMA_DPAS_S8_8x16) {
318318
MMA_Test<XE_DPAS_TT<8, int32_t, int8_t>, 64, 64, 8, 16, 32, int8_t, int8_t,
@@ -414,4 +414,13 @@ TEST(PVC_CuTe_Xe, MMA_DPAS_TF32_1x16) {
414414
tfloat32_t, float>(512, 512, 256);
415415
}
416416

417+
#else
418+
419+
// For the fallback case
420+
#include "cutlass_unit_test.h"
421+
422+
TEST(PVC_CuTe_Xe, MMA_DPAS_TESTS) {
423+
GTEST_SKIP() << "MMA DPAS tests require IGC version 2.18 or higher. skipped";
424+
}
425+
417426
#endif
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
/***************************************************************************************************
2+
* Copyright (C) 2025 Intel Corporation, All rights reserved.
3+
* SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice, this
9+
* list of conditions and the disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright notice,
12+
* this list of conditions and the following disclaimer in the documentation
13+
* and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+
* OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
29+
*
30+
**************************************************************************************************/
31+
32+
#include "cutlass/detail/layout.hpp"
33+
34+
#include <cute/tensor.hpp>
35+
#include <cute/atom/copy_atom.hpp>
36+
#include <cute/atom/copy_traits_xe_2d.hpp>
37+
#include <cute/arch/copy_xe_2d.hpp>
38+
#include <sycl/sycl.hpp>
39+
#include <cute/util/compat.hpp>
40+
41+
#include "cutlass_unit_test.h"
42+
#include "utils.hpp"
43+
44+
using namespace cute;
45+
using namespace cutlass;
46+
using namespace compat::experimental;
47+
48+
#define SUBGROUP_SIZE (16)
49+
50+
#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18)
51+
52+
// Kernel name for unique identification
53+
template<class...> class XECopy2DKernelName;
54+
55+
// Device kernel for XE_LOAD_2D testing
56+
template <class SrcTensor, class DstTensor, int Bits, int Height, int Width>
57+
void xe_copy_2d_kernel(SrcTensor src, DstTensor dst) {
58+
using namespace cute;
59+
using Element = typename SrcTensor::value_type;
60+
61+
// Only execute with the first subgroup to avoid race conditions
62+
if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) {
63+
// Get thread/subgroup information
64+
auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0));
65+
66+
// Create block 2D copy inside kernel (device-only operation)
67+
using CopyOp = XE_LOAD_2D<Bits, Height, Width>;
68+
auto tiled_copy = make_block_2d_copy(CopyOp{}, src);
69+
70+
// Get thread slice of the tiled copy
71+
auto thr_copy = tiled_copy.get_slice(local_id);
72+
73+
// Create coordinate tensor for a single tile
74+
auto coord_shape = make_shape(Int<Height>{}, Int<Width * Bits / sizeof_bits_v<Element>>{});
75+
Tensor coord_tile = make_identity_tensor(coord_shape);
76+
77+
// Partition source coordinates and create destination fragment
78+
auto thr_src_coord = thr_copy.partition_S(coord_tile);
79+
auto thr_dst_frag = thr_copy.partition_fragment_D(coord_tile);
80+
81+
// Perform the copy operation from global memory to registers
82+
copy(tiled_copy, thr_src_coord, thr_dst_frag);
83+
84+
// For verification, create a 2D store operation to write registers back to destination
85+
using StoreOp = XE_STORE_2D<Bits, Height, Width>;
86+
auto tiled_store = make_block_2d_copy(StoreOp{}, dst);
87+
auto thr_store = tiled_store.get_slice(local_id);
88+
89+
// Create destination coordinates for the store operation
90+
auto thr_dst_coord = thr_store.partition_D(coord_tile);
91+
auto thr_src_frag = thr_store.partition_fragment_S(coord_tile);
92+
93+
// Copy the loaded data from registers to the fragment for storing
94+
copy(thr_dst_frag, thr_src_frag);
95+
96+
// Perform the store operation from registers to global memory
97+
copy(tiled_store, thr_src_frag, thr_dst_coord);
98+
99+
// Synchronize to ensure all threads complete their operations
100+
sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group());
101+
}
102+
}
103+
104+
// Host test function template
105+
template <typename Element, int Bits, int Height, int Width, int BlockWidth = Width>
106+
void test_xe_copy_2d() {
107+
using namespace cute;
108+
109+
// Matrix dimensions - must be compatible with block 2D constraints
110+
constexpr int M = Height;
111+
constexpr int N = Width * sizeof_bits_v<Element> / Bits;
112+
113+
// Ensure proper alignment (required for block 2D operations)
114+
constexpr int elem_alignment = 16 / sizeof(Element);
115+
constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment;
116+
117+
// Allocate and initialize host data
118+
cutlass::host_vector<Element> host_src(M * aligned_N);
119+
cutlass::host_vector<Element> host_dst(M * aligned_N);
120+
121+
122+
// Initialize source with test pattern
123+
for (size_t i = 0; i < host_src.size(); ++i) {
124+
// Use a safe conversion that works for all numeric types
125+
if constexpr (std::is_floating_point_v<Element> ||
126+
std::is_same_v<Element, half_t> ||
127+
std::is_same_v<Element, bfloat16_t> ||
128+
std::is_same_v<Element, tfloat32_t>) {
129+
130+
// For floating-point types, convert through float
131+
float val = static_cast<float>(i % 256) / 255.0f; // Normalize to [0,1]
132+
host_src[i] = Element(val);
133+
} else {
134+
// For integer types (including uint64_t) and char, direct conversion is safe
135+
host_src[i] = static_cast<Element>(i % 256);
136+
}
137+
}
138+
139+
// Copy to device
140+
cutlass::device_vector<Element> device_src = host_src;
141+
cutlass::device_vector<Element> device_dst(M * aligned_N);
142+
143+
// Create tensors with proper layout
144+
Tensor tensor_src =
145+
make_tensor(make_gmem_ptr(device_src.data()),
146+
make_layout(Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
147+
148+
Tensor tensor_dst =
149+
make_tensor(make_gmem_ptr(device_dst.data()),
150+
make_layout(Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
151+
152+
// Launch kernel - copy creation happens on device
153+
auto blockDim = compat::dim3(SUBGROUP_SIZE);
154+
auto gridDim = compat::dim3(1);
155+
156+
launch<xe_copy_2d_kernel<decltype(tensor_src), decltype(tensor_dst), Bits, Height, Width>,
157+
XECopy2DKernelName<decltype(tensor_src), decltype(tensor_dst)>>(
158+
launch_policy{
159+
gridDim, blockDim,
160+
kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}
161+
},
162+
tensor_src, tensor_dst);
163+
164+
compat::wait_and_throw();
165+
host_dst = device_dst;
166+
for (int i = 0; i < M * N; ++i) {
167+
// printf("%d %d\n", int(h_in[i]), int(h_out[i]));
168+
EXPECT_EQ(host_dst[i], host_src[i]);
169+
}
170+
}
171+
172+
TEST(PVC_CuTe_Xe, XE_COPY_2D_uint8) {
173+
test_xe_copy_2d<uint8_t, 8, 2, 64>();
174+
test_xe_copy_2d<uint8_t, 8, 3, 64>();
175+
test_xe_copy_2d<uint8_t, 8, 4, 64>();
176+
test_xe_copy_2d<uint8_t, 8, 5, 64>();
177+
test_xe_copy_2d<uint8_t, 8, 6, 64>();
178+
test_xe_copy_2d<uint8_t, 8, 7, 64>();
179+
test_xe_copy_2d<uint8_t, 8, 8, 64>();
180+
}
181+
182+
TEST(PVC_CuTe_Xe, XE_COPY_2D_int8) {
183+
test_xe_copy_2d<int8_t, 8, 2, 64>();
184+
test_xe_copy_2d<int8_t, 8, 3, 64>();
185+
test_xe_copy_2d<int8_t, 8, 4, 64>();
186+
test_xe_copy_2d<int8_t, 8, 5, 64>();
187+
test_xe_copy_2d<int8_t, 8, 6, 64>();
188+
test_xe_copy_2d<int8_t, 8, 7, 64>();
189+
test_xe_copy_2d<int8_t, 8, 8, 64>();
190+
}
191+
192+
TEST(PVC_CuTe_Xe, XE_COPY_2D_uint16) {
193+
test_xe_copy_2d<uint16_t, 16, 2, 32>();
194+
test_xe_copy_2d<uint16_t, 16, 3, 32>();
195+
test_xe_copy_2d<uint16_t, 16, 4, 32>();
196+
test_xe_copy_2d<uint16_t, 16, 5, 32>();
197+
test_xe_copy_2d<uint16_t, 16, 6, 32>();
198+
test_xe_copy_2d<uint16_t, 16, 7, 32>();
199+
test_xe_copy_2d<uint16_t, 16, 8, 32>();
200+
}
201+
202+
TEST(PVC_CuTe_Xe, XE_COPY_2D_int16) {
203+
test_xe_copy_2d<int16_t, 16, 2, 32>();
204+
test_xe_copy_2d<int16_t, 16, 3, 32>();
205+
test_xe_copy_2d<int16_t, 16, 4, 32>();
206+
test_xe_copy_2d<int16_t, 16, 5, 32>();
207+
test_xe_copy_2d<int16_t, 16, 6, 32>();
208+
test_xe_copy_2d<int16_t, 16, 7, 32>();
209+
test_xe_copy_2d<int16_t, 16, 8, 32>();
210+
}
211+
212+
TEST(PVC_CuTe_Xe, XE_COPY_2D_half) {
213+
test_xe_copy_2d<half_t, 16, 2, 32>();
214+
test_xe_copy_2d<half_t, 16, 3, 32>();
215+
test_xe_copy_2d<half_t, 16, 4, 32>();
216+
test_xe_copy_2d<half_t, 16, 5, 32>();
217+
test_xe_copy_2d<half_t, 16, 6, 32>();
218+
test_xe_copy_2d<half_t, 16, 7, 32>();
219+
test_xe_copy_2d<half_t, 16, 8, 32>();
220+
}
221+
222+
TEST(PVC_CuTe_Xe, XE_COPY_2D_bfloat16) {
223+
test_xe_copy_2d<bfloat16_t, 16, 2, 32>();
224+
test_xe_copy_2d<bfloat16_t, 16, 3, 32>();
225+
test_xe_copy_2d<bfloat16_t, 16, 4, 32>();
226+
test_xe_copy_2d<bfloat16_t, 16, 5, 32>();
227+
test_xe_copy_2d<bfloat16_t, 16, 6, 32>();
228+
test_xe_copy_2d<bfloat16_t, 16, 7, 32>();
229+
test_xe_copy_2d<bfloat16_t, 16, 8, 32>();
230+
}
231+
232+
TEST(PVC_CuTe_Xe, XE_COPY_2D_uint32) {
233+
test_xe_copy_2d<uint32_t, 32, 2, 16>();
234+
test_xe_copy_2d<uint32_t, 32, 3, 16>();
235+
test_xe_copy_2d<uint32_t, 32, 4, 16>();
236+
test_xe_copy_2d<uint32_t, 32, 5, 16>();
237+
test_xe_copy_2d<uint32_t, 32, 6, 16>();
238+
test_xe_copy_2d<uint32_t, 32, 7, 16>();
239+
test_xe_copy_2d<uint32_t, 32, 8, 16>();
240+
}
241+
242+
TEST(PVC_CuTe_Xe, XE_COPY_2D_int32) {
243+
test_xe_copy_2d<int32_t, 32, 2, 16>();
244+
test_xe_copy_2d<int32_t, 32, 3, 16>();
245+
test_xe_copy_2d<int32_t, 32, 4, 16>();
246+
test_xe_copy_2d<int32_t, 32, 5, 16>();
247+
test_xe_copy_2d<int32_t, 32, 6, 16>();
248+
test_xe_copy_2d<int32_t, 32, 7, 16>();
249+
test_xe_copy_2d<int32_t, 32, 8, 16>();
250+
}
251+
252+
TEST(PVC_CuTe_Xe, XE_COPY_2D_float) {
253+
test_xe_copy_2d<float, 32, 2, 16>();
254+
test_xe_copy_2d<float, 32, 3, 16>();
255+
test_xe_copy_2d<float, 32, 4, 16>();
256+
test_xe_copy_2d<float, 32, 5, 16>();
257+
test_xe_copy_2d<float, 32, 6, 16>();
258+
test_xe_copy_2d<float, 32, 7, 16>();
259+
test_xe_copy_2d<float, 32, 8, 16>();
260+
}
261+
262+
TEST(PVC_CuTe_Xe, XE_COPY_2D_tfloat32) {
263+
test_xe_copy_2d<tfloat32_t, 32, 2, 16>();
264+
test_xe_copy_2d<tfloat32_t, 32, 3, 16>();
265+
test_xe_copy_2d<tfloat32_t, 32, 4, 16>();
266+
test_xe_copy_2d<tfloat32_t, 32, 5, 16>();
267+
test_xe_copy_2d<tfloat32_t, 32, 6, 16>();
268+
test_xe_copy_2d<tfloat32_t, 32, 7, 16>();
269+
test_xe_copy_2d<tfloat32_t, 32, 8, 16>();
270+
}
271+
272+
TEST(PVC_CuTe_Xe, XE_COPY_2D_char) {
273+
test_xe_copy_2d<char, 8, 2, 64>();
274+
test_xe_copy_2d<char, 8, 3, 64>();
275+
test_xe_copy_2d<char, 8, 4, 64>();
276+
test_xe_copy_2d<char, 8, 5, 64>();
277+
test_xe_copy_2d<char, 8, 6, 64>();
278+
test_xe_copy_2d<char, 8, 7, 64>();
279+
test_xe_copy_2d<char, 8, 8, 64>();
280+
}
281+
282+
#else
283+
284+
// For the fallback case
285+
#include "cutlass_unit_test.h"
286+
287+
TEST(PVC_CuTe_Xe, XE_COPY_2D_SKIPPED) {
288+
GTEST_SKIP() << "XE_COPY_2D tests require IGC version 2.18 or higher. skipped";
289+
}
290+
291+
#endif

0 commit comments

Comments
 (0)