1+ /* **************************************************************************************************
2+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
3+ * SPDX-License-Identifier: BSD-3-Clause
4+ *
5+ * Redistribution and use in source and binary forms, with or without
6+ * modification, are permitted provided that the following conditions are met:
7+ *
8+ * 1. Redistributions of source code must retain the above copyright notice, this
9+ * list of conditions and the disclaimer.
10+ *
11+ * 2. Redistributions in binary form must reproduce the above copyright notice,
12+ * this list of conditions and the following disclaimer in the documentation
13+ * and/or other materials provided with the distribution.
14+ *
15+ * 3. Neither the name of the copyright holder nor the names of its
16+ * contributors may be used to endorse or promote products derived from
17+ * this software without specific prior written permission.
18+ *
19+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+ * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
29+ *
30+ **************************************************************************************************/
31+
32+ #include " cutlass/detail/layout.hpp"
33+
34+ #include < cute/tensor.hpp>
35+ #include < cute/atom/copy_atom.hpp>
36+ #include < cute/atom/copy_traits_xe_2d.hpp>
37+ #include < cute/arch/copy_xe_2d.hpp>
38+ #include < sycl/sycl.hpp>
39+ #include < cute/util/compat.hpp>
40+
41+ #include " cutlass_unit_test.h"
42+ #include " utils.hpp"
43+
44+ using namespace cute ;
45+ using namespace cutlass ;
46+ using namespace compat ::experimental;
47+
48+ #define SUBGROUP_SIZE (16 )
49+
50+ #if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18)
51+
52+ // Kernel name for unique identification
53+ template <class ...> class XECopy2DKernelName ;
54+
55+ // Device kernel for XE_LOAD_2D testing
56+ template <class SrcTensor , class DstTensor , int Bits, int Height, int Width>
57+ void xe_copy_2d_kernel (SrcTensor src, DstTensor dst) {
58+ using namespace cute ;
59+ using Element = typename SrcTensor::value_type;
60+
61+ // Only execute with the first subgroup to avoid race conditions
62+ if (sycl::ext::oneapi::this_work_item::get_nd_item<1 >().get_group (0 ) == 0 ) {
63+ // Get thread/subgroup information
64+ auto local_id = int (sycl::ext::oneapi::this_work_item::get_nd_item<1 >().get_local_id (0 ));
65+
66+ // Create block 2D copy inside kernel (device-only operation)
67+ using CopyOp = XE_LOAD_2D<Bits, Height, Width>;
68+ auto tiled_copy = make_block_2d_copy (CopyOp{}, src);
69+
70+ // Get thread slice of the tiled copy
71+ auto thr_copy = tiled_copy.get_slice (local_id);
72+
73+ // Create coordinate tensor for a single tile
74+ auto coord_shape = make_shape (Int<Height>{}, Int<Width * Bits / sizeof_bits_v<Element>>{});
75+ Tensor coord_tile = make_identity_tensor (coord_shape);
76+
77+ // Partition source coordinates and create destination fragment
78+ auto thr_src_coord = thr_copy.partition_S (coord_tile);
79+ auto thr_dst_frag = thr_copy.partition_fragment_D (coord_tile);
80+
81+ // Perform the copy operation from global memory to registers
82+ copy (tiled_copy, thr_src_coord, thr_dst_frag);
83+
84+ // For verification, create a 2D store operation to write registers back to destination
85+ using StoreOp = XE_STORE_2D<Bits, Height, Width>;
86+ auto tiled_store = make_block_2d_copy (StoreOp{}, dst);
87+ auto thr_store = tiled_store.get_slice (local_id);
88+
89+ // Create destination coordinates for the store operation
90+ auto thr_dst_coord = thr_store.partition_D (coord_tile);
91+ auto thr_src_frag = thr_store.partition_fragment_S (coord_tile);
92+
93+ // Copy the loaded data from registers to the fragment for storing
94+ copy (thr_dst_frag, thr_src_frag);
95+
96+ // Perform the store operation from registers to global memory
97+ copy (tiled_store, thr_src_frag, thr_dst_coord);
98+
99+ // Synchronize to ensure all threads complete their operations
100+ sycl::group_barrier (sycl::ext::oneapi::this_work_item::get_nd_item<1 >().get_group ());
101+ }
102+ }
103+
104+ // Host test function template
105+ template <typename Element, int Bits, int Height, int Width, int BlockWidth = Width>
106+ void test_xe_copy_2d () {
107+ using namespace cute ;
108+
109+ // Matrix dimensions - must be compatible with block 2D constraints
110+ constexpr int M = Height;
111+ constexpr int N = Width * sizeof_bits_v<Element> / Bits;
112+
113+ // Ensure proper alignment (required for block 2D operations)
114+ constexpr int elem_alignment = 16 / sizeof (Element);
115+ constexpr int aligned_N = ((N + elem_alignment - 1 ) / elem_alignment) * elem_alignment;
116+
117+ // Allocate and initialize host data
118+ cutlass::host_vector<Element> host_src (M * aligned_N);
119+ cutlass::host_vector<Element> host_dst (M * aligned_N);
120+
121+
122+ // Initialize source with test pattern
123+ for (size_t i = 0 ; i < host_src.size (); ++i) {
124+ // Use a safe conversion that works for all numeric types
125+ if constexpr (std::is_floating_point_v<Element> ||
126+ std::is_same_v<Element, half_t > ||
127+ std::is_same_v<Element, bfloat16_t > ||
128+ std::is_same_v<Element, tfloat32_t >) {
129+
130+ // For floating-point types, convert through float
131+ float val = static_cast <float >(i % 256 ) / 255 .0f ; // Normalize to [0,1]
132+ host_src[i] = Element (val);
133+ } else {
134+ // For integer types (including uint64_t) and char, direct conversion is safe
135+ host_src[i] = static_cast <Element>(i % 256 );
136+ }
137+ }
138+
139+ // Copy to device
140+ cutlass::device_vector<Element> device_src = host_src;
141+ cutlass::device_vector<Element> device_dst (M * aligned_N);
142+
143+ // Create tensors with proper layout
144+ Tensor tensor_src =
145+ make_tensor (make_gmem_ptr (device_src.data ()),
146+ make_layout (Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
147+
148+ Tensor tensor_dst =
149+ make_tensor (make_gmem_ptr (device_dst.data ()),
150+ make_layout (Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
151+
152+ // Launch kernel - copy creation happens on device
153+ auto blockDim = compat::dim3 (SUBGROUP_SIZE);
154+ auto gridDim = compat::dim3 (1 );
155+
156+ launch<xe_copy_2d_kernel<decltype (tensor_src), decltype (tensor_dst), Bits, Height, Width>,
157+ XECopy2DKernelName<decltype (tensor_src), decltype (tensor_dst)>>(
158+ launch_policy{
159+ gridDim, blockDim,
160+ kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}
161+ },
162+ tensor_src, tensor_dst);
163+
164+ compat::wait_and_throw ();
165+ host_dst = device_dst;
166+ for (int i = 0 ; i < M * N; ++i) {
167+ // printf("%d %d\n", int(h_in[i]), int(h_out[i]));
168+ EXPECT_EQ (host_dst[i], host_src[i]);
169+ }
170+ }
171+
172+ TEST (PVC_CuTe_Xe, XE_COPY_2D_uint8) {
173+ test_xe_copy_2d<uint8_t , 8 , 2 , 64 >();
174+ test_xe_copy_2d<uint8_t , 8 , 3 , 64 >();
175+ test_xe_copy_2d<uint8_t , 8 , 4 , 64 >();
176+ test_xe_copy_2d<uint8_t , 8 , 5 , 64 >();
177+ test_xe_copy_2d<uint8_t , 8 , 6 , 64 >();
178+ test_xe_copy_2d<uint8_t , 8 , 7 , 64 >();
179+ test_xe_copy_2d<uint8_t , 8 , 8 , 64 >();
180+ }
181+
182+ TEST (PVC_CuTe_Xe, XE_COPY_2D_int8) {
183+ test_xe_copy_2d<int8_t , 8 , 2 , 64 >();
184+ test_xe_copy_2d<int8_t , 8 , 3 , 64 >();
185+ test_xe_copy_2d<int8_t , 8 , 4 , 64 >();
186+ test_xe_copy_2d<int8_t , 8 , 5 , 64 >();
187+ test_xe_copy_2d<int8_t , 8 , 6 , 64 >();
188+ test_xe_copy_2d<int8_t , 8 , 7 , 64 >();
189+ test_xe_copy_2d<int8_t , 8 , 8 , 64 >();
190+ }
191+
192+ TEST (PVC_CuTe_Xe, XE_COPY_2D_uint16) {
193+ test_xe_copy_2d<uint16_t , 16 , 2 , 32 >();
194+ test_xe_copy_2d<uint16_t , 16 , 3 , 32 >();
195+ test_xe_copy_2d<uint16_t , 16 , 4 , 32 >();
196+ test_xe_copy_2d<uint16_t , 16 , 5 , 32 >();
197+ test_xe_copy_2d<uint16_t , 16 , 6 , 32 >();
198+ test_xe_copy_2d<uint16_t , 16 , 7 , 32 >();
199+ test_xe_copy_2d<uint16_t , 16 , 8 , 32 >();
200+ }
201+
202+ TEST (PVC_CuTe_Xe, XE_COPY_2D_int16) {
203+ test_xe_copy_2d<int16_t , 16 , 2 , 32 >();
204+ test_xe_copy_2d<int16_t , 16 , 3 , 32 >();
205+ test_xe_copy_2d<int16_t , 16 , 4 , 32 >();
206+ test_xe_copy_2d<int16_t , 16 , 5 , 32 >();
207+ test_xe_copy_2d<int16_t , 16 , 6 , 32 >();
208+ test_xe_copy_2d<int16_t , 16 , 7 , 32 >();
209+ test_xe_copy_2d<int16_t , 16 , 8 , 32 >();
210+ }
211+
212+ TEST (PVC_CuTe_Xe, XE_COPY_2D_half) {
213+ test_xe_copy_2d<half_t , 16 , 2 , 32 >();
214+ test_xe_copy_2d<half_t , 16 , 3 , 32 >();
215+ test_xe_copy_2d<half_t , 16 , 4 , 32 >();
216+ test_xe_copy_2d<half_t , 16 , 5 , 32 >();
217+ test_xe_copy_2d<half_t , 16 , 6 , 32 >();
218+ test_xe_copy_2d<half_t , 16 , 7 , 32 >();
219+ test_xe_copy_2d<half_t , 16 , 8 , 32 >();
220+ }
221+
222+ TEST (PVC_CuTe_Xe, XE_COPY_2D_bfloat16) {
223+ test_xe_copy_2d<bfloat16_t , 16 , 2 , 32 >();
224+ test_xe_copy_2d<bfloat16_t , 16 , 3 , 32 >();
225+ test_xe_copy_2d<bfloat16_t , 16 , 4 , 32 >();
226+ test_xe_copy_2d<bfloat16_t , 16 , 5 , 32 >();
227+ test_xe_copy_2d<bfloat16_t , 16 , 6 , 32 >();
228+ test_xe_copy_2d<bfloat16_t , 16 , 7 , 32 >();
229+ test_xe_copy_2d<bfloat16_t , 16 , 8 , 32 >();
230+ }
231+
232+ TEST (PVC_CuTe_Xe, XE_COPY_2D_uint32) {
233+ test_xe_copy_2d<uint32_t , 32 , 2 , 16 >();
234+ test_xe_copy_2d<uint32_t , 32 , 3 , 16 >();
235+ test_xe_copy_2d<uint32_t , 32 , 4 , 16 >();
236+ test_xe_copy_2d<uint32_t , 32 , 5 , 16 >();
237+ test_xe_copy_2d<uint32_t , 32 , 6 , 16 >();
238+ test_xe_copy_2d<uint32_t , 32 , 7 , 16 >();
239+ test_xe_copy_2d<uint32_t , 32 , 8 , 16 >();
240+ }
241+
242+ TEST (PVC_CuTe_Xe, XE_COPY_2D_int32) {
243+ test_xe_copy_2d<int32_t , 32 , 2 , 16 >();
244+ test_xe_copy_2d<int32_t , 32 , 3 , 16 >();
245+ test_xe_copy_2d<int32_t , 32 , 4 , 16 >();
246+ test_xe_copy_2d<int32_t , 32 , 5 , 16 >();
247+ test_xe_copy_2d<int32_t , 32 , 6 , 16 >();
248+ test_xe_copy_2d<int32_t , 32 , 7 , 16 >();
249+ test_xe_copy_2d<int32_t , 32 , 8 , 16 >();
250+ }
251+
252+ TEST (PVC_CuTe_Xe, XE_COPY_2D_float) {
253+ test_xe_copy_2d<float , 32 , 2 , 16 >();
254+ test_xe_copy_2d<float , 32 , 3 , 16 >();
255+ test_xe_copy_2d<float , 32 , 4 , 16 >();
256+ test_xe_copy_2d<float , 32 , 5 , 16 >();
257+ test_xe_copy_2d<float , 32 , 6 , 16 >();
258+ test_xe_copy_2d<float , 32 , 7 , 16 >();
259+ test_xe_copy_2d<float , 32 , 8 , 16 >();
260+ }
261+
262+ TEST (PVC_CuTe_Xe, XE_COPY_2D_tfloat32) {
263+ test_xe_copy_2d<tfloat32_t , 32 , 2 , 16 >();
264+ test_xe_copy_2d<tfloat32_t , 32 , 3 , 16 >();
265+ test_xe_copy_2d<tfloat32_t , 32 , 4 , 16 >();
266+ test_xe_copy_2d<tfloat32_t , 32 , 5 , 16 >();
267+ test_xe_copy_2d<tfloat32_t , 32 , 6 , 16 >();
268+ test_xe_copy_2d<tfloat32_t , 32 , 7 , 16 >();
269+ test_xe_copy_2d<tfloat32_t , 32 , 8 , 16 >();
270+ }
271+
272+ TEST (PVC_CuTe_Xe, XE_COPY_2D_char) {
273+ test_xe_copy_2d<char , 8 , 2 , 64 >();
274+ test_xe_copy_2d<char , 8 , 3 , 64 >();
275+ test_xe_copy_2d<char , 8 , 4 , 64 >();
276+ test_xe_copy_2d<char , 8 , 5 , 64 >();
277+ test_xe_copy_2d<char , 8 , 6 , 64 >();
278+ test_xe_copy_2d<char , 8 , 7 , 64 >();
279+ test_xe_copy_2d<char , 8 , 8 , 64 >();
280+ }
281+
282+ #else
283+
284+ // For the fallback case
285+ #include " cutlass_unit_test.h"
286+
287+ TEST (PVC_CuTe_Xe, XE_COPY_2D_SKIPPED) {
288+ GTEST_SKIP () << " XE_COPY_2D tests require IGC version 2.18 or higher. skipped" ;
289+ }
290+
291+ #endif
0 commit comments