Add transposed load kernel through the new Xe Copy Atoms

sspintel · sspintel · commit 3578f8e5ef50 · 2025-10-28T22:38:58.000-07:00
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
@@ -47,7 +47,7 @@ if (CUTLASS_ENABLE_SYCL)
 
   cutlass_example_add_executable(
     cute_tutorial_tiled_transpose
-    transpose/tiled_transpose_sycl.cpp
+    transpose/main.cpp
   )
 
   cutlass_example_add_executable(
diff --git a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h
@@ -0,0 +1,148 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cute/tensor.hpp>
+#include <cute/util/compat.hpp>
+#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
+#include <sycl/sycl.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+template <class TensorS, class TensorD, class BlockShape, class BlockShapeTrans,
+          class ThreadLayout>
+void block2DTransposedLoadKernel(TensorS const S, TensorD const DT,
+                                 BlockShape const block_shape,
+                                 BlockShapeTrans const block_shape_transposed,
+                                 ThreadLayout const thread_layout) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  /* get workgroup and local ids */
+  auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
+  auto wg_m = int(item.get_group(0));
+  auto wg_n = int(item.get_group(1));
+  auto local_id = int(item.get_local_id(0));
+
+  /* proxy coordinate tensor */
+  Tensor cS = make_identity_tensor(S.shape());   // (M,N)
+  Tensor cDT = make_identity_tensor(DT.shape()); // (N,M)
+
+  auto wg_coord = make_coord(wg_m, wg_n);
+  auto wg_coord_transposed = make_coord(wg_n, wg_m);
+
+  // Tensor data = ... // (  M,  N) Tensor cta_data = local_tile(data,
+  // Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
+  Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)
+  Tensor gDT = local_tile(cDT, block_shape_transposed,
+                          wg_coord_transposed); // (BLK_N,BLK_M);
+
+  constexpr int CopyBits = sizeof_bits_v<Element>;
+  auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, 8, 8>{};
+  auto store_op = XE_STORE_2D<CopyBits, 8, 8>{};
+
+  /* Slice TiledCopy operations to thread (work-item) level */
+  auto transpose_S = make_block_2d_copy(transposed_load_op, S);
+  auto thr_transpose_S = transpose_S.get_slice(local_id);
+
+  auto store_DT = make_block_2d_copy(store_op, DT);
+  auto thr_copy_DT = store_DT.get_slice(local_id);
+
+  /* Register fragments for transposed copy */
+  auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS);
+  auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT);
+
+  /* Partition global tensor (proxies) for copies */
+  Tensor tSgS = thr_transpose_S.partition_S(gS);
+  Tensor tDgD = thr_copy_DT.partition_D(gDT);
+
+  // if ( cute::thread(0, 0)){
+  //     print(tSgS);print("\n");
+  //     print(tSrS);print("\n");
+  //     print(tDgD);print("\n");
+  // }
+
+  copy(transpose_S, tSgS, tSrS);
+  // copy(tSrS, tDrD);
+  copy(store_DT, tSrS, tDgD);
+}
+
+class TransposeCuteName;
+template <typename Element>
+void block_2d_transposed_copy(TransposeParams<Element> params) {
+
+  using namespace cute;
+  //
+  // Make Tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto tensor_shape_trans = make_shape(params.N, params.M);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  // Make a transposed view of the output
+  // auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
+  // Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT);
+
+  sycl::queue Q;
+
+  //
+  // Tile tensors
+  //
+
+  using bM = Int<32>;
+  using bN = Int<8>;
+
+  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
+  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
+
+  sycl::range<2> local = {bM{}, 1};
+  sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
+                           local[1] * ceil_div(shape<1>(tensor_S), bN{})};
+
+  auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{});
+
+  namespace syclex = sycl::ext::oneapi::experimental;
+  namespace intelex = sycl::ext::intel::experimental;
+
+  syclex::properties kernel_props{syclex::sub_group_size<16>,
+                                  intelex::grf_size<256>};
+
+  auto event = Q.parallel_for<TransposeCuteName>(
+      sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
+        block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape,
+                                    block_shape_trans, threadLayout);
+      });
+};
diff --git a/examples/cute/tutorial/transpose/copy_direct.h b/examples/cute/tutorial/transpose/copy_direct.h
@@ -51,6 +51,22 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
   using namespace cute;
 
   // Slice the tiled tensors
+  // This line slices the tiled tensor S to get the tile for the current work
+  // group. S is a 3D tensor with layout ((M, N), m', n') where:
+  //   - (M, N) is the block/tile shape (first mode)
+  //   - m' is the number of tiles in the M dimension (second mode)
+  //   - n' is the number of tiles in the N dimension (third mode)
+  //
+  // The indexing S(make_coord(_, _), x, y) selects:
+  //   - make_coord(_, _): Takes all elements from the first mode (M, N), i.e.,
+  //   the entire tile
+  //   - compat::work_group_id::x(): Selects the x-th tile along the m'
+  //   dimension
+  //   - compat::work_group_id::y(): Selects the y-th tile along the n'
+  //   dimension
+  //
+  // Result: A 2D tensor of shape (BlockShape_M, BlockShape_N) corresponding to
+  // the tile assigned to the current work group.
   Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
                     compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
   Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
@@ -64,78 +80,78 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
       tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
   Tensor thr_tile_D = local_partition(
       tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
-                                                      //
-
-  // Construct a register-backed Tensor with the same shape as each thread's
-  // partition Use make_tensor to try to match the layout of thr_tile_S
-  Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)
-
-  // Copy from GMEM to RMEM and from RMEM to GMEM
-  copy(thr_tile_S, fragment);
-  copy(fragment, thr_tile_D);
-}
-
-template <typename Element> void copy_direct(TransposeParams<Element> params) {
-  //
-  // Given a 2D shape, perform an efficient copy
-  //
-
-  using namespace cute;
-
-  //
-  // Make tensors
-  //
-  auto tensor_shape = make_shape(params.M, params.N);
-  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
-  auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
-  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
-  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
-
-  //
-  // Tile tensors
-  //
 
-  // Define a statically sized block (M, N).
-  // Note, by convention, capital letters are used to represent static modes.
-  auto block_shape = make_shape(Int<1>{}, Int<16384>{});
+    // Construct a register-backed Tensor with the same shape as each thread's
+    // partition Use make_tensor to try to match the layout of thr_tile_S
+    Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)
 
-  if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
-      (size<1>(tensor_shape) % size<1>(block_shape))) {
-    std::cerr << "The tensor shape must be divisible by the block shape."
-              << std::endl;
-  }
-  // Equivalent check to the above
-  if (not evenly_divides(tensor_shape, block_shape)) {
-    std::cerr << "Expected the block_shape to evenly divide the tensor shape."
-              << std::endl;
+    // Copy from GMEM to RMEM and from RMEM to GMEM
+    copy(thr_tile_S, fragment);
+    copy(fragment, thr_tile_D);
   }
 
-  // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
-  // shape, and modes (m', n') correspond to the number of tiles.
-  //
-  // These will be used to determine the CUDA kernel grid dimensions.
-  Tensor tiled_tensor_S =
-      tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
-  Tensor tiled_tensor_D =
-      tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
-
-  // Thread arrangement
-  Layout thr_layout =
-      make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
-
-  //
-  // Determine grid and block dimensions
-  //
-
-  auto gridDim = compat::dim3(
-      size<1>(tiled_tensor_S),
-      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
-  auto blockDim = compat::dim3(size(thr_layout));
-
-  //
-  // Launch the kernel
-  //
-  compat::launch<copy_kernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
-                             decltype(thr_layout)>>(
-      gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
-}
+  template <typename Element>
+  void copy_direct(TransposeParams<Element> params) {
+    //
+    // Given a 2D shape, perform an efficient copy
+    //
+
+    using namespace cute;
+
+    //
+    // Make tensors
+    //
+    auto tensor_shape = make_shape(params.M, params.N);
+    auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+    auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
+    Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+    Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+    //
+    // Tile tensors
+    //
+
+    // Define a statically sized block (M, N).
+    // Note, by convention, capital letters are used to represent static modes.
+    auto block_shape = make_shape(Int<1>{}, Int<16384>{});
+
+    if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
+        (size<1>(tensor_shape) % size<1>(block_shape))) {
+      std::cerr << "The tensor shape must be divisible by the block shape."
+                << std::endl;
+    }
+    // Equivalent check to the above
+    if (not evenly_divides(tensor_shape, block_shape)) {
+      std::cerr << "Expected the block_shape to evenly divide the tensor shape."
+                << std::endl;
+    }
+
+    // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static
+    // tile shape, and modes (m', n') correspond to the number of tiles.
+    //
+    // These will be used to determine the CUDA kernel grid dimensions.
+    Tensor tiled_tensor_S =
+        tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
+    Tensor tiled_tensor_D =
+        tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
+
+    // Thread arrangement
+    Layout thr_layout =
+        make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
+
+    //
+    // Determine grid and block dimensions
+    //
+
+    auto gridDim = compat::dim3(
+        size<1>(tiled_tensor_S),
+        size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+    auto blockDim = compat::dim3(size(thr_layout));
+
+    //
+    // Launch the kernel
+    //
+    compat::launch<copy_kernel<decltype(tiled_tensor_S),
+                               decltype(tiled_tensor_D), decltype(thr_layout)>>(
+        gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
+  }
diff --git a/examples/cute/tutorial/transpose/copy_smem.h b/examples/cute/tutorial/transpose/copy_smem.h
@@ -62,10 +62,10 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
                 compat::work_group_id::y()); // (bM, bN)
   Tensor gD = D(make_coord(_, _), compat::work_group_id::x(),
-                compat::work_group_id::y()); // (bN, bM)
+                compat::work_group_id::y()); // (bM, bN)
 
   Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()),
-                          SmemLayout{}); // (bN, bM)
+                          SmemLayout{}); // (bM, bN)
 
   auto tiled_copy_load = make_tiled_copy(
       Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
@@ -83,7 +83,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   Tensor tSgS = thr_copy_load.partition_S(gS);
   Tensor tSsS = thr_copy_load.partition_D(sS);
   //
-  Tensor tDsS = thr_copy_store.partition_D(sS);
+  Tensor tDsD = thr_copy_store.partition_S(sS);
   Tensor tDgD = thr_copy_store.partition_D(gD);
 
   copy(tiled_copy_load, tSgS, tSsS);
@@ -92,7 +92,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   cp_async_wait<0>();
   syncthreads();
   //
-  copy(tiled_copy_store, tDsS, tDgD);
+  copy(tiled_copy_store, tDsD, tDgD);
 }
 
 template <typename Element> void copy_smem(TransposeParams<Element> params) {
diff --git a/examples/cute/tutorial/transpose/main.cpp b/examples/cute/tutorial/transpose/main.cpp
@@ -1,3 +1,4 @@
+#include "block_2d_transposed_copy.h"
 #include "copy_direct.h"
 #include "copy_smem.h"
 #include "transpose_naive.h"
@@ -25,5 +26,9 @@ int main(int argc, char const **argv) {
   printf("\nTranspose through SMEM.:\n");
   benchmark<Element>(transpose_smem<Element>, M, N, iterations);
 
+  printf("Block 2d Transposed load\n");
+  benchmark<Element, true, false, false>(block_2d_transposed_copy<Element>, M,
+                                          N, iterations);
+
   return 0;
 }
diff --git a/examples/cute/tutorial/transpose/transpose_naive.h b/examples/cute/tutorial/transpose/transpose_naive.h
@@ -69,11 +69,8 @@ void transpose_naive(TransposeParams<Element> params) {
   // Make Tensors
   //
   auto tensor_shape = make_shape(params.M, params.N);
-  auto tensor_shape_trans = make_shape(params.N, params.M);
   auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
-  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
   Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
-  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
 
   // Make a transposed view of the output
   auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
@@ -87,7 +84,6 @@ void transpose_naive(TransposeParams<Element> params) {
   using bN = Int<512>;
 
   auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
-  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
 
   Tensor tiled_tensor_S =
       tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
diff --git a/examples/cute/tutorial/transpose/transpose_smem.h b/examples/cute/tutorial/transpose/transpose_smem.h

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ if (CUTLASS_ENABLE_SYCL)`
`47`	`47`
`48`	`48`	`cutlass_example_add_executable(`
`49`	`49`	`cute_tutorial_tiled_transpose`
`50`		`- transpose/tiled_transpose_sycl.cpp`
	`50`	`+ transpose/main.cpp`
`51`	`51`	`)`
`52`	`52`
`53`	`53`	`cutlass_example_add_executable(`