From 8b37f942c2b9f1921e3bb85829c1dd81d45fd0f5 Mon Sep 17 00:00:00 2001
From: Adam Osewski <Adam.Osewski@amd.com>
Date: Tue, 25 Mar 2025 11:34:42 +0000
Subject: [PATCH 01/25] Basic docs for universal gemm & ck-tile gemm.

---
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      | 115 +++++++++++++++++-
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 102 ++++++++++++++++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  60 +++++++++
 3 files changed, 275 insertions(+), 2 deletions(-)
 mode change 100755 => 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index a8cf681995e..640ee19a81e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,6 +21,104 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+/// @brief \"Universal\" GEMM operation with SplitK support.
+///
+/// @par Overview
+///         This GEMM operation is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         This operation is called \"universal\" since it provides multiple blockwise GEMM
+///         pipelines optimized for memory-bound, latency and compute-bound cases.
+///
+/// @note   This Kernel implementation supports SplitK algorithm - that is it may be configured
+///         to split the work of K dimension dot product accumulation into multiple work groups.
+///         The partial products of different workgroups are then reduced using AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam GemmAccDataType The so called "accumulation" data type related to hardware
+///                         matrix-multiplication instrucion.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor
+///                               elements.
+/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor
+///                               elements.
+/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output
+///                               tensor (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in M dimension.
+/// @tparam NPerBlock   The input/output data tile size in N dimension.
+/// @tparam KPerBlock   The input data tile size in K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. You could think of it as an answer
+///                                                      to question: "How many threads to arrange
+///                                                      on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. You could think of
+///                                                 it as an answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do verctorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. You could think of it as an answer
+///                                                      to question: "How many threads to arrange
+///                                                      on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. You could think of
+///                                                 it as an answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do verctorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -130,9 +228,22 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    // Invoker
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
     struct Invoker : public BaseInvoker
     {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
old mode 100755
new mode 100644
index 55639f4aee4..a5335adfb96
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -82,6 +82,108 @@ __global__ void
 #endif // end of if (defined(__gfx9__))
 }
 
+/// @brief \"Universal\" GEMM kernel with SplitK support.
+///
+/// @par Overview
+///         This GEMM kernel is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         This operation is called \"universal\" since it provides multiple blockwise GEMM
+///         pipelines optimized for memory-bound, latency and compute-bound cases.
+///
+/// @note   This Kernel implementation supports SplitK algorithm - that is it may be configured
+///         to split the work of K dimension dot product accumulation into multiple work groups.
+///         The partial products of different workgroups are then reduced using AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam AccDataType The so called "accumulation" data type related to hardware
+///                     matrix-multiplication instrucion.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor
+///                               elements.
+/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor
+///                               elements.
+/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output
+///                               tensor (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in M dimension.
+/// @tparam NPerBlock   The input/output data tile size in N dimension.
+/// @tparam KPerBlock   The input data tile size in K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. You could think of it as an answer
+///                                                      to question: "How many threads to arrange
+///                                                      on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. You could think of
+///                                                 it as an answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do verctorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam AThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. You could think of it as an answer
+///                                                      to question: "How many threads to arrange
+///                                                      on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. You could think of
+///                                                 it as an answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do verctorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 9435855d0ac..e5b9d17bac1 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -12,6 +12,11 @@
 
 namespace ck_tile {
 
+/// @brief The GEMM problem definition.
+///
+/// @par Overview
+///      This structure defines the GEMM problem configuration by stating all required information
+///      like M,N,K sizes and respective strides.
 struct GemmProblem
 {
     CK_TILE_HOST GemmProblem() = default;
@@ -29,6 +34,12 @@ struct GemmProblem
     index_t stride_C;
 };
 
+/// @brief The GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GemmKernel "GemmKernel" when creating kernel arguments
+///      object. It contain all necessary information required to build proper kernel argument
+///      and launch kernel on GPU.
 struct GemmHostArgs : public GemmProblem
 {
     CK_TILE_HOST GemmHostArgs() = default;
@@ -56,20 +67,69 @@ struct GemmHostArgs : public GemmProblem
     index_t k_batch;
 };
 
+/// @brief The GEMM kernel device arguments.
 struct GemmKernelArgs
 {
+    /// @brief The A input tensor's pointer to device memory.
     const void* a_ptr;
+    /// @brief The B input tensor's pointer to device memory.
     const void* b_ptr;
+    /// @brief The C output tensor's pointer to device memory.
     void* c_ptr;
+    /// @brief GEMM's M dimension size.
     index_t M;
+    /// @brief GEMM's N dimension size.
     index_t N;
+    /// @brief GEMM's K dimension size.
     index_t K;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of A tensor.
     index_t stride_A;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of B tensor.
     index_t stride_B;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of C tensor.
     index_t stride_C;
     index_t k_batch;
 };
 
+/// @brief The GEMM kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the generic matrix multiplication kernel template. By semantic
+///            division of GEMM algorithm into following parts we achieve flexible, versatile
+///            and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam TilePartitioner_    The type of class providing mapping of workgroup index into the
+///                             output data tile to be calculated. It determines the workgroup to
+///                             data relationship (or in other words - which data would be
+///                             processed and calculated by which workgroup).
+/// @tparam GemmPipeline_       The type of class which provides the core part of matrix
+///                             multiplication. This class should provide implementation of data
+///                             loading from global memory and performing block-wise matrix
+///                             multiplication. You can think of it as a work done by single
+///                             workgroup point of view.
+/// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
+///                             multiplication implementation. It is responsible for storing
+///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                             the output C tensor in global memory.
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {

From a12fa13fe0056512a15dcab567a6e3c7a0c6e8c1 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:41:55 +0100
Subject: [PATCH 02/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 640ee19a81e..354ef4d3c49 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -46,9 +46,9 @@ namespace device {
 ///                         matrix-multiplication instrucion.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
-/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor
+/// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor
 ///                               elements.
-/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor
+/// @tparam BElementwiseOperation An elementwise operation which could be applied on B input tensor
 ///                               elements.
 /// @tparam CElementwiseOperation An elementwise operation which could by applied on C output
 ///                               tensor (after GEMM).

From b985b09023704964b47336cfc73916720d5c3c2f Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:42:07 +0100
Subject: [PATCH 03/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index a5335adfb96..a34aebdb58e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -103,7 +103,7 @@ __global__ void
 /// @tparam ADataType   A tensor data type.
 /// @tparam BDataType   B tensor data type.
 /// @tparam AccDataType The so called "accumulation" data type related to hardware
-///                     matrix-multiplication instrucion.
+///                     matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
 /// @tparam CDataType   C tensor data type.

From 9cc34319079d44e6618fbf5c6431f32f3572f23d Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:42:18 +0100
Subject: [PATCH 04/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 354ef4d3c49..397e6e07f80 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -43,7 +43,7 @@ namespace device {
 /// @tparam BDataType   B tensor data type.
 /// @tparam CDataType   C tensor data type.
 /// @tparam GemmAccDataType The so called "accumulation" data type related to hardware
-///                         matrix-multiplication instrucion.
+///                         matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
 /// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor

From b945cd92907f5fdebfc8fc53b5bf503f3dbeaa46 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:42:30 +0100
Subject: [PATCH 05/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index a34aebdb58e..ea09fc920f6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -107,11 +107,11 @@ __global__ void
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
 /// @tparam CDataType   C tensor data type.
-/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor
+/// @tparam AElementwiseOperation An elementwise operation which can be applied on A input tensor
 ///                               elements.
-/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor
+/// @tparam BElementwiseOperation An elementwise operation which can be applied on B input tensor
 ///                               elements.
-/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output
+/// @tparam CElementwiseOperation An elementwise operation which can be applied on C output
 ///                               tensor (after GEMM).
 /// @tparam GemmSpec    Determines used "padding" version.
 /// @tparam BlockSize   The number of threads within workgroup.

From 010aad128ab1e594d3a1f638eb76264a14e348d9 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:42:42 +0100
Subject: [PATCH 06/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 397e6e07f80..b5f66becd2b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -90,7 +90,7 @@ namespace device {
 ///                                                 order to spread threads through tensor axes?".
 /// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
 ///                                      to read first? And which next?" etc.
-/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do verctorized memory
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
 /// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
 ///                                          elements accessed per thread per instruction.

From 3d6aea2c1d304071c75d8d9f1bffd2f6f02a013d Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:43:03 +0100
Subject: [PATCH 07/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index ea09fc920f6..520489ab24d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -126,7 +126,7 @@ __global__ void
 /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
 ///                                                      data. You could think of it as an answer
-///                                                      to question: "How many threads to arrange
+///                                                      to the question: "How many threads to arrange
 ///                                                      on each input data axis?"
 /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
 ///                                                 the input tensor dimension. You could think of

From 5efbe866a556b4412914419b5611af370f89f402 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:43:18 +0100
Subject: [PATCH 08/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index b5f66becd2b..dc16ec18302 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -73,7 +73,7 @@ namespace device {
 ///                                                 order to spread threads through tensor axes?".
 /// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
 ///                                      to read first? And which next?" etc.
-/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do verctorized memory
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
 /// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
 ///                                          elements accessed per thread per instruction.

From ad1b09f6a1c6e00eb2c8c70a8e86f9023615d8f0 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:45:58 +0100
Subject: [PATCH 09/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index dc16ec18302..f7b97d0a511 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -24,7 +24,7 @@ namespace device {
 /// @brief \"Universal\" GEMM operation with SplitK support.
 ///
 /// @par Overview
-///         This GEMM operation is carrying out following mathematical equation:
+///         This GEMM operation implements the following mathematical equation:
 ///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
 ///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
 ///         elementwise operations that could be applied on each tensor respectively.

From c671eaecd013c2c70937943e75560a3952290947 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:46:08 +0100
Subject: [PATCH 10/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 520489ab24d..704b5630251 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -134,7 +134,7 @@ __global__ void
 ///                                                 order to spread threads through tensor axes?".
 /// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
 ///                                      to read first? And which next?" etc.
-/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do verctorized memory
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
 /// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
 ///                                          elements accessed per thread per instruction.

From 2534f77999814640ff0c760267205711dec1488e Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:46:21 +0100
Subject: [PATCH 11/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 704b5630251..28c50e64175 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -153,7 +153,7 @@ __global__ void
 ///                                                 order to spread threads through tensor axes?".
 /// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
 ///                                      to read first? And which next?" etc.
-/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do verctorized memory
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
 /// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
 ///                                          elements accessed per thread per instruction.

From e8a237cc096c26bccb32f24879ceb45bef4ddd70 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:47:04 +0100
Subject: [PATCH 12/25] Update
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 28c50e64175..726e111534c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -162,10 +162,10 @@ __global__ void
 ///                          (return back to the window origin) after all thread finish data copy.
 /// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
 ///                                              universal GEMM there's no need for padding.
-/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
 ///                                         results to process per wave per iteration of CShuffle
 ///                                         in M dimension.
-/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
 ///                                         results to process per wave per iteration of CShuffle
 ///                                         in N dimension.
 /// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial

From 5b4d44317ce0cbf4ae9c821bf89d9199970329b1 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 09:47:13 +0100
Subject: [PATCH 13/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index f7b97d0a511..71e3a2dd485 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -97,10 +97,10 @@ namespace device {
 /// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
 /// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
 ///                                              universal GEMM there's no need for padding.
-/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
 ///                                         results to process per wave per iteration of CShuffle
 ///                                         in M dimension.
-/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication's instructions
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
 ///                                         results to process per wave per iteration of CShuffle
 ///                                         in N dimension.
 /// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial

From 5832414d17ae4674d79ff4a54ee52223a7ce896e Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:06:38 +0100
Subject: [PATCH 14/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 71e3a2dd485..000764ae0fe 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -31,7 +31,7 @@ namespace device {
 ///         This operation is called \"universal\" since it provides multiple blockwise GEMM
 ///         pipelines optimized for memory-bound, latency and compute-bound cases.
 ///
-/// @note   This Kernel implementation supports SplitK algorithm - that is it may be configured
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
 ///         to split the work of K dimension dot product accumulation into multiple work groups.
 ///         The partial products of different workgroups are then reduced using AtomicAdd
 ///         operation.

From da89b401c2ed6a527f5fcb2126a2ced5c2a03976 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:07:39 +0100
Subject: [PATCH 15/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 000764ae0fe..85189882f2e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -32,7 +32,7 @@ namespace device {
 ///         pipelines optimized for memory-bound, latency and compute-bound cases.
 ///
 /// @note   This Kernel implementation supports SplitK algorithm. It can be configured
-///         to split the work of K dimension dot product accumulation into multiple work groups.
+///         to split the work of K dimension dot product accumulation into multiple workgroups.
 ///         The partial products of different workgroups are then reduced using AtomicAdd
 ///         operation.
 ///

From bb951194573b53e37fa189f3e7fba5b3197a256e Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:20:42 +0100
Subject: [PATCH 16/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 85189882f2e..fc2cea579d1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -33,7 +33,7 @@ namespace device {
 ///
 /// @note   This Kernel implementation supports SplitK algorithm. It can be configured
 ///         to split the work of K dimension dot product accumulation into multiple workgroups.
-///         The partial products of different workgroups are then reduced using AtomicAdd
+///         The partial products of different workgroups are then reduced using the AtomicAdd
 ///         operation.
 ///
 /// @tparam ALayout     A tensor data layout.

From 782cba4d2f49462c5709140f5f5b6b6582ce3595 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:22:28 +0100
Subject: [PATCH 17/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index fc2cea579d1..123caebf630 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -54,7 +54,7 @@ namespace device {
 ///                               tensor (after GEMM).
 /// @tparam GemmSpec    Determines used "padding" version.
 /// @tparam BlockSize   The number of threads within workgroup.
-/// @tparam MPerBlock   The input/output data tile size in M dimension.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
 /// @tparam NPerBlock   The input/output data tile size in N dimension.
 /// @tparam KPerBlock   The input data tile size in K dimension.
 /// @tparam AK1         The vector load size from global memory for A tensor.

From b57ebf6753b152cdd1c81529dad8b00ff43a9c28 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:22:37 +0100
Subject: [PATCH 18/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 123caebf630..cccad5c4966 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -61,7 +61,7 @@ namespace device {
 /// @tparam BK1         The vector load size from global memory for B tensor.
 /// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
 /// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
-/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
 /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
 ///                                                      data. You could think of it as an answer

From 5eef1e9ab9fded6ba524381e223a6d4c10b089ff Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:22:47 +0100
Subject: [PATCH 19/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index cccad5c4966..46b17bba9de 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -56,7 +56,7 @@ namespace device {
 /// @tparam BlockSize   The number of threads within workgroup.
 /// @tparam MPerBlock   The input/output data tile size in the M dimension.
 /// @tparam NPerBlock   The input/output data tile size in N dimension.
-/// @tparam KPerBlock   The input data tile size in K dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
 /// @tparam AK1         The vector load size from global memory for A tensor.
 /// @tparam BK1         The vector load size from global memory for B tensor.
 /// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.

From 1b66dc30784afa72dea8ca576dc8ab10914dc070 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:23:10 +0100
Subject: [PATCH 20/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 46b17bba9de..97a3702c951 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -64,7 +64,7 @@ namespace device {
 /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
 /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
-///                                                      data. You could think of it as an answer
+///                                                      data. Can be interpreted as the answer to the question, 
 ///                                                      to question: "How many threads to arrange
 ///                                                      on each input data axis?"
 /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over

From eca5168fb8c1cf7e3c0016688da5bb3ae886fcb3 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:23:26 +0100
Subject: [PATCH 21/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 97a3702c951..cf50e5c44b5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -65,7 +65,7 @@ namespace device {
 /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
 ///                                                      data. Can be interpreted as the answer to the question, 
-///                                                      to question: "How many threads to arrange
+///                                                      to the question, "How many threads can be arranged
 ///                                                      on each input data axis?"
 /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
 ///                                                 the input tensor dimension. You could think of

From faff8003ce8bca8b80d67ae5c7eb185865fed5de Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 26 Mar 2025 13:26:56 +0100
Subject: [PATCH 22/25] Update
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index cf50e5c44b5..a266b99c2fb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -27,7 +27,7 @@ namespace device {
 ///         This GEMM operation implements the following mathematical equation:
 ///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
 ///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
-///         elementwise operations that could be applied on each tensor respectively.
+///         elementwise operations applied to the A, B, and C tensors, respectively.
 ///         This operation is called \"universal\" since it provides multiple blockwise GEMM
 ///         pipelines optimized for memory-bound, latency and compute-bound cases.
 ///

From 66821a26fe6e4eabe587448336499b1fc58c938b Mon Sep 17 00:00:00 2001
From: Adam Osewski <Adam.Osewski@amd.com>
Date: Wed, 26 Mar 2025 12:41:23 +0000
Subject: [PATCH 23/25] Reviewers suggestions.

---
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      | 42 +++++++-------
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 56 +++++++++----------
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index a266b99c2fb..136302a7058 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -42,36 +42,35 @@ namespace device {
 /// @tparam ADataType   A tensor data type.
 /// @tparam BDataType   B tensor data type.
 /// @tparam CDataType   C tensor data type.
-/// @tparam GemmAccDataType The so called "accumulation" data type related to hardware
+/// @tparam GemmAccDataType The accumulation data type related to the hardware
 ///                         matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
-/// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor
-///                               elements.
-/// @tparam BElementwiseOperation An elementwise operation which could be applied on B input tensor
-///                               elements.
-/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output
-///                               tensor (after GEMM).
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
 /// @tparam GemmSpec    Determines used "padding" version.
 /// @tparam BlockSize   The number of threads within workgroup.
 /// @tparam MPerBlock   The input/output data tile size in the M dimension.
-/// @tparam NPerBlock   The input/output data tile size in N dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
 /// @tparam KPerBlock   The input data tile size in the K dimension.
 /// @tparam AK1         The vector load size from global memory for A tensor.
 /// @tparam BK1         The vector load size from global memory for B tensor.
 /// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
 /// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
 /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
-/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
-///                                                      data. Can be interpreted as the answer to the question, 
-///                                                      to the question, "How many threads can be arranged
-///                                                      on each input data axis?"
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
 /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
-///                                                 the input tensor dimension. You could think of
-///                                                 it as an answer to the question: "In which
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
 ///                                                 order to spread threads through tensor axes?".
-/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
 ///                                      to read first? And which next?" etc.
 /// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
@@ -81,14 +80,15 @@ namespace device {
 /// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
 ///                                              universal GEMM there's no need for padding.
 /// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
-///                                                      data. You could think of it as an answer
-///                                                      to question: "How many threads to arrange
-///                                                      on each input data axis?"
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
 /// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
-///                                                 the input tensor dimension. You could think of
-///                                                 it as an answer to the question: "In which
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
 ///                                                 order to spread threads through tensor axes?".
-/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
 ///                                      to read first? And which next?" etc.
 /// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 726e111534c..bb743eeea42 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -92,9 +92,9 @@ __global__ void
 ///         This operation is called \"universal\" since it provides multiple blockwise GEMM
 ///         pipelines optimized for memory-bound, latency and compute-bound cases.
 ///
-/// @note   This Kernel implementation supports SplitK algorithm - that is it may be configured
-///         to split the work of K dimension dot product accumulation into multiple work groups.
-///         The partial products of different workgroups are then reduced using AtomicAdd
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the work of K dimension dot product accumulation into multiple workgroups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
 ///         operation.
 ///
 /// @tparam ALayout     A tensor data layout.
@@ -102,37 +102,36 @@ __global__ void
 /// @tparam CLayout     C tensor data layout.
 /// @tparam ADataType   A tensor data type.
 /// @tparam BDataType   B tensor data type.
-/// @tparam AccDataType The so called "accumulation" data type related to hardware
-///                     matrix-multiplication instruction.
+/// @tparam GemmAccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
 /// @tparam CDataType   C tensor data type.
-/// @tparam AElementwiseOperation An elementwise operation which can be applied on A input tensor
-///                               elements.
-/// @tparam BElementwiseOperation An elementwise operation which can be applied on B input tensor
-///                               elements.
-/// @tparam CElementwiseOperation An elementwise operation which can be applied on C output
-///                               tensor (after GEMM).
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
 /// @tparam GemmSpec    Determines used "padding" version.
 /// @tparam BlockSize   The number of threads within workgroup.
-/// @tparam MPerBlock   The input/output data tile size in M dimension.
-/// @tparam NPerBlock   The input/output data tile size in N dimension.
-/// @tparam KPerBlock   The input data tile size in K dimension.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
 /// @tparam AK1         The vector load size from global memory for A tensor.
 /// @tparam BK1         The vector load size from global memory for B tensor.
 /// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
 /// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
-/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront.
-/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
-///                                                      data. You could think of it as an answer
-///                                                      to the question: "How many threads to arrange
-///                                                      on each input data axis?"
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
 /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
-///                                                 the input tensor dimension. You could think of
-///                                                 it as an answer to the question: "In which
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
 ///                                                 order to spread threads through tensor axes?".
-/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
 ///                                      to read first? And which next?" etc.
 /// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.
@@ -144,14 +143,15 @@ __global__ void
 /// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
 ///                                              universal GEMM there's no need for padding.
 /// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
-///                                                      data. You could think of it as an answer
-///                                                      to question: "How many threads to arrange
-///                                                      on each input data axis?"
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
 /// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
-///                                                 the input tensor dimension. You could think of
-///                                                 it as an answer to the question: "In which
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
 ///                                                 order to spread threads through tensor axes?".
-/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
 ///                                      to read first? And which next?" etc.
 /// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
 ///                                      access - the one with contiguous memory.

From cbff8c2690cc29273f4a87214b1dff48a8264fda Mon Sep 17 00:00:00 2001
From: Adam Osewski <Adam.Osewski@amd.com>
Date: Wed, 26 Mar 2025 13:34:08 +0000
Subject: [PATCH 24/25] Align tparam names in doc with class tparams.

---
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index bb743eeea42..a3a87d26e52 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -102,7 +102,7 @@ __global__ void
 /// @tparam CLayout     C tensor data layout.
 /// @tparam ADataType   A tensor data type.
 /// @tparam BDataType   B tensor data type.
-/// @tparam GemmAccDataType The accumulation data type related to the hardware
+/// @tparam AccDataType The accumulation data type related to the hardware
 ///                         matrix-multiplication instruction.
 /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
 ///                          LDS memory during \"CShuffle\" data layout optimization.
@@ -116,10 +116,10 @@ __global__ void
 /// @tparam MPerBlock   The input/output data tile size in the M dimension.
 /// @tparam NPerBlock   The input/output data tile size in the N dimension.
 /// @tparam KPerBlock   The input data tile size in the K dimension.
-/// @tparam AK1         The vector load size from global memory for A tensor.
-/// @tparam BK1         The vector load size from global memory for B tensor.
-/// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
-/// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
+/// @tparam AK1Value    The vector load size from global memory for A tensor.
+/// @tparam BK1Value    The vector load size from global memory for B tensor.
+/// @tparam MPerXdl     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXdl     N size of matrix-fused-multiply-add instruction.
 /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
 /// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
 /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input

From dc01b651a85189305ab319a50bdeba3c59cbd3de Mon Sep 17 00:00:00 2001
From: Adam Osewski <Adam.Osewski@amd.com>
Date: Fri, 28 Mar 2025 09:53:47 +0000
Subject: [PATCH 25/25] More reviewers fine tuning ;)

---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp        | 7 ++++---
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp             | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 136302a7058..51c223efd2e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -28,11 +28,12 @@ namespace device {
 ///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
 ///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
 ///         elementwise operations applied to the A, B, and C tensors, respectively.
-///         This operation is called \"universal\" since it provides multiple blockwise GEMM
-///         pipelines optimized for memory-bound, latency and compute-bound cases.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
 ///
 /// @note   This Kernel implementation supports SplitK algorithm. It can be configured
-///         to split the work of K dimension dot product accumulation into multiple workgroups.
+///         to split the dot product accumulated over the K dimension into multiple working groups.
 ///         The partial products of different workgroups are then reduced using the AtomicAdd
 ///         operation.
 ///
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index a3a87d26e52..9f6d85dd781 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -89,11 +89,12 @@ __global__ void
 ///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
 ///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
 ///         elementwise operations that could be applied on each tensor respectively.
-///         This operation is called \"universal\" since it provides multiple blockwise GEMM
-///         pipelines optimized for memory-bound, latency and compute-bound cases.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
 ///
 /// @note   This Kernel implementation supports SplitK algorithm. It can be configured
-///         to split the work of K dimension dot product accumulation into multiple workgroups.
+///         to split the dot product accumulated over the K dimension into multiple working groups.
 ///         The partial products of different workgroups are then reduced using the AtomicAdd
 ///         operation.
 ///