From 8b37f942c2b9f1921e3bb85829c1dd81d45fd0f5 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Tue, 25 Mar 2025 11:34:42 +0000 Subject: [PATCH 01/25] Basic docs for universal gemm & ck-tile gemm. --- .../impl/device_gemm_xdl_cshuffle_v3.hpp | 115 +++++++++++++++++- .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 102 ++++++++++++++++ .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 60 +++++++++ 3 files changed, 275 insertions(+), 2 deletions(-) mode change 100755 => 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index a8cf681995e..640ee19a81e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,6 +21,104 @@ namespace ck { namespace tensor_operation { namespace device { +/// @brief \"Universal\" GEMM operation with SplitK support. +/// +/// @par Overview +/// This GEMM operation is carrying out following mathematical equation: +/// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) +/// Where A, B are input tensors and C is the output tensor. The A/B/C_op are +/// elementwise operations that could be applied on each tensor respectively. +/// This operation is called \"universal\" since it provides multiple blockwise GEMM +/// pipelines optimized for memory-bound, latency and compute-bound cases. +/// +/// @note This Kernel implementation supports SplitK algorithm - that is it may be configured +/// to split the work of K dimension dot product accumulation into multiple work groups. +/// The partial products of different workgroups are then reduced using AtomicAdd +/// operation. +/// +/// @tparam ALayout A tensor data layout. +/// @tparam BLayout B tensor data layout. +/// @tparam CLayout C tensor data layout. +/// @tparam ADataType A tensor data type. +/// @tparam BDataType B tensor data type. +/// @tparam CDataType C tensor data type. +/// @tparam GemmAccDataType The so called "accumulation" data type related to hardware +/// matrix-multiplication instrucion. +/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into +/// LDS memory during \"CShuffle\" data layout optimization. +/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor +/// elements. +/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor +/// elements. +/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output +/// tensor (after GEMM). +/// @tparam GemmSpec Determines used "padding" version. +/// @tparam BlockSize The number of threads within workgroup. +/// @tparam MPerBlock The input/output data tile size in M dimension. +/// @tparam NPerBlock The input/output data tile size in N dimension. +/// @tparam KPerBlock The input data tile size in K dimension. +/// @tparam AK1 The vector load size from global memory for A tensor. +/// @tparam BK1 The vector load size from global memory for B tensor. +/// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. +/// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. +/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront. +/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. +/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input +/// data. You could think of it as an answer +/// to question: "How many threads to arrange +/// on each input data axis?" +/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over +/// the input tensor dimension. You could think of +/// it as an answer to the question: "In which +/// order to spread threads through tensor axes?". +/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// to read first? And which next?" etc. +/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// access - the one with contiguous memory. +/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of +/// elements accessed per thread per instruction. +/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory. +/// @tparam ABlockLdsExtraM Whether to use padding for LDS or not. With +/// universal GEMM there's no need for padding. +/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input +/// data. You could think of it as an answer +/// to question: "How many threads to arrange +/// on each input data axis?" +/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over +/// the input tensor dimension. You could think of +/// it as an answer to the question: "In which +/// order to spread threads through tensor axes?". +/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// to read first? And which next?" etc. +/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// access - the one with contiguous memory. +/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of +/// elements accessed per thread per instruction. +/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory. +/// @tparam BBlockLdsExtraN Whether to use padding for LDS or not. With +/// universal GEMM there's no need for padding. +/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// results to process per wave per iteration of CShuffle +/// in M dimension. +/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// results to process per wave per iteration of CShuffle +/// in N dimension. +/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial +/// thread distribution used for storing data into output +/// tensor across output data layout dimensions. +/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access. +/// Used when storing data to output tensor. +/// @tparam BlkGemmPipeSched The version of blockwise-gemm pipeline scheduler (interwave or +/// intrawave). +/// @tparam BlkGemmPipelineVer The version of blockwise-gemm pipeline. +/// @tparam ComputeTypeA Data type used for A input of hardware matrix-multiplication +/// instructions. +/// @tparam ComputeTypeB Data type used for B input of hardware matrix-multiplication +/// instructions. +/// @tparam PermuteA Whether the A input tensor has gridwise-gemm friendly data layout +/// in global memory. Currently not supported! +/// @tparam PermuteB Whether the B input tensor has gridwise-gemm friendly data layout +/// in global memory (pre-shuffled). template 0) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp old mode 100755 new mode 100644 index 55639f4aee4..a5335adfb96 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -82,6 +82,108 @@ __global__ void #endif // end of if (defined(__gfx9__)) } +/// @brief \"Universal\" GEMM kernel with SplitK support. +/// +/// @par Overview +/// This GEMM kernel is carrying out following mathematical equation: +/// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) +/// Where A, B are input tensors and C is the output tensor. The A/B/C_op are +/// elementwise operations that could be applied on each tensor respectively. +/// This operation is called \"universal\" since it provides multiple blockwise GEMM +/// pipelines optimized for memory-bound, latency and compute-bound cases. +/// +/// @note This Kernel implementation supports SplitK algorithm - that is it may be configured +/// to split the work of K dimension dot product accumulation into multiple work groups. +/// The partial products of different workgroups are then reduced using AtomicAdd +/// operation. +/// +/// @tparam ALayout A tensor data layout. +/// @tparam BLayout B tensor data layout. +/// @tparam CLayout C tensor data layout. +/// @tparam ADataType A tensor data type. +/// @tparam BDataType B tensor data type. +/// @tparam AccDataType The so called "accumulation" data type related to hardware +/// matrix-multiplication instrucion. +/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into +/// LDS memory during \"CShuffle\" data layout optimization. +/// @tparam CDataType C tensor data type. +/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor +/// elements. +/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor +/// elements. +/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output +/// tensor (after GEMM). +/// @tparam GemmSpec Determines used "padding" version. +/// @tparam BlockSize The number of threads within workgroup. +/// @tparam MPerBlock The input/output data tile size in M dimension. +/// @tparam NPerBlock The input/output data tile size in N dimension. +/// @tparam KPerBlock The input data tile size in K dimension. +/// @tparam AK1 The vector load size from global memory for A tensor. +/// @tparam BK1 The vector load size from global memory for B tensor. +/// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. +/// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. +/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront. +/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. +/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input +/// data. You could think of it as an answer +/// to question: "How many threads to arrange +/// on each input data axis?" +/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over +/// the input tensor dimension. You could think of +/// it as an answer to the question: "In which +/// order to spread threads through tensor axes?". +/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// to read first? And which next?" etc. +/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// access - the one with contiguous memory. +/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of +/// elements accessed per thread per instruction. +/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory. +/// @tparam AThreadTransferSrcResetCoordinateAfterRun Decides whether we reset thread coordinate +/// (return back to the window origin) after all thread finish data copy. +/// @tparam ABlockLdsExtraM Whether to use padding for LDS or not. With +/// universal GEMM there's no need for padding. +/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input +/// data. You could think of it as an answer +/// to question: "How many threads to arrange +/// on each input data axis?" +/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over +/// the input tensor dimension. You could think of +/// it as an answer to the question: "In which +/// order to spread threads through tensor axes?". +/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// to read first? And which next?" etc. +/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// access - the one with contiguous memory. +/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of +/// elements accessed per thread per instruction. +/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory. +/// @tparam BThreadTransferSrcResetCoordinateAfterRun Decides whether we reset thread coordinate +/// (return back to the window origin) after all thread finish data copy. +/// @tparam BBlockLdsExtraN Whether to use padding for LDS or not. With +/// universal GEMM there's no need for padding. +/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// results to process per wave per iteration of CShuffle +/// in M dimension. +/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// results to process per wave per iteration of CShuffle +/// in N dimension. +/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial +/// thread distribution used for storing data into output +/// tensor across output data layout dimensions. +/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access. +/// Used when storing data to output tensor. +/// @tparam BlkGemmPipeSched The version of blockwise-gemm pipeline scheduler (interwave or +/// intrawave). +/// @tparam BlkGemmPipelineVer The version of blockwise-gemm pipeline. +/// @tparam ComputeTypeA Data type used for A input of hardware matrix-multiplication +/// instructions. +/// @tparam ComputeTypeB Data type used for B input of hardware matrix-multiplication +/// instructions. +/// @tparam PermuteA Whether the A input tensor has gridwise-gemm friendly data layout +/// in global memory. Currently not supported! +/// @tparam PermuteB Whether the B input tensor has gridwise-gemm friendly data layout +/// in global memory (pre-shuffled). template struct GemmKernel { From a12fa13fe0056512a15dcab567a6e3c7a0c6e8c1 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:41:55 +0100 Subject: [PATCH 02/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 640ee19a81e..354ef4d3c49 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -46,9 +46,9 @@ namespace device { /// matrix-multiplication instrucion. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. -/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor +/// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor /// elements. -/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor +/// @tparam BElementwiseOperation An elementwise operation which could be applied on B input tensor /// elements. /// @tparam CElementwiseOperation An elementwise operation which could by applied on C output /// tensor (after GEMM). From b985b09023704964b47336cfc73916720d5c3c2f Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:42:07 +0100 Subject: [PATCH 03/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index a5335adfb96..a34aebdb58e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -103,7 +103,7 @@ __global__ void /// @tparam ADataType A tensor data type. /// @tparam BDataType B tensor data type. /// @tparam AccDataType The so called "accumulation" data type related to hardware -/// matrix-multiplication instrucion. +/// matrix-multiplication instruction. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. /// @tparam CDataType C tensor data type. From 9cc34319079d44e6618fbf5c6431f32f3572f23d Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:42:18 +0100 Subject: [PATCH 04/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 354ef4d3c49..397e6e07f80 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -43,7 +43,7 @@ namespace device { /// @tparam BDataType B tensor data type. /// @tparam CDataType C tensor data type. /// @tparam GemmAccDataType The so called "accumulation" data type related to hardware -/// matrix-multiplication instrucion. +/// matrix-multiplication instruction. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. /// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor From b945cd92907f5fdebfc8fc53b5bf503f3dbeaa46 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:42:30 +0100 Subject: [PATCH 05/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index a34aebdb58e..ea09fc920f6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -107,11 +107,11 @@ __global__ void /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. /// @tparam CDataType C tensor data type. -/// @tparam AElementwiseOperation An elementwise operation which could by applied on A input tensor +/// @tparam AElementwiseOperation An elementwise operation which can be applied on A input tensor /// elements. -/// @tparam BElementwiseOperation An elementwise operation which could by applied on B input tensor +/// @tparam BElementwiseOperation An elementwise operation which can be applied on B input tensor /// elements. -/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output +/// @tparam CElementwiseOperation An elementwise operation which can be applied on C output /// tensor (after GEMM). /// @tparam GemmSpec Determines used "padding" version. /// @tparam BlockSize The number of threads within workgroup. From 010aad128ab1e594d3a1f638eb76264a14e348d9 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:42:42 +0100 Subject: [PATCH 06/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 397e6e07f80..b5f66becd2b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -90,7 +90,7 @@ namespace device { /// order to spread threads through tensor axes?". /// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension /// to read first? And which next?" etc. -/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. /// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of /// elements accessed per thread per instruction. From 3d6aea2c1d304071c75d8d9f1bffd2f6f02a013d Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:43:03 +0100 Subject: [PATCH 07/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index ea09fc920f6..520489ab24d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -126,7 +126,7 @@ __global__ void /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input /// data. You could think of it as an answer -/// to question: "How many threads to arrange +/// to the question: "How many threads to arrange /// on each input data axis?" /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over /// the input tensor dimension. You could think of From 5efbe866a556b4412914419b5611af370f89f402 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:43:18 +0100 Subject: [PATCH 08/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index b5f66becd2b..dc16ec18302 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -73,7 +73,7 @@ namespace device { /// order to spread threads through tensor axes?". /// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension /// to read first? And which next?" etc. -/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. /// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of /// elements accessed per thread per instruction. From ad1b09f6a1c6e00eb2c8c70a8e86f9023615d8f0 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:45:58 +0100 Subject: [PATCH 09/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index dc16ec18302..f7b97d0a511 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -24,7 +24,7 @@ namespace device { /// @brief \"Universal\" GEMM operation with SplitK support. /// /// @par Overview -/// This GEMM operation is carrying out following mathematical equation: +/// This GEMM operation implements the following mathematical equation: /// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) /// Where A, B are input tensors and C is the output tensor. The A/B/C_op are /// elementwise operations that could be applied on each tensor respectively. From c671eaecd013c2c70937943e75560a3952290947 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:46:08 +0100 Subject: [PATCH 10/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 520489ab24d..704b5630251 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -134,7 +134,7 @@ __global__ void /// order to spread threads through tensor axes?". /// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension /// to read first? And which next?" etc. -/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. /// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of /// elements accessed per thread per instruction. From 2534f77999814640ff0c760267205711dec1488e Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:46:21 +0100 Subject: [PATCH 11/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 704b5630251..28c50e64175 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -153,7 +153,7 @@ __global__ void /// order to spread threads through tensor axes?". /// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension /// to read first? And which next?" etc. -/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do verctorized memory +/// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. /// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of /// elements accessed per thread per instruction. From e8a237cc096c26bccb32f24879ceb45bef4ddd70 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:47:04 +0100 Subject: [PATCH 12/25] Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 28c50e64175..726e111534c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -162,10 +162,10 @@ __global__ void /// (return back to the window origin) after all thread finish data copy. /// @tparam BBlockLdsExtraN Whether to use padding for LDS or not. With /// universal GEMM there's no need for padding. -/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication instructions /// results to process per wave per iteration of CShuffle /// in M dimension. -/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication instructions /// results to process per wave per iteration of CShuffle /// in N dimension. /// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial From 5b4d44317ce0cbf4ae9c821bf89d9199970329b1 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 09:47:13 +0100 Subject: [PATCH 13/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index f7b97d0a511..71e3a2dd485 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -97,10 +97,10 @@ namespace device { /// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory. /// @tparam BBlockLdsExtraN Whether to use padding for LDS or not. With /// universal GEMM there's no need for padding. -/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// @tparam CShuffleMXdlPerWavePerShuffle The number of matrix-multiplication instructions /// results to process per wave per iteration of CShuffle /// in M dimension. -/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication's instructions +/// @tparam CShuffleNXdlPerWavePerShuffle The number of matrix-multiplication instructions /// results to process per wave per iteration of CShuffle /// in N dimension. /// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial From 5832414d17ae4674d79ff4a54ee52223a7ce896e Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:06:38 +0100 Subject: [PATCH 14/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 71e3a2dd485..000764ae0fe 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -31,7 +31,7 @@ namespace device { /// This operation is called \"universal\" since it provides multiple blockwise GEMM /// pipelines optimized for memory-bound, latency and compute-bound cases. /// -/// @note This Kernel implementation supports SplitK algorithm - that is it may be configured +/// @note This Kernel implementation supports SplitK algorithm. It can be configured /// to split the work of K dimension dot product accumulation into multiple work groups. /// The partial products of different workgroups are then reduced using AtomicAdd /// operation. From da89b401c2ed6a527f5fcb2126a2ced5c2a03976 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:07:39 +0100 Subject: [PATCH 15/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartłomiej Kocot --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 000764ae0fe..85189882f2e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -32,7 +32,7 @@ namespace device { /// pipelines optimized for memory-bound, latency and compute-bound cases. /// /// @note This Kernel implementation supports SplitK algorithm. It can be configured -/// to split the work of K dimension dot product accumulation into multiple work groups. +/// to split the work of K dimension dot product accumulation into multiple workgroups. /// The partial products of different workgroups are then reduced using AtomicAdd /// operation. /// From bb951194573b53e37fa189f3e7fba5b3197a256e Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:20:42 +0100 Subject: [PATCH 16/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 85189882f2e..fc2cea579d1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -33,7 +33,7 @@ namespace device { /// /// @note This Kernel implementation supports SplitK algorithm. It can be configured /// to split the work of K dimension dot product accumulation into multiple workgroups. -/// The partial products of different workgroups are then reduced using AtomicAdd +/// The partial products of different workgroups are then reduced using the AtomicAdd /// operation. /// /// @tparam ALayout A tensor data layout. From 782cba4d2f49462c5709140f5f5b6b6582ce3595 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:22:28 +0100 Subject: [PATCH 17/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index fc2cea579d1..123caebf630 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -54,7 +54,7 @@ namespace device { /// tensor (after GEMM). /// @tparam GemmSpec Determines used "padding" version. /// @tparam BlockSize The number of threads within workgroup. -/// @tparam MPerBlock The input/output data tile size in M dimension. +/// @tparam MPerBlock The input/output data tile size in the M dimension. /// @tparam NPerBlock The input/output data tile size in N dimension. /// @tparam KPerBlock The input data tile size in K dimension. /// @tparam AK1 The vector load size from global memory for A tensor. From b57ebf6753b152cdd1c81529dad8b00ff43a9c28 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:22:37 +0100 Subject: [PATCH 18/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 123caebf630..cccad5c4966 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -61,7 +61,7 @@ namespace device { /// @tparam BK1 The vector load size from global memory for B tensor. /// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. /// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. -/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront. +/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront. /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input /// data. You could think of it as an answer From 5eef1e9ab9fded6ba524381e223a6d4c10b089ff Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:22:47 +0100 Subject: [PATCH 19/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index cccad5c4966..46b17bba9de 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -56,7 +56,7 @@ namespace device { /// @tparam BlockSize The number of threads within workgroup. /// @tparam MPerBlock The input/output data tile size in the M dimension. /// @tparam NPerBlock The input/output data tile size in N dimension. -/// @tparam KPerBlock The input data tile size in K dimension. +/// @tparam KPerBlock The input data tile size in the K dimension. /// @tparam AK1 The vector load size from global memory for A tensor. /// @tparam BK1 The vector load size from global memory for B tensor. /// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. From 1b66dc30784afa72dea8ca576dc8ab10914dc070 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:23:10 +0100 Subject: [PATCH 20/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 46b17bba9de..97a3702c951 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -64,7 +64,7 @@ namespace device { /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront. /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input -/// data. You could think of it as an answer +/// data. Can be interpreted as the answer to the question, /// to question: "How many threads to arrange /// on each input data axis?" /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over From eca5168fb8c1cf7e3c0016688da5bb3ae886fcb3 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:23:26 +0100 Subject: [PATCH 21/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 97a3702c951..cf50e5c44b5 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -65,7 +65,7 @@ namespace device { /// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input /// data. Can be interpreted as the answer to the question, -/// to question: "How many threads to arrange +/// to the question, "How many threads can be arranged /// on each input data axis?" /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over /// the input tensor dimension. You could think of From faff8003ce8bca8b80d67ae5c7eb185865fed5de Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:26:56 +0100 Subject: [PATCH 22/25] Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp Co-authored-by: spolifroni-amd --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index cf50e5c44b5..a266b99c2fb 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -27,7 +27,7 @@ namespace device { /// This GEMM operation implements the following mathematical equation: /// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) /// Where A, B are input tensors and C is the output tensor. The A/B/C_op are -/// elementwise operations that could be applied on each tensor respectively. +/// elementwise operations applied to the A, B, and C tensors, respectively. /// This operation is called \"universal\" since it provides multiple blockwise GEMM /// pipelines optimized for memory-bound, latency and compute-bound cases. /// From 66821a26fe6e4eabe587448336499b1fc58c938b Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Wed, 26 Mar 2025 12:41:23 +0000 Subject: [PATCH 23/25] Reviewers suggestions. --- .../impl/device_gemm_xdl_cshuffle_v3.hpp | 42 +++++++------- .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 56 +++++++++---------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index a266b99c2fb..136302a7058 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -42,36 +42,35 @@ namespace device { /// @tparam ADataType A tensor data type. /// @tparam BDataType B tensor data type. /// @tparam CDataType C tensor data type. -/// @tparam GemmAccDataType The so called "accumulation" data type related to hardware +/// @tparam GemmAccDataType The accumulation data type related to the hardware /// matrix-multiplication instruction. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. -/// @tparam AElementwiseOperation An elementwise operation which could be applied on A input tensor -/// elements. -/// @tparam BElementwiseOperation An elementwise operation which could be applied on B input tensor -/// elements. -/// @tparam CElementwiseOperation An elementwise operation which could by applied on C output -/// tensor (after GEMM). +/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements. +/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements. +/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor +/// (after GEMM). /// @tparam GemmSpec Determines used "padding" version. /// @tparam BlockSize The number of threads within workgroup. /// @tparam MPerBlock The input/output data tile size in the M dimension. -/// @tparam NPerBlock The input/output data tile size in N dimension. +/// @tparam NPerBlock The input/output data tile size in the N dimension. /// @tparam KPerBlock The input data tile size in the K dimension. /// @tparam AK1 The vector load size from global memory for A tensor. /// @tparam BK1 The vector load size from global memory for B tensor. /// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. /// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront. -/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. +/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input -/// data. Can be interpreted as the answer to the question, -/// to the question, "How many threads can be arranged -/// on each input data axis?" +/// data. Can be interpreted as the answer +/// to the question, "How many threads can be +/// arranged on each input data axis?" /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over -/// the input tensor dimension. You could think of -/// it as an answer to the question: "In which +/// the input tensor dimension. Can be interpreted +/// as the answer to the question: "In which /// order to spread threads through tensor axes?". -/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be +/// interpreted as the answer to the question "Which dimension /// to read first? And which next?" etc. /// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. @@ -81,14 +80,15 @@ namespace device { /// @tparam ABlockLdsExtraM Whether to use padding for LDS or not. With /// universal GEMM there's no need for padding. /// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input -/// data. You could think of it as an answer -/// to question: "How many threads to arrange -/// on each input data axis?" +/// data. Can be interpreted as the answer +/// to the question: "How many threads to +/// arrange on each input data axis?" /// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over -/// the input tensor dimension. You could think of -/// it as an answer to the question: "In which +/// the input tensor dimension. Can be interpreted +/// as the answer to the question: "In which /// order to spread threads through tensor axes?". -/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be +/// interpreted as the answer to the question "Which dimension /// to read first? And which next?" etc. /// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 726e111534c..bb743eeea42 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -92,9 +92,9 @@ __global__ void /// This operation is called \"universal\" since it provides multiple blockwise GEMM /// pipelines optimized for memory-bound, latency and compute-bound cases. /// -/// @note This Kernel implementation supports SplitK algorithm - that is it may be configured -/// to split the work of K dimension dot product accumulation into multiple work groups. -/// The partial products of different workgroups are then reduced using AtomicAdd +/// @note This Kernel implementation supports SplitK algorithm. It can be configured +/// to split the work of K dimension dot product accumulation into multiple workgroups. +/// The partial products of different workgroups are then reduced using the AtomicAdd /// operation. /// /// @tparam ALayout A tensor data layout. @@ -102,37 +102,36 @@ __global__ void /// @tparam CLayout C tensor data layout. /// @tparam ADataType A tensor data type. /// @tparam BDataType B tensor data type. -/// @tparam AccDataType The so called "accumulation" data type related to hardware -/// matrix-multiplication instruction. +/// @tparam GemmAccDataType The accumulation data type related to the hardware +/// matrix-multiplication instruction. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. /// @tparam CDataType C tensor data type. -/// @tparam AElementwiseOperation An elementwise operation which can be applied on A input tensor -/// elements. -/// @tparam BElementwiseOperation An elementwise operation which can be applied on B input tensor -/// elements. -/// @tparam CElementwiseOperation An elementwise operation which can be applied on C output -/// tensor (after GEMM). +/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements. +/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements. +/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor +/// (after GEMM). /// @tparam GemmSpec Determines used "padding" version. /// @tparam BlockSize The number of threads within workgroup. -/// @tparam MPerBlock The input/output data tile size in M dimension. -/// @tparam NPerBlock The input/output data tile size in N dimension. -/// @tparam KPerBlock The input data tile size in K dimension. +/// @tparam MPerBlock The input/output data tile size in the M dimension. +/// @tparam NPerBlock The input/output data tile size in the N dimension. +/// @tparam KPerBlock The input data tile size in the K dimension. /// @tparam AK1 The vector load size from global memory for A tensor. /// @tparam BK1 The vector load size from global memory for B tensor. /// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. /// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. -/// @tparam MXdlPerWave The number of iterations in M dimension over output tile per wavefront. -/// @tparam NXdlPerWave The number of iterations in N dimension over output tile per wavefront. +/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront. +/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input -/// data. You could think of it as an answer -/// to the question: "How many threads to arrange -/// on each input data axis?" +/// data. Can be interpreted as the answer +/// to the question, "How many threads can be +/// arranged on each input data axis?" /// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over -/// the input tensor dimension. You could think of -/// it as an answer to the question: "In which +/// the input tensor dimension. Can be interpreted +/// as the answer to the question: "In which /// order to spread threads through tensor axes?". -/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be +/// interpreted as the answer to the question "Which dimension /// to read first? And which next?" etc. /// @tparam ABlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. @@ -144,14 +143,15 @@ __global__ void /// @tparam ABlockLdsExtraM Whether to use padding for LDS or not. With /// universal GEMM there's no need for padding. /// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input -/// data. You could think of it as an answer -/// to question: "How many threads to arrange -/// on each input data axis?" +/// data. Can be interpreted as the answer +/// to the question: "How many threads to +/// arrange on each input data axis?" /// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over -/// the input tensor dimension. You could think of -/// it as an answer to the question: "In which +/// the input tensor dimension. Can be interpreted +/// as the answer to the question: "In which /// order to spread threads through tensor axes?". -/// @tparam BBlockTransferSrcAccessOrder The order of accessing input tensor axes. "Which dimension +/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be +/// interpreted as the answer to the question "Which dimension /// to read first? And which next?" etc. /// @tparam BBlockTransferSrcVectorDim The index of axis on which we could do vectorized memory /// access - the one with contiguous memory. From cbff8c2690cc29273f4a87214b1dff48a8264fda Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Wed, 26 Mar 2025 13:34:08 +0000 Subject: [PATCH 24/25] Align tparam names in doc with class tparams. --- .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index bb743eeea42..a3a87d26e52 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -102,7 +102,7 @@ __global__ void /// @tparam CLayout C tensor data layout. /// @tparam ADataType A tensor data type. /// @tparam BDataType B tensor data type. -/// @tparam GemmAccDataType The accumulation data type related to the hardware +/// @tparam AccDataType The accumulation data type related to the hardware /// matrix-multiplication instruction. /// @tparam CShuffleDataType The data type used to store matrix-multiplication results into /// LDS memory during \"CShuffle\" data layout optimization. @@ -116,10 +116,10 @@ __global__ void /// @tparam MPerBlock The input/output data tile size in the M dimension. /// @tparam NPerBlock The input/output data tile size in the N dimension. /// @tparam KPerBlock The input data tile size in the K dimension. -/// @tparam AK1 The vector load size from global memory for A tensor. -/// @tparam BK1 The vector load size from global memory for B tensor. -/// @tparam MPerXDL M size of matrix-fused-multiply-add instruction. -/// @tparam NPerXDL N size of matrix-fused-multiply-add instruction. +/// @tparam AK1Value The vector load size from global memory for A tensor. +/// @tparam BK1Value The vector load size from global memory for B tensor. +/// @tparam MPerXdl M size of matrix-fused-multiply-add instruction. +/// @tparam NPerXdl N size of matrix-fused-multiply-add instruction. /// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront. /// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront. /// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input From dc01b651a85189305ab319a50bdeba3c59cbd3de Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Fri, 28 Mar 2025 09:53:47 +0000 Subject: [PATCH 25/25] More reviewers fine tuning ;) --- .../gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp | 7 ++++--- .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 136302a7058..51c223efd2e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -28,11 +28,12 @@ namespace device { /// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) /// Where A, B are input tensors and C is the output tensor. The A/B/C_op are /// elementwise operations applied to the A, B, and C tensors, respectively. -/// This operation is called \"universal\" since it provides multiple blockwise GEMM -/// pipelines optimized for memory-bound, latency and compute-bound cases. +/// The \"universal\" gemm comes with multiple pipelines optimized for different usage +/// scenarios. That's why it's called \"universal\". It's universal through it's design +/// and versatilty. /// /// @note This Kernel implementation supports SplitK algorithm. It can be configured -/// to split the work of K dimension dot product accumulation into multiple workgroups. +/// to split the dot product accumulated over the K dimension into multiple working groups. /// The partial products of different workgroups are then reduced using the AtomicAdd /// operation. /// diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index a3a87d26e52..9f6d85dd781 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -89,11 +89,12 @@ __global__ void /// C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N})) /// Where A, B are input tensors and C is the output tensor. The A/B/C_op are /// elementwise operations that could be applied on each tensor respectively. -/// This operation is called \"universal\" since it provides multiple blockwise GEMM -/// pipelines optimized for memory-bound, latency and compute-bound cases. +/// The \"universal\" gemm comes with multiple pipelines optimized for different usage +/// scenarios. That's why it's called \"universal\". It's universal through it's design +/// and versatilty. /// /// @note This Kernel implementation supports SplitK algorithm. It can be configured -/// to split the work of K dimension dot product accumulation into multiple workgroups. +/// to split the dot product accumulated over the K dimension into multiple working groups. /// The partial products of different workgroups are then reduced using the AtomicAdd /// operation. ///