diff --git a/CMakeLists.txt b/CMakeLists.txt index acae1f5ece..6b4d140501 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ include(CTest) option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON) option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF) option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF) +option(CK_EXPERIMENTAL_PROFILER "Enable experimental profiler" OFF) option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) @@ -50,6 +51,12 @@ if(CK_EXPERIMENTAL_BUILDER) include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include) endif() +if(CK_EXPERIMENTAL_PROFILER) + add_definitions(-DCK_EXPERIMENTAL_PROFILER) + include_directories(${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include) +endif() + + # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8" # CK Codegen requires dataclass which is added in Python 3.7 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 @@ -729,6 +736,10 @@ if (CK_EXPERIMENTAL_BUILDER) add_subdirectory(experimental/builder) endif() +if (CK_EXPERIMENTAL_PROFILER) + add_subdirectory(experimental/ck_tile_profiler) +endif() + if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) add_subdirectory(codegen) endif() @@ -768,6 +779,13 @@ if(CK_EXPERIMENTAL_BUILDER) ) endif() +if(CK_EXPERIMENTAL_PROFILER) + rocm_install(DIRECTORY + ${PROJECT_SOURCE_DIR}/ck_tile_profiler + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile + ) +endif() + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RPM_PACKAGE_LICENSE "MIT") diff --git a/experimental/ck_tile_profiler/CMakeLists.txt b/experimental/ck_tile_profiler/CMakeLists.txt new file mode 100644 index 0000000000..be4dca3435 --- /dev/null +++ b/experimental/ck_tile_profiler/CMakeLists.txt @@ -0,0 +1,5 @@ +include_directories(BEFORE + ${CMAKE_CURRENT_LIST_DIR}/include/ck_tile_profiler +) +add_subdirectory(src) + diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/gemm_configs.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/gemm_configs.hpp new file mode 100644 index 0000000000..f8db4c0464 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/gemm_configs.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" + +#define CK_TILE_PIPELINE_COMPUTE_V3 1 +#define CK_TILE_PIPELINE_MEMORY 2 +#define CK_TILE_PIPELINE_COMPUTE_V4 3 +#define CK_TILE_PIPELINE_COMPUTE_V5 4 + +namespace ck_tile { +namespace ops { + +using MemoryOpSet = + std::integral_constant; + +using MemoryOpAtomicAdd = std::integral_constant; + +struct GemmConfigBase +{ + static constexpr bool kPadM = true; + static constexpr bool kPadN = true; + static constexpr bool kPadK = true; + + static constexpr bool PermuteA = false; + static constexpr bool PermuteB = false; + + static constexpr bool TransposeC = false; + static constexpr bool UseStructuredSparsity = false; + + static constexpr int kBlockPerCu = 1; + static constexpr ck_tile::index_t TileParitionerGroupNum = 8; + static constexpr ck_tile::index_t TileParitionerM01 = 4; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; + static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3; + static constexpr ck_tile::index_t NumWaveGroups = 1; + static constexpr bool Preshuffle = false; + static constexpr bool TiledMMAPermuteN = false; +}; + +template +struct PipelineTypeTraits; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5; +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances.hpp new file mode 100644 index 0000000000..1880be57f5 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances.hpp @@ -0,0 +1,39 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_2.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_2.hpp new file mode 100644 index 0000000000..5eeb4851e8 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_2.hpp @@ -0,0 +1,65 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_2 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // prob this +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_3.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_3.hpp new file mode 100644 index 0000000000..3754b39999 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_3.hpp @@ -0,0 +1,68 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_3 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // ta +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_4.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_4.hpp new file mode 100644 index 0000000000..2d251413f4 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_4.hpp @@ -0,0 +1,65 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_4 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_5.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_5.hpp new file mode 100644 index 0000000000..2cc4f70712 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_5.hpp @@ -0,0 +1,69 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_5 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // ta +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_6.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_6.hpp new file mode 100644 index 0000000000..f5ed1e13ef --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_6.hpp @@ -0,0 +1,67 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_6 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_7.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_7.hpp new file mode 100644 index 0000000000..c7f0f60f1c --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_7.hpp @@ -0,0 +1,71 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_data_bf16_instances_7 = std::tuple< + // clang-format off + //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // prob this +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_factory.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_factory.hpp new file mode 100644 index 0000000000..e4de0a35ca --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_factory.hpp @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using DeviceOp2DF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::half_t, + ck_tile::half_t>; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t>; + +using DeviceOp2DF32 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + float, + float, + float, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + float, + float>; + +// Forward declarations for instance factory functions +// void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_2(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_3(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_4(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_5(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_6(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances_7(std::vector>& instances); + +// void add_grouped_conv2d_bwd_weight_bf16_instances_opt(std::vector>& instances); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = GroupedConvolutionBackwardDataBaseInvoker; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + // add_grouped_conv2d_bwd_weight_f16_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_data_bf16_instances(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_2(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_3(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_4(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_5(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_6(op_ptrs); + add_grouped_conv2d_bwd_data_bf16_instances_7(op_ptrs); + // add_grouped_conv2d_bwd_weight_bf16_instances_opt(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp new file mode 100644 index 0000000000..f2f712f419 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_data_invoker.hpp @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile_profiler/gemm_configs.hpp" + +namespace ck_tile { +namespace ops { + +template +struct GroupedConvolutionBackwardDataBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, + bool time_kernel, + int n_warmup, + int n_repeat) const = 0; + virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + GroupedConvolutionBackwardDataBaseInvoker() = default; + GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = + default; + GroupedConvolutionBackwardDataBaseInvoker& + operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default; + GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = + default; + GroupedConvolutionBackwardDataBaseInvoker& + operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default; + virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default; +}; + +template +struct GroupedConvolutionBackwardDataInvoker + : public GroupedConvolutionBackwardDataBaseInvoker +{ + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; + + // static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::AsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::BsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups>; + + using AccDataType = float; + using GemmPipelineProblem = ck_tile::GemmPipelineProblem< + OutDataType, + WeiDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits< + PipelineVersion>::template UniversalGemmPipeline; + + template + auto CreateKernel() const + { + constexpr auto scheduler = GemmConfigBase::Scheduler; + + using UniversalGemmProblem = + ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline< + UniversalGemmProblem>; + + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem, // = DsDataType + AccDataType, + InDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; + + return ck_tile::GroupedConvolutionBackwardDataKernel{}; + } + + bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const override + { + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + } + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + }; + + float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, + bool time_kernel, + int n_warmup = 5, + int n_repeat = 50) const override + { + [[maybe_unused]] ck_tile::index_t KGroups = 1; + for(int i = 0; i < args.num_dim_spatial_; i++) + { + KGroups *= args.filter_spatial_lengths_[i]; // std::min(args.filter_spatial_lengths_[i], + // args.conv_filter_strides_[i]); + } + + const index_t ConvStrideH = args.conv_filter_strides_[0]; + const index_t ConvStrideW = args.conv_filter_strides_[1]; + const index_t ConvDilationH = args.conv_filter_dilations_[0]; + const index_t ConvDilationW = args.conv_filter_dilations_[1]; + const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH); + const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW); + const auto YTilde = ConvStrideH / GcdStrideDilationH; + const auto XTilde = ConvStrideW / GcdStrideDilationW; + const auto Y = args.filter_spatial_lengths_[0]; + const auto X = args.filter_spatial_lengths_[1]; + [[maybe_unused]] const auto YDot = integer_divide_ceil(Y, YTilde); + [[maybe_unused]] const auto XDot = integer_divide_ceil(X, XTilde); + + const ck_tile::index_t gemm_k = args.K_ * XDot * YDot; + + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + printf("gemm_k: %d num_loop: %d, Xdot: %ld YDot: %ld\n", gemm_k, num_loop, XDot, YDot); + + const auto Run = + [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto memory_operation = memory_operation_.value; + + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); + + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z); + + ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, tail_number_, MemoryOpSet{}); + } + else + { + Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + } + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; + }; + + std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const override + { + std::stringstream min_occupancy; + min_occupancy << "_blk_per_cu_" << kBlockPerCu; + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + } + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + }; + + GroupedConvolutionBackwardDataInvoker() = default; + GroupedConvolutionBackwardDataInvoker(const GroupedConvolutionBackwardDataInvoker&) = default; + GroupedConvolutionBackwardDataInvoker& + operator=(const GroupedConvolutionBackwardDataInvoker&) = default; + GroupedConvolutionBackwardDataInvoker(GroupedConvolutionBackwardDataInvoker&&) = default; + GroupedConvolutionBackwardDataInvoker& + operator=(GroupedConvolutionBackwardDataInvoker&&) = default; + ~GroupedConvolutionBackwardDataInvoker() override = default; +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances.hpp new file mode 100644 index 0000000000..7ce4d4961a --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances.hpp @@ -0,0 +1,161 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple< +// clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +#if defined(__gfx950__) + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, +#endif + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, +// // ConvolutionSpecialization::Default, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +// // + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, +// // ConvolutionSpecialization::Filter1x1Stride1Pad0, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp new file mode 100644 index 0000000000..4fac278043 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp @@ -0,0 +1,128 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_bwd_weight_bf16_instances_opt = std::tuple< +// clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + // Skinny GEMM-K kernels. ConvolutionSpecialization::Default, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + // OLD CK ConvolutionSpecialization::Default, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, +// + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + // Skinny GEMM-K kernels. ConvolutionSpecialization::Default, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + // OLD CK ConvolutionSpecialization::Default, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, // ta + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker + + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_factory.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_factory.hpp new file mode 100644 index 0000000000..dee2999ead --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_factory.hpp @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::half_t, + ck_tile::half_t>; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t>; + +using DeviceOp2DF32 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + float, + float, + float, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + float, + float>; + +// Forward declarations for instance factory functions +// void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances); +void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector>& instances); +void add_grouped_conv2d_bwd_weight_bf16_instances_opt(std::vector>& instances); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = GroupedConvolutionBackwardWeightBaseInvoker; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + // add_grouped_conv2d_bwd_weight_f16_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_weight_bf16_instances(op_ptrs); + add_grouped_conv2d_bwd_weight_bf16_instances_opt(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_fp16_instances.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_fp16_instances.hpp new file mode 100644 index 0000000000..6799826062 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_fp16_instances.hpp @@ -0,0 +1,88 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using F16 = ck_tile::half_t; + +using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + F16, + F16, + F16, + PassThrough, + PassThrough, + PassThrough, + F16, + F16>; + +template +using tile_grouped_conv_bwd_weight_f16_instances = std::tuple< +// clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +#if defined(__gfx950__) + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, +#endif + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker +// clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp new file mode 100644 index 0000000000..863bf1518a --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_bwd_weight_invoker.hpp @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile_profiler/gemm_configs.hpp" + +namespace ck_tile { +namespace ops { + +template +struct GroupedConvolutionBackwardWeightBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, + bool time_kernel, + int n_warmup, + int n_repeat) const = 0; + virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; + GroupedConvolutionBackwardWeightBaseInvoker() = default; + GroupedConvolutionBackwardWeightBaseInvoker( + const GroupedConvolutionBackwardWeightBaseInvoker&) = default; + GroupedConvolutionBackwardWeightBaseInvoker& + operator=(const GroupedConvolutionBackwardWeightBaseInvoker&) = default; + GroupedConvolutionBackwardWeightBaseInvoker(GroupedConvolutionBackwardWeightBaseInvoker&&) = + default; + GroupedConvolutionBackwardWeightBaseInvoker& + operator=(GroupedConvolutionBackwardWeightBaseInvoker&&) = default; + virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default; +}; + +template +struct GroupedConvolutionBackwardWeightInvoker + : public GroupedConvolutionBackwardWeightBaseInvoker +{ + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight< + 1>::AsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight< + 1>::BsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight<1>::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups>; + + using AccDataType = float; + using GemmPipelineProblem = ck_tile::GemmPipelineProblem< + OutDataType, + InDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight<1>, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + WeiDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits< + PipelineVersion>::template UniversalGemmPipeline; + + template + auto CreateKernel() const + { + constexpr auto scheduler = GemmConfigBase::Scheduler; + + using UniversalGemmProblem = + ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline< + UniversalGemmProblem>; + + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem, // = DsDataType + AccDataType, + WeiDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; + + return ck_tile::GroupedConvolutionBackwardWeightKernel{}; + } + + bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override + { + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + } + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + }; + + float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, + bool time_kernel, + int n_warmup = 5, + int n_repeat = 50) const override + { + const ck_tile::index_t gemm_k = + args.N_ * std::accumulate(args.output_spatial_lengths_.begin(), + args.output_spatial_lengths_.end(), + 1, + std::multiplies()); + + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + const auto Run = + [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto memory_operation = memory_operation_.value; + + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); + + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, tail_number_, MemoryOpSet{}); + } + else + { + Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + } + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; + }; + + std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override + { + std::stringstream min_occupancy; + min_occupancy << "_blk_per_cu_" << kBlockPerCu; + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + } + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + }; + + GroupedConvolutionBackwardWeightInvoker() = default; + GroupedConvolutionBackwardWeightInvoker(const GroupedConvolutionBackwardWeightInvoker&) = + default; + GroupedConvolutionBackwardWeightInvoker& + operator=(const GroupedConvolutionBackwardWeightInvoker&) = default; + GroupedConvolutionBackwardWeightInvoker(GroupedConvolutionBackwardWeightInvoker&&) = default; + GroupedConvolutionBackwardWeightInvoker& + operator=(GroupedConvolutionBackwardWeightInvoker&&) = default; + ~GroupedConvolutionBackwardWeightInvoker() override = default; +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances.hpp new file mode 100644 index 0000000000..7c7b5d88b6 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances.hpp @@ -0,0 +1,148 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + + +template +using tile_grouped_conv_fwd_bf16_instances = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_2.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_2.hpp new file mode 100644 index 0000000000..6bde40eaf6 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_2.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_2 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_3.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_3.hpp new file mode 100644 index 0000000000..5aadade58b --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_3.hpp @@ -0,0 +1,148 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + + +template +using tile_grouped_conv_fwd_bf16_instances_3 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_4.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_4.hpp new file mode 100644 index 0000000000..fab913ec0f --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_4.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_4 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_5.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_5.hpp new file mode 100644 index 0000000000..7ca319d8ad --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_5.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_5 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_6.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_6.hpp new file mode 100644 index 0000000000..f5e2246210 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_6.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_6 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_factory.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_factory.hpp new file mode 100644 index 0000000000..afae09888d --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_factory.hpp @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile_profler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +using DeviceOpFwd2DF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + F16, + F16, + F16, + PassThrough, + PassThrough, + PassThrough, + F16, + F16>; + +void add_grouped_conv2d_fwd_bf16_instances(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_2(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_3(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_4(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_5(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_6(std::vector>& instances); + +void add_grouped_conv2d_fwd_f16_instances(std::vector>& instances); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = GroupedConvolutionForwardBaseInvoker; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_fwd_f16_instances(op_ptrs); + } + else if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_fwd_bf16_instances(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_2(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_3(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_4(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_5(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_6(op_ptrs); + } + else + { + std::cout << "Unsupported data type combination for GroupedConv2dFwd\n"; + } + } + else + { + std::cout << "Unsupported layout combination for GroupedConv2dFwd\n"; + } + } + + return op_ptrs; + } +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_fp16_instances.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_fp16_instances.hpp new file mode 100644 index 0000000000..bb7782cab3 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_fp16_instances.hpp @@ -0,0 +1,40 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile_profler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profler/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using F16 = ck_tile::half_t; + +using DeviceOpFwd2DF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + F16, + F16, + F16, + PassThrough, + PassThrough, + PassThrough, + F16, + F16>; + +template +using tile_grouped_conv_fwd_fp16_instances = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| + + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp new file mode 100644 index 0000000000..c8aa74a22d --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_fwd_invoker.hpp @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile_profler/gemm_configs.hpp" + +namespace ck_tile { +namespace ops { + +using GroupedConvFwdHostArgs = ck_tile::GroupedConvFwdHostArgs; + +template +struct GroupedConvolutionForwardBaseInvoker +{ + virtual bool IsSupportedArgument(const GroupedConvFwdHostArgs& args) const = 0; + virtual float + Run(const GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0; + virtual std::string GetName(const GroupedConvFwdHostArgs& args) const = 0; + GroupedConvolutionForwardBaseInvoker() = default; + GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default; + GroupedConvolutionForwardBaseInvoker& + operator=(const GroupedConvolutionForwardBaseInvoker&) = default; + GroupedConvolutionForwardBaseInvoker(GroupedConvolutionForwardBaseInvoker&&) = default; + GroupedConvolutionForwardBaseInvoker& + operator=(GroupedConvolutionForwardBaseInvoker&&) = default; + virtual ~GroupedConvolutionForwardBaseInvoker() = default; +}; + +template +struct GroupedConvolutionForwardInvoker + : public GroupedConvolutionForwardBaseInvoker +{ + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsFwd<1>::AsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsFwd<1>::BsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsFwd<1>::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups, + GemmConfigBase::Preshuffle>; + + using AccDataType = float; + using GemmPipelineProblem = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsFwd<1>, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + OutDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits< + PipelineVersion>::template UniversalGemmPipeline; + + template + auto CreateKernel() const + { + constexpr auto scheduler = GemmConfigBase::Scheduler; + + using UniversalGemmProblem = + ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline< + UniversalGemmProblem>; + + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem, // = DsDataType + AccDataType, + OutDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; + + // std::cout << std::endl << "Vector size A: " << GemmPipeline::GetVectorSizeA() + // << ", Vector size B: " << GemmPipeline::GetVectorSizeB() + // << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; + + return ck_tile::GroupedConvolutionForwardKernel{}; + } + + bool IsSupportedArgument(const GroupedConvFwdHostArgs& args) const override + { + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + } + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + }; + + float Run(const GroupedConvFwdHostArgs& args, + bool time_kernel, + int n_warmup = 5, + int n_repeat = 50) const override + { + const ck_tile::index_t gemm_k = + args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(), + args.filter_spatial_lengths_.end(), + 1, + std::multiplies()); + + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + const auto Run = + [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto memory_operation = memory_operation_.value; + + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); + + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + // std::cout << grids.x << " " << grids.y << " " << grids.z << std::endl; + // std::cout << std::endl << has_hot_loop_v << " " << + // static_cast(tail_number_.value) << " " << gemm_k << " " << num_loop; + + ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, tail_number_, MemoryOpSet{}); + } + else + { + Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + } + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; + }; + + std::string GetName(const GroupedConvFwdHostArgs& args) const override + { + std::stringstream min_occupancy; + min_occupancy << "_blk_per_cu_" << kBlockPerCu; + if(args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + } + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + }; + + GroupedConvolutionForwardInvoker() = default; + GroupedConvolutionForwardInvoker(const GroupedConvolutionForwardInvoker&) = default; + GroupedConvolutionForwardInvoker& operator=(const GroupedConvolutionForwardInvoker&) = default; + GroupedConvolutionForwardInvoker(GroupedConvolutionForwardInvoker&&) = default; + GroupedConvolutionForwardInvoker& operator=(GroupedConvolutionForwardInvoker&&) = default; + ~GroupedConvolutionForwardInvoker() override = default; +}; + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_instance_factory.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_instance_factory.hpp new file mode 100644 index 0000000000..083385ab72 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_grouped_conv_instance_factory.hpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" + +#define CK_TILE_PIPELINE_COMPUTE_V3 1 +#define CK_TILE_PIPELINE_MEMORY 2 +#define CK_TILE_PIPELINE_COMPUTE_V4 3 +#define CK_TILE_PIPELINE_COMPUTE_V5 4 + +namespace ck_tile { +namespace ops { + +template +struct DeviceOperationInstanceFactory; + +using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; +using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; +using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + +using PassThrough = ck_tile::element_wise::PassThrough; + +template +void add_device_operation_instances(std::vector>& op_instances, + const NewOpInstances& new_op_instances) +{ + ck_tile::static_for<0, std::tuple_size_v, 1>{}([&](auto i) { + const auto new_op_instance = std::get(new_op_instances); + + using NewOpInstance = remove_cvref_t; + if constexpr(std::is_same_v) + { + return; // We can use nullptr_t to enable trailing comma + } + else + { + static_assert(std::is_base_of_v, + "NewOpInstance must be derived from BaseOp"); + + op_instances.push_back(std::make_unique(new_op_instance)); + } + }); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_data_impl.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_data_impl.hpp new file mode 100644 index 0000000000..df77b3608b --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_data_impl.hpp @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_factory.hpp" +#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp" +#include "ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp" +namespace ck_tile { +namespace profiler { + +template +auto calculate_rtol_atol(const ck_tile::index_t GemmK, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(GemmK, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = + ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +bool profile_grouped_conv_bwd_data_impl(int do_verification, + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const std::string& split_k, + ck_tile::index_t instance_index = -1) +{ + using AccDataType = float; + using InElementOp = ck_tile::element_wise::PassThrough; + using WeiElementOp = ck_tile::element_wise::PassThrough; + using OutElementOp = ck_tile::element_wise::PassThrough; + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + switch(init_method) + { + case 0: + ck_tile::FillUniformDistribution{-1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{-1.f, 1.f}(output); + break; + case 1: + ck_tile::FillMonotonicSeq{}(weight); + ck_tile::FillMonotonicSeq{}(output); + break; + case 2: + ck_tile::FillUniformDistribution{-1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{-1.f, 1.f}(output); + break; + default: weight.SetZero(); output.SetZero(); + } + + using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker; + + // get device op instances + const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); + + std::cout << "found " << ops.size() << " instances" << std::endl; + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + std::string best_split_k("1"); + + // std::vector split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, + // 128, 152, 256, 304}; + std::vector split_k_list = {1, 2, 3, 4, 6, 8, 12, 16}; + if(split_k != "all") + { + try + { + ck_tile::index_t split_k_value = std::stoi(split_k); + split_k_list = {split_k_value}; + } + catch(const std::exception& e) + { + std::cerr << e.what() << '\n'; + exit(EXIT_FAILURE); + } + } + + // First, calculate the reference result if verification is needed. + ck_tile::HostTensor input_host_ref(in_g_n_c_wis_desc); + input_host_ref.SetZero(); + if(do_verification) + { + ck_tile::reference_grouped_conv_bwd_data( + input_host_ref, + weight, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + } + + // instance_index = 0; + index_t num_kernel = 0; + bool all_pass = true; + for(auto& op : ops) + { + for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) + { + auto split_k_value = split_k_list[split_k_id]; + auto split_k_param_str = std::to_string(split_k_value); + + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.SetZero(); + weight_dev_buf.ToDevice(weight.data()); + output_dev_buf.ToDevice(output.data()); + + ck_tile::GroupedConvBwdDataHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + split_k_value); + + // Split-K autodeduction is not supported. + if(op->IsSupportedArgument(args) && split_k_value >= 1) + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + + std::string op_name = op->GetName(args); + std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." + << std::endl; + + // Run verification first. If it doesn't pass, no need to do performance + // measurement. + bool pass = false; + if(do_verification) + { + constexpr int n_warmup = 0; + constexpr int n_repeat = 1; + + op->Run(args, false, n_warmup, n_repeat); + input_dev_buf.FromDevice(input.data()); + + const ck_tile::index_t GemmK = conv_param.K_ * + conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1]; + + const float max_accumulated_value = + *std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, split_k_value, max_accumulated_value); + + pass = ck_tile::check_err(input, + input_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; + + all_pass &= pass; + } + + bool is_valid = do_verification ? pass : true; + + if(is_valid) + { + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = + conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_param_str << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_split_k = split_k_param_str; + } + } + } + else + { + std::cout << op->GetName(args) << ", SplitK " << split_k_param_str + << " does not support this problem." << std::endl; + } + } + } + + std::stringstream ss; + ss << "\n********************************" + << "\nCK Tile best configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec + << "\nSplitK: " << best_split_k << std::endl; + + std::cout << ss.str(); + + const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out(log_file, std::ios::app); + if(out.is_open()) + { + std::stringstream out_ss; + out_ss << "CK Tile best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << best_split_k << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; + out << out_ss.str(); + out.close(); + } + } + + if(instance_index != -1) + { + std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } + return all_pass; +} + +} // namespace profiler +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_weight_impl.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_weight_impl.hpp new file mode 100644 index 0000000000..b1393873f6 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_bwd_weight_impl.hpp @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_factory.hpp" +#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" +#include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp" +namespace ck_tile { +namespace profiler { + +template +auto calculate_rtol_atol(const ck_tile::index_t GemmK, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(GemmK, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = + ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +bool profile_grouped_conv_bwd_weight_impl(int do_verification, + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const std::string& split_k, + ck_tile::index_t instance_index = -1) +{ + using AccDataType = float; + using InElementOp = ck_tile::element_wise::PassThrough; + using WeiElementOp = ck_tile::element_wise::PassThrough; + using OutElementOp = ck_tile::element_wise::PassThrough; + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + switch(init_method) + { + case 0: + ck_tile::FillUniformDistribution{-1.f, 1.f}(input); + ck_tile::FillUniformDistribution{-1.f, 1.f}(output); + break; + case 1: + ck_tile::FillMonotonicSeq{}(input); + ck_tile::FillMonotonicSeq{}(output); + break; + case 2: + ck_tile::FillUniformDistribution{0.f, 1.f}(input); + ck_tile::FillUniformDistribution{0.f, 1.f}(output); + break; + default: input.SetZero(); output.SetZero(); + } + + using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker; + + // get device op instances + const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); + + std::cout << "found " << ops.size() << " instances" << std::endl; + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + std::string best_split_k("1"); + + std::vector split_k_list = { + 1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304}; + if(split_k != "all") + { + try + { + ck_tile::index_t split_k_value = std::stoi(split_k); + split_k_list = {split_k_value}; + } + catch(const std::exception& e) + { + std::cerr << e.what() << '\n'; + exit(EXIT_FAILURE); + } + } + + // First, calculate the reference result if verification is needed. + ck_tile::HostTensor weight_host_ref(wei_g_k_c_xs_desc); + weight_host_ref.SetZero(); + if(do_verification) + { + ck_tile:: + reference_grouped_conv_bwd_weight( + input, + weight_host_ref, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + } + + index_t num_kernel = 0; + bool all_pass = true; + for(auto& op : ops) + { + for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) + { + auto split_k_value = split_k_list[split_k_id]; + auto split_k_param_str = std::to_string(split_k_value); + + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.ToDevice(input.data()); + weight_dev_buf.SetZero(); + output_dev_buf.ToDevice(output.data()); + + ck_tile::GroupedConvBwdWeightHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + split_k_value); + + // Split-K autodeduction is not supported. + if(op->IsSupportedArgument(args) && split_k_value >= 1) + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + + std::string op_name = op->GetName(args); + std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." + << std::endl; + + // Run verification first. If it doesn't pass, no need to do performance + // measurement. + bool pass = false; + if(do_verification) + { + constexpr int n_warmup = 0; + constexpr int n_repeat = 1; + + op->Run(args, false, n_warmup, n_repeat); + weight_dev_buf.FromDevice(weight.data()); + + const ck_tile::index_t GemmK = + weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = *std::max_element( + weight_host_ref.mData.begin(), weight_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, split_k_value, max_accumulated_value); + + pass = ck_tile::check_err(weight, + weight_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; + + all_pass &= pass; + } + + bool is_valid = do_verification ? pass : true; + + if(is_valid) + { + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = + conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_param_str << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_split_k = split_k_param_str; + } + } + } + else + { + std::cout << op->GetName(args) << ", SplitK " << split_k_param_str + << " does not support this problem." << std::endl; + } + } + } + + std::stringstream ss; + ss << "\n********************************" + << "\nCK Tile best configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec + << "\nSplitK: " << best_split_k << std::endl; + + std::cout << ss.str(); + + const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out(log_file, std::ios::app); + if(out.is_open()) + { + std::stringstream out_ss; + out_ss << "CK Tile best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << best_split_k << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; + out << out_ss.str(); + out.close(); + } + } + + if(instance_index != -1) + { + std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } + return all_pass; +} + +} // namespace profiler +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_fwd_impl.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_fwd_impl.hpp new file mode 100644 index 0000000000..e82219a8b6 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profile_grouped_conv_fwd_impl.hpp @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_factory.hpp" +#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" +#include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp" +namespace ck_tile { +namespace profiler { + +template +auto calculate_rtol_atol(const ck_tile::index_t GemmK, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(GemmK, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = + ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +bool profile_grouped_conv_fwd_impl(int do_verification, + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const ck_tile::index_t k_batch, + ck_tile::index_t instance_index = -1) +{ + using AccDataType = float; + using InElementOp = ck_tile::element_wise::PassThrough; + using WeiElementOp = ck_tile::element_wise::PassThrough; + using OutElementOp = ck_tile::element_wise::PassThrough; + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + if(init_method == 0) + { + ck_tile::FillUniformDistribution{-5.f, 5.f}(input); + ck_tile::FillUniformDistribution{-5.f, 5.f}(weight); + } + else if(init_method == 1) + { + ck_tile::FillMonotonicSeq{}(input); + ck_tile::FillMonotonicSeq{}(weight); + } + else if(init_method == 2) + { + ck_tile::FillUniformDistribution{0.f, 1.f}(input); + ck_tile::FillUniformDistribution{0.f, 1.f}(weight); + } + else + { + input.SetZero(); + weight.SetZero(); + } + + using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker; + + // get device op instances + const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); + + std::cout << "found " << ops.size() << " instances" << std::endl; + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + index_t num_kernel = 0; + bool all_pass = true; + + // tmp enforce instance + // instance_index = -1; + + for(auto& op : ops) + { + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.ToDevice(input.data()); + weight_dev_buf.ToDevice(weight.data()); + output_dev_buf.SetZero(); + + ck_tile::GroupedConvFwdHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + k_batch); + + if(op->IsSupportedArgument(args)) + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + + std::string op_name = op->GetName(args); + std::cout << op_name << " is profiled..." << std::endl; + + // Run verification first. If it doesn't pass, no need to do performance measurement. + bool pass = false; + if(do_verification) + { + constexpr int n_warmup = 0; + constexpr int n_repeat = 1; + + op->Run(args, false, n_warmup, n_repeat); + output_dev_buf.FromDevice(output.data()); + + ck_tile::HostTensor output_host_ref(out_g_n_k_wos_desc); + output_host_ref.SetZero(); + + ck_tile:: + reference_grouped_conv_fwd( + input, + weight, + output_host_ref, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + const ck_tile::index_t GemmK = + weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, k_batch, max_accumulated_value); + pass = ck_tile::check_err(output, + output_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; + all_pass &= pass; + } + + bool is_valid = do_verification ? pass : true; + if(is_valid) + { + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + } + } + else + { + // std::cout << op->GetName(args) << " does not support this problem." << std::endl; + } + } + + std::cout << "\n********************************" + << "\nBest configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; + + const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out(log_file, std::ios::app); + if(out.is_open()) + { + std::stringstream out_ss; + out_ss << "CK Tile best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << 1 << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; + out << out_ss.str(); + out.close(); + } + } + + if(instance_index != -1) + { + std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } + return all_pass; +} + +} // namespace profiler +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profiler_operation_registry.hpp b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profiler_operation_registry.hpp new file mode 100644 index 0000000000..8bccf6b032 --- /dev/null +++ b/experimental/ck_tile_profiler/include/ck_tile_profiler/tile_profiler_operation_registry.hpp @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +class ProfilerOperationRegistry final +{ + ProfilerOperationRegistry() = default; + ~ProfilerOperationRegistry() = default; + + public: + using Operation = std::function; + + private: + struct Entry final + { + explicit Entry(std::string_view description, Operation operation) noexcept + : description_(description), operation_(std::move(operation)) + { + } + + std::string_view description_; + Operation operation_; + }; + + std::map entries_; + + friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry) + { + stream << "{\n"; + for(auto& [name, entry] : registry.entries_) + { + stream << "\t" << name << ": " << entry.description_ << "\n"; + } + stream << "}"; + + return stream; + } + + public: + static ProfilerOperationRegistry& GetInstance() + { + static ProfilerOperationRegistry registry; + return registry; + } + + std::optional Get(std::string_view name) const + { + const auto found = entries_.find(name); + if(found == end(entries_)) + { + return std::nullopt; + } + + return (found->second).operation_; + } + + bool Add(std::string_view name, std::string_view description, Operation operation) + { + return entries_ + .emplace(std::piecewise_construct, + std::forward_as_tuple(name), + std::forward_as_tuple(description, std::move(operation))) + .second; + } +}; + +#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y) +#define PP_CONCAT_IMPL(x, y) x##y + +#define REGISTER_PROFILER_OPERATION(name, description, operation) \ + static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \ + ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation) diff --git a/experimental/ck_tile_profiler/script/benchmark_ck_vs_ck_tile.py b/experimental/ck_tile_profiler/script/benchmark_ck_vs_ck_tile.py new file mode 100755 index 0000000000..493beeb3da --- /dev/null +++ b/experimental/ck_tile_profiler/script/benchmark_ck_vs_ck_tile.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 + +import os +import argparse +import subprocess +import sys +import matplotlib.pyplot as plt +# Non-interactive backend for matplotlib +plt.switch_backend('Agg') +import numpy as np + +import xlsxwriter + +def parse_cli_args(): + """Parse command line arguments""" + parser = argparse.ArgumentParser(description="Run CK and CK Tile convolution profilers.") + parser.add_argument("--input-file", type=str, dest="input_file", required=False, help="Path to the file containing test cases.") + parser.add_argument("--log-to-stdout", action="store_true", help="Log profiler output to stdout instead of /dev/null.") + parser.add_argument("--bin-path", type=str, dest="bin_path", required=False, help="Path to the CK/CK Tile profiler executables.") + parser.add_argument("--results-path", type=str, dest="results_path", required=False, help="Path to store profiler results.", default=".") + parser.add_argument("--analyze-file", type=str, dest="analyze_file", required=False, help="Path to store analysis results.", default="") + + args, unknown_args = parser.parse_known_args() + + if unknown_args: + print(f"Unknown arguments: {unknown_args}", file=sys.stderr) + sys.exit(1) + + return args + +class ProfilerType: + CK = 1 + CK_TILE = 2 + +def run_ck_profiler_cmd(cmd_args, profiler_type, bin_path, results_file, log_to_stdout=False): + profiler = "ckTileProfiler" if profiler_type == ProfilerType.CK_TILE else "ckProfiler" + profiler_path = os.path.join(bin_path, profiler) + cmd = [profiler_path] + cmd_args + cmd_str = ' '.join(cmd) + + # Environment variable to specify results file + env = os.environ.copy() + env["CK_PROFILER_LOG_FILE"] = results_file + env["CK_TILE_PROFILER_LOG_FILE"] = results_file + + if log_to_stdout: + subprocess.run(cmd) + else: + with open(os.devnull, 'w') as devnull: + timeoutInSec = 300 * 60 # 300 minutes timeout + try: + subprocess.run(cmd, stdout=devnull, stderr=devnull, timeout=timeoutInSec, env=env) + except subprocess.TimeoutExpired: + print(f"Command '{cmd_str}' timed out after {timeoutInSec} seconds.", file=sys.stderr) + +def get_profiler_commands(file): + profiler_commands = [] + with open(file, 'r') as f: + lines = f.readlines() + lines = lines[1:] # Skip the header line + lines = list(dict.fromkeys(lines)) + for line in lines: + line = line.strip() + cmd = [x.strip() for x in line.split(' ') if x.strip() and x.strip() != ''] + profiler_commands.append(cmd) + return profiler_commands + +def run_analysis(results_file): + """Analyze benchmark results and create performance comparison plots""" + + # Parse the results file + test_cases = [] + current_case = {} + + with open(results_file, 'r') as f: + lines = f.readlines() + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Look for grouped_conv_* command lines + if line.startswith('grouped_conv_'): + current_case = {'command': line} + i += 1 + + # Parse CK Tile results + while i < len(lines) and not lines[i].strip().startswith('CK Tile best configuration:'): + i += 1 + + if i < len(lines): + i += 1 # Skip "CK Tile best configuration:" line + if i < len(lines) and lines[i].strip().startswith('name:'): + current_case['ck_tile_name'] = lines[i].strip().replace('name:', '').strip() + i += 1 + if i < len(lines) and lines[i].strip().startswith('avg_time:'): + current_case['ck_tile_time'] = float(lines[i].strip().replace('avg_time:', '').strip()) + i += 1 + if i < len(lines) and lines[i].strip().startswith('SplitK:'): + current_case['ck_tile_splitk'] = lines[i].strip().replace('SplitK:', '').strip() + i += 1 + if i < len(lines) and lines[i].strip().startswith('all_pass'): + current_case['ck_tile_all_pass'] = lines[i].strip().replace('all_pass', '').strip() + i += 1 + + # Parse CK results + while i < len(lines) and not lines[i].strip().startswith('CK best configuration:'): + i += 1 + + if i < len(lines): + i += 1 # Skip "CK best configuration:" line + if i < len(lines) and lines[i].strip().startswith('name:'): + current_case['ck_name'] = lines[i].strip().replace('name:', '').strip() + i += 1 + if i < len(lines) and lines[i].strip().startswith('avg_time:'): + current_case['ck_time'] = float(lines[i].strip().replace('avg_time:', '').strip()) + i += 1 + if i < len(lines) and lines[i].strip().startswith('SplitK:'): + current_case['ck_splitk'] = lines[i].strip().replace('SplitK:', '').strip() + i += 1 + + # Only add if we have both CK and CK Tile results + if all(key in current_case for key in ['ck_tile_time', 'ck_time']): + # Skip cases where CK Tile failed (time = 0) + if current_case['ck_tile_time'] > 0: + test_cases.append(current_case) + else: + i += 1 + + print(f"Found {len(test_cases)} valid test cases for analysis") + + # Calculate performance ratios (CK Tile performance relative to CK, where 100% = parity) + performance_ratios = [] + ck_times = [] + ck_tile_times = [] + case_labels = [] + + workbook = xlsxwriter.Workbook('conv_perf.xlsx') + worksheet = workbook.add_worksheet() + + header_format = workbook.add_format() + header_format.set_bold() + + offset = 4 + + worksheet.write(offset, 0, "command", header_format) + worksheet.set_column(0, 0, 66) + worksheet.write(offset, 1, "CK Time", header_format) + worksheet.set_column(1, 1, 11) + worksheet.write(offset, 2, "CK Tile Time", header_format) + worksheet.set_column(2, 2, 11) + worksheet.write(offset, 3, "CK / CK Tile", header_format) + worksheet.set_column(3, 3, 11) + worksheet.write(offset, 4, "All pass", header_format) + worksheet.set_column(4, 4, 11) + worksheet.write(offset, 5, "CK best kernel", header_format) + worksheet.set_column(5, 5, 25) + worksheet.write(offset, 6, "CK splitk", header_format) + worksheet.set_column(6, 6, 15) + worksheet.write(offset, 7, "CK tile best kernel", header_format) + worksheet.set_column(7, 7, 25) + worksheet.write(offset, 8, "CK tile splitk", header_format) + worksheet.set_column(8, 8, 15) + + offset += 1 + + num_of_ck_tile_slower = 0 + + for i, case in enumerate(test_cases): + worksheet.write(i + offset, 0, case['command']) + worksheet.write(i + offset, 1, case['ck_time']) + worksheet.write(i + offset, 2, case['ck_tile_time']) + + format = workbook.add_format() + ratio = case['ck_time'] / case['ck_tile_time'] + + if ratio < 1.0: + format.set_bg_color('red') + num_of_ck_tile_slower += 1 + else: + format.set_bg_color('green') + + all_pass = case['ck_tile_all_pass'] + + worksheet.write(i + offset, 3, ratio, format) + + format2 = workbook.add_format() + format2.set_bg_color('green' if all_pass == "true" else 'red') + worksheet.write(i + offset, 4, all_pass, format2) + worksheet.write(i + offset, 5, case['ck_name']) + worksheet.write(i + offset, 6, case['ck_splitk']) + worksheet.write(i + offset, 7, case['ck_tile_name']) + worksheet.write(i + offset, 8, case['ck_tile_splitk']) + + ck_time = case['ck_time'] + ck_tile_time = case['ck_tile_time'] + + # Performance ratio: CK_time / CK_Tile_time * 100% + # >100% means CK Tile is faster, <100% means CK is faster + # ratio = (ck_time / ck_tile_time) * 100 + # performance_ratios.append(ratio) + # ck_times.append(ck_time) + # ck_tile_times.append(ck_tile_time) + + # # Create a short label for the test case + # cmd_parts = case['command'].split() + # if len(cmd_parts) >= 8: + # label = f"G{cmd_parts[8]}_N{cmd_parts[9]}_K{cmd_parts[10]}_C{cmd_parts[11]}" + # else: + # label = f"Case_{i+1}" + # case_labels.append(label) + + worksheet.write(0, 0, f"all cases: {len(test_cases)}") + worksheet.write(1, 0, f"ck tile slower: {num_of_ck_tile_slower}") + worksheet.write(2, 0, f"ck tile slower: {(num_of_ck_tile_slower / len(test_cases) * 100):2.1f}%") + + workbook.close() + return + + + max_cases_to_detailed_plot = 10 + if len(test_cases) < max_cases_to_detailed_plot: + # Create performance comparison plots + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12)) + + # Plot 1: Performance ratio bar chart + x_pos = np.arange(len(case_labels)) + colors = ['green' if ratio >= 100 else 'red' for ratio in performance_ratios] + + bars = ax1.bar(x_pos, performance_ratios, color=colors, alpha=0.7) + ax1.set_xlabel('Test Cases') + ax1.set_ylabel('CK Tile Performance (% of CK)') + ax1.set_title('CK Tile vs CK Performance Comparison\n(>100% = CK Tile Faster, <100% = CK Faster)') + ax1.set_xticks(x_pos) + ax1.set_xticklabels(case_labels, rotation=45, ha='right') + ax1.legend() + ax1.grid(True, alpha=0.3) + + # Add value labels on bars + for bar, ratio in zip(bars, performance_ratios): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height + 1, + f'{ratio:.1f}%', ha='center', va='bottom', fontsize=8) + + # Plot 2: Absolute timing comparison + x_pos_offset = np.arange(len(case_labels)) + width = 0.35 + + bars1 = ax2.bar(x_pos_offset - width/2, ck_times, width, label='CK', color='blue', alpha=0.7) + bars2 = ax2.bar(x_pos_offset + width/2, ck_tile_times, width, label='CK Tile', color='orange', alpha=0.7) + + ax2.set_xlabel('Test Cases') + ax2.set_ylabel('Average Time (seconds)') + ax2.set_title('Absolute Performance Comparison: CK vs CK Tile') + ax2.set_xticks(x_pos_offset) + ax2.set_xticklabels(case_labels, rotation=45, ha='right') + ax2.legend() + ax2.grid(True, alpha=0.3) + ax2.set_yscale('log') # Use log scale for better visualization + + plt.tight_layout() + + # Save the plot + output_file = results_file.replace('.txt', '_analysis.png') + plt.savefig(output_file, dpi=300, bbox_inches='tight') + print(f"Performance analysis plot saved to: {output_file}") + + # Print summary statistics + print("\n" + "="*80) + print("PERFORMANCE SUMMARY") + print("="*80) + + faster_count = sum(1 for ratio in performance_ratios if ratio > 100) + slower_count = len(performance_ratios) - faster_count + + print(f"Total test cases: {len(test_cases)}") + print(f"CK Tile faster: {faster_count} ({faster_count/len(test_cases)*100:.1f}%)") + print(f"CK faster: {slower_count} ({slower_count/len(test_cases)*100:.1f}%)") + print(f"Average CK Tile performance: {np.mean(performance_ratios):.1f}% of CK") + print(f"Median CK Tile performance: {np.median(performance_ratios):.1f}% of CK") + print(f"Best CK Tile performance: {np.max(performance_ratios):.1f}% of CK") + print(f"Worst CK Tile performance: {np.min(performance_ratios):.1f}% of CK") + + # Show the plot + plt.show() + else: + # Plot the histogram of the performance ratios + plt.figure(figsize=(10, 6)) + + min_ratio = min(performance_ratios) + max_ratio = max(performance_ratios) + + bin_width = 5 + + # Extend range to ensure we capture all data + bin_start = int(min_ratio // bin_width) * bin_width + bin_end = int(max_ratio // bin_width) * bin_width + bin_edges = np.arange(bin_start, bin_end, bin_width) + + # Create the histogram data + counts, bins = np.histogram(performance_ratios, bins=bin_edges) + + # Color bars based on whether they're above or below 100% + colors = [] + for i in range(len(counts)): + bin_center = (bin_edges[i] + bin_edges[i+1]) / 2 + if bin_center < 100: + colors.append('red') + else: + colors.append('green') + + # Plot the histogram with custom colors + plt.bar(bin_edges[:-1], counts, width=bin_width, color=colors, edgecolor='black', + alpha=0.7, align='edge') + + plt.xlabel('CK Tile Performance (% of CK)') + plt.ylabel('Number of Test Cases') + plt.title('CK Tile vs CK Performance Distribution\n(>100% = CK Tile Faster, <100% = CK Faster)') + + # Create custom legend + from matplotlib.patches import Patch + legend_elements = [ + Patch(facecolor='red', alpha=0.7, label='CK Faster (<100%)'), + Patch(facecolor='green', alpha=0.7, label='CK Tile Faster (>100%)') + ] + plt.legend(handles=legend_elements) + + plt.grid(True, alpha=0.3) + + # Set x-axis to show percentage marks at logical intervals + x_ticks = np.arange(int(min_ratio//10)*10, int(max_ratio//10)*10 + 20, 10) + plt.xticks(x_ticks) + + # Set y-axis to integer positions only + max_count = max(counts) if len(counts) > 0 else 1 + y_ticks = np.arange(0, max_count + 1, 2) + plt.yticks(y_ticks) + + # Save the histogram + output_file = results_file.replace('.txt', '_analysis_histogram.png') + plt.savefig(output_file, dpi=300, bbox_inches='tight') + print(f"Performance analysis histogram saved to: {output_file}") + plt.close() + + # Collect aggregated statistics for cases where CK is faster + print("\n" + "="*80) + print("CK FASTER TEST CASES - AGGREGATED STATISTICS") + print("="*80) + + ck_faster_cases = [] + ck_faster_ratios = [] + ck_faster_kernels = {} # Track which CK kernels are winning + ck_tile_losing_kernels = {} # Track which CK Tile kernels are losing + + for i, case in enumerate(test_cases): + ratio = performance_ratios[i] + if ratio < 100: + ck_faster_cases.append(case) + ck_faster_ratios.append(ratio) + + # Count CK kernels that are winning + ck_kernel = case.get('ck_name', 'N/A') + if ck_kernel not in ck_faster_kernels: + ck_faster_kernels[ck_kernel] = {'count': 0, 'ratios': []} + ck_faster_kernels[ck_kernel]['count'] += 1 + ck_faster_kernels[ck_kernel]['ratios'].append(ratio) + + # Count CK Tile kernels that are losing + ck_tile_kernel = case.get('ck_tile_name', 'N/A') + if ck_tile_kernel not in ck_tile_losing_kernels: + ck_tile_losing_kernels[ck_tile_kernel] = {'count': 0, 'ratios': []} + ck_tile_losing_kernels[ck_tile_kernel]['count'] += 1 + ck_tile_losing_kernels[ck_tile_kernel]['ratios'].append(ratio) + + if ck_faster_cases: + print(f"Number of cases where CK is faster: {len(ck_faster_cases)}/{len(test_cases)} ({len(ck_faster_cases)/len(test_cases)*100:.1f}%)") + print(f"Average CK performance advantage: {100 - np.mean(ck_faster_ratios):.1f}%") + print(f"Median CK performance advantage: {100 - np.median(ck_faster_ratios):.1f}%") + print(f"Best CK performance advantage: {100 - np.min(ck_faster_ratios):.1f}%") + print(f"Worst CK performance advantage: {100 - np.max(ck_faster_ratios):.1f}%") + + print(f"\nTop CK kernels that outperform CK Tile:") + sorted_ck_kernels = sorted(ck_faster_kernels.items(), key=lambda x: x[1]['count'], reverse=True) + for kernel, stats in sorted_ck_kernels[:5]: # Show top 5 + avg_advantage = 100 - np.mean(stats['ratios']) + print(f" {kernel}: {stats['count']} wins, avg advantage: {avg_advantage:.1f}%") + + print(f"\nCK Tile kernels that lose most often:") + sorted_ck_tile_kernels = sorted(ck_tile_losing_kernels.items(), key=lambda x: x[1]['count'], reverse=True) + for kernel, stats in sorted_ck_tile_kernels[:5]: # Show top 5 + avg_disadvantage = np.mean(stats['ratios']) + print(f" {kernel}: {stats['count']} losses, avg performance: {avg_disadvantage:.1f}% of CK") + else: + print("No cases found where CK is faster than CK Tile.") + +def main(): + args = parse_cli_args() + + if (args.analyze_file): + print(f"Analyzing results from file: {args.analyze_file}") + run_analysis(args.analyze_file) + return + else: + print(f"Running profilers using test cases from file: {args.input_file}") + profiler_commands = get_profiler_commands(args.input_file) + print(f"Got {len(profiler_commands)} unique commands to run.") + + if not os.path.exists(args.results_path): + os.makedirs(args.results_path) + + results_file = os.path.join(args.results_path, f"ck_vs_ck_tile_results_{os.getpid()}.txt") + + for i, cmd in enumerate(profiler_commands): + cmd_concatenated_str = ' '.join(cmd) + print(f"\n####################################################################################################################") + print(f"Running command {i + 1}/{len(profiler_commands)}: {cmd_concatenated_str}") + print(f"######################################################################################################################") + with open(results_file, 'a') as f: + f.write(cmd_concatenated_str + "\n") + run_ck_profiler_cmd(cmd, ProfilerType.CK_TILE, args.bin_path, results_file, args.log_to_stdout) + + # For the old CK, we don't want to run verification. We assume CK already works correctly. + cmd[3] = '0' # Set verification flag to 0 (no verification) + + run_ck_profiler_cmd(cmd, ProfilerType.CK, args.bin_path, results_file, args.log_to_stdout) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experimental/ck_tile_profiler/script/convert_miopen_driver_commands.py b/experimental/ck_tile_profiler/script/convert_miopen_driver_commands.py new file mode 100755 index 0000000000..df539d8abe --- /dev/null +++ b/experimental/ck_tile_profiler/script/convert_miopen_driver_commands.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import os + +def parse_miopen_command(miopen_cmd): + """Parse MIOpen driver command and extract parameters""" + # Remove 'convbfp16' or similar prefix and split into arguments + parts = miopen_cmd.strip().split() + if not parts: + return None + + # Skip the command name (convbfp16, convfp16, etc.) + args = parts[1:] if parts[0].startswith('conv') else parts + + params = {} + i = 0 + while i < len(args): + if args[i].startswith('-'): + key = args[i] + if i + 1 < len(args) and not args[i + 1].startswith('-'): + params[key] = args[i + 1] + i += 2 + else: + params[key] = True + i += 1 + else: + i += 1 + + return params + +def determine_operation_type(params): + """Determine the operation type based on MIOpen parameters""" + # TODO: Current data is for bwd weight. + return "grouped_conv_bwd_data" + #return "grouped_conv_bwd_weight" + #return "grouped_conv_fwd"#"grouped_conv_bwd_weight" + +def convert_miopen_to_ck_profiler(miopen_cmd): + """Convert MIOpen driver command to CK profiler command""" + params = parse_miopen_command(miopen_cmd) + if not params: + return None + + # Determine operation type + operation = determine_operation_type(params) + + data_type = 2 #2 for bwd data 2 FOR FWD 5 FOR BWD WEI # BF16 + layout = 1 #1 FIR BWD DATA 1 FOR FWD 2 FOR BWE WEI # channels last + verification = 1 # with verification + init_method = 2 # uniform data + print_output = 0 # no print output + time_kernel = 1 # time kernel + n_dim = 2 # 2D convolution by default + + # Build CK profiler command + ck_cmd = [operation, str(data_type), str(layout), str(verification), + str(init_method), str(print_output), str(time_kernel), str(n_dim)] + + # Add tensor dimensions + G = params.get('-g', '1') + N = params.get('-n', '1') + K = params.get('-k', '1') + C = params.get('-c', '1') + + ck_cmd.extend([G, N, K, C]) + + Y = params.get('-y', '1') + X = params.get('-x', '1') + ck_cmd.extend([Y, X]) + + # Input dimensions + Hi = params.get('-H', '1') + Wi = params.get('-W', '1') + ck_cmd.extend([Hi, Wi]) + + # Stride + stride_h = params.get('-u', '1') + stride_w = params.get('-v', '1') + ck_cmd.extend([stride_h, stride_w]) + + # Dilation + dilation_h = params.get('-l', '1') + dilation_w = params.get('-j', '1') + ck_cmd.extend([dilation_h, dilation_w]) + + # Padding + pad_h = params.get('-p', '0') + pad_w = params.get('-q', '0') + ck_cmd.extend([pad_h, pad_w, pad_h, pad_w]) # Assuming symmetric padding + + # Split-K + split_k = "all" + ck_cmd.append(split_k) + + return ' '.join(ck_cmd) + +def convert_file(input_file, output_file): + """Convert MIOpen commands from input file and save CK profiler commands to output file""" + converted_commands = [] + failed_conversions = [] + + try: + with open(input_file, 'r') as f: + lines = f.readlines() + + # Skip header line if present + start_idx = 0 + if lines and (lines[0].strip().lower() == 'shape' or 'conv' not in lines[0].lower()): + start_idx = 1 + + for line_num, line in enumerate(lines[start_idx:], start_idx + 1): + line = line.strip() + if not line or line.startswith('#'): + continue + + ck_cmd = convert_miopen_to_ck_profiler(line) + if ck_cmd: + converted_commands.append(ck_cmd) + else: + failed_conversions.append((line_num, line)) + + # Write converted commands to output file + with open(output_file, 'w') as f: + for cmd in converted_commands: + f.write(cmd + '\n') + + print(f"Conversion completed successfully!") + print(f"Input file: {input_file}") + print(f"Output file: {output_file}") + print(f"Converted {len(converted_commands)} commands") + + if failed_conversions: + print(f"\nFailed to convert {len(failed_conversions)} commands:") + for line_num, line in failed_conversions[:5]: # Show first 5 failures + print(f" Line {line_num}: {line[:80]}...") + if len(failed_conversions) > 5: + print(f" ... and {len(failed_conversions) - 5} more") + + return True + + except FileNotFoundError: + print(f"Error: Input file '{input_file}' not found") + return False + except Exception as e: + print(f"Error during conversion: {e}") + return False + +def generate_output_filename(input_file): + """Generate output filename based on input filename""" + base_name = os.path.splitext(input_file)[0] + return f"{base_name}_ck_profiler.txt" + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description='Convert MIOpen driver commands to CK profiler commands', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""Examples: + python3 convert_miopen_driver_to_profiler.py miopen_commands.txt + python3 convert_miopen_driver_to_profiler.py miopen_commands.txt -o ck_commands.txt + python3 convert_miopen_driver_to_profiler.py --validate miopen_commands.txt""" + ) + + parser.add_argument('input_file', + help='Input file containing MIOpen driver commands') + parser.add_argument('-o', '--output', + help='Output file for CK profiler commands (default: auto-generated)') + parser.add_argument('--validate', action='store_true', + help='Validate converted commands (dry run)') + + return parser.parse_args() + +def main(): + args = parse_arguments() + print(args.input_file) + + # Check if input file exists + if not os.path.exists(args.input_file): + print(f"Error: Input file '{args.input_file}' does not exist") + sys.exit(1) + + print(args.input_file) + + # Generate output filename if not provided + output_file = args.output if args.output else generate_output_filename(args.input_file) + + # Validate mode - just show what would be converted + if args.validate: + print("Validation mode - showing first 5 conversions:") + try: + with open(args.input_file, 'r') as f: + lines = f.readlines()[:6] # Header + 5 commands + + start_idx = 1 if lines and 'conv' not in lines[0].lower() else 0 + + for i, line in enumerate(lines[start_idx:start_idx+5]): + line = line.strip() + if not line: + continue + + ck_cmd = convert_miopen_to_ck_profiler(line) + print(f"\n{i+1}. MIOpen: {line[:80]}{'...' if len(line) > 80 else ''}") + print(f" CK: {ck_cmd if ck_cmd else 'CONVERSION FAILED'}") + + print(f"\nWould write to: {output_file}") + except Exception as e: + print(f"Validation error: {e}") + + return + + # Perform the actual conversion + success = convert_file(args.input_file, output_file) + + if success: + print(f"\nConversion completed! You can now use the converted commands with:") + print(f" # For individual command testing:") + print(f" ./profiler/ck_tile/ckTileProfiler ") + print(f" ") + print(f" # For batch processing, you can create a script that reads from {os.path.basename(output_file)}") + else: + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_data_to_ck_tile.py b/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_data_to_ck_tile.py new file mode 100644 index 0000000000..32a62a8812 --- /dev/null +++ b/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_data_to_ck_tile.py @@ -0,0 +1,128 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel.txt" +output_path = "outputkernel_bwd_data.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + + # Example usage + #input_str = " DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>" + + params = extract_template_parameters(line) + + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + DsLayout = params[3] + ELayout = params[4] + + ADataType = params[5] + BDataType = params[6] + AccDataType = params[7] + CshuffleDataType= params[8] + DsDataTypes = params[9] + EDataType = params[10] + + AElementwiseOp = params[11] + BElementwiseOp = params[12] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[14] + + DoPadGemmM = params[15] + DoPadGemmN = params[16] + NumGemmK = params[17] + + BlockSize = params[18] + MPerBlock = params[19] + NPerBlock = params[20] + KPerBlock = params[21] + AK1 = params[22] + BK1 = params[23] + MPerXDL = params[24] + NPerXDL = params[25] + MXdlPerWave = params[26] + NXdlPerWave = params[27] + ABlockTransferClusterLengths = params[28] + ABlockTransferArrangeOrder = params[29] + ABlockTransferSrcAccessOrder = params[30] + ABlockTransferSrcVectorDim = params[31] + ABlockTransferSrcScalarPerVector = params[32] + ABlockTransferDstScalarPerVector_K1 = params[33] + ABlockLdsAddExtraM = params[34] + BBlockTransferClusterLengths = params[35] + BBlockTransferArrangeOrder = params[36] + BBlockTransferSrcVectorDim = params[37] + BBlockTransferSrcAccessOrder = params[38] + BBlockTransferSrcScalarPerVector = params[39] + BBlockTransferDstScalarPerVector_K1 = params[40] + BBlockLdsAddExtraM = params[41] + CShuffleMXdlPerwave = params[42] + CShuffleNXdlPerWavePerShuffle = params[43] + CBlockTransferClusterLengths = params[44] + CBlockTransferScalarPerVector = params[45] + + + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + print(MPerBlock, NPerBlock, KPerBlock) + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + + for pipeline in pipelines: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionBackwardDataInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp},' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file diff --git a/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py b/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py new file mode 100644 index 0000000000..b8b2f0832c --- /dev/null +++ b/experimental/ck_tile_profiler/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py @@ -0,0 +1,120 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel.txt" +output_path = "outputkernel_bwd_wei.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + + # Example usage + #input_str = " DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>" + + params = extract_template_parameters(line) + + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + ELayout = params[3] + + ADataType = params[4] + BDataType = params[5] + EDataType = params[6] + AccDataType = params[7] + + AElementwiseOp = params[8] + BElementwiseOp = params[9] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[11] + + BlockSize = params[12] + MPerBlock = params[13] + NPerBlock = params[14] + KPerBlock = params[15] + K1 = params[16] + MPerXDL = params[17] + NPerXDL = params[18] + MXdlPerWave = params[19] + NXdlPerWave = params[20] + ABlockTransferClusterLengths = params[21] + ABlockTransferArrangeOrder = params[22] + ABlockTransferSrcAccessOrder = params[23] + ABlockTransferSrcVectorDim = params[24] + ABlockTransferSrcScalarPerVector = params[25] + ABlockTransferDstScalarPerVector_K1 = params[26] + ABlockLdsAddExtraM = params[27] + BBlockTransferClusterLengths = params[28] + BBlockTransferArrangeOrder = params[29] + BBlockTransferSrcVectorDim = params[30] + BBlockTransferSrcAccessOrder = params[31] + BBlockTransferSrcScalarPerVector = params[32] + BBlockTransferDstScalarPerVector_K1 = params[33] + BBlockLdsAddExtraM = params[34] + CShuffleMXdlPerwave = params[35] + CShuffleNXdlPerWavePerShuffle = params[36] + CBlockTransferClusterLengths = params[37] + CBlockTransferScalarPerVector = params[38] + + + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + print(MPerBlock, NPerBlock, KPerBlock) + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + + for pipeline in pipelines: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionBackwardWeightInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp},' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file diff --git a/experimental/ck_tile_profiler/script/convert_old_ck_conv_fwd_to_ck_tile.py b/experimental/ck_tile_profiler/script/convert_old_ck_conv_fwd_to_ck_tile.py new file mode 100644 index 0000000000..69cb949b86 --- /dev/null +++ b/experimental/ck_tile_profiler/script/convert_old_ck_conv_fwd_to_ck_tile.py @@ -0,0 +1,123 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel_fwd.txt" +output_path = "outputkernel_fwd.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + print(1) + params = extract_template_parameters(line) + print(1) + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + DsLayout = params[3] + ELayout = params[4] + ADataType = params[5] + BDataType = params[6] + AccDataType = params[7] + CShuffleDataType= params[8] + DsDataTypes = params[9] + EDataType = params[10] + AElementwiseOp = params[11] + BElementwiseOp = params[12] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[14] + GemmSpec = params[15] + NummGemmKPref = params[16] + BlockSize = params[17] + MPerBlock = params[18] + NPerBlock = params[19] + KPerBlock = params[20] + AK1 = params[21] + BK1 = params[22] + MPerXDL = params[23] + NPerXDL = params[24] + MXdlPerWave = params[25] + NXdlPerWave = params[26] + ABlockTransferClusterLengths = params[27] + ABlockTransferArrangeOrder = params[28] + ABlockTransferSrcAccessOrder = params[29] + ABlockTransferSrcVectorDim = params[30] + ABlockTransferSrcScalarPerVector = params[31] + ABlockTransferDstScalarPerVector_K1 = params[32] + ABlockLdsAddExtraM = params[33] + BBlockTransferClusterLengths = params[34] + BBlockTransferArrangeOrder = params[35] + BBlockTransferSrcVectorDim = params[36] + BBlockTransferSrcAccessOrder = params[37] + BBlockTransferSrcScalarPerVector = params[38] + BBlockTransferDstScalarPerVector_K1 = params[39] + BBlockLdsAddExtraM = params[40] + CShuffleMXdlPerwave = params[41] + CShuffleNXdlPerWavePerShuffle = params[42] + CBlockTransferClusterLengths = params[43] + CBlockTransferScalarPerVector = params[44] + + print(1) + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + convspecs = ["Filter1x1Stride1Pad0", "Filter1x1Pad0", "Filter3x3", "Default"] + + for pipeline in pipelines: + print(1) + for convSpec in convspecs: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionForwardInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp}, ConvolutionSpecialization::{convSpec}' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + print(1) + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file diff --git a/experimental/ck_tile_profiler/src/CMakeLists.txt b/experimental/ck_tile_profiler/src/CMakeLists.txt new file mode 100644 index 0000000000..dadfc19ce4 --- /dev/null +++ b/experimental/ck_tile_profiler/src/CMakeLists.txt @@ -0,0 +1,261 @@ +# ckTileProfiler + + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances STATIC + tile_grouped_conv_bwd_data_bf16_instances.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_2 STATIC + tile_grouped_conv_bwd_data_bf16_instances_2.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_3 STATIC + tile_grouped_conv_bwd_data_bf16_instances_3.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_4 STATIC + tile_grouped_conv_bwd_data_bf16_instances_4.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_5 STATIC + tile_grouped_conv_bwd_data_bf16_instances_5.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_6 STATIC + tile_grouped_conv_bwd_data_bf16_instances_6.cpp +) + +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances_7 STATIC + tile_grouped_conv_bwd_data_bf16_instances_7.cpp +) + +add_library(ck_tile_grouped_conv_bwd_weight_fp16_instances STATIC + tile_grouped_conv_bwd_weight_fp16_instances.cpp +) + +add_library(ck_tile_grouped_conv_bwd_weight_bf16_instances STATIC + tile_grouped_conv_bwd_weight_bf16_instances.cpp +) + +add_library(ck_tile_grouped_conv_bwd_weight_bf16_instances_opt STATIC + tile_grouped_conv_bwd_weight_bf16_instances_opt.cpp +) + +add_library(ck_tile_grouped_conv_fwd_fp16_instances STATIC + tile_grouped_conv_fwd_fp16_instances.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances STATIC + tile_grouped_conv_fwd_bf16_instances.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_2 STATIC + tile_grouped_conv_fwd_bf16_instances_2.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_3 STATIC + tile_grouped_conv_fwd_bf16_instances_3.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_4 STATIC + tile_grouped_conv_fwd_bf16_instances_4.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_5 STATIC + tile_grouped_conv_fwd_bf16_instances_5.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_6 STATIC + tile_grouped_conv_fwd_bf16_instances_6.cpp +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_2 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_3 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_4 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_5 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_6 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances_7 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_weight_fp16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_weight_bf16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_bwd_weight_bf16_instances_opt PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_fp16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_2 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_3 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_4 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_5 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_6 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include +) + + +set(CK_PROFILER_OP_FILTER "" CACHE STRING "Filter for the operators to be profiled. Default is to include all") +set(CK_PROFILER_INSTANCE_FILTER "" CACHE STRING "Filter for the kernels instances to be profiled. Default is to be the same as the operator filter") +if (CK_PROFILER_OP_FILTER STREQUAL "") + set(CK_PROFILER_OP_FILTER ".+") +endif() +if (CK_PROFILER_INSTANCE_FILTER STREQUAL "") + set(CK_PROFILER_INSTANCE_FILTER ${CK_PROFILER_OP_FILTER}) +endif() +message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}") +message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}") + +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") + list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_data.cpp) + list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_weight.cpp) + #list(APPEND PROFILER_OPS tile_profile_grouped_conv_fwd.cpp) +endif() + +set(PROFILER_SOURCES tile_profiler.cpp) +foreach(SOURCE ${PROFILER_OPS}) + string(REGEX REPLACE "tile_profile_(.+)\.cpp" "\\1" OP_NAME ${SOURCE}) + if (OP_NAME STREQUAL "") + message(FATAL_ERROR "Unexpected source file name: ${SOURCE}") + endif() + if("${OP_NAME}" MATCHES "${CK_PROFILER_OP_FILTER}") + list(APPEND PROFILER_SOURCES ${SOURCE}) + endif() +endforeach() +message(VERBOSE "ckTileProfiler sources: ${PROFILER_SOURCES}") + +set(PROFILER_EXECUTABLE ckTileProfiler) + +add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) +target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) +# flags to compress the library +if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) + message(DEBUG "Adding --offload-compress flag for ${PROFILER_EXECUTABLE}") + target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress) +endif() + +set(DEVICE_INSTANCES "") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_2) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_3) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_4) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_5) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_6) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances_7) + + #list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_fp16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_bf16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_bf16_instances_opt) + #list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_fp16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_2) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_3) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_4) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_5) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_6) + +endif() + +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") +endif() + +if(DL_KERNELS) +endif() + +set(PROFILER_LIBS utility getopt::getopt) +foreach(LIB ${DEVICE_INSTANCES}) + # Handle both traditional device_*_instance and ck_tile_*_instances patterns + string(REGEX REPLACE "device_(.+)_instance" "\\1" INSTANCE_NAME ${LIB}) + if (INSTANCE_NAME STREQUAL ${LIB}) + # If no match, try ck_tile pattern + string(REGEX REPLACE "ck_tile_(.+)_instances" "\\1" INSTANCE_NAME ${LIB}) + endif() + if (INSTANCE_NAME STREQUAL "") + message(FATAL_ERROR "Unexpected kernel instance name: ${LIB}") + endif() + if("${INSTANCE_NAME}" MATCHES "${CK_PROFILER_INSTANCE_FILTER}") + list(APPEND PROFILER_LIBS ${LIB}) + endif() +endforeach() +message(VERBOSE "ckTileProfiler libs: ${PROFILER_LIBS}") +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE ${PROFILER_LIBS}) + +rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances.cpp new file mode 100644 index 0000000000..d5dc079675 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_2.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_2.cpp new file mode 100644 index 0000000000..8e74524b71 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_2.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_2.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_2(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_2< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_3.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_3.cpp new file mode 100644 index 0000000000..23a2967f41 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_3.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_3.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_3(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_3< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_4.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_4.cpp new file mode 100644 index 0000000000..4bdb3e3b84 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_4.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_4.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_4(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_4< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_5.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_5.cpp new file mode 100644 index 0000000000..59e50a9d25 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_5.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_5.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_5(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_5< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_6.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_6.cpp new file mode 100644 index 0000000000..d29fdc59ea --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_6.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_6.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_6(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_6< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_7.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_7.cpp new file mode 100644 index 0000000000..7bce8604a3 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_data_bf16_instances_7.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_data_bf16_instances_7.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances_7(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances_7< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances.cpp new file mode 100644 index 0000000000..8c7b580a2e --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_weight_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances_opt.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances_opt.cpp new file mode 100644 index 0000000000..a5702ae901 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_bf16_instances_opt.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_weight_bf16_instances_opt(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_weight_bf16_instances_opt< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_fp16_instances.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_fp16_instances.cpp new file mode 100644 index 0000000000..c7074c4202 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_bwd_weight_fp16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_bwd_weight_fp16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_weight_f16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances.cpp new file mode 100644 index 0000000000..a85920290d --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_2.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_2.cpp new file mode 100644 index 0000000000..7fe133fb4b --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_2.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_2.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_2(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_2< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_3.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_3.cpp new file mode 100644 index 0000000000..5d99b597ea --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_3.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_3.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_3(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_3< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_4.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_4.cpp new file mode 100644 index 0000000000..d5f10adfe1 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_4.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_4.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_4(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_4< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_5.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_5.cpp new file mode 100644 index 0000000000..ccd68f2fb6 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_5.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_5.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_5(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_5< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_6.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_6.cpp new file mode 100644 index 0000000000..b909d7730a --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_bf16_instances_6.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_bf16_instances_6.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_6(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_6< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_fp16_instances.cpp b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_fp16_instances.cpp new file mode 100644 index 0000000000..799aa44816 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_grouped_conv_fwd_fp16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile_profiler/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile_profiler/tile_grouped_conv_fwd_fp16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_f16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_fp16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp new file mode 100644 index 0000000000..b69745d3eb --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck_tile_profiler/tile_profile_grouped_conv_bwd_data_impl.hpp" +#include "ck_tile_profiler/tile_profiler_operation_registry.hpp" + +// CK Tile library dependencies +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace { + +enum struct ConvLayout +{ + GNHWC_GKYXC_GNHWK, // 0 + NHWGC_GKYXC_NHWGK, // 1 + NGCHW_GKYXC_NGKHW, // 2 + NGCHW_GKCYX_NGKHW, // 3 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 +}; + +#define OP_NAME "grouped_conv_bwd_data" +#define OP_DESC "Grouped Convolution Backward Data" + +static void print_helper_msg() +{ + std::string conv_param_parser_helper_msg; + + conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight fp32, Output bf16\n" + << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n" + << " 4: Input int8, Weight int8, Output int8\n" + << " 5: Input bf16, Weight bf16, Output bf16\n" + << " 6: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " + "N, K, Ho, Wo]\n" + << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " + "N, Ho, Wo, K]\n" + << " 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, " + "Ho, Wo, G, K]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo]\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << conv_param_parser_helper_msg + << " SplitK (-1 for internally computed split-K value, positive value to set k " + "batches explicitly, or 'all' to test all internal split-K values)\n" + << std::endl; +} + +} // namespace + +int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 9) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); + + // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + { + print_helper_msg(); + return 1; + } + + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); + + const auto& split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]); + + // using F32 = float; + // using F16 = ck_tile::half_t; + using BF16 = ck_tile::bfloat16_t; + // using F8 = ck_tile::fp8_t; + // using BF8 = ck_tile::bf8_t; + + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + // using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + // using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + // using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + constexpr auto I2 = ck_tile::number<2>{}; + // constexpr auto I3 = ck_tile::number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck_tile::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + bool pass = ck_tile::profiler::profile_grouped_conv_bwd_data_impl( + do_verification, init_method, do_log, time_kernel, params, split_k); + + return pass ? 0 : 1; + }; + + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + // if(data_type == ConvDataType::F32_F32_F32) + // { + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + // } + // if(data_type == ConvDataType::F16_F16_F16) + // { + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + // } + // if(data_type == ConvDataType::BF16_F32_BF16) + // { + // // fp32 atomic add is used for weight tensor in bf16 kernel + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + // } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + + // if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + // { + // if(data_type == ConvDataType::F32_F32_F32) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + // } + // if(data_type == ConvDataType::F16_F16_F16) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + // } + // if(data_type == ConvDataType::BF16_F32_BF16) + // { + // // fp32 atomic add is used for weight tensor in bf16 kernel + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, + // BF16{}); + // } + // if(data_type == ConvDataType::BF16_BF16_BF16) + // { + // return profile( + // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + // } + // if(data_type == ConvDataType::F16_F16_F16_BF8_F8) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); + // } + // else if(data_type == ConvDataType::I8_I8_I8) + // { + // return profile( + // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, + // int8_t{}); + // } + // } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, tile_profile_grouped_conv_bwd_data); diff --git a/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp new file mode 100644 index 0000000000..086c1e8492 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck_tile_profiler/tile_profile_grouped_conv_bwd_weight_impl.hpp" +#include "ck_tile_profiler/tile_profiler_operation_registry.hpp" + +// CK Tile library dependencies +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace { + +enum struct ConvLayout +{ + GNCHW_GKCYX_GNKHW, // 0 + GNHWC_GKYXC_GNHWK, // 1 + NHWGC_GKYXC_NHWGK, // 2 + NGCHW_GKYXC_NGKHW, // 3 + NGCHW_GKCYX_NGKHW, // 4 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_F32_BF16, // 2 + F16_F16_F16_BF8_F8, // 3 + I8_I8_I8, // 4 + BF16_BF16_BF16, // 5 + F32_F32_F32_TF32, // 6 +}; + +#define OP_NAME "grouped_conv_bwd_weight" +#define OP_DESC "Grouped Convolution Backward Weight" + +static void print_helper_msg() +{ + std::string conv_param_parser_helper_msg; + + conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight fp32, Output bf16\n" + << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n" + << " 4: Input int8, Weight int8, Output int8\n" + << " 5: Input bf16, Weight bf16, Output bf16\n" + << " 6: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " + "N, K, Ho, Wo]\n" + << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " + "N, Ho, Wo, K]\n" + << " 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, " + "Ho, Wo, G, K]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo]\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << conv_param_parser_helper_msg + << " SplitK (-1 for internally computed split-K value, positive value to set k " + "batches explicitly, or 'all' to test all internal split-K values)\n" + << std::endl; +} + +} // namespace + +int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 9) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); + + // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + { + print_helper_msg(); + return 1; + } + + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); + + const auto& split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]); + + using F32 = float; + using F16 = ck_tile::half_t; + using BF16 = ck_tile::bfloat16_t; + using F8 = ck_tile::fp8_t; + using BF8 = ck_tile::bf8_t; + + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + constexpr auto I2 = ck_tile::number<2>{}; + constexpr auto I3 = ck_tile::number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck_tile::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + bool pass = ck_tile::profiler::profile_grouped_conv_bwd_weight_impl( + do_verification, init_method, do_log, time_kernel, params, split_k); + + return pass ? 0 : 1; + }; + + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::F16_F16_F16_BF8_F8) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); + } + else if(data_type == ConvDataType::I8_I8_I8) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, tile_profile_grouped_conv_bwd_weight); diff --git a/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp new file mode 100644 index 0000000000..b5854e48f2 --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck_tile_profiler/tile_profile_grouped_conv_fwd_impl.hpp" +#include "ck_tile_profiler/tile_profiler_operation_registry.hpp" + +// CK Tile library dependencies +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace { + +enum struct ConvLayout +{ + GNHWC_GKYXC_GNHWK, // 0 + NHWGC_GKYXC_NHWGK, // 1 + NGCHW_GKYXC_NGKHW, // 2 + NGCHW_GKCYX_NGKHW, // 3 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + I8_I8_I8, // 3 +}; + +#define OP_NAME "grouped_conv_fwd" +#define OP_DESC "Grouped Convolution Forward" + +static void print_helper_msg() +{ + std::string conv_param_parser_helper_msg; + + conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + std::cout + // clang-format off + << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight bf16, Output bf16\n" + << " 3: Input int8, Weight int8, Output int8\n" + << " 4: Input fp8, Weight fp8, Output fp8\n" + << " 5: Input bf8, Weight bf8, Output fp8\n" + << " 6: Input fp8, Weight bf8, Output fp8\n" + << " 7: Input bf8, Weight fp8, Output fp8\n" + << " 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n" + << " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n" + << " 2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo])\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << conv_param_parser_helper_msg << std::endl; + // clang-format on +} + +} // namespace + +int tile_profile_grouped_conv_fwd(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 9) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); + + // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + { + std::cout << argc << std::endl; + print_helper_msg(); + return 1; + } + + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); + constexpr ck_tile::index_t k_batch = 1; + + using F32 = float; + using F16 = ck_tile::half_t; + using BF16 = ck_tile::bfloat16_t; + + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + constexpr auto I2 = ck_tile::number<2>{}; + constexpr auto I3 = ck_tile::number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck_tile::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + bool pass = ck_tile::profiler::profile_grouped_conv_fwd_impl( + do_verification, init_method, do_log, time_kernel, params, k_batch); + + return pass ? 0 : 1; + }; + + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::I8_I8_I8) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, tile_profile_grouped_conv_fwd); diff --git a/experimental/ck_tile_profiler/src/tile_profiler.cpp b/experimental/ck_tile_profiler/src/tile_profiler.cpp new file mode 100644 index 0000000000..18bf0a7aff --- /dev/null +++ b/experimental/ck_tile_profiler/src/tile_profiler.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck_tile_profiler/tile_profiler_operation_registry.hpp" + +static void print_helper_message() +{ + std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl; +} + +int main(int argc, char* argv[]) +{ + if(argc == 1) + { + print_helper_message(); + } + else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]); + operation.has_value()) + { + return (*operation)(argc, argv); + } + else + { + std::cerr << "cannot find operation: " << argv[1] << std::endl; + return EXIT_FAILURE; + } +} diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp index 5b00e53af8..c4680a60d8 100644 --- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp +++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp @@ -5,6 +5,8 @@ #include "ck_tile/core.hpp" #include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp" +#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" namespace ck_tile {