@@ -147,11 +147,24 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
147147 DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple<Row, Col>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256 , 64 , 256 , 512 , 16 , 16 , 16 , 16 , 4 , 4 , S<32 , 8 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , S<32 , 8 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , 2 , 1 , S<1 , 32 , 1 , 8 >, S<8 , 8 , 1 >, BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
148148 DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple<Row, Col>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256 , 32 , 256 , 512 , 16 , 16 , 16 , 16 , 2 , 4 , S<32 , 8 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , S<32 , 8 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , 2 , 1 , S<1 , 32 , 1 , 8 >, S<8 , 8 , 1 >, BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
149149 // N 512
150- DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple<Row, Col>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256 , 64 , 512 , 256 , 16 , 16 , 16 , 16 , 4 , 8 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , 2 , 1 , S<1 , 32 , 1 , 8 >, S<8 , 8 , 1 >, BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
151150 DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple<Row, Col>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256 , 32 , 512 , 256 , 16 , 16 , 16 , 16 , 2 , 8 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , 2 , 1 , S<1 , 32 , 1 , 8 >, S<8 , 8 , 1 >, BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
152151 // clang-format on
153152 >;
154153
154+ // separate instances from p3 instances that may not work on gfx950
155+ template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
156+ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_instances_part2 =
157+ std::tuple<
158+ // clang-format off
159+ // ##########################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
160+ // ##########################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| Pipeline| Pipeline|
161+ // ##########################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NWaveNPerXdl| Scheduler| Verision|
162+ // ##########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
163+ // known issue: v1 of this instance with kbatch > 1 failed verification on test_gemm_multiply_multiply_wp_xdl_fp8
164+ DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple<Row, Col>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256 , 64 , 512 , 256 , 16 , 16 , 16 , 16 , 4 , 8 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , S<16 , 16 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 16 , 16 , 0 , 2 , 1 , S<1 , 32 , 1 , 8 >, S<8 , 8 , 1 >, BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
165+ // clang-format on
166+ >;
167+
155168template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
156169using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_instances =
157170 std::tuple<
0 commit comments