NVIDIA
diff --git a/‎cpp/micro_benchmarks/gen-moe-benchmark-file.py
Lines changed: 5 additions & 5 deletions b/‎cpp/micro_benchmarks/gen-moe-benchmark-file.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
Lines changed: 51 additions & 23 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
Lines changed: 51 additions & 23 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
Lines changed: 8 additions & 2 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
Lines changed: 7 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
Lines changed: 7 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
Lines changed: 6 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
Lines changed: 6 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
Lines changed: 19 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
Lines changed: 19 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu
Lines changed: 0 additions & 12 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu
Lines changed: 0 additions & 12 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h
Lines changed: 2 additions & 2 deletions
@@ -54,15 +54,15 @@ def populate_benchmark_config(**kwargs):
 
 
 # Default Mixtral configurations
-num_experts = 8
-k = 2
+num_experts = 256
+k = 8
 hidden_size = 4096
-inter_size = 14336
-tp_size = 4
+inter_size = 2048
+tp_size = 8
 ep_size = 1
 world_rank = 0
 act_fn = 3
-dtype_string = make_dtype_string()  # All dtypes
+dtype_string = make_dtype_string(["fp4", "wfp4afp8"])  # All dtypes
 routing_string = make_routing_string(
     name="uniform",
     is_distribution=True)  # Use the default uniform random distribution
 
@@ -298,14 +298,21 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     using WeightType = typename TypeTuple_::WeightType;
     using OutputType = typename TypeTuple_::OutputType;
     constexpr static bool INT4 = std::is_same_v<WeightType, cutlass::uint4b_t>;
-    constexpr static bool FP4 = std::is_same_v<DataType, SafeFP4>;
-    constexpr static bool FP8 = std::is_same_v<DataType, SafeFP8>;
-    constexpr static bool INT_QUANT = !std::is_same_v<DataType, WeightType>;
-    using InputType = std::conditional_t<FP4, OutputType, DataType>;
-    using WeightStorage = std::conditional_t<INT_QUANT || FP4, uint8_t, WeightType>;
-    constexpr static int WEIGHT_ELEM_PER_BYTE = (INT4 || FP4) ? 2 : 1;
+    constexpr static bool NVFP4 = std::is_same_v<DataType, SafeFP4> && std::is_same_v<WeightType, SafeFP4>;
+    constexpr static bool FP8 = std::is_same_v<DataType, SafeFP8> && std::is_same_v<WeightType, SafeFP8>;
+    constexpr static bool WFP4AFP8 = std::is_same_v<WeightType, SafeFP4> && std::is_same_v<DataType, SafeFP8>;
+    constexpr static bool INT_QUANT = !std::is_same_v<DataType, WeightType>
+        && (std::is_same_v<WeightType, cutlass::uint4b_t> || std::is_same_v<WeightType, uint8_t>);
+    constexpr static bool ANY_FP4 = NVFP4 || WFP4AFP8;
+    using InputType = std::conditional_t<NVFP4, OutputType, DataType>;
+    using WeightStorage = std::conditional_t<INT_QUANT || ANY_FP4, uint8_t, WeightType>;
+    constexpr static int WEIGHT_ELEM_PER_BYTE = (INT4 || ANY_FP4) ? 2 : 1;
     int const BASE_HIDDEN_SIZE = 64 / sizeof(WeightType) * WEIGHT_ELEM_PER_BYTE;
 
+    constexpr static int64_t FP4_VECTOR_SIZE = NVFP4
+        ? tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
+        : tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
+
     std::vector<BufferManager::IBufferPtr> managed_buffers;
     int* mSelectedExperts{};
     DataType* mInputTensor{};
@@ -316,12 +323,12 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     constexpr static nvinfer1::DataType toDTypeID()
     {
-        if (FP8)
+        if (FP8 || WFP4AFP8)
             return nvinfer1::DataType::kFP8;
-        if (FP4)
+        if (NVFP4)
             return nvinfer1::DataType::kFP4;
         if (INT_QUANT && INT4)
-            return nvinfer1::DataType::kINT4; // Hack to distinguish int4, use unsigned
+            return nvinfer1::DataType::kINT4;
         if (INT_QUANT)
             return nvinfer1::DataType::kINT8;
         if (std::is_same_v<DataType, float>)
@@ -331,9 +338,29 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 #ifdef ENABLE_BF16
         if (std::is_same_v<DataType, nv_bfloat16>)
             return nvinfer1::DataType::kBF16;
-#else
+#endif
         TLLM_THROW("Unrecognised format");
+    };
+
+    constexpr static nvinfer1::DataType toWTypeID()
+    {
+        if (FP8)
+            return nvinfer1::DataType::kFP8;
+        if (NVFP4 || WFP4AFP8)
+            return nvinfer1::DataType::kFP4;
+        if (INT_QUANT && INT4)
+            return nvinfer1::DataType::kINT4;
+        if (INT_QUANT)
+            return nvinfer1::DataType::kINT8;
+        if (std::is_same_v<DataType, float>)
+            return nvinfer1::DataType::kFLOAT;
+        if (std::is_same_v<DataType, half>)
+            return nvinfer1::DataType::kHALF;
+#ifdef ENABLE_BF16
+        if (std::is_same_v<DataType, nv_bfloat16>)
+            return nvinfer1::DataType::kBF16;
 #endif
+        TLLM_THROW("Unrecognised format");
     };
 
     template <class T>
@@ -345,7 +372,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         }
         else if constexpr (std::is_same_v<T, SafeFP4>)
         {
-            return nvinfer1::DataType::kINT64;
+            return nvinfer1::DataType::kFP4;
         }
         else if constexpr (std::is_same_v<T, uint8_t>)
         {
@@ -380,10 +407,10 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         static_assert(!FP8, "FP8 Tests enabled on unsupported CUDA version");
 #endif
 #ifndef ENABLE_FP4
-        static_assert(!FP4, "FP4 Tests enabled on unsupported CUDA version");
+        static_assert(!ANY_FP4, "FP4 Tests enabled on unsupported CUDA version");
 #endif
         bool should_skip_unsupported_fp8 = getSMVersion() < 89 && FP8;
-        bool should_skip_unsupported_fp4 = (getSMVersion() < 100 || getSMVersion() >= 120) && FP4;
+        bool should_skip_unsupported_fp4 = (getSMVersion() < 100 || getSMVersion() >= 120) && ANY_FP4;
         return should_skip_unsupported_fp8 || should_skip_unsupported_fp4;
     }
 
@@ -496,8 +523,9 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mGatedMultiplier = mIsGated ? 2 : 1;
         auto const gated_inter = mInterSize * mGatedMultiplier;
 
-        size_t workspace_size = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK,
-            mActType, {}, mUseLora, /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
+        size_t workspace_size
+            = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {},
+                mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
 
         mWorkspace = allocBuffer<char>(workspace_size);
         size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
@@ -528,20 +556,19 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
             mQuantParams = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
         }
-        else if constexpr (FP4)
+        else if constexpr (ANY_FP4)
         {
             mExpertFP4ActScale1 = allocBuffer<float>(1);
-            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(num_experts * gated_inter * mHiddenSize
-                / tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::BlockScaleVectorSize);
+            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE);
             mExpertFP4GlobalScale1 = allocBuffer<float>(num_experts);
 
             mExpertFP4ActScale2 = allocBuffer<float>(1);
-            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(num_experts * mInterSize * mHiddenSize
-                / tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::BlockScaleVectorSize);
+            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE);
             mExpertFP4GlobalScale2 = allocBuffer<float>(num_experts);
 
-            mQuantParams = QuantParams::FP4(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1,
-                mExpertFP4ActScale2, mExpertFP4WeightSf2, mExpertFP4GlobalScale2);
+            auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
+            mQuantParams = func(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1, mExpertFP4ActScale2,
+                mExpertFP4WeightSf2, mExpertFP4GlobalScale2);
         }
 
         mSelectedExperts = allocBuffer<int>(mTotalTokens * mK);
@@ -734,7 +761,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
             mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
             parallelism_config, mUseLora, mLoraParams,
-            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+            /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
     }
 
     void runBenchmark(benchmark::State& state);
@@ -772,6 +799,7 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     state.counters["act_fn"] = (int) mActType;
     state.counters["routing_config"] = (int) routing_config;
     state.counters["dtype"] = (int) toDTypeID();
+    state.counters["wtype"] = (int) toWTypeID();
 
     std::stringstream ss;
     ss << "Experts,K,Hidden,Inter,TP,EP,Rank,Tokens,Bias,Scale,Actfn,Tactic,Routing=";
 
@@ -377,7 +377,7 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
             {
                 continue;
             }
-            else if (BenchClass::FP4 && !hasDtype("fp4"))
+            else if (BenchClass::NVFP4 && !hasDtype("fp4"))
             {
                 continue;
             }
@@ -403,6 +403,10 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
             {
                 continue;
             }
+            else if (BenchClass::WFP4AFP8 && !hasDtype("wfp4afp8"))
+            {
+                continue;
+            }
         }
 
         // Do this after filtering datatypes as tactics only make sense if we know the data type
@@ -559,6 +563,7 @@ BENCHMARK_BASIC(SafeFP8, SafeFP8, half)
 #endif
 #ifdef ENABLE_FP4
 BENCHMARK_BASIC(SafeFP4, SafeFP4, half)
+BENCHMARK_BASIC(SafeFP8, SafeFP4, half)
 #endif
 
 void delayedRegisterBenchmark()
@@ -578,6 +583,7 @@ void delayedRegisterBenchmark()
 #endif
 #ifdef ENABLE_FP4
         BENCHMARK_BASIC_DO_REGISTER(SafeFP4, SafeFP4, half);
+        BENCHMARK_BASIC_DO_REGISTER(SafeFP8, SafeFP4, half);
 #endif
     }
 }
@@ -657,7 +663,7 @@ void help()
            "Useful for quick perf tests, prefer a full sweep and manually setting the tactic for more accurate "
            "results"
            "- dtypes - A list of dtypes to run this config through.\n"
-           "Allowed values are: fp8, int4, int8, float, half, bfloat16\n"
+           "Allowed values are: fp8, fp4, wfp4afp8, int4, int8, float, half, bfloat16\n"
            "If this argument is omitted all dtypes will be run. Note, not all tactics are supported for all "
            "dtypes,\n"
            "unsupported tactics will be skipped with a warning.\n"
 
@@ -253,11 +253,14 @@ public:
         }
         if constexpr (GetQuantType<Pattern> == QuantType::kFP4)
         {
-            PackedVec<DType> pack_val = *reinterpret_cast<PackedVec<DType> const*>(&val);
-            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(std::nullopt, token_id, m_access_id_in_token,
-                std::nullopt, m_params.hidden_dim, reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);
+            constexpr int SF_VEC_SIZE = 16;
+            using PackedVec = PackedVec<DType>;
+            PackedVec pack_val = *reinterpret_cast<PackedVec const*>(&val);
+            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt, token_id,
+                m_access_id_in_token, std::nullopt, m_params.hidden_dim,
+                reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);
             reinterpret_cast<uint32_t*>(m_params.quant_out)[m_access_id]
-                = cvt_warp_fp16_to_fp4(pack_val, m_scale_factor, sf_out);
+                = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, m_scale_factor, sf_out);
         }
         else if constexpr (GetQuantType<Pattern> == QuantType::kFP8)
         {
 
@@ -147,12 +147,14 @@ __device__ __forceinline__ void fused_op(
     }
     if constexpr (QuantOut)
     {
-        PackedVec<DType> pack_val = *reinterpret_cast<PackedVec<DType> const*>(&norm_val);
-        auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(std::nullopt /* batchIdx */, token_id,
-            access_id_in_token, std::nullopt /* numRows */, params.hidden_dim,
+        constexpr int SF_VEC_SIZE = 16;
+        using PackedVec = PackedVec<DType>;
+        PackedVec pack_val = *reinterpret_cast<PackedVec const*>(&norm_val);
+        auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
+            token_id, access_id_in_token, std::nullopt /* numRows */, params.hidden_dim,
             reinterpret_cast<uint32_t*>(params.scale_out), params.layout);
         reinterpret_cast<uint32_t*>(params.quant_out)[access_id]
-            = cvt_warp_fp16_to_fp4(pack_val, *params.scale_factor, sf_out);
+            = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, *params.scale_factor, sf_out);
     }
 }
 
 
@@ -42,12 +42,14 @@ set_directory_properties(
 
 set(INSTANTIATION_GENERATION_DIR
     ${CMAKE_CURRENT_BINARY_DIR}/cutlass_instantiations)
+
 execute_process(
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/
   COMMAND
     ${Python3_EXECUTABLE} generate_kernels.py -a
     "${CMAKE_CUDA_ARCHITECTURES_ORIG};${CMAKE_CUDA_ARCHITECTURES_NATIVE}" -o
     ${INSTANTIATION_GENERATION_DIR}
+  OUTPUT_VARIABLE _KERNEL_GEN_OUTPUT
   RESULT_VARIABLE _KERNEL_GEN_SUCCESS)
 
 if(NOT _KERNEL_GEN_SUCCESS MATCHES 0)
@@ -57,6 +59,23 @@ if(NOT _KERNEL_GEN_SUCCESS MATCHES 0)
   )
 endif()
 
+file(GLOB_RECURSE INSTANTIATIONS_GENERATED ${INSTANTIATION_GENERATION_DIR}/*.cu)
+string(STRIP "${_KERNEL_GEN_OUTPUT}" _KERNEL_GEN_OUTPUT)
+
+# Sort both lists to ensure order doesn't matter
+list(SORT _KERNEL_GEN_OUTPUT)
+list(SORT INSTANTIATIONS_GENERATED)
+
+# Compare the lists
+if(NOT _KERNEL_GEN_OUTPUT STREQUAL INSTANTIATIONS_GENERATED)
+  list(REMOVE_ITEM INSTANTIATIONS_GENERATED ${_KERNEL_GEN_OUTPUT})
+  message(
+    WARNING
+      "There exist stale generated kernels in ${INSTANTIATION_GENERATION_DIR}. Removing these files:\n${INSTANTIATIONS_GENERATED}"
+  )
+  file(REMOVE ${INSTANTIATIONS_GENERATED})
+endif()
+
 # Get the sources for Mixed Input GEMM launchers
 file(GLOB_RECURSE MIXED_CU_INSTANTIATIONS
      ${INSTANTIATION_GENERATION_DIR}/gemm/*.cu)
 
@@ -28,6 +28,7 @@
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/gemm/kernel/tile_scheduler.hpp"
 #include "cutlass_extensions/communication/collective/sm90_allreduce_nvls_warpspecialized.hpp"
+#include "cutlass_extensions/epilogue/fusion/sm90_visitor_allreduce_tma_warpspecialized.hpp"
 #include "cutlass_extensions/gemm/kernel/sm90_gemm_allreduce_tma_warpspecialized_pingpong.hpp"
 
 #include "tensorrt_llm/common/cudaUtils.h"
 
@@ -21,18 +21,6 @@
 namespace tensorrt_llm::kernels::fp8_blockscale_gemm
 {
 
-template <typename ElementA, typename ElementB, typename ElementD>
-CutlassFp8BlockScaleGemmRunner<ElementA, ElementB, ElementD>::CutlassFp8BlockScaleGemmRunner()
-{
-    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-
-template <typename ElementA, typename ElementB, typename ElementD>
-CutlassFp8BlockScaleGemmRunner<ElementA, ElementB, ElementD>::~CutlassFp8BlockScaleGemmRunner()
-{
-    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-
 template <typename ElementA, typename ElementB, typename ElementD>
 void CutlassFp8BlockScaleGemmRunner<ElementA, ElementB, ElementD>::gemm(void* mat_d, void const* mat_a,
     void const* mat_b, int shape_m, int shape_n, int shape_k, cudaStream_t stream, float const* scales_a,
 
@@ -85,8 +85,8 @@ template <typename ElementA, typename ElementB, typename ElementD>
 class CutlassFp8BlockScaleGemmRunner : public CutlassFp8BlockScaleGemmRunnerInterface
 {
 public:
-    CutlassFp8BlockScaleGemmRunner();
-    ~CutlassFp8BlockScaleGemmRunner();
+    CutlassFp8BlockScaleGemmRunner() = default;
+    ~CutlassFp8BlockScaleGemmRunner() override = default;
 
     void gemm(void* mat_d, void const* mat_a, void const* mat_b, int shape_m, int shape_n, int shape_k,
         cudaStream_t stream, float const* scales_a = nullptr, float const* scales_b = nullptr) override;
Original file line number	Diff line number	Diff line change
`@@ -377,7 +377,7 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)`
`377`	`377`	`{`
`378`	`378`	`continue;`
`379`	`379`	`}`
`380`		`- else if (BenchClass::FP4 && !hasDtype("fp4"))`
	`380`	`+ else if (BenchClass::NVFP4 && !hasDtype("fp4"))`
`381`	`381`	`{`
`382`	`382`	`continue;`
`383`	`383`	`}`
`@@ -403,6 +403,10 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)`
`403`	`403`	`{`
`404`	`404`	`continue;`
`405`	`405`	`}`
	`406`	`+ else if (BenchClass::WFP4AFP8 && !hasDtype("wfp4afp8"))`
	`407`	`+ {`
	`408`	`+ continue;`
	`409`	`+ }`
`406`	`410`	`}`
`407`	`411`
`408`	`412`	`// Do this after filtering datatypes as tactics only make sense if we know the data type`
`@@ -559,6 +563,7 @@ BENCHMARK_BASIC(SafeFP8, SafeFP8, half)`
`559`	`563`	`#endif`
`560`	`564`	`#ifdef ENABLE_FP4`
`561`	`565`	`BENCHMARK_BASIC(SafeFP4, SafeFP4, half)`
	`566`	`+BENCHMARK_BASIC(SafeFP8, SafeFP4, half)`
`562`	`567`	`#endif`
`563`	`568`
`564`	`569`	`void delayedRegisterBenchmark()`
`@@ -578,6 +583,7 @@ void delayedRegisterBenchmark()`
`578`	`583`	`#endif`
`579`	`584`	`#ifdef ENABLE_FP4`
`580`	`585`	`BENCHMARK_BASIC_DO_REGISTER(SafeFP4, SafeFP4, half);`
	`586`	`+ BENCHMARK_BASIC_DO_REGISTER(SafeFP8, SafeFP4, half);`
`581`	`587`	`#endif`
`582`	`588`	`}`
`583`	`589`	`}`
`@@ -657,7 +663,7 @@ void help()`
`657`	`663`	`"Useful for quick perf tests, prefer a full sweep and manually setting the tactic for more accurate "`
`658`	`664`	`"results"`
`659`	`665`	`"- dtypes - A list of dtypes to run this config through.\n"`
`660`		`- "Allowed values are: fp8, int4, int8, float, half, bfloat16\n"`
	`666`	`+ "Allowed values are: fp8, fp4, wfp4afp8, int4, int8, float, half, bfloat16\n"`
`661`	`667`	`"If this argument is omitted all dtypes will be run. Note, not all tactics are supported for all "`
`662`	`668`	`"dtypes,\n"`
`663`	`669`	`"unsupported tactics will be skipped with a warning.\n"`
Original file line number	Diff line number	Diff line change
`@@ -253,11 +253,14 @@ public:`
`253`	`253`	`}`
`254`	`254`	`if constexpr (GetQuantType<Pattern> == QuantType::kFP4)`
`255`	`255`	`{`
`256`		`- PackedVec<DType> pack_val = reinterpret_cast<PackedVec<DType> const>(&val);`
`257`		`- auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(std::nullopt, token_id, m_access_id_in_token,`
`258`		`- std::nullopt, m_params.hidden_dim, reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);`
	`256`	`+ constexpr int SF_VEC_SIZE = 16;`
	`257`	`+ using PackedVec = PackedVec<DType>;`
	`258`	`+ PackedVec pack_val = reinterpret_cast<PackedVec const>(&val);`
	`259`	`+ auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt, token_id,`
	`260`	`+ m_access_id_in_token, std::nullopt, m_params.hidden_dim,`
	`261`	`+ reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);`
`259`	`262`	`reinterpret_cast<uint32_t*>(m_params.quant_out)[m_access_id]`
`260`		`- = cvt_warp_fp16_to_fp4(pack_val, m_scale_factor, sf_out);`
	`263`	`+ = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, m_scale_factor, sf_out);`
`261`	`264`	`}`
`262`	`265`	`else if constexpr (GetQuantType<Pattern> == QuantType::kFP8)`
`263`	`266`	`{`
Original file line number	Diff line number	Diff line change
`@@ -147,12 +147,14 @@ __device__ __forceinline__ void fused_op(`
`147`	`147`	`}`
`148`	`148`	`if constexpr (QuantOut)`
`149`	`149`	`{`
`150`		`- PackedVec<DType> pack_val = reinterpret_cast<PackedVec<DType> const>(&norm_val);`
`151`		`- auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(std::nullopt /* batchIdx */, token_id,`
`152`		`- access_id_in_token, std::nullopt /* numRows */, params.hidden_dim,`
	`150`	`+ constexpr int SF_VEC_SIZE = 16;`
	`151`	`+ using PackedVec = PackedVec<DType>;`
	`152`	`+ PackedVec pack_val = reinterpret_cast<PackedVec const>(&norm_val);`
	`153`	`+ auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt /* batchIdx */,`
	`154`	`+ token_id, access_id_in_token, std::nullopt /* numRows */, params.hidden_dim,`
`153`	`155`	`reinterpret_cast<uint32_t*>(params.scale_out), params.layout);`
`154`	`156`	`reinterpret_cast<uint32_t*>(params.quant_out)[access_id]`
`155`		`- = cvt_warp_fp16_to_fp4(pack_val, *params.scale_factor, sf_out);`
	`157`	`+ = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, *params.scale_factor, sf_out);`
`156`	`158`	`}`
`157`	`159`	`}`
`158`	`160`