NVIDIA
diff --git a/‎cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp‎
Lines changed: 37 additions & 14 deletions b/‎cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp‎
Lines changed: 37 additions & 14 deletions
diff --git a/‎tensorrt_llm/_torch/autotuner.py‎
Lines changed: 214 additions & 2 deletions b/‎tensorrt_llm/_torch/autotuner.py‎
Lines changed: 214 additions & 2 deletions
@@ -15,13 +15,16 @@
  */
 
 #include "tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h"
+#include "tensorrt_llm/thop/thUtils.h"
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/EmptyTensor.h>
 #include <torch/library.h>
 
 #include <cstdint>
+#include <memory>
+#include <unordered_map>
 
 namespace torch_ext
 {
@@ -316,16 +319,30 @@ class FP8BlockScaleMoeRunner : public torch::CustomClassHolder
 {
 
 public:
-    explicit FP8BlockScaleMoeRunner(int64_t tileTokensDim)
-        : mTileTokensDim(tileTokensDim)
+    explicit FP8BlockScaleMoeRunner()
+        : mSupportedTileN{8, 16, 32, 64}
     {
-        mRunner = std::make_unique<RunnerType>(mDtypeElt, mUseDeepSeekFp8, mTileTokensDim);
+        for (int tileN : mSupportedTileN)
+        {
+            mRunners.emplace(tileN, std::make_unique<RunnerType>(mDtypeElt, mUseDeepSeekFp8, tileN));
+        }
     }
 
-    [[nodiscard]] std::vector<int64_t> getValidConfigs(
+    [[nodiscard]] std::vector<std::vector<int64_t>> getValidConfigs(
         int64_t topK, int64_t hiddenSize, int64_t intermediateSize, int64_t numLocalExperts, int64_t numTokens) const
     {
-        return mRunner->getValidConfigIndices(topK, hiddenSize, intermediateSize, numLocalExperts, numTokens);
+        // returns (tileN, config)
+        std::vector<std::vector<int64_t>> tactics;
+        for (auto& [tileN, runner] : mRunners)
+        {
+            auto config_indices_per_runner
+                = runner->getValidConfigIndices(topK, hiddenSize, intermediateSize, numLocalExperts, numTokens);
+            for (auto cfg : config_indices_per_runner)
+            {
+                tactics.push_back({tileN, cfg});
+            }
+        }
+        return tactics;
     }
 
     [[nodiscard]] at::Tensor run(at::optional<at::Tensor> const& routing_logits,
@@ -334,42 +351,48 @@ class FP8BlockScaleMoeRunner : public torch::CustomClassHolder
         at::Tensor const& gemm2_weights, at::Tensor const& gemm2_weights_scale, int64_t num_experts, int64_t top_k,
         std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t const intermediate_size,
         int64_t const local_expert_offset, int64_t const local_num_experts,
-        std::optional<double> const routed_scaling_factor, int64_t routing_method_type, int64_t moeConfigIndex,
-        std::optional<at::Tensor> const& topk_weights, std::optional<at::Tensor> const& topk_ids)
+        std::optional<double> const routed_scaling_factor, int64_t routing_method_type,
+        std::vector<int64_t> tile_config_pair, std::optional<at::Tensor> const& topk_weights,
+        std::optional<at::Tensor> const& topk_ids)
     {
+        // tile_config_pair corresponds to pair (tileN, config)
+        auto [tileN, config] = std::tie(tile_config_pair[0], tile_config_pair[1]);
 
         // Autotuner has requested a default or 'fallback' config index
-        if (moeConfigIndex == -1)
+        if (tileN == -1 || config == -1)
         {
             auto const num_tokens = hidden_states.sizes()[0];
             auto const hidden_size = hidden_states.sizes()[1];
 
-            moeConfigIndex = mRunner->getDefaultValidConfigIndex(
+            float const avg_tokens_per_expert = static_cast<float>(num_tokens * top_k) / local_num_experts;
+            tileN = std::clamp(nextPowerOfTwo(avg_tokens_per_expert), mSupportedTileN.front(), mSupportedTileN.back());
+
+            config = mRunners.at(tileN)->getDefaultValidConfigIndex(
                 top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
         }
 
         return run_fp8_block_scale_moe(routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights,
             gemm1_weights_scale, gemm2_weights, gemm2_weights_scale, num_experts, top_k, n_group, topk_group,
-            intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor, mTileTokensDim,
-            routing_method_type, *mRunner, moeConfigIndex, topk_weights, topk_ids);
+            intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor, tileN,
+            routing_method_type, *mRunners.at(tileN), config, topk_weights, topk_ids);
     }
 
 private:
     using RunnerType = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner;
 
-    std::unique_ptr<RunnerType> mRunner;
+    std::vector<int32_t> const mSupportedTileN;
+    std::unordered_map<int32_t, std::unique_ptr<RunnerType>> mRunners;
 
     btg::Dtype mDtypeElt{btg::Dtype::E4m3}; // FP8 runner so hard-coded
     bool mUseDeepSeekFp8{true};             // Always true for BlockScaleMoe
-    int64_t mTileTokensDim;
 };
 
 } // namespace torch_ext
 
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
     m.class_<torch_ext::FP8BlockScaleMoeRunner>("FP8BlockScaleMoERunner")
-        .def(torch::init<int64_t>())
+        .def(torch::init<>())
         .def("get_valid_configs", &torch_ext::FP8BlockScaleMoeRunner::getValidConfigs)
         .def("run_moe", &torch_ext::FP8BlockScaleMoeRunner::run);
 }
@@ -10,7 +10,7 @@
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 
@@ -127,7 +127,7 @@ def _opt(self):
 class OptimizationProfile:
     '''Ranges of all tensors, all dimension
     '''
-    shapes: List[List[Dim]]
+    shapes: List[List[Dim]] = field(default_factory=lambda: [[]])
 
     def get_hash_key(self):
         return self.get_opt_shapes()
@@ -536,12 +536,90 @@ def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
 
         self.profiling_debug = True
 
+        # Current captured choose_one() contexts
+        self._active_capture: Optional['AutoTuner.TacticsCapture'] = None
+        # Last captured choose_one() contexts
+        self._last_capture: Optional['AutoTuner.TacticsCapture'] = None
+
     @classmethod
     def get(cls):
         if cls._instance is None:
             cls._instance = AutoTuner()
         return cls._instance
 
+    class TacticsCapture:
+        """Object returned by capture() that can be iterated to get all tactic combinations.
+
+        This class encapsulates all state related to capturing and replaying tactics:
+        - Captured execution contexts
+        - Generated tactic configurations
+        - Current replay state (which config and call index)
+        """
+
+        def __init__(self, autotuner):
+            # State for captured contexts
+            self._captured_contexts: List[Dict[str, Any]] = []
+            self._configurations = None
+            # State for replay mode
+            self._replay_runner_tactic_list: Optional[List[Tuple[int,
+                                                                 int]]] = None
+            self._replay_context_idx: int = 0
+
+        def __iter__(self):
+            """Iterate through all tactic configurations.
+
+            For single context: yields (runner, tactic)
+            For multiple contexts: yields ((runner_ctx0, tactic_ctx0), (runner_ctx1, tactic_ctx1), ...)
+            """
+            if self._configurations is None:
+                self._configurations = self._generate_configurations()
+
+            for config in self._configurations:
+                # config is a tuple of (runner_idx, tactic) for each context
+                # Convert to (runner, tactic) format for user
+                runner_tactic_pairs = []
+                for ctx_idx, (runner_idx, tactic) in enumerate(config):
+                    runners = self._captured_contexts[ctx_idx]['runners']
+                    runner = runners[runner_idx]
+                    runner_tactic_pairs.append((runner, tactic))
+
+                yield tuple(runner_tactic_pairs)
+
+        def _generate_configurations(self):
+            """Generate all valid tactic combinations."""
+            if not self._captured_contexts:
+                raise RuntimeError(
+                    "No context available for testing.\n"
+                    "Use capture() to capture the operation context first:\n"
+                    "  with AutoTuner.get().capture() as tactics_capture:\n"
+                    "      output = operation.forward(...)\n")
+
+            # Collect valid tactics for each context separately
+            context_tactics_lists = []
+
+            for context in self._captured_contexts:
+                runners = context['runners']
+                inputs = context['inputs']
+                kwargs = context.get('kwargs', {})
+
+                # Collect all valid (runner, tactic) combinations for this context
+                tactics_lists = []
+                for runner_idx, runner in enumerate(runners):
+                    valid_tactics = runner.get_valid_tactics(
+                        inputs, OptimizationProfile(), **kwargs)
+                    for tactic in valid_tactics:
+                        tactics_lists.append((runner_idx, tactic))
+                context_tactics_lists.append(tactics_lists)
+
+            # Generate cartesian product from context and tactics where all_configrations[i][ctx] = (runner, tactic)
+            # Such that each element in all_configrations is a replay of multiple contexts of all possible replays
+            all_configurations = list(itertools.product(*context_tactics_lists))
+            return all_configurations
+
+        def is_replaying(self) -> bool:
+            """Check if this TacticsCapture is currently in replay mode."""
+            return self._replay_runner_tactic_list is not None
+
     def choose_one(
         self,
         custom_op: str,
@@ -573,6 +651,52 @@ def choose_one(
             Runner authors are suggested to provide a fallback implementation for each runner to avoid potential issues.
         """
 
+        # Check if we're in replay mode via active TacticsCapture
+        if self._active_capture is not None and self._active_capture.is_replaying(
+        ):
+            tactics_capture = self._active_capture
+            call_idx = tactics_capture._replay_context_idx
+
+            assert call_idx < len(tactics_capture._replay_runner_tactic_list
+                                  ), "call_idx out of range"
+            assert call_idx < len(
+                tactics_capture._captured_contexts), "call_idx out of range"
+            assert len(tactics_capture._replay_runner_tactic_list) == len(
+                tactics_capture._captured_contexts)
+
+            # Check if we have a forced tactic for this call and both custom_op match
+            captured_custom_op = tactics_capture._captured_contexts[
+                call_idx].get('custom_op')
+            if captured_custom_op != custom_op:
+                raise RuntimeError(
+                    f"Custom op mismatch in kernel testing mode.\n"
+                    f"Expected operation: '{captured_custom_op}'\n"
+                    f"Actual operation: '{custom_op}'\n"
+                    f"Context index: {call_idx}\n"
+                    f"Make sure the forward() call in test mode uses the same operation as captured."
+                )
+
+            runner_idx, tactic = tactics_capture._replay_runner_tactic_list[
+                call_idx]
+            # Increment context counter
+            tactics_capture._replay_context_idx += 1
+            # Reset counter after all contexts have been used
+            if tactics_capture._replay_context_idx >= len(
+                    tactics_capture._replay_runner_tactic_list):
+                tactics_capture._replay_context_idx = 0
+            return (runners[runner_idx], tactic)
+
+        # Capture context for testing all underlying kernels
+        if self._active_capture is not None and not self._active_capture.is_replaying(
+        ):
+            self._active_capture._captured_contexts.append({
+                'custom_op': custom_op,
+                'runners': runners,
+                'tuning_config': tuning_config,
+                'inputs': inputs,
+                'kwargs': kwargs,
+            })
+
         input_shapes = tuple(self._get_input_sizes(inputs))
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
@@ -957,3 +1081,91 @@ def print_profiling_cache(self):
             logger.debug(
                 f"[Autotuner] {key}: (runner_id={runner_id}, tactic={tactic}, min_time={min_time})"
             )
+
+    @contextlib.contextmanager
+    def capture(self):
+        """Context manager for capturing execution contexts for testing.
+
+        Returns a TacticsCapture object that can be iterated to get all valid
+        (runner, tactic) combinations.
+
+        Example:
+            >>> # Single context case
+            >>> with AutoTuner.get().capture() as tactics_capture:
+            ...     y = custom_op.forward(x)
+            >>>
+            >>> for runner, tactic in tactics_capture:
+            ...     with AutoTuner.get().replay(runner, tactic):
+            ...         y = custom_op.forward(x)
+
+            >>> # Multiple contexts case
+            >>> with AutoTuner.get().capture() as tactics_capture:
+            ...     y = custom_op1.forward(x)
+            ...     z = custom_op2.forward(y)
+            >>>
+            >>> for config in tactics_capture:
+            ...     with AutoTuner.get().replay(config):
+            ...         y = custom_op1.forward(x)
+            ...         z = custom_op2.forward(y)
+        """
+        tactics_capture = self.TacticsCapture(self)
+        self._active_capture = tactics_capture
+        try:
+            yield tactics_capture
+        finally:
+            self._active_capture = None
+            self._last_capture = tactics_capture
+
+    @contextlib.contextmanager
+    def replay(self, *config: Tuple[Tuple[TunableRunner, int], ...]):
+        """Context manager for replaying with specific runner/tactic configuration.
+
+        Args:
+            config:
+                - A tuple of (runner, tactic) pairs. The tuple size matches the number of captured choose_one() contexts.
+        """
+        # Parse config argument
+        if len(config) == 1:
+            if isinstance(config[0], tuple):
+                # Multiple contexts: replay(((r0,t0), (r1,t1), ...))
+                runner_tactic_pairs = list(config[0])
+            else:
+                # Also handle single context passed as replay((runner, tactic))
+                runner_tactic_pairs = [config[0]]
+        else:
+            raise ValueError(
+                f"Invalid config for replay: {config}\n"
+                "Expected replay(((runner, tactic), (runner, tactic), ...))")
+
+        # Find the TacticsCapture to use
+        tactics_capture = self._active_capture or self._last_capture
+
+        if tactics_capture is None:
+            raise RuntimeError(
+                "No TacticsCapture available for replay. "
+                "Make sure you've called capture() before replay().")
+
+        # Temporarily set as active capture during replay
+        prev_active = self._active_capture
+        self._active_capture = tactics_capture
+
+        runner_tactic_list = []
+        for ctx_idx, (runner, tactic) in enumerate(runner_tactic_pairs):
+            runners = tactics_capture._captured_contexts[ctx_idx]['runners']
+            runner_idx = runners.index(runner)
+            runner_tactic_list.append((runner_idx, tactic))
+
+        logger.debug(
+            f"[Autotuner][replay]: Testing configuration: {runner_tactic_list}")
+
+        # Replay the contexts with given (runner, tactic) pairs
+        tactics_capture._replay_runner_tactic_list = runner_tactic_list
+        tactics_capture._replay_context_idx = 0
+
+        try:
+            yield
+        finally:
+            tactics_capture._replay_runner_tactic_list = None
+            tactics_capture._replay_context_idx = 0
+            # Restore previous active capture state
+            self._active_capture = prev_active