Revert "[inductor] Expand use of generic benchmark function (pytorch#164938)"

pytorchmergebot · Chao1Han · commit 696e668b1ccf · 2025-10-21T15:38:11.000+08:00
This reverts commit 5c583e2. Reverted pytorch#164938 on behalf of https://github.com/clee2000 due to I think this broke test/inductor/test_cuda_repro.py::CudaReproTests::test_epilogue_fusion_with_view? [GH job link](https://github.com/pytorch/pytorch/actions/runs/18529735968/job/52813191763) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/f58f301313d4fc89499fb35cdfb2ffb91d14d896) on both rocm and the slow grad check for linux. It did run successfully on cuda workflow on trunk, I wonder if this a gpu capability thing? no clue though ([comment](pytorch#164938 (comment)))
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
@@ -8,7 +8,6 @@
 
 from torch._inductor.ir import MultiTemplateBuffer
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
-from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch.utils._ordered_set import OrderedSet
 
 from .. import config
@@ -370,20 +369,16 @@ def benchmark_sub_kernels(self, *args, **kwargs):
         be picked.
         """
 
-        def get_args_kwargs(kernel, index) -> tuple[tuple, dict[str, Any]]:  # type: ignore[type-arg]
-            filtered_args = self._get_filtered_args(args, index)
-            args_clone, kwargs_clone = kernel.clone_args(*filtered_args, **kwargs)
-            return args_clone, kwargs_clone
+        def wrap_fn(kernel, index):
+            def inner():
+                filtered_args = self._get_filtered_args(args, index)
+                args_clone, kwargs_clone = kernel.clone_args(*filtered_args, **kwargs)
+                return kernel.run(*args_clone, **kwargs_clone)
+
+            return inner
 
         return [
-            benchmarker.benchmark(
-                kernel.run,
-                *get_args_kwargs(kernel, index),
-                device=kernel.device_props.type
-                if isinstance(kernel, CachingAutotuner)
-                else None,
-                rep=40,
-            )
+            benchmarker.benchmark_gpu(wrap_fn(kernel, index), rep=40)
             for index, kernel in enumerate(self.kernels)
         ]
 
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
@@ -109,10 +109,7 @@ def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
                 bm_func([*sym_inputs, *args])
         if config.profile_bandwidth_with_do_bench_using_profiling:
             return do_bench_using_profiling(lambda: bm_func([*sym_inputs, *args]))
-        return benchmarker.benchmark(
-            bm_func,
-            fn_args=([*sym_inputs, *args],),
-        )
+        return benchmarker.benchmark_gpu(lambda: bm_func([*sym_inputs, *args]))
 
     def hash_key(self) -> str:
         return "-".join(
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -4682,7 +4682,7 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
 
             result.writeline("args = get_args()")
             result.writeline(
-                f"ms = benchmarker.benchmark(lambda: call(args), device={V.graph.get_current_device_or_throw().type}, rep=40)"  # noqa: B950 line too long
+                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
             )
             result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
@@ -5624,21 +5624,18 @@ def load_cache():
                 # skip benchmarking the kernel if there are register spills
                 ms = float("inf")
             else:
-                device = V.graph.get_current_device_or_throw()
                 # We have to clone the inplace updated arguments to avoid earlier calls
                 # generating out of range indices for later calls.
-                ms = benchmarker.benchmark(
-                    lambda: call(wrapped_jit_function.clone_args(*args)[0]),
-                    device=device,
+                ms = benchmarker.benchmark_gpu(
+                    lambda: call(wrapped_jit_function.clone_args(*args)[0])
                 )
                 # overhead of cloning args gives bias for fusing the kernel
                 # in the case of mutating/in-placeable second fusion
                 # TODO - would be better as a hook in triton do_bench that reset
                 # the input values between benchmarking
                 if len(wrapped_jit_function.mutated_arg_names) > 0:
-                    ms = ms - benchmarker.benchmark(
-                        lambda: wrapped_jit_function.clone_args(*args),
-                        device=str(device),
+                    ms = ms - benchmarker.benchmark_gpu(
+                        lambda: wrapped_jit_function.clone_args(*args)
                     )
 
             log.debug(
@@ -5807,16 +5804,13 @@ def store_cache():
                 # skip benchmarking the kernel if there are register spills
                 ms = ms_clone = float("inf")
             else:
-                device = V.graph.get_current_device_or_throw()
                 # We have to clone the inplace updated arguments to avoid earlier calls
                 # generating out of range indices for later calls.
-                ms = benchmarker.benchmark(
-                    lambda: call(wrapped_jit_function.clone_args(*args)[0]),
-                    device=device,
+                ms = benchmarker.benchmark_gpu(
+                    lambda: call(wrapped_jit_function.clone_args(*args)[0])
                 )
-                ms_clone = benchmarker.benchmark(
-                    lambda: wrapped_jit_function.clone_args(*args)[0],
-                    device=device,
+                ms_clone = benchmarker.benchmark_gpu(
+                    lambda: wrapped_jit_function.clone_args(*args)[0]
                 )
 
             log.debug(
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -889,7 +889,6 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
             result.writeline(f"return {', '.join(var_names)},")
 
         result.writelines(["\n", "\n", "def call(args):"])
-        device = V.graph.get_current_device_or_throw()
         index = V.graph.get_current_device_or_throw().index
         with result.indent():
             result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
@@ -924,7 +923,7 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
 
             result.writeline("args = get_args()")
             result.writeline(
-                f"ms = benchmarker.benchmark(call, fn_args=(args,), device={device.type},rep=40)"
+                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
             )
             result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -5050,9 +5050,7 @@ def benchmark(self, *args: Any, out: torch.Tensor) -> float:
         }
         if config.profile_bandwidth_with_do_bench_using_profiling:
             return do_bench_using_profiling(lambda: algo(*args), **benchmark_configs)  # type: ignore[arg-type]
-        return benchmarker.benchmark(
-            algo, args, {"out": out}, device=None, **benchmark_configs
-        )
+        return benchmarker.benchmark(algo, args, {"out": out}, **benchmark_configs)
 
     def call_name(self) -> str:
         raise NotImplementedError
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
@@ -92,21 +92,15 @@ def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
 
 
 class Benchmarker:
-    """
-    A device-agnostic benchmarking utility for measuring the runtime of
-    inductor generated callables.
-    """
-
     def __init__(self: Self) -> None:
         pass
 
     @time_and_count
     def benchmark(
         self: Self,
         fn: Callable[..., Any],
-        fn_args: Optional[tuple[Any, ...]] = None,
-        fn_kwargs: Optional[dict[str, Any]] = None,
-        device: Optional[Union[str, torch.device]] = None,
+        fn_args: tuple[Any, ...],
+        fn_kwargs: dict[str, Any],
         **kwargs: Any,
     ) -> float:
         """Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
@@ -115,61 +109,34 @@ def benchmark(
         device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
         `ValueError(...)` if we can't safely infer the device type of `fn`; for example,
         if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
-        types are found. To bypass device inference, provide the device to the `device`
-        parameter.
+        types are found.
 
         Arguments:
         - fn: The function to benchmark.
         - fn_args: The function's arguments.
         - fn_kwargs: The function's kwargs.
 
         Keyword Arguments:
-        - device: Which device to use for benchmarking. If not provided the device will be attempted
-        to be inferred from `fn_args` and `fn_kwargs`.
         - **kwargs: The benchmarking implementation's kwargs.
 
         Returns:
         - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
         """
-        inferred_device: Optional[torch.device] = None
-        if device is not None:
-            inferred_device = (
-                torch.device(device) if isinstance(device, str) else device
-            )
-        else:
-            if fn_args is None and fn_kwargs is None:
+        inferred_device = None
+        for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
+            if not isinstance(arg_or_kwarg, torch.Tensor):
+                continue
+            if inferred_device is None:
+                inferred_device = arg_or_kwarg.device
+            elif arg_or_kwarg.device != inferred_device:
                 raise ValueError(
-                    "`fn_args` and `fn_kwargs` cannot both be None if `device` is not provided."
+                    "Can't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!"
                 )
-
-            fn_args = fn_args or tuple()
-            fn_kwargs = fn_kwargs or {}
-            for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
-                if not isinstance(arg_or_kwarg, torch.Tensor):
-                    continue
-                if inferred_device is None:
-                    inferred_device = arg_or_kwarg.device
-                elif arg_or_kwarg.device != inferred_device:
-                    raise ValueError(
-                        "Can't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!"
-                    )
-
         if inferred_device is None:
             raise ValueError(
-                "Can't safely infer the device type of `fn` with no device types"
-                " in `fn_args` or `fn_kwargs` and `device` not explicitly provided!"
-                " You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly."
+                "Can't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly."  # noqa: B950
             )
-
-        fn_args = fn_args or tuple()
-        fn_kwargs = fn_kwargs or {}
-
-        # No need to wrap if the callable takes no arguments
-        if len(fn_args) == 0 and len(fn_kwargs) == 0:
-            _callable = fn
-        else:
-            _callable = lambda: fn(*fn_args, **fn_kwargs)  # noqa: E731
-
+        _callable = lambda: fn(*fn_args, **fn_kwargs)  # noqa: E731
         if inferred_device == torch.device("cpu"):
             return self.benchmark_cpu(_callable, **kwargs)
         # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -927,11 +927,11 @@ def kernel_call():
 
             return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
 
-        benchmark_kwargs = {"rep": 40} if self.device_props.type == "cuda" else {}
-        return benchmarker.benchmark(
-            fn=kernel_call,
-            device=self.device_props.type,
-            **benchmark_kwargs,  # type: ignore[arg-type]
+        if self.device_props.type == "cpu":
+            return benchmarker.benchmark_cpu(kernel_call)
+
+        return benchmarker.benchmark_gpu(
+            kernel_call, rep=40, is_vetted_benchmarking=True
         )
 
     def copy_args_to_cpu_if_needed(self, *args, **kwargs):
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -3269,8 +3269,8 @@ def speedup_by_fusion(
         device = node_list_1[0].get_device()
         assert device
 
-        # don't support benchmark fusion for CPU C++ backend right now.
-        if device.type == "cpu" and config.cpu_backend != "triton":
+        # don't support benchmark fusion for CPU right now.
+        if device.type == "cpu":
             return True
 
         node_list_2 = node2.get_nodes()
@@ -5569,8 +5569,8 @@ def speedup_by_combo_kernel(self, nodes: list[BaseSchedulerNode]) -> bool:
         subkernel_nodes = nodes
         device = subkernel_nodes[0].get_device()
 
-        # don't support benchmark fusion for CPU C++ backend right now.
-        if device is None or (device.type == "cpu" and config.cpu_backend != "triton"):
+        # don't support benchmark fusion for CPU right now.
+        if device is None or device.type == "cpu":
             return True
 
         from triton.compiler.errors import CompilationError
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -2671,10 +2671,8 @@ def __call__(
 
         # Templates selected with input_gen_fns require specific input data to avoid IMA
         # Passing custom input gen fns to benchmark_fusion NYI, so skip deferred template selection
-        # TODO(jgong5): support multi-template on CPU C++ backend
-        if input_gen_fns is not None or (
-            layout.device.type == "cpu" and config.cpu_backend != "triton"
-        ):
+        # TODO(jgong5): support multi-template on CPU
+        if input_gen_fns is not None or layout.device.type == "cpu":
             return_multi_template = False
 
         # TODO - assert that we have not mutating kernels here
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
@@ -93,7 +93,6 @@ def benchmark_all_kernels(
             continue
 
         triton_kernel = get_triton_kernel(kernel_mod)
-        device_type = triton_kernel.device_props.type
         kernel_category = get_kernel_category(kernel_mod)
         args = kernel_mod.get_args()
         num_in_out_ptrs = len(
@@ -138,12 +137,7 @@ def get_info_str(
                     f"  {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
                 )
         else:
-            ms = benchmarker.benchmark(
-                kernel_mod.call,
-                fn_args=(args,),
-                device=device_type,
-                rep=40,
-            )
+            ms = benchmarker.benchmark_gpu(lambda: kernel_mod.call(args), rep=40)
             assert len(triton_kernel.launchers) == 1, (
                 "Autotuner should have selected the best config"
             )