Fixed vllm compatibility, added more perf improvement

iupaikov-amd · iupaikov-amd · commit 9c942d6e957d · 2025-09-12T14:52:03.000Z
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -3222,7 +3222,7 @@ def codegen_body(self):
                         "rsplit_end" if self.cooperative_reduction else f"{prefix}numel"
                     )
                     self.body.writeline(
-                        f"for {prefix}offset in range({loop_start}, {loop_end}, {prefix.upper()}BLOCK):"
+                        f"for {prefix}offset in tl.range({loop_start}, {loop_end}, {prefix.upper()}BLOCK, num_stages = 2):"
                     )
                 with self.body.indent(offset=level + 1):
                     self.iteration_ranges_codegen_header(tree, self.body)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1138,7 +1138,7 @@ class triton:
     # So far we see a fixed 8 spilled registers for kernels using sin/cos.
     # Raise the threshold to 16 to be safe.
     # We should revisit this once we understand more of the source of register spills.
-    spill_threshold: int = 16
+    spill_threshold: int = 32
 
     # Generate code containing the newer tl.make_block_ptr() API for loads/store
     use_block_ptr = False
diff --git a/torch/_inductor/kernel/mm_scaled.py b/torch/_inductor/kernel/mm_scaled.py
@@ -469,7 +469,9 @@ def scaled_mm_options(  # type: ignore[no-untyped-def]
         f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
     )
     return dict(
-        GROUP_M=8,
+        # this change is incompatible with vllm, can't make it into our release
+        # should be fixed by them
+        # GROUP_M=8,
         EVEN_K=even_k_symbolic,
         ACC_TYPE="tl.float32",
         USE_FAST_ACCUM=use_fast_accum,
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -577,7 +577,7 @@ def bench(self, launcher, *args, with_profiler=False, **kwargs):
         # for some (complicated) custom Triton kernels, a register-spilling
         # config may yield the best latency.
         if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
-            "spill_threshold", 16
+            "spill_threshold", 32
         ):
             log.debug(
                 "Skip config %s because of register spilling: %d",
@@ -1874,11 +1874,8 @@ def pointwise(
                 triton_config_with_settings(
                     size_hints, bs // 2, num_elements_per_warp=64
                 ),
-                # triton_config_with_settings(
-                #     size_hints, 8192, num_warps=8, num_stages=1, matrix_instr=0, waves_per_eu=2
-                # ),
                 triton_config_with_settings(
-                    size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
+                    size_hints, TRITON_MAX_BLOCK["X"]
                 ),
                 *hinted_configs,
             ]
@@ -1975,14 +1972,14 @@ def _reduction_configs(
     if inductor_meta.get("max_autotune") or inductor_meta.get("max_autotune_pointwise"):
         pass  # skip all these cases
     elif reduction_hint == ReductionHint.INNER:
-        return [contiguous_config]
+        result_configs = [contiguous_config]
     elif reduction_hint == ReductionHint.OUTER:
-        return [outer_config]
+        result_configs = [outer_config]
     elif reduction_hint == ReductionHint.OUTER_TINY:
-        return [tiny_config]
+        result_configs = [tiny_config]
     if disable_pointwise_autotuning(inductor_meta):
-        return [triton_config_reduction(size_hints, 32, 128)]
-    return [
+        result_configs = [triton_config_reduction(size_hints, 32, 128)]
+    result_configs = [
         contiguous_config,
         outer_config,
         tiny_config,
@@ -1994,6 +1991,19 @@ def _reduction_configs(
         triton_config_reduction(size_hints, 64, 4, num_warps=8),
     ]
 
+    # Additional reduction configs appended for ROCm builds
+    if torch.version.hip:
+        # New config
+        result_configs.append(triton_config_reduction(
+            size_hints,
+            1024,
+            8,
+            num_warps=4,
+            num_stages=1
+        ))
+
+    return result_configs
+
 
 def reduction(
     size_hints,
@@ -2012,23 +2022,6 @@ def reduction(
 
     configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
 
-    # Additional tuning confirgs for ROCm builds
-    # Add checks for reduction autotuning bools
-    # if torch.version.hip and inductor_meta.get("max_autotune"):
-    #     configs = [
-    #         triton_config_with_settings(size_hints, bs, num_elements_per_warp=256),
-    #         triton_config_with_settings(
-    #             size_hints, bs // 2, num_elements_per_warp=64
-    #         ),
-    #         # triton_config_with_settings(
-    #         #     size_hints, 8192, num_warps=8, num_stages=1, matrix_instr=0, waves_per_eu=2
-    #         # ),
-    #         triton_config_with_settings(
-    #             size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
-    #         ),
-    #         *hinted_configs,
-    #     ]
-
     return cached_autotune(
         size_hints,
         configs=configs,

Original file line number	Diff line number	Diff line change
`@@ -3222,7 +3222,7 @@ def codegen_body(self):`
`3222`	`3222`	`"rsplit_end" if self.cooperative_reduction else f"{prefix}numel"`
`3223`	`3223`	`)`
`3224`	`3224`	`self.body.writeline(`
`3225`		`- f"for {prefix}offset in range({loop_start}, {loop_end}, {prefix.upper()}BLOCK):"`
	`3225`	`+ f"for {prefix}offset in tl.range({loop_start}, {loop_end}, {prefix.upper()}BLOCK, num_stages = 2):"`
`3226`	`3226`	`)`
`3227`	`3227`	`with self.body.indent(offset=level + 1):`
`3228`	`3228`	`self.iteration_ranges_codegen_header(tree, self.body)`