Enable transpose-quantized_relu-transpose fusion. (#10337)

hsharma35 · facebook-github-bot · commit df354bf76c0a · 2025-04-21T23:12:46.000-07:00
Summary:

Add quantized_relu support when fusing transpose pairs.

Reviewed By: mcremon-meta

Differential Revision: D73300693
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -6,12 +6,12 @@
 
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "CXX",
 )
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("odai_jarvis")
 
@@ -36,18 +36,18 @@ python_library(
         "compiler.py",
     ],
     deps = [
-        ":passes",
-        ":utils",
+        ":memory_planning",
         ":ops_registrations",
+        ":passes",
         ":replace_ops",
-        ":memory_planning",
+        ":utils",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
         "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/backends/transforms:remove_clone_ops",
-        "//executorch/exir:lib",
         "//executorch/devtools:lib",
+        "//executorch/exir:lib",
     ],
 )
 
@@ -57,19 +57,19 @@ python_library(
         "export_example.py",
     ],
     deps = [
-        ":passes",
-        ":utils",
         ":ops_registrations",
+        ":passes",
         ":replace_ops",
+        ":utils",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
-        "//executorch/backends/cadence/runtime:runtime",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
-        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
+        "//executorch/backends/cadence/runtime:runtime",
         "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/backends/transforms:remove_clone_ops",
-        "//executorch/exir:lib",
+        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
         "//executorch/devtools:lib",
+        "//executorch/exir:lib",
     ],
 )
 
@@ -94,12 +94,12 @@ python_library(
         "passes.py",
     ],
     deps = [
-        ":utils",
         ":fuse_ops",
-        ":simplify_ops",
-        ":replace_ops",
-        ":reorder_ops",
         ":remove_ops",
+        ":reorder_ops",
+        ":replace_ops",
+        ":simplify_ops",
+        ":utils",
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
@@ -131,7 +131,6 @@ python_library(
     ],
 )
 
-
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
@@ -191,9 +190,9 @@ python_library(
     ],
     typing = True,
     deps = [
-        "//caffe2:torch",
-        ":ops_registrations",
         ":compiler_utils",
+        ":ops_registrations",
+        "//caffe2:torch",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:utils",
         "//executorch/exir:pass_base",
@@ -228,11 +227,11 @@ python_library(
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:simplify_ops",
+        "//executorch/backends/transforms:remove_clone_ops",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
         "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/backends/transforms:remove_clone_ops"
     ],
 )
 
@@ -283,13 +282,13 @@ python_unittest(
     ],
     typing = True,
     deps = [
+        ":ops_registrations",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:graph_builder",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//later:lib",
-        ":ops_registrations"
     ],
 )
 
@@ -319,8 +318,10 @@ python_unittest(
     srcs = [
         "tests/test_fusion_ops_passes.py",
     ],
+    supports_static_listing = False,
     typing = True,
     deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
         ":compiler",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:compiler",
@@ -391,7 +392,6 @@ python_unittest(
     ],
 )
 
-
 python_library(
     name = "memory_planning",
     srcs = [
@@ -409,7 +409,6 @@ python_library(
     ],
 )
 
-
 python_library(
     name = "memory_constraints",
     srcs = [
@@ -425,7 +424,6 @@ python_library(
     ],
 )
 
-
 python_unittest(
     name = "test_memory_passes",
     srcs = [
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -901,9 +901,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
-    Fuse dequantize-quantize op pairs to a single requantize op.
-    For the special case where quant params match, this will remove
-    both dequant and quant ops.
+    Fuse transpose op pairs to a single view op.
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -915,6 +913,7 @@ class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
         exir_ops.edge.cadence.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.cadence.quantized_relu.per_tensor,
     }
 
     def can_fuse_for_chain(
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -23,6 +23,8 @@
 from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ProxyValue
+from parameterized import parameterized
 from torch import nn
 
 
@@ -485,18 +487,37 @@ def test_fuse_then_transpose_pass(self):
 
 
 class TestFuseTransposeOpPairsPass(TestFusionPassesBase):
-    def test_fuse_transpose_pairs(self):
+    def _create_operator(
+        self, builder: GraphBuilder, op: torch._ops.OpOverload, x: ProxyValue
+    ) -> ProxyValue:
+        if op == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default:
+            return builder.call_operator(
+                op=op,
+                args=(x, 1.2, 3, 0, 127, torch.int8),
+            )
+        elif op == exir_ops.edge.cadence.quantized_relu.per_tensor:
+            return builder.call_operator(
+                op=op,
+                args=(x, 0, 0, 0, 0),
+            )
+        else:
+            raise ValueError(f"Unsupported op: {op}")
+
+    @parameterized.expand(
+        [
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantized_relu.per_tensor,
+        ],
+    )
+    def test_fuse_transpose_pairs(self, op: torch._ops.OpOverload):
         # Create a graph with transpose -> quant -> transpose.
         builder = GraphBuilder()
         x = builder.placeholder("x", torch.randn(2, 3))
         transpose_node = builder.call_operator(
             op=exir_ops.edge.aten.transpose_copy.int,
             args=(x, 0, 1),
         )
-        quant_node = builder.call_operator(
-            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            args=(transpose_node, 1.2, 3, 0, 127, torch.int8),
-        )
+        quant_node = self._create_operator(builder, op, transpose_node)
         transpose_node = builder.call_operator(
             op=exir_ops.edge.aten.transpose_copy.int,
             args=(quant_node, 0, 1),
@@ -507,7 +528,7 @@ def test_fuse_transpose_pairs(self):
             gm,
             expected_op_counts={
                 exir_ops.edge.aten.transpose_copy.int: 2,
-                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                op: 1,
             },
         )
 
@@ -517,7 +538,7 @@ def test_fuse_transpose_pairs(self):
             gm_after_pass,
             expected_op_counts={
                 exir_ops.edge.aten.transpose_copy.int: 0,
-                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                op: 1,
             },
         )