From 5d386c24605a606b7ee451c16469f9f0cd1dad26 Mon Sep 17 00:00:00 2001
From: Zonglin Peng <zonglinpeng@fb.com>
Date: Fri, 31 Oct 2025 14:05:20 -0700
Subject: [PATCH] [Falcon][NMS][non-functional] Migrate BoxWithNMSLimit to
 Cadence namespace as a custom op from Caffe2

To add the op when `to_edge` or `to_executorch`, `import executorch.backends.cadence.aot.ops_registrations  # noqa`

Differential Revision: [D78835800](https://our.internmc.facebook.com/intern/diff/D78835800/)

[ghstack-poisoned]
---
 backends/cadence/aot/ops_registrations.py   | 51 +++++++++++++++++++++
 backends/cadence/aot/ref_implementations.py | 44 ++++++++++++++++++
 backends/cadence/aot/replace_ops.py         | 25 ++++++++++
 3 files changed, 120 insertions(+)

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 0220baa593f..8744b1219ff 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -56,6 +56,8 @@ def _validate_ref_impl_exists() -> None:
 
         if op_name_clean not in ref_impls:
             if op_name not in _SKIP_OPS:
+                print("*"*100)
+                print(op_name_clean)
                 error_impls.append(op_name)
 
     if error_impls:
@@ -81,6 +83,13 @@ def register_fake(
     _REGISTERED_META_KERNELS.add(op_name)
     return _register_fake_original(op_name)
 
+lib.define(
+    "box_with_nms_limit.out(Tensor scores, Tensor boxes, Tensor batch_splits, float score_thresh, float nms, int detections_per_im, bool soft_nms_enabled, str soft_nms_method, float soft_nms_sigma, float soft_nms_min_score_thres, bool rotated, bool cls_agnostic_bbox_reg, bool input_boxes_include_bg_cls, bool output_classes_include_bg_cls, bool legacy_plus_one, Tensor[]? _caffe2_preallocated_outputs=None, *, Tensor(a!) out_scores, Tensor(b!) out_boxes, Tensor(c!) out_classes, Tensor(d!) batch_splits_out, Tensor(e!) out_keeps, Tensor(f!) out_keeps_size) -> (Tensor(a!) scores, Tensor(b!) boxes, Tensor(c!) classes, Tensor(d!) batch_splits, Tensor(e!) keeps, Tensor(f!) keeps_size)"
+)
+
+lib.define(
+    "box_with_nms_limit(Tensor scores, Tensor boxes, Tensor batch_splits, float score_thresh, float nms, int detections_per_im, bool soft_nms_enabled, str soft_nms_method, float soft_nms_sigma, float soft_nms_min_score_thres, bool rotated, bool cls_agnostic_bbox_reg, bool input_boxes_include_bg_cls, bool output_classes_include_bg_cls, bool legacy_plus_one, Tensor[]? _caffe2_preallocated_outputs=None) -> (Tensor scores, Tensor boxes, Tensor classes, Tensor batch_splits, Tensor keeps, Tensor keeps_size)"
+)
 
 lib.define(
     "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
@@ -2734,6 +2743,48 @@ def quantized_w8a32_gru_meta(
     return hidden.new_empty((2, hidden.shape[-1]), dtype=torch.float32)
 
 
+
+@register_fake("cadence::box_with_nms_limit")
+def box_with_nms_limit_meta(
+    tscores: torch.Tensor,
+    tboxes: torch.Tensor,
+    tbatch_splits: torch.Tensor,
+    score_thres: float,
+    nms_thres: float,
+    detections_per_im: int,
+    soft_nms_enabled: bool,
+    soft_nms_method_str: str,
+    soft_nms_sigma: float,
+    soft_nms_min_score_thres: float,
+    rotated: bool,
+    cls_agnostic_bbox_reg: bool,
+    input_boxes_include_bg_cls: bool,
+    output_classes_include_bg_cls: bool,
+    legacy_plus_one: bool,
+    optional_tensor_list: Optional[list[torch.Tensor]] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+    box_dim = 5 if rotated else 4
+    assert detections_per_im != 0
+    batch_size = tbatch_splits.size(0)
+    num_classes = tscores.size(1)
+    out_scores = tscores.new_empty([detections_per_im])
+    out_boxes = tscores.new_empty([detections_per_im, box_dim])
+    out_classes = tscores.new_empty([detections_per_im])
+    batch_splits_out = tscores.new_empty([batch_size])
+    out_keeps = tscores.new_empty([detections_per_im], dtype=torch.int32)
+    out_keeps_size = tscores.new_empty([batch_size, num_classes], dtype=torch.int32)
+
+    return (
+        out_scores,
+        out_boxes,
+        out_classes,
+        batch_splits_out,
+        out_keeps,
+        out_keeps_size,
+    )
+    
 # Validate that all meta kernels have reference implementations
 # This is called at module import time to catch missing implementations early
 _validate_ref_impl_exists()
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 5a8cba0361d..f199f411b26 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -15,6 +15,8 @@
 from executorch.exir.scalar_type import ScalarType
 from torch.library import impl, Library
 
+from typing import Optional
+
 m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
 torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
 
@@ -2146,3 +2148,45 @@ def quantized_softmax(
         out_scale,
         out_zero_point,
     )
+
+
+@impl_tracked(m, "box_with_nms_limit")
+def meta_box_with_nms_limit(
+    tscores: torch.Tensor,
+    tboxes: torch.Tensor,
+    tbatch_splits: torch.Tensor,
+    score_thres: float,
+    nms_thres: float,
+    detections_per_im: int,
+    soft_nms_enabled: bool,
+    soft_nms_method_str: str,
+    soft_nms_sigma: float,
+    soft_nms_min_score_thres: float,
+    rotated: bool,
+    cls_agnostic_bbox_reg: bool,
+    input_boxes_include_bg_cls: bool,
+    output_classes_include_bg_cls: bool,
+    legacy_plus_one: bool,
+    optional_tensor_list: Optional[list[torch.Tensor]] = None,
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+
+    return torch.ops._caffe2.BoxWithNMSLimit(
+        tscores,
+        tboxes,
+        tbatch_splits,
+        score_thres,
+        nms_thres,
+        detections_per_im,
+        soft_nms_enabled,
+        soft_nms_method_str,
+        soft_nms_sigma,
+        soft_nms_min_score_thres,
+        rotated,
+        cls_agnostic_bbox_reg,
+        input_boxes_include_bg_cls,
+        output_classes_include_bg_cls,
+        legacy_plus_one,
+        optional_tensor_list,
+    )
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index d430e95c470..a8494027c1e 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -195,6 +195,30 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceCaffe2BoxWithNMSLimitWithCadenceBoxWithNMSLimit(ExportPass):
+    """Replaces _caffe2 BoxWithNMSLimit ops with Cadence BoxWithNMSLimit ops.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        ns = exir_ops.edge if isinstance(op, EdgeOpOverload) else torch.ops
+        if op != ns._caffe2.BoxWithNMSLimit.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.cadence.box_with_nms_limit.default,
+            args,
+            kwargs,
+            meta,
+        )
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceSqueezeAndUnsqueezeWithViewPass(ExportPass):
     """
@@ -2162,6 +2186,7 @@ class CadenceReplaceOpsInGraph:
         ReplaceScalarTensorWithFullPass,
         ReplaceInfArgInFullWithValuePass,
         ReplaceLogicalNotBooleanWhereWithWherePass,
+        ReplaceCaffe2BoxWithNMSLimitWithCadenceBoxWithNMSLimit,
         ReplaceAdaptiveAvgPoolWithAtenAvgPoolPass,
         ReplaceAtenAvgPoolWithCadenceAvgPoolPass,
         ReplaceWhereWithFullArgsWithWhereScalar,