From e30a74ddb61f41609f3a0e24ef4754a1ff48d51a Mon Sep 17 00:00:00 2001
From: Star Yuan <ysh329@apache.org>
Date: Wed, 17 Jan 2024 22:18:14 +0800
Subject: [PATCH 1/5] [COMMUNITY] Add new key for release signing

---
 KEYS | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/KEYS b/KEYS
index 3cee902acd32..32e451097045 100644
--- a/KEYS
+++ b/KEYS
@@ -642,3 +642,62 @@ WTgrESErlqNLN5ZTTW/1jBELJCfJKxgHUip+Yo6qNZoWwNLP1BaIcoA3miSG3DXf
 wS/UuN04NxDy7V6mPXE=
 =MTba
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2024-01-15 [SC]
+      A4D9228E55761E665BF01CBB5CE869CB7DEC048C
+uid           [ultimate] Star Yuan (CODE SIGNING KEY) <ysh329@apache.org>
+sig 3        5CE869CB7DEC048C 2024-01-15  Star Yuan (CODE SIGNING KEY) <ysh329@apache.org>
+sub   rsa4096 2024-01-15 [E]
+sig          5CE869CB7DEC048C 2024-01-15  Star Yuan (CODE SIGNING KEY) <ysh329@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBGWlStcBEADaslyfbUNARhWftJoRAChoak0cFU6NxahhvyZfyTGtSuwuHNDD
+2eyvhnDIaYXVClxoNgikiQ5Nkd1jtbA4rFCw6Pdbq+98fkpcr8N4o+jlbpu6Ff3j
+dJ2Qu000MV5qe9FZ4QasdfglJElvizgfNbJv/Fz1ERl/BS1U0c7lyQF9jGGh7EY2
+1y+JFp5OMG6A9SpfaOd+iOw5/cfCQk8+sHQC4dp3hOJPK4NLvjotK+hlOhRsF7gU
+goYYT2IP56kPQb6U/Uiv4/R6HbKugzqSMl6BMwAb9uG6UX0xUfAA8ciHoaITCJCQ
+9e/jGWnDnqYlAMNqLkHEmW7THxJ3hHXcac/Z1C3PeLDJU0rpTxDcjuYkM5jFCu7H
+TgT7lWBP/PyAAVSsLqMQbLJOWm0a14tb/oRoeYr/B2prIbJY5qJBM1nherKGMg0G
+7Oqugo6A1VqgUxg7Chj73PledaNwvm5Lxpl6D+wPDSifhlz0vnwOCMoOon0pTjK4
+DXDEXnEXZtzkZgXI6g7AkVyt0gkqyUi+01ibmlBfcVHh3PVvU4oNdkaywQd5s29R
+DsA4WOqt9cLv+iqIzM1juygfR6ooA1jHDIyIPmmC/kOrcxKXEFvIGXDDCbXAvdXc
+uXgZeZqI3pbKjQaU3fF8HwJ956HTM8rywtVGH9BWRl/i6qn5sq9CcukcuQARAQAB
+tDBTdGFyIFl1YW4gKENPREUgU0lHTklORyBLRVkpIDx5c2gzMjlAYXBhY2hlLm9y
+Zz6JAk4EEwEKADgWIQSk2SKOVXYeZlvwHLtc6GnLfewEjAUCZaVK1wIbAwULCQgH
+AgYVCgkICwIEFgIDAQIeAQIXgAAKCRBc6GnLfewEjBAiD/0cfaYfQ0DL7CPsP0lS
+yezPDDTnDPIo//G1cuSYG0gnXQ1SpbJSzDE7deew+P506/sWFneOY5Kuv6DuSE8J
+nM6vv1EYR4/9x/XstA4F04lQPngKKBV+UKrWj8zIA2Drn345Ece1150bWvrUD7mT
++ps1gfe8SGYpOmR/kRc8qra2zizcWBC1Dl4qd+RcY7Ac6Cu3G/JG2KvZnrUSVev9
+nzSl2V0JtFVIla2odSJqv0Zdj5E2vLvQd3Dxbf3BODCdL3iQqxrQhj+0T3QLEhPg
+y2XOtqW7a96XosoQ44wUiHaS5LwFViG8LoiPADtSdXYb8m4FtMfB8t4mzXVqBjpz
+2csMqOnNvo7bctfpJkjM14UKib39MR2wUv9fD6Qa+OAAIeXGTQH+wlXmlYjji9+A
+4tgq/+d75qUC/tyHSgbZLNXobHF8v77g60cBvFXVL02W53xhVDZP4gwu5iSSN8BJ
+a2hqwo4UO53mRUNkwFZONYxJE7MhLl22r08eu0xNYhoGtpHzDVoyHg26+2FUgFDd
+TNsdqjMyJ+3GXEE3PdKVDTj9To+RoHLuCczk5uvtFYGhseRwIWbVhmTLKUL+wgSa
++b90slkv+CBJvLjvKbVCmCLXwiH8Cx+MZSu0oM5v8fbHuWOhkb7bJd1V+U7qV/OA
+CCqBICt64F+ooQ0oEdC0oLvr2LkCDQRlpUrXARAA1DKsF2ZNUdPIn4VcsjRk/+qF
+13VC9SaqMp+J+8m1XTIeXdr27uUa2vT4j8pAM4gwMVkpEqE0rmHK+S1SeEAlcizC
+Bvp7vvso/glcOg9Sgt9PXvvEDPL/Hnsn1+3YX+Gye4cOTiDDgVW1RKcgGj9Xsir+
+5BS9Secj5CGo92cuaqIo/mMjxGlsuW/LvTU5qQhz7aOaBibe5EHPlGMqM6XJN0BZ
+MHRfBiGDs2n/egMnTPL0JcTlAeird+yxDPULKzhQWkd8rfQKpwcRiY6IcYFHlWdM
+VhZkXNRrxh6+q3rR7FKmxlvG/12YyT6Y1BocGLgROzKIeoEp+6vsU5LJ90jy82ig
+oGSHwNjm2RRukjV3eebovl1dCo6IaI/j4idCv7NlcBnln/Unk4YOZbneMT5r+3Zy
+Q4azLB8KHfHOrUwAxRAGPygdLtqbjs4mF45HDe6h3IOVoiOQlZNpesrwEumlK+Il
+taU0T8hfxyMpIcTLUZpIddSxo0sVby2XZ+z00En3JvtqbpRcfA87thxpsE7uHxwT
+YT8mPPDxo1R4I4LSzsDnekD8EB/7woz4n5I1RBoPB1LSoo0B2os+4vHGkiwZ0TN0
+ICcUYdM623Bv2wJQbVKEDvwjHZTkotjLx7R2lyqMRwFYrMXHxevOfbARJQCqrcY2
+ouLzQme9rE5MPQbKj2cAEQEAAYkCNgQYAQoAIBYhBKTZIo5Vdh5mW/Acu1zoact9
+7ASMBQJlpUrXAhsMAAoJEFzoact97ASMNsIP/3tlsvwUVfy19lUjxWT4rPw2GGz8
+lbPiaetgigK1F1rlzYnIVo32Fcj/GNNwWEdxxEzeaQR/AJmZLWB8sBDThoTGeSDK
+fjKXeDjZh+ElpIKWyk7f3ddHN2TpBz698kZ7fYCciRE9T4d3xgbqx2rCfupxUFSj
+lxLFRkasByJnLdAZI50NZjW838IHMaGsvgbWEqRuvKZOES6gFhrK1NTSxj5iuiHk
+Uxj1KzMhOW+m1eZ0pQcCVXJDY6KYhmrZzw9q6kzSO9ukmS5yRf0EnD7Fsca4iIXP
+Y28xs3zBxYHV4IGU1PtcIwNewmTnjnEy0apHPz0zDplHi1meXuhA7bBMjs/AouJg
+6FIDNSQqDuFXufqvVQ6LZZgob+LklMAoGcka4/5ZLPjipj5SWNeZZunJujSqWK7f
+KJaIfn7ILXqxjaTFrjBN3cm60rO1+zEektrjtWMmSBn0L76pY2ucenrqewruYYdD
+12VQra/6QAS5R0HG8gzOfsZcrHaiIuLoTbsOgnqLVcdb9lO7f3oMbKPwejZ5yhyz
+SraXHvmixlhf4uUYwsWyhw3UgHrv1psB8Z9NfdH9/T2BvRg0qy6ZmI0n0OagPNgz
+v+SZrqrWkSjyPdl6j7x8EmePfNidqw/CnncYI2rEVSmP28W0Uhg5JLgroGYmycv6
+HeZaRpYvkV8UNmnE
+=BtHq
+-----END PGP PUBLIC KEY BLOCK-----

From e5fa1f7c768912d05e82b3ccb638d1a8d3aaff22 Mon Sep 17 00:00:00 2001
From: ysh329 <ysh329@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:10:38 +0000
Subject: [PATCH 2/5] [Relax] Add AllClassNMS Operator

---
 python/tvm/relax/op/vision/__init__.py |  18 ++
 python/tvm/relax/op/vision/nms.py      | 347 +++++++++++++++++++++++++
 2 files changed, 365 insertions(+)
 create mode 100644 python/tvm/relax/op/vision/__init__.py
 create mode 100644 python/tvm/relax/op/vision/nms.py

diff --git a/python/tvm/relax/op/vision/__init__.py b/python/tvm/relax/op/vision/__init__.py
new file mode 100644
index 000000000000..be45458d3647
--- /dev/null
+++ b/python/tvm/relax/op/vision/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""VISION operators."""
+from .nms import *
diff --git a/python/tvm/relax/op/vision/nms.py b/python/tvm/relax/op/vision/nms.py
new file mode 100644
index 000000000000..54b89582ed87
--- /dev/null
+++ b/python/tvm/relax/op/vision/nms.py
@@ -0,0 +1,347 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
+"""Non-maximum suppression operator"""
+import tvm
+from tvm import te
+
+from tvm.tir import if_then_else
+
+from ..sort import argsort
+from ..math import cast
+from ..transform import reshape, gather
+from .. import reduction
+from ..scan import cumsum
+from .nms_util import (
+    binary_search,
+    collect_selected_indices,
+    collect_selected_indices_and_scores,
+    run_all_class_nms,
+)
+
+
+def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 5].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
+    Returns
+    -------
+    valid_count : tvm.te.Tensor
+        1-D tensor for valid number of boxes.
+
+    out_tensor : tvm.te.Tensor
+        Rearranged data tensor.
+
+    out_indices: tvm.te.Tensor or numpy NDArray
+        Related index in input data.
+    """
+    if isinstance(score_threshold, (float, int)):
+        score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype)
+    id_index_const = tvm.tir.const(id_index, "int32")
+    score_index_const = tvm.tir.const(score_index, "int32")
+    return hybrid_get_valid_counts(
+        data,
+        score_threshold,
+        id_index_const,
+        score_index_const,
+        tvm.tir.const(1, data.dtype),
+        data.shape[0],
+        data.shape[1],
+    )
+
+
+def _nms_loop(
+    ib,
+    batch_size,
+    top_k,
+    iou_threshold,
+    max_output_size,
+    valid_count,
+    on_new_valid_box_func,
+    on_new_invalidated_box_func,
+    needs_bbox_check_func,
+    calc_overlap_func,
+    out_scores,
+    num_valid_boxes,
+):
+    def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local):
+        # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
+        on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j)
+        num_valid_boxes_local[0] += 1
+
+        num_boxes_to_check = nkeep - (j + 1)
+
+        with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k:
+            k = j + 1 + _k
+
+            with ib.if_scope(
+                tvm.tir.all(
+                    k < nkeep,
+                    out_scores[i, k] > 0,  # is the box k still valid?
+                    needs_bbox_check_func(i, j, k),
+                )
+            ):
+                iou = calc_overlap_func(i, j, k)
+
+                with ib.if_scope(iou >= iou_threshold):
+                    # invalidate the box k
+                    out_scores[i, k] = -1.0
+                    on_new_invalidated_box_func(i, k)
+
+    with ib.for_range(0, batch_size, name="i") as i:
+        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
+        max_output_size = if_then_else(max_output_size > te.const(0), max_output_size, nkeep)
+
+        with ib.if_scope(tvm.tir.all(iou_threshold > te.const(0), valid_count[i] > te.const(0))):
+            num_valid_boxes_local = ib.allocate(
+                "int32", (1,), name="num_valid_boxes_local", scope="local"
+            )
+            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
+            num_valid_boxes_local[0] = 0
+            box_idx[0] = 0
+
+            # Apply nms
+            # No need to do more iteration if we have already reached max_output_size boxes
+            with ib.while_loop(
+                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
+            ):
+                # Proceed to the inner loop if the box with id box_idx is still valid
+                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
+                    nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local)
+                box_idx[0] += 1
+
+            num_valid_boxes[i] = num_valid_boxes_local[0]
+
+        with ib.else_scope():
+            num_valid_boxes[i] = 0
+
+    return ib.get()
+
+
+def _get_valid_box_count(scores, score_threshold):
+    batch_classes, num_boxes = scores.shape
+
+    def searchsorted_ir(scores, valid_count):
+        ib = tvm.tir.ir_builder.create()
+        scores = ib.buffer_ptr(scores)
+        valid_count = ib.buffer_ptr(valid_count)
+
+        with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
+            binary_search(ib, i, num_boxes, scores, score_threshold, valid_count)
+
+        return ib.get()
+
+    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)
+    searchsorted_buf = tvm.tir.decl_buffer(
+        (batch_classes,), "int32", "searchsorted", data_alignment=8
+    )
+
+    return te.extern(
+        [(batch_classes,)],
+        [scores],
+        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[scores_buf],
+        out_buffers=[searchsorted_buf],
+        name="searchsorted",
+        tag="searchsorted",
+    )
+
+
+def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
+    batch_classes, _ = selected_indices.shape
+
+    ib = tvm.tir.ir_builder.create()
+
+    selected_indices = ib.buffer_ptr(selected_indices)
+    num_detections = ib.buffer_ptr(num_detections)
+    row_offsets = ib.buffer_ptr(row_offsets)
+    out = ib.buffer_ptr(out)
+
+    with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
+        i = cast(i, "int64")
+        batch_id = i // num_class
+        class_id = i % num_class
+
+        with ib.for_range(0, num_detections[i], name="j") as j:
+            out[row_offsets[i] + j, 0] = batch_id
+            out[row_offsets[i] + j, 1] = class_id
+            out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64")
+
+    return ib.get()
+
+
+def _collect_selected_indices_and_scores_ir(
+    selected_indices,
+    selected_scores,
+    num_detections,
+    row_offsets,
+    num_total_detections,
+    collected_indices,
+    collected_scores,
+):
+    batch_size, num_class = row_offsets.shape
+    num_boxes = selected_indices.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    selected_indices = ib.buffer_ptr(selected_indices)
+    selected_scores = ib.buffer_ptr(selected_scores)
+    num_detections = ib.buffer_ptr(num_detections)
+    row_offsets = ib.buffer_ptr(row_offsets)
+    num_total_detections = ib.buffer_ptr(num_total_detections)
+    collected_indices = ib.buffer_ptr(collected_indices)
+    collected_scores = ib.buffer_ptr(collected_scores)
+    zero = cast(0, "int64")
+
+    with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i:
+        i = cast(i, "int64")
+        batch_id = i // num_class
+        class_id = i % num_class
+
+        with ib.for_range(0, num_boxes, name="j") as j:
+            with ib.if_scope(j < num_detections[batch_id, class_id]):
+                offset = row_offsets[batch_id, class_id] + j
+                collected_indices[batch_id, offset, 0] = class_id
+                collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64")
+                collected_scores[batch_id, offset] = selected_scores[i, j]
+            with ib.else_scope():
+                offset = (
+                    num_total_detections[batch_id]
+                    + class_id * num_boxes
+                    - row_offsets[batch_id, class_id]
+                    + j
+                    - num_detections[batch_id, class_id]
+                )
+                collected_indices[batch_id, offset, 0] = zero
+                collected_indices[batch_id, offset, 1] = zero
+                collected_scores[batch_id, offset] = 0.0
+
+    return ib.get()
+
+
+def all_class_non_max_suppression(
+    boxes,
+    scores,
+    max_output_boxes_per_class,
+    iou_threshold,
+    score_threshold,
+    output_format="onnx",
+):
+    """Non-maximum suppression operator for object detection, corresponding to ONNX
+    NonMaxSuppression and TensorFlow combined_non_max_suppression.
+    NMS is performed for each class separately.
+
+    Parameters
+    ----------
+    boxes : tvm.te.Tensor
+        3-D tensor with shape (batch_size, num_boxes, 4)
+
+    scores: tvm.te.Tensor
+        3-D tensor with shape (batch_size, num_classes, num_boxes)
+
+    max_output_boxes_per_class : int or tvm.te.Tensor, optional
+        The maxinum number of output selected boxes per class
+
+    iou_threshold : float or tvm.te.Tensor, optionaIl
+        IoU test threshold
+
+    score_threshold : float or tvm.te.Tensor, optional
+        Score threshold to filter out low score boxes early
+
+    output_format : str, optional
+        "onnx" or "tensorflow", see below.
+
+    Returns
+    -------
+    out : list of tvm.te.Tensor
+        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
+        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
+        `num_total_detection` of shape `(1,)` representing the total number of selected
+        boxes. The three values in `indices` encode batch, class, and box indices.
+        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
+        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
+        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
+        rows are valid.
+
+        If `output_format` is "tensorflow", the output is three tensors, the first
+        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
+        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
+        `(batch_size,)` representing the total number of selected boxes per batch. The two values
+        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
+        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
+        `indices` and `scores` are sorted within each class by box scores, but not across classes.
+        So the box indices and scores for the class 0 come first in a sorted order, followed by
+        the class 1 etc.
+    """
+    batch, num_class, num_boxes = scores.shape
+    scores = reshape(scores, (batch * num_class, num_boxes))
+
+    sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32")
+    sorted_scores = gather(scores, 1, sorted_indices)
+
+    valid_count = _get_valid_box_count(sorted_scores, score_threshold)
+
+    selected_indices, selected_scores, num_detections = run_all_class_nms(
+        boxes,
+        sorted_scores,
+        sorted_indices,
+        valid_count,
+        max_output_boxes_per_class,
+        iou_threshold,
+        _nms_loop,
+        return_scores=(output_format == "tensorflow"),
+    )
+
+    if output_format == "onnx":
+        row_offsets = cumsum(num_detections, exclusive=True, dtype="int64")
+        num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1)
+
+        selected_indices = collect_selected_indices(
+            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
+        )
+        return [selected_indices, num_total_detections]
+
+    num_detections_per_batch = reshape(num_detections, (batch, num_class))
+    row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1)
+    num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1)
+
+    selected_indices, selected_scores = collect_selected_indices_and_scores(
+        selected_indices,
+        selected_scores,
+        num_detections_per_batch,
+        row_offsets,
+        num_total_detections,
+        _collect_selected_indices_and_scores_ir,
+    )
+
+    return [selected_indices, selected_scores, num_total_detections]

From 3f91dca089fa9941f52cc9bca1b9f36c45abff5a Mon Sep 17 00:00:00 2001
From: ysh329 <ysh329@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:42:12 +0000
Subject: [PATCH 3/5] update nms

---
 .../tvm/relax/frontend/onnx/onnx_frontend.py  | 50 ++++++++++++++++++-
 python/tvm/relax/op/__init__.py               |  1 +
 .../relax/transform/legalize_ops/__init__.py  |  1 +
 python/tvm/script/ir_builder/relax/ir.py      |  2 +
 src/te/operation/create_primfunc.cc           |  5 +-
 tests/python/relax/test_frontend_onnx.py      | 21 ++++++++
 6 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index 926da7f022dc..fd71c2e87f7f 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -3108,6 +3108,54 @@ def _impl_v9(cls, bb, inputs, attr, params):
             relax.op.nonzero(inputs[0]), relax.TensorStructInfo((ndim, nonzero_numbers), "int64")
         )
 
+class NonMaxSuppression(OnnxOpConverter):
+    """Converts an onnx NonMaxSuppression node into an equivalent Relax expression."""
+
+    @classmethod
+    def _impl_v10(cls, bb, inputs, attr, params):
+        # Get parameter values
+        boxes = inputs[0]
+        scores = inputs[1]
+        max_output_boxes_per_class = inputs[2] if len(inputs) >= 3 else relax.const([0], "int64")
+        iou_threshold = inputs[3] if len(inputs) >= 4 else relax.const([0.0], "float32")
+        score_threshold = inputs[4] if len(inputs) >= 5 else relax.const([0.0], "float32")
+
+        boxes_dtype = boxes.struct_info.dtype
+        if attr.get("center_point_box", 0) != 0:
+            xc, yc, w, h = relax.op.split(boxes, 4, axis=2)
+            half_w = w / relax.const(2.0, boxes_dtype)
+            half_h = h / relax.const(2.0, boxes_dtype)
+            x1 = xc - half_w
+            x2 = xc + half_w
+            y1 = yc - half_h
+            y2 = yc + half_h
+            boxes = relax.op.concat([y1, x1, y2, x2], axis=2)
+
+        def conditionally_squeeze_scalar(x):
+            rank = x.struct_info.ndim
+            assert rank <= 1, "nms thresholds must be scalars"
+            return relax.op.squeeze(x, [0]) if rank == 1 else x
+
+        max_output_boxes_per_class = conditionally_squeeze_scalar(max_output_boxes_per_class)
+        iou_threshold = conditionally_squeeze_scalar(iou_threshold)
+        score_threshold = conditionally_squeeze_scalar(score_threshold)
+
+        nms_out = bb.normalize(
+            relax.op.vision.all_class_non_max_suppression(
+                boxes,
+                scores,
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+            )
+        )
+        return relax.op.dynamic_strided_slice(
+            nms_out[0],
+            begin=relax.const([0, 0], dtype="int64"),
+            end=relax.op.concat([nms_out[1], relax.const([3], dtype="int64")], axis=0),
+            strides=relax.const([1, 1], dtype="int64"),
+        )
+
 
 class HardSigmoid(OnnxOpConverter):
     """Converts an onnx HardSigmoid node into an equivalent Relax expression."""
@@ -3499,7 +3547,7 @@ def _get_convert_map():
         # "LRN": LRN,
         # "MaxRoiPool": MaxRoiPool,
         # "RoiAlign": RoiAlign,
-        # "NonMaxSuppression": NonMaxSuppression,
+        "NonMaxSuppression": NonMaxSuppression,
         # "GridSample": GridSample,
         # "Upsample": Upsample,
         # others
diff --git a/python/tvm/relax/op/__init__.py b/python/tvm/relax/op/__init__.py
index fd3672368b68..e1635d64e63a 100644
--- a/python/tvm/relax/op/__init__.py
+++ b/python/tvm/relax/op/__init__.py
@@ -154,6 +154,7 @@
     tanh,
     trunc,
 )
+from .vision import all_class_non_max_suppression
 
 
 def _register_op_make():
diff --git a/python/tvm/relax/transform/legalize_ops/__init__.py b/python/tvm/relax/transform/legalize_ops/__init__.py
index b4aba0291fc1..5614d0229646 100644
--- a/python/tvm/relax/transform/legalize_ops/__init__.py
+++ b/python/tvm/relax/transform/legalize_ops/__init__.py
@@ -31,3 +31,4 @@
 from . import search
 from . import statistical
 from . import unary
+from . import vision
diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py
index e61e563b706b..6be506f9c837 100644
--- a/python/tvm/script/ir_builder/relax/ir.py
+++ b/python/tvm/script/ir_builder/relax/ir.py
@@ -186,6 +186,7 @@
     wrap_param,
     zeros,
     zeros_like,
+    vision,
 )
 from tvm.relax.op.builtin import stop_lift_params
 from tvm.relax.struct_info import StructInfo
@@ -896,4 +897,5 @@ def dtype(value: Union[py_str, DataType]) -> Expr:
     "nn",
     "ccl",
     "erf",
+    "vision",
 ]
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 7408eb46eb51..a2977884396e 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -649,7 +649,10 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf
   // reads/writes filled in.
 
   BufferSubstituter substituter(var_map, input_buffer_map);
-  Stmt body = substituter(extern_op->body);
+  Stmt substituted_body = substituter(extern_op->body);
+
+  ProducerToBufferTransformer transformer(info->tensor2buffers);
+  Stmt body = transformer(substituted_body);
 
   // Step 4. Generate opaque block as body.
   return BlockRealize(/*iter_values=*/{},
diff --git a/tests/python/relax/test_frontend_onnx.py b/tests/python/relax/test_frontend_onnx.py
index 7a0a7d7bc952..9c96fe0f155b 100644
--- a/tests/python/relax/test_frontend_onnx.py
+++ b/tests/python/relax/test_frontend_onnx.py
@@ -737,6 +737,27 @@ def test_gemm(alpha, beta, useC):
     check_correctness(model)
 
 
+def test_nms():
+    nms_node = helper.make_node(
+        "NonMaxSuppression", ["boxes", "scores"], ["selected_indices"], center_point_box=0
+    )
+
+    inputs = [
+        helper.make_tensor_value_info("boxes", TensorProto.FLOAT, [1, 10647, 4]),
+        helper.make_tensor_value_info("scores", TensorProto.FLOAT, [1, 80, 10647]),
+    ]
+
+    graph = helper.make_graph(
+        [nms_node],
+        "nms_test",
+        inputs=inputs,
+        outputs=[helper.make_tensor_value_info("selected_indices", TensorProto.INT64, [0, 3])],
+    )
+
+    model = helper.make_model(graph, producer_name="nms_test")
+    check_correctness(model)
+
+
 @pytest.mark.parametrize(
     "in_shape, shape, out_shape",
     [

From aae2345ebaba471c518a4dba1ffced15c7e4e3e0 Mon Sep 17 00:00:00 2001
From: ysh329 <ysh329@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:43:11 +0000
Subject: [PATCH 4/5] update nms

---
 include/tvm/relax/attrs/vision.h              |  49 +++
 python/tvm/relax/op/vision/_ffi_api.py        |  19 +
 .../relax/transform/legalize_ops/vision.py    |  34 ++
 python/tvm/topi/vision/nms.py                 | 302 +++++++++++++++
 python/tvm/topi/vision/nms_util.py            | 345 ++++++++++++++++++
 src/relax/op/vision/nms.cc                    |  94 +++++
 src/relax/op/vision/nms.h                     |  41 +++
 tests/python/relax/test_op_vision.py          |  69 ++++
 .../relax/test_tvmscript_parser_op_vision.py  |  64 ++++
 9 files changed, 1017 insertions(+)
 create mode 100644 include/tvm/relax/attrs/vision.h
 create mode 100644 python/tvm/relax/op/vision/_ffi_api.py
 create mode 100644 python/tvm/relax/transform/legalize_ops/vision.py
 create mode 100644 python/tvm/topi/vision/nms.py
 create mode 100644 python/tvm/topi/vision/nms_util.py
 create mode 100644 src/relax/op/vision/nms.cc
 create mode 100644 src/relax/op/vision/nms.h
 create mode 100644 tests/python/relax/test_op_vision.py
 create mode 100644 tests/python/relax/test_tvmscript_parser_op_vision.py

diff --git a/include/tvm/relax/attrs/vision.h b/include/tvm/relax/attrs/vision.h
new file mode 100644
index 000000000000..828d30a35249
--- /dev/null
+++ b/include/tvm/relax/attrs/vision.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/relax/attrs/vision.h
+ * \brief Auxiliary attributes for vision operators.
+ */
+#ifndef TVM_RELAX_ATTRS_VISION_H_
+#define TVM_RELAX_ATTRS_VISION_H_
+
+#include <tvm/relax/expr.h>
+
+namespace tvm {
+namespace relax {
+
+/*! \brief Attributes used in AllClassNonMaximumSuppression operator */
+struct AllClassNonMaximumSuppressionAttrs
+    : public tvm::AttrsNode<AllClassNonMaximumSuppressionAttrs> {
+  String output_format;
+
+  TVM_DECLARE_ATTRS(AllClassNonMaximumSuppressionAttrs,
+                    "relax.attrs.AllClassNonMaximumSuppressionAttrs") {
+    TVM_ATTR_FIELD(output_format)
+        .set_default("onnx")
+        .describe(
+            "Output format, onnx or tensorflow. Returns outputs in a way that can be easily "
+            "consumed by each frontend.");
+  }
+};  // struct AllClassNonMaximumSuppressionAttrs
+
+}  // namespace relax
+}  // namespace tvm
+
+#endif  // TVM_RELAX_ATTRS_VISION_H_
diff --git a/python/tvm/relax/op/vision/_ffi_api.py b/python/tvm/relax/op/vision/_ffi_api.py
new file mode 100644
index 000000000000..fec2d4d4b28c
--- /dev/null
+++ b/python/tvm/relax/op/vision/_ffi_api.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+"""Constructor APIs"""
+import tvm._ffi
+
+tvm._ffi._init_api("relax.op.vision", __name__)
diff --git a/python/tvm/relax/transform/legalize_ops/vision.py b/python/tvm/relax/transform/legalize_ops/vision.py
new file mode 100644
index 000000000000..2943385228f9
--- /dev/null
+++ b/python/tvm/relax/transform/legalize_ops/vision.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Default legalization function for vision network related operators."""
+from tvm import topi
+from ...block_builder import BlockBuilder
+from ...expr import Call, Expr
+from .common import register_legalize
+
+
+@register_legalize("relax.vision.all_class_non_max_suppression")
+def _vision_all_class_non_max_suppression(bb: BlockBuilder, call: Call) -> Expr:
+    return bb.call_te(
+        topi.vision.all_class_non_max_suppression,
+        call.args[0],
+        call.args[1],
+        call.args[2],
+        call.args[3],
+        call.args[4],
+        output_format=call.attrs.output_format,
+    )
diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
new file mode 100644
index 000000000000..40b42d3a8080
--- /dev/null
+++ b/python/tvm/topi/vision/nms.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
+"""Non-maximum suppression operator"""
+import tvm
+from tvm import te
+
+from tvm.tir import if_then_else
+
+from ..sort import argsort
+from ..math import cast
+from ..transform import reshape, gather
+from .. import reduction
+from ..scan import cumsum
+from .nms_util import (
+    binary_search,
+    collect_selected_indices,
+    collect_selected_indices_and_scores,
+    run_all_class_nms,
+)
+
+
+def _nms_loop(
+    ib,
+    batch_size,
+    top_k,
+    iou_threshold,
+    max_output_size,
+    valid_count,
+    on_new_valid_box_func,
+    on_new_invalidated_box_func,
+    needs_bbox_check_func,
+    calc_overlap_func,
+    out_scores,
+    num_valid_boxes,
+):
+    def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local):
+        # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
+        on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j)
+        num_valid_boxes_local[0] += 1
+
+        num_boxes_to_check = nkeep - (j + 1)
+
+        with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k:
+            k = j + 1 + _k
+
+            with ib.if_scope(
+                tvm.tir.all(
+                    k < nkeep,
+                    out_scores[i, k] > 0,  # is the box k still valid?
+                    needs_bbox_check_func(i, j, k),
+                )
+            ):
+                iou = calc_overlap_func(i, j, k)
+
+                with ib.if_scope(iou >= iou_threshold):
+                    # invalidate the box k
+                    out_scores[i, k] = -1.0
+                    on_new_invalidated_box_func(i, k)
+
+    with ib.for_range(0, batch_size, name="i") as i:
+        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
+        max_output_size = if_then_else(max_output_size > te.const(0), max_output_size, nkeep)
+
+        with ib.if_scope(tvm.tir.all(iou_threshold > te.const(0), valid_count[i] > te.const(0))):
+            num_valid_boxes_local = ib.allocate(
+                "int32", (1,), name="num_valid_boxes_local", scope="local"
+            )
+            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
+            num_valid_boxes_local[0] = 0
+            box_idx[0] = 0
+
+            # Apply nms
+            # No need to do more iteration if we have already reached max_output_size boxes
+            with ib.while_loop(
+                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
+            ):
+                # Proceed to the inner loop if the box with id box_idx is still valid
+                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
+                    nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local)
+                box_idx[0] += 1
+
+            num_valid_boxes[i] = num_valid_boxes_local[0]
+
+        with ib.else_scope():
+            num_valid_boxes[i] = 0
+
+    return ib.get()
+
+
+def _get_valid_box_count(scores, score_threshold):
+    batch_classes, num_boxes = scores.shape
+
+    def searchsorted_ir(scores, valid_count):
+        ib = tvm.tir.ir_builder.create()
+        scores = ib.buffer_ptr(scores)
+        valid_count = ib.buffer_ptr(valid_count)
+
+        with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
+            binary_search(ib, i, num_boxes, scores, score_threshold, valid_count)
+
+        return ib.get()
+
+    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)
+    searchsorted_buf = tvm.tir.decl_buffer(
+        (batch_classes,), "int32", "searchsorted", data_alignment=8
+    )
+
+    return te.extern(
+        [(batch_classes,)],
+        [scores],
+        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[scores_buf],
+        out_buffers=[searchsorted_buf],
+        name="searchsorted",
+        tag="searchsorted",
+    )
+
+
+def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
+    batch_classes, _ = selected_indices.shape
+
+    ib = tvm.tir.ir_builder.create()
+
+    selected_indices = ib.buffer_ptr(selected_indices)
+    num_detections = ib.buffer_ptr(num_detections)
+    row_offsets = ib.buffer_ptr(row_offsets)
+    out = ib.buffer_ptr(out)
+
+    with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
+        i = cast(i, "int64")
+        batch_id = i // num_class
+        class_id = i % num_class
+
+        with ib.for_range(0, num_detections[i], name="j") as j:
+            out[row_offsets[i] + j, 0] = batch_id
+            out[row_offsets[i] + j, 1] = class_id
+            out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64")
+
+    return ib.get()
+
+
+def _collect_selected_indices_and_scores_ir(
+    selected_indices,
+    selected_scores,
+    num_detections,
+    row_offsets,
+    num_total_detections,
+    collected_indices,
+    collected_scores,
+):
+    batch_size, num_class = row_offsets.shape
+    num_boxes = selected_indices.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    selected_indices = ib.buffer_ptr(selected_indices)
+    selected_scores = ib.buffer_ptr(selected_scores)
+    num_detections = ib.buffer_ptr(num_detections)
+    row_offsets = ib.buffer_ptr(row_offsets)
+    num_total_detections = ib.buffer_ptr(num_total_detections)
+    collected_indices = ib.buffer_ptr(collected_indices)
+    collected_scores = ib.buffer_ptr(collected_scores)
+    zero = cast(0, "int64")
+
+    with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i:
+        i = cast(i, "int64")
+        batch_id = i // num_class
+        class_id = i % num_class
+
+        with ib.for_range(0, num_boxes, name="j") as j:
+            with ib.if_scope(j < num_detections[batch_id, class_id]):
+                offset = row_offsets[batch_id, class_id] + j
+                collected_indices[batch_id, offset, 0] = class_id
+                collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64")
+                collected_scores[batch_id, offset] = selected_scores[i, j]
+            with ib.else_scope():
+                offset = (
+                    num_total_detections[batch_id]
+                    + class_id * num_boxes
+                    - row_offsets[batch_id, class_id]
+                    + j
+                    - num_detections[batch_id, class_id]
+                )
+                collected_indices[batch_id, offset, 0] = zero
+                collected_indices[batch_id, offset, 1] = zero
+                collected_scores[batch_id, offset] = 0.0
+
+    return ib.get()
+
+
+def all_class_non_max_suppression(
+    boxes,
+    scores,
+    max_output_boxes_per_class,
+    iou_threshold,
+    score_threshold,
+    output_format="onnx",
+):
+    """Non-maximum suppression operator for object detection, corresponding to ONNX
+    NonMaxSuppression and TensorFlow combined_non_max_suppression.
+    NMS is performed for each class separately.
+
+    Parameters
+    ----------
+    boxes : tvm.te.Tensor
+        3-D tensor with shape (batch_size, num_boxes, 4)
+
+    scores: tvm.te.Tensor
+        3-D tensor with shape (batch_size, num_classes, num_boxes)
+
+    max_output_boxes_per_class : int or tvm.te.Tensor, optional
+        The maxinum number of output selected boxes per class
+
+    iou_threshold : float or tvm.te.Tensor, optionaIl
+        IoU test threshold
+
+    score_threshold : float or tvm.te.Tensor, optional
+        Score threshold to filter out low score boxes early
+
+    output_format : str, optional
+        "onnx" or "tensorflow", see below.
+
+    Returns
+    -------
+    out : list of tvm.te.Tensor
+        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
+        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
+        `num_total_detection` of shape `(1,)` representing the total number of selected
+        boxes. The three values in `indices` encode batch, class, and box indices.
+        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
+        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
+        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
+        rows are valid.
+
+        If `output_format` is "tensorflow", the output is three tensors, the first
+        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
+        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
+        `(batch_size,)` representing the total number of selected boxes per batch. The two values
+        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
+        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
+        `indices` and `scores` are sorted within each class by box scores, but not across classes.
+        So the box indices and scores for the class 0 come first in a sorted order, followed by
+        the class 1 etc.
+    """
+    batch, num_class, num_boxes = scores.shape
+    scores = reshape(scores, (batch * num_class, num_boxes))
+
+    sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32")
+    sorted_scores = gather(scores, 1, sorted_indices)
+
+    valid_count = _get_valid_box_count(sorted_scores, score_threshold)
+
+    selected_indices, selected_scores, num_detections = run_all_class_nms(
+        boxes,
+        sorted_scores,
+        sorted_indices,
+        valid_count,
+        max_output_boxes_per_class,
+        iou_threshold,
+        _nms_loop,
+        return_scores=(output_format == "tensorflow"),
+    )
+
+    if output_format == "onnx":
+        row_offsets = cumsum(num_detections, exclusive=True, dtype="int64")
+        num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1)
+
+        selected_indices = collect_selected_indices(
+            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
+        )
+        return [selected_indices, num_total_detections]
+
+    num_detections_per_batch = reshape(num_detections, (batch, num_class))
+    row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1)
+    num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1)
+
+    selected_indices, selected_scores = collect_selected_indices_and_scores(
+        selected_indices,
+        selected_scores,
+        num_detections_per_batch,
+        row_offsets,
+        num_total_detections,
+        _collect_selected_indices_and_scores_ir,
+    )
+
+    return [selected_indices, selected_scores, num_total_detections]
diff --git a/python/tvm/topi/vision/nms_util.py b/python/tvm/topi/vision/nms_util.py
new file mode 100644
index 000000000000..5cf3a459abd7
--- /dev/null
+++ b/python/tvm/topi/vision/nms_util.py
@@ -0,0 +1,345 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Common utilities used in Non-maximum suppression operators"""
+import tvm
+from tvm import te
+
+
+def _get_boundaries(output, box_idx):
+    l = tvm.te.min(
+        output[box_idx],
+        output[box_idx + 2],
+    )
+    t = tvm.te.min(
+        output[box_idx + 1],
+        output[box_idx + 3],
+    )
+    r = tvm.te.max(
+        output[box_idx],
+        output[box_idx + 2],
+    )
+    b = tvm.te.max(
+        output[box_idx + 1],
+        output[box_idx + 3],
+    )
+    return l, t, r, b
+
+
+def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
+    """Calculate overlap of two boxes."""
+    a_l, a_t, a_r, a_b = _get_boundaries(out_tensor, box_a_idx)
+    b_l, b_t, b_r, b_b = _get_boundaries(out_tensor, box_b_idx)
+
+    # Overlapping width and height
+    w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l))
+    h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t))
+
+    # Overlapping area
+    area = h * w
+
+    # total area of the figure formed by box a and box b
+    # except for overlapping area
+    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+    return tvm.tir.Select(u <= 0.0, 0.0, area / u)
+
+
+def binary_search(ib, y, num_boxes, scores, score_threshold, out):
+    """Binary search for score_threshold on scores sorted in descending order"""
+    lo = ib.allocate("int32", (1,), name="lo", scope="local")
+    hi = ib.allocate("int32", (1,), name="hi", scope="local")
+
+    lo[0] = 0
+    hi[0] = num_boxes.astype("int32")
+
+    with ib.while_loop(lo[0] < hi[0]):
+        mid = (hi[0] + lo[0]) >> 1
+        with ib.if_scope(scores[y, mid] > score_threshold):
+            lo[0] = mid + 1
+        with ib.else_scope():
+            hi[0] = mid
+
+    out[y] = lo[0]
+
+
+def collect_selected_indices(num_class, selected_indices, num_detections, row_offsets, ir):
+    """Collect selected indices from the core NMS loop into one linear output
+
+    Parameters
+    ----------
+    num_class : int
+
+    selected_indices: tvm.te.Tensor
+        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
+        of selected boxes by the core NMS loop.
+
+    num_detections tvm.te.Tensor
+        1-D tensor with shape (batch_size * num_classes,), representing
+        the number of boxes selected by the core NMS loop, per batch and class
+
+    row_offsets tvm.te.Tensor
+        1-D tensor with shape (batch_size * num_classes,), this should be the exclusive scan
+        of num_detections
+
+    ir : function
+        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        The output is indices of size (batch_size * num_class* num_boxes , 3).
+        Rows of indices are ordered such that selected boxes from batch 0, class 0 come
+        first, in descending of scores, followed by boxes from batch 0, class 1 etc.
+    """
+    batch_class, num_boxes = selected_indices.shape
+    return te.extern(
+        [(batch_class * num_boxes, 3)],
+        [selected_indices, num_detections, row_offsets],
+        lambda ins, outs: ir(num_class, ins[0], ins[1], ins[2], outs[0]),
+        dtype=["int64"],
+        name="collect_indices",
+        tag="collect_indices",
+    )
+
+
+def collect_selected_indices_and_scores(
+    selected_indices, selected_scores, num_detections, row_offsets, num_total_detections, ir
+):
+    """Collect selected indices and scores from the core NMS loop into one linear output
+
+    Parameters
+    ----------
+    num_class : int
+
+    selected_indices: tvm.te.Tensor
+        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
+        of selected boxes by the core NMS loop.
+
+    selected_indices: tvm.te.Tensor
+        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the scores
+        of selected boxes by the core NMS loop.
+
+    num_detections tvm.te.Tensor
+        2-D tensor with shape (batch_size, num_classes), representing
+        the number of boxes selected by the core NMS loop, per batch and class
+
+    row_offsets tvm.te.Tensor
+        2-D tensor with shape (batch_size, num_classes), this should be the exclusive scan
+        of num_detections along axis 1
+
+    ir : function
+        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
+
+    Returns
+    -------
+    out : [tvm.te.Tensor, tvm.te.Tensor]
+        The output is two tensors. The first is indices of size
+        (batch_size, num_class* num_boxes, 2), and the second is scores of size
+        (batch_size, num_class* num_boxes).
+    """
+    batch_size, num_class = row_offsets.shape
+    num_boxes = selected_indices.shape[1]
+    return te.extern(
+        [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)],
+        [selected_indices, selected_scores, num_detections, row_offsets, num_total_detections],
+        lambda ins, outs: ir(ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], outs[1]),
+        dtype=["int64", "float32"],
+        name="collect_indices_and_scores",
+        tag="collect_indices_and_scores",
+    )
+
+
+def _all_class_nms_ir(
+    boxes,
+    sorted_scores,
+    sorted_indices,
+    valid_count,
+    batch_class,
+    num_class,
+    num_anchors,
+    iou_threshold,
+    max_output_size_per_class,
+    box_indices,
+    selected_scores,
+    num_valid_boxes,
+    nms_loop,
+):
+    ib = tvm.tir.ir_builder.create()
+    boxes = ib.buffer_ptr(boxes)
+    sorted_scores = ib.buffer_ptr(sorted_scores)
+    sorted_indices = ib.buffer_ptr(sorted_indices)
+    valid_count = ib.buffer_ptr(valid_count)
+    box_indices = ib.buffer_ptr(box_indices)
+    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
+
+    if selected_scores is not None:
+        selected_scores = ib.buffer_ptr(selected_scores)
+
+    if isinstance(iou_threshold, float):
+        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
+
+    if isinstance(max_output_size_per_class, int):
+        max_output_size_per_class = tvm.tir.const(max_output_size_per_class)
+
+    def calc_overlap(i, j, k):
+        offset_j = sorted_indices[i, j] * 4
+        offset_k = sorted_indices[i, k] * 4
+        batch_id = i // num_class
+        base_bbox_idx = batch_id * num_anchors * 4
+        return calculate_overlap(
+            boxes,
+            base_bbox_idx + offset_j,
+            base_bbox_idx + offset_k,
+        )
+
+    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
+        with ib.if_scope(tid + 0 == 0):
+            box_indices[i, num_current_valid_box] = sorted_indices[i, j]
+
+            if selected_scores is not None:
+                selected_scores[i, num_current_valid_box] = sorted_scores[i, j]
+
+    def on_new_invalidated_box(*_):
+        pass
+
+    def needs_bbox_check(*_):
+        return tvm.tir.const(True)
+
+    return nms_loop(
+        ib,
+        batch_class,
+        tvm.tir.IntImm("int32", -1),  # top_k
+        iou_threshold,
+        max_output_size_per_class,
+        valid_count,
+        on_new_valid_box,
+        on_new_invalidated_box,
+        needs_bbox_check,
+        calc_overlap,
+        sorted_scores,
+        num_valid_boxes,
+    )
+
+
+def run_all_class_nms(
+    boxes,
+    sorted_scores,
+    sorted_indices,
+    valid_count,
+    max_output_size_per_class,
+    iou_threshold,
+    nms_loop,
+    return_scores=False,
+):
+    """The core all class NMS routine
+
+    Parameters
+    ----------
+    boxes : tvm.te.Tensor
+        3-D tensor with shape (batch_size, num_boxes, 4)
+
+    sorted_scores: tvm.te.Tensor
+        2-D tensor with shape (batch_size * num_classes, num_boxes)
+        One of the outputs from argsort
+
+    sorted_indices: tvm.te.Tensor
+        2-D tensor with shape (batch_size * num_classes, num_boxes)
+        The other output from argsort
+
+    valid_count: tvm.te.Tensor
+        1-D tensor with shape (batch_size * num_classes,), representing
+        the number of boxes whose score is above score_threshold, per batch and class
+
+    max_output_boxes_per_class : int or tvm.te.Tensor, optional
+        The maxinum number of output selected boxes per class
+
+    iou_threshold : float or tvm.te.Tensor, optionaIl
+        IoU test threshold
+
+    nms_loop : function
+        A core NMS loop, see its usage in vision/nms.py and cuda/nms.py
+
+    return_scores : bool, optional
+        Whether or not to return selected scores, needed by the tensorflow output format.
+
+    Returns
+    -------
+    out : a list of tvm.te.Tensor
+        The output is three tensors, the first and second are indices and scores of size
+        (batch_size * num_class, num_boxes), and the third is a tensor
+        num_selected_boxes of shape (batch_size * num_class,) representing the total number of
+        selected boxes per batch and class. If return_scores is False, the second output is
+        None.
+    """
+    batch, num_boxes, _ = boxes.shape
+    batch_class = sorted_scores.shape[0]
+    num_class = batch_class // batch
+
+    if return_scores is False:
+        all_class_num0_buf = tvm.tir.decl_buffer(
+            (batch_class, num_boxes), "int32", "all_class_nms0", data_alignment=8
+        )
+        all_class_num1_buf = tvm.tir.decl_buffer(
+            (1, batch_class), "int32", "all_class_nms1", data_alignment=8
+        )
+        selected_indices, num_detections = te.extern(
+            [(batch_class, num_boxes), (1, batch_class)],
+            [boxes, sorted_scores, sorted_indices, valid_count],
+            lambda ins, outs: _all_class_nms_ir(
+                ins[0],  # boxes
+                ins[1],  # sorted_scores
+                ins[2],  # sorted_indices
+                ins[3],  # valid_count
+                batch_class,
+                num_class,
+                num_boxes,
+                iou_threshold,
+                max_output_size_per_class,
+                outs[0],  # box_indices
+                None,  # scores
+                outs[1],  # num_selected_boxes
+                nms_loop,
+            ),
+            out_buffers=[all_class_num0_buf, all_class_num1_buf],
+            dtype=["int32", "int32"],
+            name="all_class_nms",
+            tag="all_class_nms",
+        )
+        return selected_indices, None, num_detections
+
+    return te.extern(
+        [(batch_class, num_boxes), (batch_class, num_boxes), (1, batch_class)],
+        [boxes, sorted_scores, sorted_indices, valid_count],
+        lambda ins, outs: _all_class_nms_ir(
+            ins[0],  # boxes
+            ins[1],  # sorted_scores
+            ins[2],  # sorted_indices
+            ins[3],  # valid_count
+            batch_class,
+            num_class,
+            num_boxes,
+            iou_threshold,
+            max_output_size_per_class,
+            outs[0],  # box_indices
+            outs[1],  # selected scores
+            outs[2],  # num_selected_boxes
+            nms_loop,
+        ),
+        dtype=["int32", "float32", "int32"],
+        name="all_class_nms",
+        tag="all_class_nms",
+    )
diff --git a/src/relax/op/vision/nms.cc b/src/relax/op/vision/nms.cc
new file mode 100644
index 000000000000..163a3d25a904
--- /dev/null
+++ b/src/relax/op/vision/nms.cc
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "nms.h"
+
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relax {
+
+/* relax.vision.all_class_non_max_suppression */
+TVM_REGISTER_NODE_TYPE(AllClassNonMaximumSuppressionAttrs);
+
+Expr all_class_non_max_suppression(Expr boxes, Expr scores, Expr max_output_boxes_per_class,
+                                   Expr iou_threshold, Expr score_threshold, String output_format) {
+  ObjectPtr<AllClassNonMaximumSuppressionAttrs> attrs =
+      make_object<AllClassNonMaximumSuppressionAttrs>();
+  attrs->output_format = output_format;
+
+  static const Op& op = Op::Get("relax.vision.all_class_non_max_suppression");
+  return Call(op,
+              {std::move(boxes), std::move(scores), std::move(max_output_boxes_per_class),
+               std::move(iou_threshold), std::move(score_threshold)},
+              Attrs{attrs}, {});
+}
+
+TVM_REGISTER_GLOBAL("relax.op.vision.all_class_non_max_suppression")
+    .set_body_typed(all_class_non_max_suppression);
+
+StructInfo InferStructInfoAllClassNMS(const Call& call, const BlockBuilder& ctx) {
+  Array<TensorStructInfo> input_sinfo = GetInputTensorStructInfo(call, ctx);
+  const auto boxes_sinfo = input_sinfo[0];
+  const auto scores_sinfo = input_sinfo[1];
+  ICHECK(!boxes_sinfo->IsUnknownNdim()) << "Only support known ndim";
+  ICHECK(!scores_sinfo->IsUnknownNdim()) << "Only support known ndim";
+  ICHECK_EQ(boxes_sinfo->ndim, 3) << "AllClassNMS input boxes should be 3-D.";
+  ICHECK_EQ(scores_sinfo->ndim, 3) << "AllClassNMS input scores count should be 3-D.";
+
+  const auto batch = boxes_sinfo->shape.as<ShapeExprNode>()->values[0];
+  const auto num_classes = scores_sinfo->shape.as<ShapeExprNode>()->values[1];
+  const auto num_boxes = boxes_sinfo->shape.as<ShapeExprNode>()->values[1];
+
+  auto vdev = input_sinfo[0]->vdevice;
+  const auto* attrs = call->attrs.as<AllClassNonMaximumSuppressionAttrs>();
+  if (attrs->output_format == "onnx") {
+    auto vdev = input_sinfo[0]->vdevice;
+    auto num_total_boxes = batch * num_classes * num_boxes;
+    ShapeExpr oshape{Array<PrimExpr>({num_total_boxes, 3})};
+    ShapeExpr counts_shape{Array<PrimExpr>({1})};
+    return TupleStructInfo({TensorStructInfo(oshape, DataType::Int(64), vdev),
+                            TensorStructInfo(counts_shape, DataType::Int(64), vdev)});
+  }
+
+  auto num_total_boxes_per_batch = num_classes * num_boxes;
+  ShapeExpr indices_shape{Array<PrimExpr>({batch, num_total_boxes_per_batch, 2})};
+  ShapeExpr scores_shape{Array<PrimExpr>({batch, num_total_boxes_per_batch})};
+  ShapeExpr counts_shape{Array<PrimExpr>({batch})};
+  return TupleStructInfo({TensorStructInfo(indices_shape, DataType::Int(64), vdev),
+                          TensorStructInfo(scores_shape, DataType::Float(32), vdev),
+                          TensorStructInfo(counts_shape, DataType::Int(64), vdev)});
+}
+
+TVM_REGISTER_OP("relax.vision.all_class_non_max_suppression")
+    .set_attrs_type<AllClassNonMaximumSuppressionAttrs>()
+    .set_num_inputs(5)
+    .add_argument("boxes", "Tensor", "The input boxes in the format [batch, num_boxes, 4].")
+    .add_argument("scores", "Tensor",
+                  "Scores for each box and class in the format [batch, num_classes, num_boxes].")
+    .add_argument("max_output_boxes_per_class", "Tensor",
+                  "The maximum number of output boxes per class.")
+    .add_argument("iou_threshold", "Tensor", "The IoU threshold for box the overlap test.")
+    .add_argument("score_threshold", "Tensor",
+                  "The score threshold to filter out low score boxes early.")
+    .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoAllClassNMS)
+    .set_attr<Bool>("FPurity", Bool(true));
+
+}  // namespace relax
+}  // namespace tvm
diff --git a/src/relax/op/vision/nms.h b/src/relax/op/vision/nms.h
new file mode 100644
index 000000000000..e7adc9600ef2
--- /dev/null
+++ b/src/relax/op/vision/nms.h
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file nms.h
+ * \brief The functions to make Relax Non-maximum suppression operator calls.
+ */
+
+#ifndef TVM_RELAX_OP_VISION_NMS_H_
+#define TVM_RELAX_OP_VISION_NMS_H_
+
+#include <tvm/relax/attrs/vision.h>
+
+#include "../op_common.h"
+
+namespace tvm {
+namespace relax {
+
+/*! \brief Compute All Class NonMaximumSuppression. */
+Expr all_class_non_max_suppression(Expr boxes, Expr scores, Expr max_output_boxes_per_class,
+                                   Expr iou_threshold, Expr score_threshold, String output_format);
+
+}  // namespace relax
+}  // namespace tvm
+
+#endif  // TVM_RELAX_OP_VISION_NMS_H_
diff --git a/tests/python/relax/test_op_vision.py b/tests/python/relax/test_op_vision.py
new file mode 100644
index 000000000000..bb23aabb3cb2
--- /dev/null
+++ b/tests/python/relax/test_op_vision.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import tvm
+import tvm.testing
+from tvm import relax, tir
+from tvm import TVMError
+from tvm.ir import Op, VDevice
+from tvm.script import relax as R
+
+
+def _check_inference(bb: relax.BlockBuilder, call: relax.Call, expected_sinfo: relax.StructInfo):
+    ret = bb.normalize(call)
+    tvm.ir.assert_structural_equal(ret.struct_info, expected_sinfo)
+
+
+def test_all_class_non_max_suppression_infer_struct_info():
+    bb = relax.BlockBuilder()
+    batch_size, num_classes, num_boxes = 10, 8, 5
+    boxes = relax.Var("boxes", R.Tensor((batch_size, num_boxes, 4), "int64"))
+    scores = relax.Var("scores", R.Tensor((batch_size, num_classes, num_boxes), "float32"))
+
+    _check_inference(
+        bb,
+        relax.op.vision.all_class_non_max_suppression(boxes, scores, output_format="onnx"),
+        relax.TupleStructInfo(
+            [
+                relax.TensorStructInfo((batch_size * num_classes * num_boxes, 3), "int64"),
+                relax.TensorStructInfo((1,), "int64"),
+            ]
+        ),
+    )
+
+    _check_inference(
+        bb,
+        relax.op.vision.all_class_non_max_suppression(boxes, scores, output_format="tensorflow"),
+        relax.TupleStructInfo(
+            [
+                relax.TensorStructInfo((batch_size, num_classes * num_boxes, 2), "int64"),
+                relax.TensorStructInfo(
+                    (
+                        batch_size,
+                        num_classes * num_boxes,
+                    ),
+                    "float32",
+                ),
+                relax.TensorStructInfo((batch_size,), "int64"),
+            ]
+        ),
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/test_tvmscript_parser_op_vision.py b/tests/python/relax/test_tvmscript_parser_op_vision.py
new file mode 100644
index 000000000000..b90dc1e092ad
--- /dev/null
+++ b/tests/python/relax/test_tvmscript_parser_op_vision.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Optional, Union
+
+import tvm
+import tvm.script
+import tvm.testing
+from tvm import IRModule, relax
+from tvm.script import relax as R
+
+
+def _check(
+    parsed: Union[relax.Function, IRModule],
+    expect: Optional[Union[relax.Function, IRModule]],
+):
+    test = parsed.script(show_meta=True)
+    roundtrip_mod = tvm.script.from_source(test)
+    tvm.ir.assert_structural_equal(parsed, roundtrip_mod)
+    if expect:
+        tvm.ir.assert_structural_equal(parsed, expect)
+
+
+def test_all_class_non_max_suppression():
+    @R.function
+    def foo(
+        boxes: R.Tensor((10, 5, 4), "int64"),
+        scores: R.Tensor((10, 8, 5), "float32"),
+    ) -> R.Tuple(R.Tensor((400, 3), "int64"), R.Tensor((1,), "int64")):
+        gv: R.Tuple(
+            R.Tensor((400, 3), "int64"), R.Tensor((1,), "int64")
+        ) = R.vision.all_class_non_max_suppression(
+            boxes,
+            scores,
+        )
+        return gv
+
+    boxes = relax.Var("boxes", R.Tensor((10, 5, 4), "int64"))
+    scores = relax.Var("scores", R.Tensor((10, 8, 5), "float32"))
+
+    bb = relax.BlockBuilder()
+    with bb.function("foo", [boxes, scores]):
+        gv = bb.emit(relax.op.vision.all_class_non_max_suppression(boxes, scores))
+        bb.emit_func_output(gv)
+
+    _check(foo, bb.get()["foo"])
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From ce6478b83f296ee7410deea266b4def8183f017c Mon Sep 17 00:00:00 2001
From: ysh329 <ysh329@users.noreply.github.com>
Date: Wed, 30 Jul 2025 09:02:23 +0000
Subject: [PATCH 5/5] fix lint

---
 python/tvm/relax/frontend/onnx/onnx_frontend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index fd71c2e87f7f..d5779862a250 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -3108,6 +3108,7 @@ def _impl_v9(cls, bb, inputs, attr, params):
             relax.op.nonzero(inputs[0]), relax.TensorStructInfo((ndim, nonzero_numbers), "int64")
         )
 
+
 class NonMaxSuppression(OnnxOpConverter):
     """Converts an onnx NonMaxSuppression node into an equivalent Relax expression."""