From e30a74ddb61f41609f3a0e24ef4754a1ff48d51a Mon Sep 17 00:00:00 2001 From: Star Yuan Date: Wed, 17 Jan 2024 22:18:14 +0800 Subject: [PATCH 1/5] [COMMUNITY] Add new key for release signing --- KEYS | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/KEYS b/KEYS index 3cee902acd32..32e451097045 100644 --- a/KEYS +++ b/KEYS @@ -642,3 +642,62 @@ WTgrESErlqNLN5ZTTW/1jBELJCfJKxgHUip+Yo6qNZoWwNLP1BaIcoA3miSG3DXf wS/UuN04NxDy7V6mPXE= =MTba -----END PGP PUBLIC KEY BLOCK----- +pub rsa4096 2024-01-15 [SC] + A4D9228E55761E665BF01CBB5CE869CB7DEC048C +uid [ultimate] Star Yuan (CODE SIGNING KEY) +sig 3 5CE869CB7DEC048C 2024-01-15 Star Yuan (CODE SIGNING KEY) +sub rsa4096 2024-01-15 [E] +sig 5CE869CB7DEC048C 2024-01-15 Star Yuan (CODE SIGNING KEY) + +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBGWlStcBEADaslyfbUNARhWftJoRAChoak0cFU6NxahhvyZfyTGtSuwuHNDD +2eyvhnDIaYXVClxoNgikiQ5Nkd1jtbA4rFCw6Pdbq+98fkpcr8N4o+jlbpu6Ff3j +dJ2Qu000MV5qe9FZ4QasdfglJElvizgfNbJv/Fz1ERl/BS1U0c7lyQF9jGGh7EY2 +1y+JFp5OMG6A9SpfaOd+iOw5/cfCQk8+sHQC4dp3hOJPK4NLvjotK+hlOhRsF7gU +goYYT2IP56kPQb6U/Uiv4/R6HbKugzqSMl6BMwAb9uG6UX0xUfAA8ciHoaITCJCQ +9e/jGWnDnqYlAMNqLkHEmW7THxJ3hHXcac/Z1C3PeLDJU0rpTxDcjuYkM5jFCu7H +TgT7lWBP/PyAAVSsLqMQbLJOWm0a14tb/oRoeYr/B2prIbJY5qJBM1nherKGMg0G +7Oqugo6A1VqgUxg7Chj73PledaNwvm5Lxpl6D+wPDSifhlz0vnwOCMoOon0pTjK4 +DXDEXnEXZtzkZgXI6g7AkVyt0gkqyUi+01ibmlBfcVHh3PVvU4oNdkaywQd5s29R +DsA4WOqt9cLv+iqIzM1juygfR6ooA1jHDIyIPmmC/kOrcxKXEFvIGXDDCbXAvdXc +uXgZeZqI3pbKjQaU3fF8HwJ956HTM8rywtVGH9BWRl/i6qn5sq9CcukcuQARAQAB +tDBTdGFyIFl1YW4gKENPREUgU0lHTklORyBLRVkpIDx5c2gzMjlAYXBhY2hlLm9y +Zz6JAk4EEwEKADgWIQSk2SKOVXYeZlvwHLtc6GnLfewEjAUCZaVK1wIbAwULCQgH +AgYVCgkICwIEFgIDAQIeAQIXgAAKCRBc6GnLfewEjBAiD/0cfaYfQ0DL7CPsP0lS +yezPDDTnDPIo//G1cuSYG0gnXQ1SpbJSzDE7deew+P506/sWFneOY5Kuv6DuSE8J +nM6vv1EYR4/9x/XstA4F04lQPngKKBV+UKrWj8zIA2Drn345Ece1150bWvrUD7mT ++ps1gfe8SGYpOmR/kRc8qra2zizcWBC1Dl4qd+RcY7Ac6Cu3G/JG2KvZnrUSVev9 +nzSl2V0JtFVIla2odSJqv0Zdj5E2vLvQd3Dxbf3BODCdL3iQqxrQhj+0T3QLEhPg +y2XOtqW7a96XosoQ44wUiHaS5LwFViG8LoiPADtSdXYb8m4FtMfB8t4mzXVqBjpz +2csMqOnNvo7bctfpJkjM14UKib39MR2wUv9fD6Qa+OAAIeXGTQH+wlXmlYjji9+A +4tgq/+d75qUC/tyHSgbZLNXobHF8v77g60cBvFXVL02W53xhVDZP4gwu5iSSN8BJ +a2hqwo4UO53mRUNkwFZONYxJE7MhLl22r08eu0xNYhoGtpHzDVoyHg26+2FUgFDd +TNsdqjMyJ+3GXEE3PdKVDTj9To+RoHLuCczk5uvtFYGhseRwIWbVhmTLKUL+wgSa ++b90slkv+CBJvLjvKbVCmCLXwiH8Cx+MZSu0oM5v8fbHuWOhkb7bJd1V+U7qV/OA +CCqBICt64F+ooQ0oEdC0oLvr2LkCDQRlpUrXARAA1DKsF2ZNUdPIn4VcsjRk/+qF +13VC9SaqMp+J+8m1XTIeXdr27uUa2vT4j8pAM4gwMVkpEqE0rmHK+S1SeEAlcizC +Bvp7vvso/glcOg9Sgt9PXvvEDPL/Hnsn1+3YX+Gye4cOTiDDgVW1RKcgGj9Xsir+ +5BS9Secj5CGo92cuaqIo/mMjxGlsuW/LvTU5qQhz7aOaBibe5EHPlGMqM6XJN0BZ +MHRfBiGDs2n/egMnTPL0JcTlAeird+yxDPULKzhQWkd8rfQKpwcRiY6IcYFHlWdM +VhZkXNRrxh6+q3rR7FKmxlvG/12YyT6Y1BocGLgROzKIeoEp+6vsU5LJ90jy82ig +oGSHwNjm2RRukjV3eebovl1dCo6IaI/j4idCv7NlcBnln/Unk4YOZbneMT5r+3Zy +Q4azLB8KHfHOrUwAxRAGPygdLtqbjs4mF45HDe6h3IOVoiOQlZNpesrwEumlK+Il +taU0T8hfxyMpIcTLUZpIddSxo0sVby2XZ+z00En3JvtqbpRcfA87thxpsE7uHxwT +YT8mPPDxo1R4I4LSzsDnekD8EB/7woz4n5I1RBoPB1LSoo0B2os+4vHGkiwZ0TN0 +ICcUYdM623Bv2wJQbVKEDvwjHZTkotjLx7R2lyqMRwFYrMXHxevOfbARJQCqrcY2 +ouLzQme9rE5MPQbKj2cAEQEAAYkCNgQYAQoAIBYhBKTZIo5Vdh5mW/Acu1zoact9 +7ASMBQJlpUrXAhsMAAoJEFzoact97ASMNsIP/3tlsvwUVfy19lUjxWT4rPw2GGz8 +lbPiaetgigK1F1rlzYnIVo32Fcj/GNNwWEdxxEzeaQR/AJmZLWB8sBDThoTGeSDK +fjKXeDjZh+ElpIKWyk7f3ddHN2TpBz698kZ7fYCciRE9T4d3xgbqx2rCfupxUFSj +lxLFRkasByJnLdAZI50NZjW838IHMaGsvgbWEqRuvKZOES6gFhrK1NTSxj5iuiHk +Uxj1KzMhOW+m1eZ0pQcCVXJDY6KYhmrZzw9q6kzSO9ukmS5yRf0EnD7Fsca4iIXP +Y28xs3zBxYHV4IGU1PtcIwNewmTnjnEy0apHPz0zDplHi1meXuhA7bBMjs/AouJg +6FIDNSQqDuFXufqvVQ6LZZgob+LklMAoGcka4/5ZLPjipj5SWNeZZunJujSqWK7f +KJaIfn7ILXqxjaTFrjBN3cm60rO1+zEektrjtWMmSBn0L76pY2ucenrqewruYYdD +12VQra/6QAS5R0HG8gzOfsZcrHaiIuLoTbsOgnqLVcdb9lO7f3oMbKPwejZ5yhyz +SraXHvmixlhf4uUYwsWyhw3UgHrv1psB8Z9NfdH9/T2BvRg0qy6ZmI0n0OagPNgz +v+SZrqrWkSjyPdl6j7x8EmePfNidqw/CnncYI2rEVSmP28W0Uhg5JLgroGYmycv6 +HeZaRpYvkV8UNmnE +=BtHq +-----END PGP PUBLIC KEY BLOCK----- From e5fa1f7c768912d05e82b3ccb638d1a8d3aaff22 Mon Sep 17 00:00:00 2001 From: ysh329 Date: Wed, 30 Jul 2025 08:10:38 +0000 Subject: [PATCH 2/5] [Relax] Add AllClassNMS Operator --- python/tvm/relax/op/vision/__init__.py | 18 ++ python/tvm/relax/op/vision/nms.py | 347 +++++++++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 python/tvm/relax/op/vision/__init__.py create mode 100644 python/tvm/relax/op/vision/nms.py diff --git a/python/tvm/relax/op/vision/__init__.py b/python/tvm/relax/op/vision/__init__.py new file mode 100644 index 000000000000..be45458d3647 --- /dev/null +++ b/python/tvm/relax/op/vision/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""VISION operators.""" +from .nms import * diff --git a/python/tvm/relax/op/vision/nms.py b/python/tvm/relax/op/vision/nms.py new file mode 100644 index 000000000000..54b89582ed87 --- /dev/null +++ b/python/tvm/relax/op/vision/nms.py @@ -0,0 +1,347 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args +"""Non-maximum suppression operator""" +import tvm +from tvm import te + +from tvm.tir import if_then_else + +from ..sort import argsort +from ..math import cast +from ..transform import reshape, gather +from .. import reduction +from ..scan import cumsum +from .nms_util import ( + binary_search, + collect_selected_indices, + collect_selected_indices_and_scores, + run_all_class_nms, +) + + +def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): + """Get valid count of bounding boxes given a score threshold. + Also moves valid boxes to the top of input data. + + Parameters + ---------- + data : tvm.te.Tensor + Input data. 3-D tensor with shape [batch_size, num_anchors, 6] + or [batch_size, num_anchors, 5]. + + score_threshold : optional, float + Lower limit of score for valid bounding boxes. + + id_index : optional, int + index of the class categories, -1 to disable. + + score_index: optional, int + Index of the scores/confidence of boxes. + + Returns + ------- + valid_count : tvm.te.Tensor + 1-D tensor for valid number of boxes. + + out_tensor : tvm.te.Tensor + Rearranged data tensor. + + out_indices: tvm.te.Tensor or numpy NDArray + Related index in input data. + """ + if isinstance(score_threshold, (float, int)): + score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype) + id_index_const = tvm.tir.const(id_index, "int32") + score_index_const = tvm.tir.const(score_index, "int32") + return hybrid_get_valid_counts( + data, + score_threshold, + id_index_const, + score_index_const, + tvm.tir.const(1, data.dtype), + data.shape[0], + data.shape[1], + ) + + +def _nms_loop( + ib, + batch_size, + top_k, + iou_threshold, + max_output_size, + valid_count, + on_new_valid_box_func, + on_new_invalidated_box_func, + needs_bbox_check_func, + calc_overlap_func, + out_scores, + num_valid_boxes, +): + def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local): + # The box j is valid, invalidate other boxes that overlap with j above iou_threshold + on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j) + num_valid_boxes_local[0] += 1 + + num_boxes_to_check = nkeep - (j + 1) + + with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k: + k = j + 1 + _k + + with ib.if_scope( + tvm.tir.all( + k < nkeep, + out_scores[i, k] > 0, # is the box k still valid? + needs_bbox_check_func(i, j, k), + ) + ): + iou = calc_overlap_func(i, j, k) + + with ib.if_scope(iou >= iou_threshold): + # invalidate the box k + out_scores[i, k] = -1.0 + on_new_invalidated_box_func(i, k) + + with ib.for_range(0, batch_size, name="i") as i: + nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) + max_output_size = if_then_else(max_output_size > te.const(0), max_output_size, nkeep) + + with ib.if_scope(tvm.tir.all(iou_threshold > te.const(0), valid_count[i] > te.const(0))): + num_valid_boxes_local = ib.allocate( + "int32", (1,), name="num_valid_boxes_local", scope="local" + ) + box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local") + num_valid_boxes_local[0] = 0 + box_idx[0] = 0 + + # Apply nms + # No need to do more iteration if we have already reached max_output_size boxes + with ib.while_loop( + tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size) + ): + # Proceed to the inner loop if the box with id box_idx is still valid + with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): + nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local) + box_idx[0] += 1 + + num_valid_boxes[i] = num_valid_boxes_local[0] + + with ib.else_scope(): + num_valid_boxes[i] = 0 + + return ib.get() + + +def _get_valid_box_count(scores, score_threshold): + batch_classes, num_boxes = scores.shape + + def searchsorted_ir(scores, valid_count): + ib = tvm.tir.ir_builder.create() + scores = ib.buffer_ptr(scores) + valid_count = ib.buffer_ptr(valid_count) + + with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: + binary_search(ib, i, num_boxes, scores, score_threshold, valid_count) + + return ib.get() + + scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8) + searchsorted_buf = tvm.tir.decl_buffer( + (batch_classes,), "int32", "searchsorted", data_alignment=8 + ) + + return te.extern( + [(batch_classes,)], + [scores], + lambda ins, outs: searchsorted_ir(ins[0], outs[0]), + dtype=["int32"], + in_buffers=[scores_buf], + out_buffers=[searchsorted_buf], + name="searchsorted", + tag="searchsorted", + ) + + +def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out): + batch_classes, _ = selected_indices.shape + + ib = tvm.tir.ir_builder.create() + + selected_indices = ib.buffer_ptr(selected_indices) + num_detections = ib.buffer_ptr(num_detections) + row_offsets = ib.buffer_ptr(row_offsets) + out = ib.buffer_ptr(out) + + with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: + i = cast(i, "int64") + batch_id = i // num_class + class_id = i % num_class + + with ib.for_range(0, num_detections[i], name="j") as j: + out[row_offsets[i] + j, 0] = batch_id + out[row_offsets[i] + j, 1] = class_id + out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64") + + return ib.get() + + +def _collect_selected_indices_and_scores_ir( + selected_indices, + selected_scores, + num_detections, + row_offsets, + num_total_detections, + collected_indices, + collected_scores, +): + batch_size, num_class = row_offsets.shape + num_boxes = selected_indices.shape[1] + + ib = tvm.tir.ir_builder.create() + + selected_indices = ib.buffer_ptr(selected_indices) + selected_scores = ib.buffer_ptr(selected_scores) + num_detections = ib.buffer_ptr(num_detections) + row_offsets = ib.buffer_ptr(row_offsets) + num_total_detections = ib.buffer_ptr(num_total_detections) + collected_indices = ib.buffer_ptr(collected_indices) + collected_scores = ib.buffer_ptr(collected_scores) + zero = cast(0, "int64") + + with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i: + i = cast(i, "int64") + batch_id = i // num_class + class_id = i % num_class + + with ib.for_range(0, num_boxes, name="j") as j: + with ib.if_scope(j < num_detections[batch_id, class_id]): + offset = row_offsets[batch_id, class_id] + j + collected_indices[batch_id, offset, 0] = class_id + collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64") + collected_scores[batch_id, offset] = selected_scores[i, j] + with ib.else_scope(): + offset = ( + num_total_detections[batch_id] + + class_id * num_boxes + - row_offsets[batch_id, class_id] + + j + - num_detections[batch_id, class_id] + ) + collected_indices[batch_id, offset, 0] = zero + collected_indices[batch_id, offset, 1] = zero + collected_scores[batch_id, offset] = 0.0 + + return ib.get() + + +def all_class_non_max_suppression( + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + output_format="onnx", +): + """Non-maximum suppression operator for object detection, corresponding to ONNX + NonMaxSuppression and TensorFlow combined_non_max_suppression. + NMS is performed for each class separately. + + Parameters + ---------- + boxes : tvm.te.Tensor + 3-D tensor with shape (batch_size, num_boxes, 4) + + scores: tvm.te.Tensor + 3-D tensor with shape (batch_size, num_classes, num_boxes) + + max_output_boxes_per_class : int or tvm.te.Tensor, optional + The maxinum number of output selected boxes per class + + iou_threshold : float or tvm.te.Tensor, optionaIl + IoU test threshold + + score_threshold : float or tvm.te.Tensor, optional + Score threshold to filter out low score boxes early + + output_format : str, optional + "onnx" or "tensorflow", see below. + + Returns + ------- + out : list of tvm.te.Tensor + If `output_format` is "onnx", the output is two tensors. The first is `indices` of size + `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor + `num_total_detection` of shape `(1,)` representing the total number of selected + boxes. The three values in `indices` encode batch, class, and box indices. + Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come + first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of + `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection` + rows are valid. + + If `output_format` is "tensorflow", the output is three tensors, the first + is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of + size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size + `(batch_size,)` representing the total number of selected boxes per batch. The two values + in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at + batch b, only the first `num_total_detection[b]` entries are valid. The second axis of + `indices` and `scores` are sorted within each class by box scores, but not across classes. + So the box indices and scores for the class 0 come first in a sorted order, followed by + the class 1 etc. + """ + batch, num_class, num_boxes = scores.shape + scores = reshape(scores, (batch * num_class, num_boxes)) + + sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32") + sorted_scores = gather(scores, 1, sorted_indices) + + valid_count = _get_valid_box_count(sorted_scores, score_threshold) + + selected_indices, selected_scores, num_detections = run_all_class_nms( + boxes, + sorted_scores, + sorted_indices, + valid_count, + max_output_boxes_per_class, + iou_threshold, + _nms_loop, + return_scores=(output_format == "tensorflow"), + ) + + if output_format == "onnx": + row_offsets = cumsum(num_detections, exclusive=True, dtype="int64") + num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1) + + selected_indices = collect_selected_indices( + num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir + ) + return [selected_indices, num_total_detections] + + num_detections_per_batch = reshape(num_detections, (batch, num_class)) + row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1) + num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1) + + selected_indices, selected_scores = collect_selected_indices_and_scores( + selected_indices, + selected_scores, + num_detections_per_batch, + row_offsets, + num_total_detections, + _collect_selected_indices_and_scores_ir, + ) + + return [selected_indices, selected_scores, num_total_detections] From 3f91dca089fa9941f52cc9bca1b9f36c45abff5a Mon Sep 17 00:00:00 2001 From: ysh329 Date: Wed, 30 Jul 2025 08:42:12 +0000 Subject: [PATCH 3/5] update nms --- .../tvm/relax/frontend/onnx/onnx_frontend.py | 50 ++++++++++++++++++- python/tvm/relax/op/__init__.py | 1 + .../relax/transform/legalize_ops/__init__.py | 1 + python/tvm/script/ir_builder/relax/ir.py | 2 + src/te/operation/create_primfunc.cc | 5 +- tests/python/relax/test_frontend_onnx.py | 21 ++++++++ 6 files changed, 78 insertions(+), 2 deletions(-) diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py index 926da7f022dc..fd71c2e87f7f 100644 --- a/python/tvm/relax/frontend/onnx/onnx_frontend.py +++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py @@ -3108,6 +3108,54 @@ def _impl_v9(cls, bb, inputs, attr, params): relax.op.nonzero(inputs[0]), relax.TensorStructInfo((ndim, nonzero_numbers), "int64") ) +class NonMaxSuppression(OnnxOpConverter): + """Converts an onnx NonMaxSuppression node into an equivalent Relax expression.""" + + @classmethod + def _impl_v10(cls, bb, inputs, attr, params): + # Get parameter values + boxes = inputs[0] + scores = inputs[1] + max_output_boxes_per_class = inputs[2] if len(inputs) >= 3 else relax.const([0], "int64") + iou_threshold = inputs[3] if len(inputs) >= 4 else relax.const([0.0], "float32") + score_threshold = inputs[4] if len(inputs) >= 5 else relax.const([0.0], "float32") + + boxes_dtype = boxes.struct_info.dtype + if attr.get("center_point_box", 0) != 0: + xc, yc, w, h = relax.op.split(boxes, 4, axis=2) + half_w = w / relax.const(2.0, boxes_dtype) + half_h = h / relax.const(2.0, boxes_dtype) + x1 = xc - half_w + x2 = xc + half_w + y1 = yc - half_h + y2 = yc + half_h + boxes = relax.op.concat([y1, x1, y2, x2], axis=2) + + def conditionally_squeeze_scalar(x): + rank = x.struct_info.ndim + assert rank <= 1, "nms thresholds must be scalars" + return relax.op.squeeze(x, [0]) if rank == 1 else x + + max_output_boxes_per_class = conditionally_squeeze_scalar(max_output_boxes_per_class) + iou_threshold = conditionally_squeeze_scalar(iou_threshold) + score_threshold = conditionally_squeeze_scalar(score_threshold) + + nms_out = bb.normalize( + relax.op.vision.all_class_non_max_suppression( + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + ) + ) + return relax.op.dynamic_strided_slice( + nms_out[0], + begin=relax.const([0, 0], dtype="int64"), + end=relax.op.concat([nms_out[1], relax.const([3], dtype="int64")], axis=0), + strides=relax.const([1, 1], dtype="int64"), + ) + class HardSigmoid(OnnxOpConverter): """Converts an onnx HardSigmoid node into an equivalent Relax expression.""" @@ -3499,7 +3547,7 @@ def _get_convert_map(): # "LRN": LRN, # "MaxRoiPool": MaxRoiPool, # "RoiAlign": RoiAlign, - # "NonMaxSuppression": NonMaxSuppression, + "NonMaxSuppression": NonMaxSuppression, # "GridSample": GridSample, # "Upsample": Upsample, # others diff --git a/python/tvm/relax/op/__init__.py b/python/tvm/relax/op/__init__.py index fd3672368b68..e1635d64e63a 100644 --- a/python/tvm/relax/op/__init__.py +++ b/python/tvm/relax/op/__init__.py @@ -154,6 +154,7 @@ tanh, trunc, ) +from .vision import all_class_non_max_suppression def _register_op_make(): diff --git a/python/tvm/relax/transform/legalize_ops/__init__.py b/python/tvm/relax/transform/legalize_ops/__init__.py index b4aba0291fc1..5614d0229646 100644 --- a/python/tvm/relax/transform/legalize_ops/__init__.py +++ b/python/tvm/relax/transform/legalize_ops/__init__.py @@ -31,3 +31,4 @@ from . import search from . import statistical from . import unary +from . import vision diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py index e61e563b706b..6be506f9c837 100644 --- a/python/tvm/script/ir_builder/relax/ir.py +++ b/python/tvm/script/ir_builder/relax/ir.py @@ -186,6 +186,7 @@ wrap_param, zeros, zeros_like, + vision, ) from tvm.relax.op.builtin import stop_lift_params from tvm.relax.struct_info import StructInfo @@ -896,4 +897,5 @@ def dtype(value: Union[py_str, DataType]) -> Expr: "nn", "ccl", "erf", + "vision", ] diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc index 7408eb46eb51..a2977884396e 100644 --- a/src/te/operation/create_primfunc.cc +++ b/src/te/operation/create_primfunc.cc @@ -649,7 +649,10 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf // reads/writes filled in. BufferSubstituter substituter(var_map, input_buffer_map); - Stmt body = substituter(extern_op->body); + Stmt substituted_body = substituter(extern_op->body); + + ProducerToBufferTransformer transformer(info->tensor2buffers); + Stmt body = transformer(substituted_body); // Step 4. Generate opaque block as body. return BlockRealize(/*iter_values=*/{}, diff --git a/tests/python/relax/test_frontend_onnx.py b/tests/python/relax/test_frontend_onnx.py index 7a0a7d7bc952..9c96fe0f155b 100644 --- a/tests/python/relax/test_frontend_onnx.py +++ b/tests/python/relax/test_frontend_onnx.py @@ -737,6 +737,27 @@ def test_gemm(alpha, beta, useC): check_correctness(model) +def test_nms(): + nms_node = helper.make_node( + "NonMaxSuppression", ["boxes", "scores"], ["selected_indices"], center_point_box=0 + ) + + inputs = [ + helper.make_tensor_value_info("boxes", TensorProto.FLOAT, [1, 10647, 4]), + helper.make_tensor_value_info("scores", TensorProto.FLOAT, [1, 80, 10647]), + ] + + graph = helper.make_graph( + [nms_node], + "nms_test", + inputs=inputs, + outputs=[helper.make_tensor_value_info("selected_indices", TensorProto.INT64, [0, 3])], + ) + + model = helper.make_model(graph, producer_name="nms_test") + check_correctness(model) + + @pytest.mark.parametrize( "in_shape, shape, out_shape", [ From aae2345ebaba471c518a4dba1ffced15c7e4e3e0 Mon Sep 17 00:00:00 2001 From: ysh329 Date: Wed, 30 Jul 2025 08:43:11 +0000 Subject: [PATCH 4/5] update nms --- include/tvm/relax/attrs/vision.h | 49 +++ python/tvm/relax/op/vision/_ffi_api.py | 19 + .../relax/transform/legalize_ops/vision.py | 34 ++ python/tvm/topi/vision/nms.py | 302 +++++++++++++++ python/tvm/topi/vision/nms_util.py | 345 ++++++++++++++++++ src/relax/op/vision/nms.cc | 94 +++++ src/relax/op/vision/nms.h | 41 +++ tests/python/relax/test_op_vision.py | 69 ++++ .../relax/test_tvmscript_parser_op_vision.py | 64 ++++ 9 files changed, 1017 insertions(+) create mode 100644 include/tvm/relax/attrs/vision.h create mode 100644 python/tvm/relax/op/vision/_ffi_api.py create mode 100644 python/tvm/relax/transform/legalize_ops/vision.py create mode 100644 python/tvm/topi/vision/nms.py create mode 100644 python/tvm/topi/vision/nms_util.py create mode 100644 src/relax/op/vision/nms.cc create mode 100644 src/relax/op/vision/nms.h create mode 100644 tests/python/relax/test_op_vision.py create mode 100644 tests/python/relax/test_tvmscript_parser_op_vision.py diff --git a/include/tvm/relax/attrs/vision.h b/include/tvm/relax/attrs/vision.h new file mode 100644 index 000000000000..828d30a35249 --- /dev/null +++ b/include/tvm/relax/attrs/vision.h @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \file tvm/relax/attrs/vision.h + * \brief Auxiliary attributes for vision operators. + */ +#ifndef TVM_RELAX_ATTRS_VISION_H_ +#define TVM_RELAX_ATTRS_VISION_H_ + +#include + +namespace tvm { +namespace relax { + +/*! \brief Attributes used in AllClassNonMaximumSuppression operator */ +struct AllClassNonMaximumSuppressionAttrs + : public tvm::AttrsNode { + String output_format; + + TVM_DECLARE_ATTRS(AllClassNonMaximumSuppressionAttrs, + "relax.attrs.AllClassNonMaximumSuppressionAttrs") { + TVM_ATTR_FIELD(output_format) + .set_default("onnx") + .describe( + "Output format, onnx or tensorflow. Returns outputs in a way that can be easily " + "consumed by each frontend."); + } +}; // struct AllClassNonMaximumSuppressionAttrs + +} // namespace relax +} // namespace tvm + +#endif // TVM_RELAX_ATTRS_VISION_H_ diff --git a/python/tvm/relax/op/vision/_ffi_api.py b/python/tvm/relax/op/vision/_ffi_api.py new file mode 100644 index 000000000000..fec2d4d4b28c --- /dev/null +++ b/python/tvm/relax/op/vision/_ffi_api.py @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +"""Constructor APIs""" +import tvm._ffi + +tvm._ffi._init_api("relax.op.vision", __name__) diff --git a/python/tvm/relax/transform/legalize_ops/vision.py b/python/tvm/relax/transform/legalize_ops/vision.py new file mode 100644 index 000000000000..2943385228f9 --- /dev/null +++ b/python/tvm/relax/transform/legalize_ops/vision.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Default legalization function for vision network related operators.""" +from tvm import topi +from ...block_builder import BlockBuilder +from ...expr import Call, Expr +from .common import register_legalize + + +@register_legalize("relax.vision.all_class_non_max_suppression") +def _vision_all_class_non_max_suppression(bb: BlockBuilder, call: Call) -> Expr: + return bb.call_te( + topi.vision.all_class_non_max_suppression, + call.args[0], + call.args[1], + call.args[2], + call.args[3], + call.args[4], + output_format=call.attrs.output_format, + ) diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py new file mode 100644 index 000000000000..40b42d3a8080 --- /dev/null +++ b/python/tvm/topi/vision/nms.py @@ -0,0 +1,302 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args +"""Non-maximum suppression operator""" +import tvm +from tvm import te + +from tvm.tir import if_then_else + +from ..sort import argsort +from ..math import cast +from ..transform import reshape, gather +from .. import reduction +from ..scan import cumsum +from .nms_util import ( + binary_search, + collect_selected_indices, + collect_selected_indices_and_scores, + run_all_class_nms, +) + + +def _nms_loop( + ib, + batch_size, + top_k, + iou_threshold, + max_output_size, + valid_count, + on_new_valid_box_func, + on_new_invalidated_box_func, + needs_bbox_check_func, + calc_overlap_func, + out_scores, + num_valid_boxes, +): + def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local): + # The box j is valid, invalidate other boxes that overlap with j above iou_threshold + on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j) + num_valid_boxes_local[0] += 1 + + num_boxes_to_check = nkeep - (j + 1) + + with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k: + k = j + 1 + _k + + with ib.if_scope( + tvm.tir.all( + k < nkeep, + out_scores[i, k] > 0, # is the box k still valid? + needs_bbox_check_func(i, j, k), + ) + ): + iou = calc_overlap_func(i, j, k) + + with ib.if_scope(iou >= iou_threshold): + # invalidate the box k + out_scores[i, k] = -1.0 + on_new_invalidated_box_func(i, k) + + with ib.for_range(0, batch_size, name="i") as i: + nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) + max_output_size = if_then_else(max_output_size > te.const(0), max_output_size, nkeep) + + with ib.if_scope(tvm.tir.all(iou_threshold > te.const(0), valid_count[i] > te.const(0))): + num_valid_boxes_local = ib.allocate( + "int32", (1,), name="num_valid_boxes_local", scope="local" + ) + box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local") + num_valid_boxes_local[0] = 0 + box_idx[0] = 0 + + # Apply nms + # No need to do more iteration if we have already reached max_output_size boxes + with ib.while_loop( + tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size) + ): + # Proceed to the inner loop if the box with id box_idx is still valid + with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): + nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local) + box_idx[0] += 1 + + num_valid_boxes[i] = num_valid_boxes_local[0] + + with ib.else_scope(): + num_valid_boxes[i] = 0 + + return ib.get() + + +def _get_valid_box_count(scores, score_threshold): + batch_classes, num_boxes = scores.shape + + def searchsorted_ir(scores, valid_count): + ib = tvm.tir.ir_builder.create() + scores = ib.buffer_ptr(scores) + valid_count = ib.buffer_ptr(valid_count) + + with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: + binary_search(ib, i, num_boxes, scores, score_threshold, valid_count) + + return ib.get() + + scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8) + searchsorted_buf = tvm.tir.decl_buffer( + (batch_classes,), "int32", "searchsorted", data_alignment=8 + ) + + return te.extern( + [(batch_classes,)], + [scores], + lambda ins, outs: searchsorted_ir(ins[0], outs[0]), + dtype=["int32"], + in_buffers=[scores_buf], + out_buffers=[searchsorted_buf], + name="searchsorted", + tag="searchsorted", + ) + + +def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out): + batch_classes, _ = selected_indices.shape + + ib = tvm.tir.ir_builder.create() + + selected_indices = ib.buffer_ptr(selected_indices) + num_detections = ib.buffer_ptr(num_detections) + row_offsets = ib.buffer_ptr(row_offsets) + out = ib.buffer_ptr(out) + + with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: + i = cast(i, "int64") + batch_id = i // num_class + class_id = i % num_class + + with ib.for_range(0, num_detections[i], name="j") as j: + out[row_offsets[i] + j, 0] = batch_id + out[row_offsets[i] + j, 1] = class_id + out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64") + + return ib.get() + + +def _collect_selected_indices_and_scores_ir( + selected_indices, + selected_scores, + num_detections, + row_offsets, + num_total_detections, + collected_indices, + collected_scores, +): + batch_size, num_class = row_offsets.shape + num_boxes = selected_indices.shape[1] + + ib = tvm.tir.ir_builder.create() + + selected_indices = ib.buffer_ptr(selected_indices) + selected_scores = ib.buffer_ptr(selected_scores) + num_detections = ib.buffer_ptr(num_detections) + row_offsets = ib.buffer_ptr(row_offsets) + num_total_detections = ib.buffer_ptr(num_total_detections) + collected_indices = ib.buffer_ptr(collected_indices) + collected_scores = ib.buffer_ptr(collected_scores) + zero = cast(0, "int64") + + with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i: + i = cast(i, "int64") + batch_id = i // num_class + class_id = i % num_class + + with ib.for_range(0, num_boxes, name="j") as j: + with ib.if_scope(j < num_detections[batch_id, class_id]): + offset = row_offsets[batch_id, class_id] + j + collected_indices[batch_id, offset, 0] = class_id + collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64") + collected_scores[batch_id, offset] = selected_scores[i, j] + with ib.else_scope(): + offset = ( + num_total_detections[batch_id] + + class_id * num_boxes + - row_offsets[batch_id, class_id] + + j + - num_detections[batch_id, class_id] + ) + collected_indices[batch_id, offset, 0] = zero + collected_indices[batch_id, offset, 1] = zero + collected_scores[batch_id, offset] = 0.0 + + return ib.get() + + +def all_class_non_max_suppression( + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + output_format="onnx", +): + """Non-maximum suppression operator for object detection, corresponding to ONNX + NonMaxSuppression and TensorFlow combined_non_max_suppression. + NMS is performed for each class separately. + + Parameters + ---------- + boxes : tvm.te.Tensor + 3-D tensor with shape (batch_size, num_boxes, 4) + + scores: tvm.te.Tensor + 3-D tensor with shape (batch_size, num_classes, num_boxes) + + max_output_boxes_per_class : int or tvm.te.Tensor, optional + The maxinum number of output selected boxes per class + + iou_threshold : float or tvm.te.Tensor, optionaIl + IoU test threshold + + score_threshold : float or tvm.te.Tensor, optional + Score threshold to filter out low score boxes early + + output_format : str, optional + "onnx" or "tensorflow", see below. + + Returns + ------- + out : list of tvm.te.Tensor + If `output_format` is "onnx", the output is two tensors. The first is `indices` of size + `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor + `num_total_detection` of shape `(1,)` representing the total number of selected + boxes. The three values in `indices` encode batch, class, and box indices. + Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come + first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of + `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection` + rows are valid. + + If `output_format` is "tensorflow", the output is three tensors, the first + is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of + size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size + `(batch_size,)` representing the total number of selected boxes per batch. The two values + in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at + batch b, only the first `num_total_detection[b]` entries are valid. The second axis of + `indices` and `scores` are sorted within each class by box scores, but not across classes. + So the box indices and scores for the class 0 come first in a sorted order, followed by + the class 1 etc. + """ + batch, num_class, num_boxes = scores.shape + scores = reshape(scores, (batch * num_class, num_boxes)) + + sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32") + sorted_scores = gather(scores, 1, sorted_indices) + + valid_count = _get_valid_box_count(sorted_scores, score_threshold) + + selected_indices, selected_scores, num_detections = run_all_class_nms( + boxes, + sorted_scores, + sorted_indices, + valid_count, + max_output_boxes_per_class, + iou_threshold, + _nms_loop, + return_scores=(output_format == "tensorflow"), + ) + + if output_format == "onnx": + row_offsets = cumsum(num_detections, exclusive=True, dtype="int64") + num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1) + + selected_indices = collect_selected_indices( + num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir + ) + return [selected_indices, num_total_detections] + + num_detections_per_batch = reshape(num_detections, (batch, num_class)) + row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1) + num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1) + + selected_indices, selected_scores = collect_selected_indices_and_scores( + selected_indices, + selected_scores, + num_detections_per_batch, + row_offsets, + num_total_detections, + _collect_selected_indices_and_scores_ir, + ) + + return [selected_indices, selected_scores, num_total_detections] diff --git a/python/tvm/topi/vision/nms_util.py b/python/tvm/topi/vision/nms_util.py new file mode 100644 index 000000000000..5cf3a459abd7 --- /dev/null +++ b/python/tvm/topi/vision/nms_util.py @@ -0,0 +1,345 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Common utilities used in Non-maximum suppression operators""" +import tvm +from tvm import te + + +def _get_boundaries(output, box_idx): + l = tvm.te.min( + output[box_idx], + output[box_idx + 2], + ) + t = tvm.te.min( + output[box_idx + 1], + output[box_idx + 3], + ) + r = tvm.te.max( + output[box_idx], + output[box_idx + 2], + ) + b = tvm.te.max( + output[box_idx + 1], + output[box_idx + 3], + ) + return l, t, r, b + + +def calculate_overlap(out_tensor, box_a_idx, box_b_idx): + """Calculate overlap of two boxes.""" + a_l, a_t, a_r, a_b = _get_boundaries(out_tensor, box_a_idx) + b_l, b_t, b_r, b_b = _get_boundaries(out_tensor, box_b_idx) + + # Overlapping width and height + w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l)) + h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t)) + + # Overlapping area + area = h * w + + # total area of the figure formed by box a and box b + # except for overlapping area + u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area + return tvm.tir.Select(u <= 0.0, 0.0, area / u) + + +def binary_search(ib, y, num_boxes, scores, score_threshold, out): + """Binary search for score_threshold on scores sorted in descending order""" + lo = ib.allocate("int32", (1,), name="lo", scope="local") + hi = ib.allocate("int32", (1,), name="hi", scope="local") + + lo[0] = 0 + hi[0] = num_boxes.astype("int32") + + with ib.while_loop(lo[0] < hi[0]): + mid = (hi[0] + lo[0]) >> 1 + with ib.if_scope(scores[y, mid] > score_threshold): + lo[0] = mid + 1 + with ib.else_scope(): + hi[0] = mid + + out[y] = lo[0] + + +def collect_selected_indices(num_class, selected_indices, num_detections, row_offsets, ir): + """Collect selected indices from the core NMS loop into one linear output + + Parameters + ---------- + num_class : int + + selected_indices: tvm.te.Tensor + 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices + of selected boxes by the core NMS loop. + + num_detections tvm.te.Tensor + 1-D tensor with shape (batch_size * num_classes,), representing + the number of boxes selected by the core NMS loop, per batch and class + + row_offsets tvm.te.Tensor + 1-D tensor with shape (batch_size * num_classes,), this should be the exclusive scan + of num_detections + + ir : function + A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py + + Returns + ------- + out : tvm.te.Tensor + The output is indices of size (batch_size * num_class* num_boxes , 3). + Rows of indices are ordered such that selected boxes from batch 0, class 0 come + first, in descending of scores, followed by boxes from batch 0, class 1 etc. + """ + batch_class, num_boxes = selected_indices.shape + return te.extern( + [(batch_class * num_boxes, 3)], + [selected_indices, num_detections, row_offsets], + lambda ins, outs: ir(num_class, ins[0], ins[1], ins[2], outs[0]), + dtype=["int64"], + name="collect_indices", + tag="collect_indices", + ) + + +def collect_selected_indices_and_scores( + selected_indices, selected_scores, num_detections, row_offsets, num_total_detections, ir +): + """Collect selected indices and scores from the core NMS loop into one linear output + + Parameters + ---------- + num_class : int + + selected_indices: tvm.te.Tensor + 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices + of selected boxes by the core NMS loop. + + selected_indices: tvm.te.Tensor + 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the scores + of selected boxes by the core NMS loop. + + num_detections tvm.te.Tensor + 2-D tensor with shape (batch_size, num_classes), representing + the number of boxes selected by the core NMS loop, per batch and class + + row_offsets tvm.te.Tensor + 2-D tensor with shape (batch_size, num_classes), this should be the exclusive scan + of num_detections along axis 1 + + ir : function + A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py + + Returns + ------- + out : [tvm.te.Tensor, tvm.te.Tensor] + The output is two tensors. The first is indices of size + (batch_size, num_class* num_boxes, 2), and the second is scores of size + (batch_size, num_class* num_boxes). + """ + batch_size, num_class = row_offsets.shape + num_boxes = selected_indices.shape[1] + return te.extern( + [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)], + [selected_indices, selected_scores, num_detections, row_offsets, num_total_detections], + lambda ins, outs: ir(ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], outs[1]), + dtype=["int64", "float32"], + name="collect_indices_and_scores", + tag="collect_indices_and_scores", + ) + + +def _all_class_nms_ir( + boxes, + sorted_scores, + sorted_indices, + valid_count, + batch_class, + num_class, + num_anchors, + iou_threshold, + max_output_size_per_class, + box_indices, + selected_scores, + num_valid_boxes, + nms_loop, +): + ib = tvm.tir.ir_builder.create() + boxes = ib.buffer_ptr(boxes) + sorted_scores = ib.buffer_ptr(sorted_scores) + sorted_indices = ib.buffer_ptr(sorted_indices) + valid_count = ib.buffer_ptr(valid_count) + box_indices = ib.buffer_ptr(box_indices) + num_valid_boxes = ib.buffer_ptr(num_valid_boxes) + + if selected_scores is not None: + selected_scores = ib.buffer_ptr(selected_scores) + + if isinstance(iou_threshold, float): + iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) + + if isinstance(max_output_size_per_class, int): + max_output_size_per_class = tvm.tir.const(max_output_size_per_class) + + def calc_overlap(i, j, k): + offset_j = sorted_indices[i, j] * 4 + offset_k = sorted_indices[i, k] * 4 + batch_id = i // num_class + base_bbox_idx = batch_id * num_anchors * 4 + return calculate_overlap( + boxes, + base_bbox_idx + offset_j, + base_bbox_idx + offset_k, + ) + + def on_new_valid_box(ib, tid, num_current_valid_box, i, j): + with ib.if_scope(tid + 0 == 0): + box_indices[i, num_current_valid_box] = sorted_indices[i, j] + + if selected_scores is not None: + selected_scores[i, num_current_valid_box] = sorted_scores[i, j] + + def on_new_invalidated_box(*_): + pass + + def needs_bbox_check(*_): + return tvm.tir.const(True) + + return nms_loop( + ib, + batch_class, + tvm.tir.IntImm("int32", -1), # top_k + iou_threshold, + max_output_size_per_class, + valid_count, + on_new_valid_box, + on_new_invalidated_box, + needs_bbox_check, + calc_overlap, + sorted_scores, + num_valid_boxes, + ) + + +def run_all_class_nms( + boxes, + sorted_scores, + sorted_indices, + valid_count, + max_output_size_per_class, + iou_threshold, + nms_loop, + return_scores=False, +): + """The core all class NMS routine + + Parameters + ---------- + boxes : tvm.te.Tensor + 3-D tensor with shape (batch_size, num_boxes, 4) + + sorted_scores: tvm.te.Tensor + 2-D tensor with shape (batch_size * num_classes, num_boxes) + One of the outputs from argsort + + sorted_indices: tvm.te.Tensor + 2-D tensor with shape (batch_size * num_classes, num_boxes) + The other output from argsort + + valid_count: tvm.te.Tensor + 1-D tensor with shape (batch_size * num_classes,), representing + the number of boxes whose score is above score_threshold, per batch and class + + max_output_boxes_per_class : int or tvm.te.Tensor, optional + The maxinum number of output selected boxes per class + + iou_threshold : float or tvm.te.Tensor, optionaIl + IoU test threshold + + nms_loop : function + A core NMS loop, see its usage in vision/nms.py and cuda/nms.py + + return_scores : bool, optional + Whether or not to return selected scores, needed by the tensorflow output format. + + Returns + ------- + out : a list of tvm.te.Tensor + The output is three tensors, the first and second are indices and scores of size + (batch_size * num_class, num_boxes), and the third is a tensor + num_selected_boxes of shape (batch_size * num_class,) representing the total number of + selected boxes per batch and class. If return_scores is False, the second output is + None. + """ + batch, num_boxes, _ = boxes.shape + batch_class = sorted_scores.shape[0] + num_class = batch_class // batch + + if return_scores is False: + all_class_num0_buf = tvm.tir.decl_buffer( + (batch_class, num_boxes), "int32", "all_class_nms0", data_alignment=8 + ) + all_class_num1_buf = tvm.tir.decl_buffer( + (1, batch_class), "int32", "all_class_nms1", data_alignment=8 + ) + selected_indices, num_detections = te.extern( + [(batch_class, num_boxes), (1, batch_class)], + [boxes, sorted_scores, sorted_indices, valid_count], + lambda ins, outs: _all_class_nms_ir( + ins[0], # boxes + ins[1], # sorted_scores + ins[2], # sorted_indices + ins[3], # valid_count + batch_class, + num_class, + num_boxes, + iou_threshold, + max_output_size_per_class, + outs[0], # box_indices + None, # scores + outs[1], # num_selected_boxes + nms_loop, + ), + out_buffers=[all_class_num0_buf, all_class_num1_buf], + dtype=["int32", "int32"], + name="all_class_nms", + tag="all_class_nms", + ) + return selected_indices, None, num_detections + + return te.extern( + [(batch_class, num_boxes), (batch_class, num_boxes), (1, batch_class)], + [boxes, sorted_scores, sorted_indices, valid_count], + lambda ins, outs: _all_class_nms_ir( + ins[0], # boxes + ins[1], # sorted_scores + ins[2], # sorted_indices + ins[3], # valid_count + batch_class, + num_class, + num_boxes, + iou_threshold, + max_output_size_per_class, + outs[0], # box_indices + outs[1], # selected scores + outs[2], # num_selected_boxes + nms_loop, + ), + dtype=["int32", "float32", "int32"], + name="all_class_nms", + tag="all_class_nms", + ) diff --git a/src/relax/op/vision/nms.cc b/src/relax/op/vision/nms.cc new file mode 100644 index 000000000000..163a3d25a904 --- /dev/null +++ b/src/relax/op/vision/nms.cc @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "nms.h" + +#include +#include + +namespace tvm { +namespace relax { + +/* relax.vision.all_class_non_max_suppression */ +TVM_REGISTER_NODE_TYPE(AllClassNonMaximumSuppressionAttrs); + +Expr all_class_non_max_suppression(Expr boxes, Expr scores, Expr max_output_boxes_per_class, + Expr iou_threshold, Expr score_threshold, String output_format) { + ObjectPtr attrs = + make_object(); + attrs->output_format = output_format; + + static const Op& op = Op::Get("relax.vision.all_class_non_max_suppression"); + return Call(op, + {std::move(boxes), std::move(scores), std::move(max_output_boxes_per_class), + std::move(iou_threshold), std::move(score_threshold)}, + Attrs{attrs}, {}); +} + +TVM_REGISTER_GLOBAL("relax.op.vision.all_class_non_max_suppression") + .set_body_typed(all_class_non_max_suppression); + +StructInfo InferStructInfoAllClassNMS(const Call& call, const BlockBuilder& ctx) { + Array input_sinfo = GetInputTensorStructInfo(call, ctx); + const auto boxes_sinfo = input_sinfo[0]; + const auto scores_sinfo = input_sinfo[1]; + ICHECK(!boxes_sinfo->IsUnknownNdim()) << "Only support known ndim"; + ICHECK(!scores_sinfo->IsUnknownNdim()) << "Only support known ndim"; + ICHECK_EQ(boxes_sinfo->ndim, 3) << "AllClassNMS input boxes should be 3-D."; + ICHECK_EQ(scores_sinfo->ndim, 3) << "AllClassNMS input scores count should be 3-D."; + + const auto batch = boxes_sinfo->shape.as()->values[0]; + const auto num_classes = scores_sinfo->shape.as()->values[1]; + const auto num_boxes = boxes_sinfo->shape.as()->values[1]; + + auto vdev = input_sinfo[0]->vdevice; + const auto* attrs = call->attrs.as(); + if (attrs->output_format == "onnx") { + auto vdev = input_sinfo[0]->vdevice; + auto num_total_boxes = batch * num_classes * num_boxes; + ShapeExpr oshape{Array({num_total_boxes, 3})}; + ShapeExpr counts_shape{Array({1})}; + return TupleStructInfo({TensorStructInfo(oshape, DataType::Int(64), vdev), + TensorStructInfo(counts_shape, DataType::Int(64), vdev)}); + } + + auto num_total_boxes_per_batch = num_classes * num_boxes; + ShapeExpr indices_shape{Array({batch, num_total_boxes_per_batch, 2})}; + ShapeExpr scores_shape{Array({batch, num_total_boxes_per_batch})}; + ShapeExpr counts_shape{Array({batch})}; + return TupleStructInfo({TensorStructInfo(indices_shape, DataType::Int(64), vdev), + TensorStructInfo(scores_shape, DataType::Float(32), vdev), + TensorStructInfo(counts_shape, DataType::Int(64), vdev)}); +} + +TVM_REGISTER_OP("relax.vision.all_class_non_max_suppression") + .set_attrs_type() + .set_num_inputs(5) + .add_argument("boxes", "Tensor", "The input boxes in the format [batch, num_boxes, 4].") + .add_argument("scores", "Tensor", + "Scores for each box and class in the format [batch, num_classes, num_boxes].") + .add_argument("max_output_boxes_per_class", "Tensor", + "The maximum number of output boxes per class.") + .add_argument("iou_threshold", "Tensor", "The IoU threshold for box the overlap test.") + .add_argument("score_threshold", "Tensor", + "The score threshold to filter out low score boxes early.") + .set_attr("FInferStructInfo", InferStructInfoAllClassNMS) + .set_attr("FPurity", Bool(true)); + +} // namespace relax +} // namespace tvm diff --git a/src/relax/op/vision/nms.h b/src/relax/op/vision/nms.h new file mode 100644 index 000000000000..e7adc9600ef2 --- /dev/null +++ b/src/relax/op/vision/nms.h @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \file nms.h + * \brief The functions to make Relax Non-maximum suppression operator calls. + */ + +#ifndef TVM_RELAX_OP_VISION_NMS_H_ +#define TVM_RELAX_OP_VISION_NMS_H_ + +#include + +#include "../op_common.h" + +namespace tvm { +namespace relax { + +/*! \brief Compute All Class NonMaximumSuppression. */ +Expr all_class_non_max_suppression(Expr boxes, Expr scores, Expr max_output_boxes_per_class, + Expr iou_threshold, Expr score_threshold, String output_format); + +} // namespace relax +} // namespace tvm + +#endif // TVM_RELAX_OP_VISION_NMS_H_ diff --git a/tests/python/relax/test_op_vision.py b/tests/python/relax/test_op_vision.py new file mode 100644 index 000000000000..bb23aabb3cb2 --- /dev/null +++ b/tests/python/relax/test_op_vision.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +import tvm +import tvm.testing +from tvm import relax, tir +from tvm import TVMError +from tvm.ir import Op, VDevice +from tvm.script import relax as R + + +def _check_inference(bb: relax.BlockBuilder, call: relax.Call, expected_sinfo: relax.StructInfo): + ret = bb.normalize(call) + tvm.ir.assert_structural_equal(ret.struct_info, expected_sinfo) + + +def test_all_class_non_max_suppression_infer_struct_info(): + bb = relax.BlockBuilder() + batch_size, num_classes, num_boxes = 10, 8, 5 + boxes = relax.Var("boxes", R.Tensor((batch_size, num_boxes, 4), "int64")) + scores = relax.Var("scores", R.Tensor((batch_size, num_classes, num_boxes), "float32")) + + _check_inference( + bb, + relax.op.vision.all_class_non_max_suppression(boxes, scores, output_format="onnx"), + relax.TupleStructInfo( + [ + relax.TensorStructInfo((batch_size * num_classes * num_boxes, 3), "int64"), + relax.TensorStructInfo((1,), "int64"), + ] + ), + ) + + _check_inference( + bb, + relax.op.vision.all_class_non_max_suppression(boxes, scores, output_format="tensorflow"), + relax.TupleStructInfo( + [ + relax.TensorStructInfo((batch_size, num_classes * num_boxes, 2), "int64"), + relax.TensorStructInfo( + ( + batch_size, + num_classes * num_boxes, + ), + "float32", + ), + relax.TensorStructInfo((batch_size,), "int64"), + ] + ), + ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relax/test_tvmscript_parser_op_vision.py b/tests/python/relax/test_tvmscript_parser_op_vision.py new file mode 100644 index 000000000000..b90dc1e092ad --- /dev/null +++ b/tests/python/relax/test_tvmscript_parser_op_vision.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Optional, Union + +import tvm +import tvm.script +import tvm.testing +from tvm import IRModule, relax +from tvm.script import relax as R + + +def _check( + parsed: Union[relax.Function, IRModule], + expect: Optional[Union[relax.Function, IRModule]], +): + test = parsed.script(show_meta=True) + roundtrip_mod = tvm.script.from_source(test) + tvm.ir.assert_structural_equal(parsed, roundtrip_mod) + if expect: + tvm.ir.assert_structural_equal(parsed, expect) + + +def test_all_class_non_max_suppression(): + @R.function + def foo( + boxes: R.Tensor((10, 5, 4), "int64"), + scores: R.Tensor((10, 8, 5), "float32"), + ) -> R.Tuple(R.Tensor((400, 3), "int64"), R.Tensor((1,), "int64")): + gv: R.Tuple( + R.Tensor((400, 3), "int64"), R.Tensor((1,), "int64") + ) = R.vision.all_class_non_max_suppression( + boxes, + scores, + ) + return gv + + boxes = relax.Var("boxes", R.Tensor((10, 5, 4), "int64")) + scores = relax.Var("scores", R.Tensor((10, 8, 5), "float32")) + + bb = relax.BlockBuilder() + with bb.function("foo", [boxes, scores]): + gv = bb.emit(relax.op.vision.all_class_non_max_suppression(boxes, scores)) + bb.emit_func_output(gv) + + _check(foo, bb.get()["foo"]) + + +if __name__ == "__main__": + tvm.testing.main() From ce6478b83f296ee7410deea266b4def8183f017c Mon Sep 17 00:00:00 2001 From: ysh329 Date: Wed, 30 Jul 2025 09:02:23 +0000 Subject: [PATCH 5/5] fix lint --- python/tvm/relax/frontend/onnx/onnx_frontend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py index fd71c2e87f7f..d5779862a250 100644 --- a/python/tvm/relax/frontend/onnx/onnx_frontend.py +++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py @@ -3108,6 +3108,7 @@ def _impl_v9(cls, bb, inputs, attr, params): relax.op.nonzero(inputs[0]), relax.TensorStructInfo((ndim, nonzero_numbers), "int64") ) + class NonMaxSuppression(OnnxOpConverter): """Converts an onnx NonMaxSuppression node into an equivalent Relax expression."""